/*
 * Copyright (C) 2007 by
 * 
 * 	Xuan-Hieu Phan
 *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 * 	Graduate School of Information Sciences
 * 	Tohoku University
 *
 * GibbsLDA++ is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * GibbsLDA++ is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with GibbsLDA++; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

/* 
 * References:
 * + The Java code of Gregor Heinrich (gregor@arbylon.net)
 *   http://www.arbylon.net/projects/LdaGibbsSampler.java
 * + "Parameter estimation for text analysis" by Gregor Heinrich
 *   http://www.arbylon.net/publications/text-est.pdf
 */

#ifndef	_MODEL_H
#define	_MODEL_H

#include "constants.h"
#include "dataset.h"

using namespace std;

// LDA model
class model {
public:
    // fixed options
    string wordmapfile;		// file that contains word map [string -> integer id]
	string authormapfile;	// file that contains author map [string -> integer id]
	string confmapfile;	//file that contains conf map [int -> int id]
    string trainlogfile;	// training log file
    string tassign_suffix;	// suffix for topic assignment file
    string theta_suffix;	// suffix for theta file
	string theta_ak_suffix; // suffix for the theta_ak file
	string theta_ck_suffix; // suffix for the theta_ck file
	string topic_doc_suffix; // suffix for the topic doc file
	string topic_author_suffix; //suffix for the topic author file
	string topic_conf_suffix; //suffix for the topic conference file
	string topic_suffix;//suffix for topic file
    string phi_suffix;		// suffix for phi file
	string psi_suffix;		// suffix for psi file
    string others_suffix;	// suffix for file containing other parameters
    string twords_suffix;	// suffix for file containing words-per-topics
	string tauthors_suffix;	// suffix for file containing authors-per-topics
	string tconf_suffix;	// suffix for file containt 
	string test_topic_given_doc_suffix ;// P(z|d)
	string test_doc_given_topic_suffix ;// P(d|z)

    string dir;			// model directory
    string dfile;		// data file    
    string model_name;		// model name
    int model_status;		// model status:
				// MODEL_STATUS_UNKNOWN: unknown status
				// MODEL_STATUS_EST: estimating from scratch
				// MODEL_STATUS_ESTC: continue to estimate the model from a previous one
				// MODEL_STATUS_INF: do inference

    dataset * ptrndata;	// pointer to training dataset object
    dataset * pnewdata; // pointer to new dataset object

    mapid2word id2word; // word map [int => string]
	mapid2author id2author; //author map [int => string]
    mapid2conf id2conf; //conf map [int id = > int]

    // --- model parameters and variables ---    
    int M; // dataset size (i.e., number of docs)
    int V; // vocabulary size
	int A; // number of authors
	int C; //number of conf;
	int TOTAL_W; //total number of word token

    int K; // number of topics
	int KA; // product of number of topics and number of authors K*A, where 0 indicate z=0&x=0, 1 z=0&x=1...
    double alpha, beta, mu; // LDA hyperparameters 
    int niters; // number of Gibbs sampling iterations
    int liter; // the iteration at which the model was saved
    int savestep; // saving period
    int twords; // print out top words per each topic
    int withrawstrs;

    //double * p; // temp variable for sampling
    int ** z; // topic assignments for words, size M x doc.size()	
    int ** nw; // cwt[i][j]: number of instances of word/term i assigned to topic j, size V x K
    int ** nd; // nd[i][j]: number of words in document i assigned to topic j, size M x K	
	int ** na; // na[i][j]: number of times author i assigned to topic j, size A x K 
	int ** nc; // nc[i][j]: number of times conf i assigned to topic j, size C x K 
    int * nwsum; // nwsum[j]: total number of words assigned to topic j, size K
    int * ndsum; // ndsum[i]: total number of words in document i, size M
	int * naksum; // naksum[a]: total number of times of topics assigned to author a, size A
	int * nkasum; // nkasum[k]: total number of times of authors assigned to topic k, size K
	int * ncksum; // nkcsum[c]: total number of times of topics assigned to conf c, size C
	int * nkcsum; // nkcsum[k]: total number of times of conf assigned to topic k, size K
	int * n;
    double ** theta; // theta: document-topic distributions, size M x K
	double ** theta_ak; //theta_ak: author topic distributions, size A x K
	double ** theta_ck; //theta_ak: author topic distributions, size A x K, it is used for analysis
    double ** phi; // phi: topic-word distributions, size K x V
	double ** topic_doc_kd; // topic_doc_kd: topic doc distribution, size K x M, it is used for ananlysis
	double ** topic_author_ka; //topic_author_ka: topic author distribution, size K x A, it is used for analysis
	double ** topic_conf; //topic_conf: topic conference distribution    size K x C
	double * topic;//topic: topic distribution size K
	double * doc;// doc: doc distribution size M
	double ** test_topic_given_doc; // test_topic_given_doc: document-topic distributions, size M x K
	double ** test_doc_given_topic; // test_doc_given_topic: topic-document distributions, size K x M
    	
    // for inference only
    int inf_liter;
    int newM;
    int newV;
	int newA;
	int newC;
	int newTOTAL_W; //total number of word token in the new data
    int ** newz;
    int ** newnw;
    int ** newnd;
	int ** newna;
	int ** newnc;
    int * newnwsum;
    int * newndsum;
	int * newnaksum;
	int * newnkasum;
	int * newnkcsum;
	int * newncksum;
    double ** newtheta;
	double ** newtheta_ak;
	double ** newtheta_ck;
    double ** newphi;
	double ** newtopic_author_ka;
	double ** newtopic_conf;
    // --------------------------------------
    
    model() {
	set_default_values();
    }
          
    ~model();
    
    // set default values for variables
    void set_default_values();   

    // parse command line to get options
    int parse_args(int argc, char ** argv);
    
    // initialize the model
    int init(int argc, char ** argv);
    
    // load LDA model to continue estimating or to do inference
    int load_model(string model_name);
    
    // save LDA model to files
    // model_name.tassign: topic assignments for words in docs
    // model_name.theta: document-topic distributions
    // model_name.phi: topic-word distributions
    // model_name.others: containing other parameters of the model (alpha, beta, M, V, K)
    int save_model(string model_name, bool isfinal);
    int save_model_tassign(string filename);
    int save_model_theta(string filename);
	int save_model_theta_ak(string filename);
	int save_model_theta_ck(string filename);
    int save_model_phi(string filename);
	int save_model_others(string filename);
    int save_model_twords(string filename);
	int save_model_tauthors(string filename);
	int save_model_tconfs(string filename);
	int save_model_topic_doc_kd(string filename);
	int save_model_topic_author_ka(string filename);
	int save_model_topic_conf(string filename);
	int save_model_topic(string filename);
	int save_model_test_topic_given_doc(string filename);
	int save_model_test_doc_given_topic(string filename);
    
    // saving inference outputs
    int save_inf_model(string model_name);
    int save_inf_model_tassign(string filename);
    int save_inf_model_newtheta(string filename);
    int save_inf_model_newphi(string filename);
    int save_inf_model_others(string filename);
    int save_inf_model_twords(string filename);
	int compute_and_save_perplexity(string filename);
    
    // init for estimation
    int init_est();
    int init_estc();
	
    // estimate LDA model using Gibbs sampling
    void estimate();
    int sampling(int m, int n);
    void compute_theta();
	void compute_theta_ak();
	void compute_theta_ck();
	void compute_phi();
	void compute_topic_doc_kd();
	void compute_topic_author_ka();
    void compute_topic_conf();
	void compute_topic();
	//void compute_doc();
    void compute_test_topic_given_doc();
	void compute_test_doc_given_topic();

	
	double compute_perplexity();
	    
    // init for inference
    int init_inf();
    // inference for new (unseen) data based on the estimated LDA model
    void inference();
    int inf_sampling(int m, int n);
    void compute_newtheta();
    void compute_newphi();
	
	void compute_newtheta_ak();
	void compute_newtopic_author_ka();
	void compute_newtopic_conf();

	//academic suggestion
	//void citation_suggestion();
	void conference_suggestion();
};

#endif

