/*
 * Copyright (C) 2007 by
 * 
 * 	Xuan-Hieu Phan
 *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 * 	Graduate School of Information Sciences
 * 	Tohoku University
 *
 * GibbsLDA++ is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * GibbsLDA++ is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with GibbsLDA++; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

#ifndef	_DATASET_H
#define	_DATASET_H

#include <string>
#include <vector>
#include <map>

using namespace std;

// map of words/terms [string => int]
typedef map<string, int> mapword2id;
// map of words/terms [int => string]
typedef map<int, string> mapid2word;


//map of author name to author id [string=>int]
typedef map<string, int> mapauthor2id;
//map of author id to author name [int=>string]
typedef map<int, string> mapid2author;

//map of conf id to conf [string => int]
typedef map<string, int> mapconf2id;
// map of conf [int => string]
typedef map<int, string>mapid2conf;

class document {
public:
    int * words;
    string rawstr;
	string title;
    int length;

	int * authors;
	int authorcount;

	int confid;
    
    document() {
	words = NULL;
	rawstr = "";
	title = "";
	length = 0;	
	authors = NULL;
	authorcount = 0;
	confid = -1;	
    }
    
    document(int length) {
	this->length = length;
	rawstr = "";
	title = "";
	words = new int[length];	
    authors = NULL;
	authorcount = 0;
	confid = -1;
	}
    
    document(int length, int * words) {
	this->length = length;
	rawstr = "";
	title = "";
	this->words = new int[length];
	for (int i = 0; i < length; i++) {
	    this->words[i] = words[i];
	}
    authors = NULL;
	authorcount = 0;
	confid =-1;
	}

    document(int length, int * words, string rawstr) {
	this->length = length;
	this->rawstr = rawstr;
	title = "";
	this->words = new int[length];
	for (int i = 0; i < length; i++) {
	    this->words[i] = words[i];
	}
	authors = NULL;
	authorcount = 0;
	confid = -1;
    }
    
    document(vector<int> & doc) {
	this->length = doc.size();
	rawstr = "";
	title = "";
	this->words = new int[length];
	for (int i = 0; i < length; i++) {
	    this->words[i] = doc[i];
	}
	authors = NULL;
	authorcount = 0;
	confid = -1;
    }

    document(vector<int> & doc, string rawstr) {
	this->length = doc.size();
	this->rawstr = rawstr;
	title = "";
	this->words = new int[length];
	for (int i = 0; i < length; i++) {
	    this->words[i] = doc[i];
	}
	authors = NULL;
	authorcount = 0;
	confid =-1;
    }
    
    ~document() {
	if (words) {
	    delete words;
	}
	if(authors){
		delete authors;
	}
    }

	void setwords(vector<int> & doc){
		if(words)
			delete words;

		this->length = doc.size();
		this->words = new int[length];
		for (int i = 0; i < length; i++) {
			this->words[i] = doc[i];
		}
	}

	void setwordcount(int length){
		this->words = new int[length];
		this->length = length;
	}

	void setauthorcount(int count){
		this->authors = new int [count];
		this->authorcount = count;
	}

	void setconfid(int cid){
		this->confid = cid;
	}
	
	void settitle(string title){
		this->title = title;
	}
};

class dataset {
public:
    document ** docs;
    document ** _docs; // used only for inference
    map<int, int> _id2id; // also used only for inference
	map<int, int> _aid2aid; // also used only for inference
	map<int, int> _cid2cid; // also used only for inference

	map<int, int> id2_id;//inf only
	map<int, int> aid2_aid;//inf only
	map<int, int> cid2_cid;//inf only


    int M; // number of documents
    int V; // number of words
	int A; // number of authors
	int C; // number of conferences
    
    dataset() {
	docs = NULL;
	_docs = NULL;
	M = 0;
	V = 0;
	A = 0;
	C = 0;
    }
    
    dataset(int M) {
	this->M = M;
	this->V = 0;
	this->A = 0;
	this->C = 0;
	docs = new document*[M];	
	_docs = NULL;
    }   
    
    ~dataset() {
	if (docs) {
	    for (int i = 0; i < M; i++) {
		delete docs[i];
	    }
		delete docs;
	}
	
	
	if (_docs) {
	    for (int i = 0; i < M; i++) {
		delete _docs[i];		
	    }
		delete _docs;
	}
	
    }
    
    void deallocate() {
	if (docs) {
	    for (int i = 0; i < M; i++) {
		delete docs[i];
	    }
	}
	delete docs;
	docs = NULL;

	if (_docs) {
	    for (int i = 0; i < M; i++) {
		delete _docs[i];
	    }
	}
	delete _docs;
	_docs = NULL;
    }
    
    void add_doc(document * doc, int idx) {
	if (0 <= idx && idx < M) {
	    docs[idx] = doc;
	}
    }   
    
    void _add_doc(document * doc, int idx) {
	if (0 <= idx && idx < M) {
	    _docs[idx] = doc;
	}
    }       

    static int write_wordmap(string wordmapfile, mapword2id * pword2id);
    static int read_wordmap(string wordmapfile, mapword2id * pword2id);
    static int read_wordmap(string wordmapfile, mapid2word * pid2word);

	static int write_confmap(string confmapfile, mapconf2id * pconf2id);
	static int read_id2confmap(string confmapfile, mapid2conf * pid2conf);
	static int read_conf2idmap(string confmapfile, mapconf2id * pconf2id);

	static int write_authormap(string authormapfile, mapauthor2id * pauthor2id);
	static int read_authormap(string authormapfile, mapauthor2id * pauthor2id);
	static int read_authormap(string authormapfile, mapid2author * pid2author);
    
    int read_trndata(string dfile, string wordmapfile, string authormapfile, string confmapfile);
    int read_newdata(string dfile, string wordmapfile, string authormapfile, string confmapfile);
    int read_newdata_withrawstrs(string dfile, string wordmapfile);
};

#endif

