/*
 * Copyright (C) 2007 by
 * 
 * 	Xuan-Hieu Phan
 *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 * 	Graduate School of Information Sciences
 * 	Tohoku University
 *
 * GibbsLDA++ is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * GibbsLDA++ is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with GibbsLDA++; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

#include <stdio.h>
#include <stdlib.h>
#include "constants.h"
#include "strtokenizer.h"
#include "dataset.h"
#include "utils.h"

using namespace std;

int dataset::write_wordmap(string wordmapfile, mapword2id * pword2id) {
    FILE * fout = fopen(wordmapfile.c_str(), "w");
    if (!fout) {
	printf("Cannot open file %s to write!\n", wordmapfile.c_str());
	return 1;
    }    
    
    mapword2id::iterator it;
    fprintf(fout, "%d\n", pword2id->size());
    for (it = pword2id->begin(); it != pword2id->end(); it++) {
	fprintf(fout, "%s %d\n", (it->first).c_str(), it->second);
    }
    
    fclose(fout);
    
    return 0;
}

int dataset::read_wordmap(string wordmapfile, mapword2id * pword2id) {
    pword2id->clear();
    
    FILE * fin = fopen(wordmapfile.c_str(), "r");
    if (!fin) {
	printf("Cannot open file %s to read!\n", wordmapfile.c_str());
	return 1;
    }    
    
    char buff[BUFF_SIZE_SHORT];
    string line;
    
    fgets(buff, BUFF_SIZE_SHORT - 1, fin);
    int nwords = atoi(buff);
    
    for (int i = 0; i < nwords; i++) {
	fgets(buff, BUFF_SIZE_SHORT - 1, fin);
	line = buff;
	
	strtokenizer strtok(line, " \t\r\n");
	if (strtok.count_tokens() != 2) {
	    continue;
	}
	
	pword2id->insert(pair<string, int>(strtok.token(0), atoi(strtok.token(1).c_str())));
    }
    
    fclose(fin);
    
    return 0;
}

int dataset::read_wordmap(string wordmapfile, mapid2word * pid2word) {
    pid2word->clear();
    
    FILE * fin = fopen(wordmapfile.c_str(), "r");
    if (!fin) {
	printf("Cannot open file %s to read!\n", wordmapfile.c_str());
	return 1;
    }    
    
    char buff[BUFF_SIZE_SHORT];
    string line;
    
    fgets(buff, BUFF_SIZE_SHORT - 1, fin);
    int nwords = atoi(buff);
    
    for (int i = 0; i < nwords; i++) {
	fgets(buff, BUFF_SIZE_SHORT - 1, fin);
	line = buff;
	
	strtokenizer strtok(line, " \t\r\n");
	if (strtok.count_tokens() != 2) {
	    continue;
	}
	
	pid2word->insert(pair<int, string>(atoi(strtok.token(1).c_str()), strtok.token(0)));
    }
    
    fclose(fin);
    
    return 0;
}




int dataset::write_confmap(string confmapfile, mapconf2id * pconf2id) {
    FILE * fout = fopen(confmapfile.c_str(), "w");
    if (!fout) {
		printf("Cannot open file %s to write!\n", confmapfile.c_str());
		return 1;
    }    
    
    mapconf2id::iterator it;
    fprintf(fout, "%d\n", pconf2id->size());
    for (it = pconf2id->begin(); it != pconf2id->end(); it++) {
		fprintf(fout, "%s\t%d\n", (it->first).c_str(), it->second);
    }
    
    fclose(fout);
    
    return 0;
}

int dataset::read_id2confmap(string confmapfile, mapid2conf * pid2conf) {
    pid2conf->clear();
    
    FILE * fin = fopen(confmapfile.c_str(), "r");
    if (!fin) {
		printf("Cannot open file %s to read!\n", confmapfile.c_str());
		return 1;
    }    
    
    char buff[BUFF_SIZE_SHORT];
    string line;
    
    fgets(buff, BUFF_SIZE_SHORT - 1, fin);
    int nconfs = atoi(buff);
    
    for (int i = 0; i < nconfs; i++) {
		fgets(buff, BUFF_SIZE_SHORT - 1, fin);
		line = buff;
		
		strtokenizer strtok(line, "\t\r\n");
		if (strtok.count_tokens() != 2) {
			continue;
		}

		string id = utils::trimstring(strtok.token(1));
		string conf = utils::trimstring(strtok.token(0));
		pid2conf->insert(pair<int, string>(atoi(id.c_str()), conf.c_str()));
    }
    
    fclose(fin);
    
    return 0;
}

int dataset::read_conf2idmap(string confmapfile, mapconf2id * pconf2id) {
    pconf2id->clear();
    
    FILE * fin = fopen(confmapfile.c_str(), "r");
    if (!fin) {
		printf("Cannot open file %s to read!\n", confmapfile.c_str());
		return 1;
    }    
    
    char buff[BUFF_SIZE_SHORT];
    string line;
    
    fgets(buff, BUFF_SIZE_SHORT - 1, fin);
    int nconfs = atoi(buff);
    
    for (int i = 0; i < nconfs; i++) {
		fgets(buff, BUFF_SIZE_SHORT - 1, fin);
		line = buff;
		
		strtokenizer strtok(line, "\t\r\n");
		if (strtok.count_tokens() != 2) {
			continue;
		}

		string id = utils::trimstring(strtok.token(1));
		string conf = utils::trimstring(strtok.token(0));
		pconf2id->insert(pair<string, int>(conf.c_str(), atoi(id.c_str())));
    }
    
    fclose(fin);
    
    return 0;
}


int dataset::write_authormap(string authormapfile, mapauthor2id * pauthor2id) {
    FILE * fout = fopen(authormapfile.c_str(), "w");
    if (!fout) {
		printf("Cannot open file %s to write!\n", authormapfile.c_str());
		return 1;
    }    
    
    mapauthor2id::iterator it;
    fprintf(fout, "%d\n", pauthor2id->size());
    for (it = pauthor2id->begin(); it != pauthor2id->end(); it++) {
		fprintf(fout, "%s\t%d\n", (it->first).c_str(), it->second);
    }
    
    fclose(fout);
    
    return 0;
}

int dataset::read_authormap(string authormapfile, mapauthor2id * pauthor2id) {
    pauthor2id->clear();
    
    FILE * fin = fopen(authormapfile.c_str(), "r");
    if (!fin) {
		printf("Cannot open file %s to read!\n", authormapfile.c_str());
		return 1;
    }    
    
    char buff[BUFF_SIZE_SHORT];
    string line;
    
    fgets(buff, BUFF_SIZE_SHORT - 1, fin);
    int nauthors = atoi(buff);
    
    for (int i = 0; i < nauthors; i++) {
	fgets(buff, BUFF_SIZE_SHORT - 1, fin);
	line = buff;
	
	strtokenizer strtok(line, "\t\r\n");
	if (strtok.count_tokens() != 2) {
	    continue;
	}
	
	string id = utils::trimstring(strtok.token(1));
	string author = utils::trimstring(strtok.token(0));	
	pauthor2id->insert(pair<string, int>(author, atoi(id.c_str())));
    }
    
    fclose(fin);
    
    return 0;
}

int dataset::read_authormap(string authormapfile, mapid2author * pid2author) {
    pid2author->clear();
    
    FILE * fin = fopen(authormapfile.c_str(), "r");
    if (!fin) {
	printf("Cannot open file %s to read!\n", authormapfile.c_str());
	return 1;
    }    
    
    char buff[BUFF_SIZE_SHORT];
    string line;
    
    fgets(buff, BUFF_SIZE_SHORT - 1, fin);
    int nauthors = atoi(buff);
    
    for (int i = 0; i < nauthors; i++) {
	fgets(buff, BUFF_SIZE_SHORT - 1, fin);
	line = buff;
	
	strtokenizer strtok(line, "\t\r\n");
	if (strtok.count_tokens() != 2) {
	    continue;
	}

	string id = utils::trimstring(strtok.token(1));
	string author = utils::trimstring(strtok.token(0));
	pid2author->insert(pair<int, string>(atoi(id.c_str()), author));
    }
    
    fclose(fin);
    
    return 0;
}


/*
for each document, we suppose that there are three lines:
#@jie tang, jing zhang, juanzi li   //author information, the line starts with an indicator "#@"
#$2005-10-10    //time information, the line starts with an indicator "#$"
abil absenc acquisit acquisit agreem    //document content information

we assume that the author and time information are placed before document content. 
The order of the author and time cannot be reversed. should be author first and time second.
*/
int dataset::read_trndata(string dfile, string wordmapfile, string authormapfile, string confmapfile) {
    mapword2id word2id;
	mapauthor2id author2id;
	mapconf2id conf2id;
    
    FILE * fin = fopen(dfile.c_str(), "r");
    if (!fin) {
		printf("Cannot open file %s to read!\n", dfile.c_str());
		return 1;
    }   
    
    mapword2id::iterator it;    
	mapauthor2id::iterator ait;
	mapconf2id::iterator tit;
    char buff[BUFF_SIZE_LONG];
    string line;
    
	// get the number of documents
    fgets(buff, BUFF_SIZE_LONG - 1, fin);
    M = atoi(buff);
    if (M <= 0) {
		printf("No document available!\n");
		return 1;
    }
	
    
    // allocate memory for corpus
    if (docs) {
		deallocate();
    } else {
		docs = new document*[M];
    }
    
    // set number of words to zero
    V = 0;
	//set number of authors to zero
	A = 0;
	int counter=0;
	string token;
	int i=0, j; 

	document * pdoc = NULL;
    while(true){
		if(fgets(buff, BUFF_SIZE_LONG - 1, fin)==NULL)
			break;

		line = buff;
		if(line.length()<2)// && pdoc==NULL)
			continue;
		
		if(line.at(0) == '#'){//we suppose that title (first), author and time are always written before document content line
			if(line.at(1) == '*'){
				line = line.substr(2, line.length() - 2);
				// allocate new document
				pdoc = new document();
				pdoc->settitle(utils::trimstring(line));
			}else if(line.at(1) == '@'){//author, e.g. "#@jie tang, jing zhang, juanzi li"
				line = line.substr(2, line.length() - 2);
				strtokenizer authortok(line, "\,\n\t"); //maybe I will change it to ";" because of the format.
				int acount = authortok.count_tokens();
				if(acount <=0 ){
					printf("No author information in this document %d \n", counter);
					continue;//return 1;
				}
				
				pdoc->setauthorcount(acount);
				
				for(int k=0; k<acount; k++){
					token = utils::trimstring(authortok.token(k));
					ait = author2id.find(token);
					if(ait == author2id.end()){
						pdoc->authors[k] = author2id.size();						
						author2id.insert(pair<string, int>(token, author2id.size()));
					} else {
						pdoc->authors[k] = ait->second;
					}
				}
			}else if(line.at(1) == 'c'){//conference , 
				line = line.substr(2, line.length() - 3);
				if(line.length()<=0){
					continue;
				}
				token = utils::trimstring(line);

				tit = conf2id.find(token);
				if(tit == conf2id.end()){
					pdoc->confid = conf2id.size();
					conf2id.insert(pair<string, int>(token, conf2id.size()));
				} else {
					pdoc->confid = tit->second;
				}
			} else if(line.at(1) == '!'){//content
				if(pdoc==NULL){
					printf("fatal error!\n train file format error!\n");
					return 1;
				}
				if(pdoc->confid==-1 || pdoc->authorcount<=0){
					pdoc = NULL;
					delete pdoc;
					continue;
				}

				line = line.substr(2, line.length() - 2);
				line = utils::trimstring(line);
				if(pdoc->title.length()>0){
					//line = pdoc->title + " " + pdoc->title + " " + line;
					line = pdoc->title + " " + line;
				}
				strtokenizer strtok(line, " \t\r\n");
				int length = strtok.count_tokens();

				if (length <= 0) {
					printf("Invalid (empty) document!\n");
					deallocate();
					M = V = 0;
					return 1;
				}
				
				pdoc->setwordcount(length);

				for (j = 0; j < length; j++) {
					it = word2id.find(strtok.token(j));
					if (it == word2id.end()) {
						// word not found, i.e., new word
						pdoc->words[j] = word2id.size();
						word2id.insert(pair<string, int>(strtok.token(j), word2id.size()));
					} else {
						pdoc->words[j] = it->second;
					}
				}
				
				// add new doc to the corpus			
				add_doc(pdoc, i);
				i++;
				pdoc = NULL;
			} else {
				continue;
			}
		} else {
			continue;			
		}
    }
	fclose(fin);
    
	M = i;

    // write word map to file
    if (write_wordmap(wordmapfile, &word2id)) {
		return 1;
    }

	// write author map to file
    if (write_authormap(authormapfile, &author2id)) {
		return 1;
    }

	// write author map to file
    if (write_confmap(confmapfile, &conf2id)) {
		return 1;
    }
	    
    // update number of words
    V = word2id.size();
	//update number of authors
	A = author2id.size();
	//update number of confs
	C = conf2id.size();
    
    return 0;
}

int dataset::read_newdata(string dfile, string wordmapfile, string authormapfile, string confmapfile) {
    mapword2id word2id;
	mapauthor2id author2id;
	mapconf2id conf2id;
    	    
    read_wordmap(wordmapfile, &word2id);
    if (word2id.size() <= 0) {
		printf("No word map available!\n");
		return 1;
    }
	read_authormap(authormapfile, &author2id);
	if(author2id.size()<=0){
		printf("No author map available!\n");
		return 1;
	}
	read_conf2idmap(confmapfile, &conf2id);
	if(conf2id.size()<=0){
		printf("No conf map available!\n");
		return 1;
	}

    FILE * fin = fopen(dfile.c_str(), "r");
    if (!fin) {
		printf("Cannot open file %s to read!\n", dfile.c_str());
		return 1;
    }   

    mapword2id::iterator it;
	mapauthor2id::iterator ait;
	mapconf2id::iterator cit;
    map<int, int>::iterator _it;
	map<int, int>::iterator _ait;
	map<int, int>::iterator _cit;

    char buff[BUFF_SIZE_LONG];
    string line;
    
    // get number of new documents
    fgets(buff, BUFF_SIZE_LONG - 1, fin);
    M = atoi(buff);
    if (M <= 0) {
		printf("No document available!\n");
		return 1;
    }
    
    // allocate memory for corpus
    if (docs) {
		deallocate();
    } else {
		docs = new document*[M];
    }
    _docs = new document*[M];
    
    // set number of words to zero
    V = 0;
	A = 0;
	C = 0;

	int counter=0;
	string token;	
    int i=0, j; 
	// allocate memory for new doc
	document * pdoc; //= new document(doc);
	document * _pdoc; //= new document(_doc);
    
	while(true){
		if(fgets(buff, BUFF_SIZE_LONG - 1, fin)==NULL)
			break;
	
		line = buff;

		if(line.length()<2)
			continue;

		if(line.at(0) == '#'){//we suppose that title (first), author and time are always written before document content line
			if(line.at(1) == '*'){
				line = line.substr(2, line.length() - 2);
				// allocate new document
				pdoc = new document();
				_pdoc = new document();
				pdoc->settitle(utils::trimstring(line));		
			}else if(line.at(1) == '@'){//author, e.g. "#@jie tang, jing zhang, juanzi li"
				line = line.substr(2, line.length() - 2);
				strtokenizer authortok(line, "\,\n\t"); //maybe I will change it to ";" because of the format.
				int acount = authortok.count_tokens();
				if(acount <=0 ){
					printf("No author information in this document %d \n", counter);
					return 1;
				}
				
				pdoc->setauthorcount(acount);
				_pdoc->setauthorcount(acount);
				
				int l=0;
				for(int k=0; k<acount; k++){
					token = utils::trimstring(authortok.token(k));
					ait = author2id.find(token);
					if(ait == author2id.end()){
						printf("author not exist\n");
						pdoc->authorcount--;
						_pdoc->authorcount--;
						continue;
						//did nothing
						//the author is not in our data set
					} else {
						int _aid;						

						_ait = aid2_aid.find(ait->second);
						if (_ait == aid2_aid.end()) {
							_aid = aid2_aid.size();
							aid2_aid.insert(pair<int, int>(ait->second, _aid));
							_aid2aid.insert(pair<int, int>(_aid, ait->second));
						} else {
							_aid = _ait->second;
						}
						
						pdoc->authors[l] = ait->second;
						_pdoc->authors[l] = _aid;				
						l++;
					}
				}
			}else if(line.at(1) == 'c'){//time , e.g., "#$2005-10-10"
				line = line.substr(2, line.length() - 3);				
				token = utils::trimstring(line);
				
				cit = conf2id.find(token);
				if(cit == conf2id.end()){
					printf("conference not here\n");
					continue;
					//return 1;
					//did nothing
					
				}else{
					int _cid;
					_cit = cid2_cid.find(cit->second);
					if(_cit == cid2_cid.end()){
						_cid = cid2_cid.size();
						cid2_cid.insert(pair<int, int>(cit->second, _cid));
						_cid2cid.insert(pair<int, int>(_cid, cit->second));
					} else {
						_cid = _cit->second;					
					}
					_pdoc->confid = _cid;
					pdoc->confid = cit->second;
				}
			} else if(line.at(1) == '!'){//content
				if(pdoc==NULL){
					printf("fatal error!\n train file format error!\n");
					return 1;
				}
				if(pdoc->confid==-1 || pdoc->authorcount<=0){
					pdoc = NULL;
					_pdoc = NULL;
					delete pdoc;
					delete _pdoc;
					continue;
				}

				line = line.substr(2, line.length() - 2);
				line = utils::trimstring(line);
				if(pdoc->title.length()>0){
					line = pdoc->title + " " + line;
				}
				
				strtokenizer strtok(line, " \t\r\n");
				int length = strtok.count_tokens();
		
				vector<int> doc;
				vector<int> _doc;
				for (j = 0; j < length; j++) {
					it = word2id.find(strtok.token(j));
					if (it == word2id.end()) {
						printf("word not exist\n");
						continue;
						// word not found, i.e., word unseen in training data
						// do anything? (future decision)
					} else {
						int _id;
						_it = id2_id.find(it->second);
						if (_it == id2_id.end()) {
							_id = id2_id.size();
							id2_id.insert(pair<int, int>(it->second, _id));
							_id2id.insert(pair<int, int>(_id, it->second));
						} else {
							_id = _it->second;
						}
					
						doc.push_back(it->second);
						_doc.push_back(_id);
					}
				}
				pdoc->setwords(doc);
				_pdoc->setwords(_doc);
			
			
				// add new doc
				if(pdoc->authorcount>0 && pdoc->length>0 && pdoc->confid>=0){
					add_doc(pdoc, i);
					_add_doc(_pdoc, i);
					i++;
				}else{
					M--;
				}
				pdoc = NULL;
				_pdoc = NULL;
			} else{
				continue;	
			}
		}else{
			continue;
		}
	}
    
    fclose(fin);
    M = i;
    // update number of new words
    V = id2_id.size();
	A = aid2_aid.size();
	C = cid2_cid.size();
    
    return 0;
}

int dataset::read_newdata_withrawstrs(string dfile, string wordmapfile) {
    mapword2id word2id;
    map<int, int> id2_id;
    
    read_wordmap(wordmapfile, &word2id);
    if (word2id.size() <= 0) {
	printf("No word map available!\n");
	return 1;
    }

    FILE * fin = fopen(dfile.c_str(), "r");
    if (!fin) {
	printf("Cannot open file %s to read!\n", dfile.c_str());
	return 1;
    }   

    mapword2id::iterator it;
    map<int, int>::iterator _it;
    char buff[BUFF_SIZE_LONG];
    string line;
    
    // get number of new documents
    fgets(buff, BUFF_SIZE_LONG - 1, fin);
    M = atoi(buff);
    if (M <= 0) {
	printf("No document available!\n");
	return 1;
    }
    
    // allocate memory for corpus
    if (docs) {
	deallocate();
    } else {
	docs = new document*[M];
    }
    _docs = new document*[M];
    
    // set number of words to zero
    V = 0;
    
    for (int i = 0; i < M; i++) {
	fgets(buff, BUFF_SIZE_LONG - 1, fin);
	line = buff;
	strtokenizer strtok(line, " \t\r\n");
	int length = strtok.count_tokens();
	
	vector<int> doc;
	vector<int> _doc;
	for (int j = 0; j < length - 1; j++) {
	    it = word2id.find(strtok.token(j));
	    if (it == word2id.end()) {
		// word not found, i.e., word unseen in training data
		// do anything? (future decision)
	    } else {
		int _id;
		_it = id2_id.find(it->second);
		if (_it == id2_id.end()) {
		    _id = id2_id.size();
		    id2_id.insert(pair<int, int>(it->second, _id));
		    _id2id.insert(pair<int, int>(_id, it->second));
		} else {
		    _id = _it->second;
		}
		
		doc.push_back(it->second);
		_doc.push_back(_id);
	    }
	}
	
	// allocate memory for new doc
	document * pdoc = new document(doc, line);
	document * _pdoc = new document(_doc, line);
	
	// add new doc
	add_doc(pdoc, i);
	_add_doc(_pdoc, i);
    }
    
    fclose(fin);
    
    // update number of new words
    V = id2_id.size();
    
    return 0;
}

