/*
 * main.cpp
 * classifier
 *
 * daniel wojcik
 *
 */

#include 
#include 
#include 
#include 
#include 
#include "algorithms.h"

/*std::map gterms;
std::list docs;
std::map docclasses;
std::map clusters;
unsigned int doccount;
unsigned int termcount;*/

globals global;
unsigned int ndocs;
bool learning = false;
unsigned int error;
unsigned int rate, crate;
double sse, misses;

//classification & clustering paramenters go here

//void stats();
//void classify(docitem& dcmt);
//void cclassify(docitem& dcmt);
//void characterizeclasses();
//void computeweights();
//void updateweights(docitem* dcmt);
//void cluster();
//void cull();
void save();
void saveclass();
void load();
void loadclass();
docitem* getdoc(int n);
void top20();
//void classprint();
//void clusterprint();

int main (int argc, char** const argv)
{
	std::string str, str2;
	global.doccount = 0;
	global.termcount = 0;
	ndocs = 0;
	error = 0;
	misses = 0;
	sse = 0;
	rate = 16;
	crate = 0;
	
	if (argc > 1)
	{
		int d = 2;
		bool read = false, ndb = false;
		if (strcmp(argv[1], "-n") == 0)
		{
			ndb = true;
			d++;
		}
		if (strcmp(argv[d-1], "-r") == 0)
		{
			rate = atoi(argv[d++]);
			d++;
		}
		if (strcmp(argv[d-1], "-l") == 0)
			learning = true;
		else if (strcmp(argv[d-1], "-l") == 0)
		{
			read = true;
			learning = true;
		}
		else if (strcmp(argv[d-1], "-c") == 0)
			read = true;
		else
			d = 1;
		//check for -o outfilename, default to something?
		
		if (!ndb)
			load();
		
		while (d < argc)
		{
			std::ifstream argument(argv[d]);
			if (argument == null)
				break; //error recovery goes here
			
			while (!argument.eof())
			{
				std::ifstream document;
				str2 = "";
				if (read)
				{
					argument >> str2;
					std::cout << "new file : " << str2 << "\n";
					document.open(str2.c_str());
					if (document == null)
					{
						continue;
						std::cout << "fail\n";
					}
				}
				else
					document.open(argv[d]);
				
				docitem dcmt = docitem();
				//docclasses.clear();
			
				//needs to update global counts manually now
				while (!document.eof())
				{
					str = "";
					document >> str;
					
					for (unsigned int i = 0; i < str.size(); i++)
					{
						if (isalpha(str[i]))
							str[i] = tolower(str[i]);
					}
					//std::cout << str << "\n";
					
					//check for meta data
					if (str.compare("> str;
							if (category == 0 && str.compare("name='dc.date'") == 0)
							{
								std::string str1;
								document >> str1;
								str = str1.substr(9,4);
								dcmt.realclass[0] = str;//atoi(str.c_str());
								break;
							}
							else if (category == 1 && str.compare("name='dc.subject'") == 0)
							{
								std::string str1;
								document >> str1;
								str = str1.substr(9,str1.size()-11);
								dcmt.realclass[0] = str;
								break;
							}
							else if (str[str.size()-1] == '>')
								break;
						}
					}
					
					std::map::iterator itr;
					//itr = find(gterms.begin(), gterms.end(), str);
					itr = global.gterms.find(str);
					if (itr != global.gterms.end())
					{
						if (dcmt.getcount(itr->first) == 0)
						{
							dcmt.addterm(itr->first);
							//update inverted index
							itr->second.dcount++;
							itr->second.count++;
							//itr->second.invindex.push_back(doccount);
						}
						else
						{
							dcmt.increment(str);
							itr->second.count++;
						}
					}
					else
					{
						global.gterms[str].count = 1;
						global.gterms[str].dcount = 1;
						//gterms[str].invindex.push_back(doccount);
						dcmt.addterm(str);
					}
					global.termcount++;
				}
				
				//if (!read)
					document.close();
				global.doccount++;
				ndocs++;
				
				if (learning)
				{
					if (read)
					{
						//argument >> dcmt.classification;
						dcmt.classification[0] = dcmt.realclass[0];
					}
					else
					{
						//dcmt.classification = atoi(argv[++d]);
						dcmt.classification[0] = dcmt.realclass[0];
					}
				}
				else
					classify(dcmt, global); //doing it this way means order of documents matters,
									//but otherwise would update weights with no class.
				updateweights(&dcmt, global);
				crate++;
				if (!learning && crate >= rate) //wait until the end to do this when learning
				{
					characterizeclasses(global);
					cluster(global);
					save();
					saveclass();
					crate = 0;
				}
				//global.docs.push_back(dcmt);
				
				//some variance checking, mostly only good for date stuff.
				//need to get clustering working to automatically set up date range classes.
				if (category == 0)
				{
					unsigned int e = atoi(dcmt.classification[0].c_str());
					e-= atoi(dcmt.realclass[0].c_str());
					//unsigned int e = abs(dcmt.classification[0] - dcmt.realclass[0]);
					error+= abs(e);
				}
				if (dcmt.classification[0] != dcmt.realclass[0])
				{
					double d = classtypes;
					for (unsigned int i = 1; i < classtypes; i++)
					{
						if (dcmt.classification[i] == dcmt.realclass[0])
							break;
						else
							d--;
					}
					misses+= 1/d;
				}
			}
			argument.close();
			
			d++;
		}
	}
	else
		return 1;
	
	if (category == 0)
		std::cout << "classification error: " << error/(float)ndocs << "\n";
	std::cout << "classification accuracy: " << misses << " misses in " << ndocs;
	std::cout << " documents, " << (ndocs - misses)/(float)ndocs*100 << "%\n";
	characterizeclasses(global);
	//top20();
	//classprint();
	cluster(global);
	save();
	saveclass();
	
    return 0;
}

void top20()
{
	std::cout << "top 20\n";
	std::pair topterms[20];
	unsigned int terms = 0;
	
	std::map::iterator itr = global.gterms.begin();
	while (itr != global.gterms.end())
	{
		termstat term = itr->second;
		//if (term.idf < minidf || term.count < minkeep*(doccount/supportscale))
		//	gterms.erase(itr++);
		//else
		{
			if (terms < 20)
			{
				std::pair p;
				p.first = itr->first;
				p.second = itr->second;
				topterms[terms] = p;
				terms++;
			}
			else
			{
				for (unsigned int i = 0; i < terms; i++)
				{
					if (itr->second.count > topterms[i].second.count)
					{
						std::pair p;
						p.first = itr->first;
						p.second = itr->second;
						topterms[i] = p;
						break;
					}
				}
			}
			itr++;
		}
	}
	
	for (unsigned int i = 0; i < 20; i++)
	{
		std::cout << topterms[i].first << " " << topterms[i].second.count << 
			" " << topterms[i].second.idf << "\n";
	}
}

//write current state of classification knowledge to a file so
//that it can be loaded later. useful for incremental updates.
void save()
{
	std::cout << "saving\n";
	std::ofstream docout("brain");
	if (docout == null)
		return;
	docout<< "= " << global.doccount << "\n";
	docout<< "? " << global.gterms.size() << "\n";
	
	std::map::iterator itr = global.gterms.begin();
	
	while (itr != global.gterms.end())
	{
		termstat term = itr->second;
		if (term.count >= minkeep*(global.doccount/(float)supportscale) && term.idf >= minidf)
		{
			docout<< "+ "<< itr->first<< " "<< term.count<< " "<< term.dcount<< " "<< term.idf<< "\n";
		
			std::map::iterator witr = term.ccounts.begin();
			while (witr != term.ccounts.end())
			{
				docout<< "- "<< witr->first<< " "<< witr->second << "\n";
				witr++;
			}
			
			itr++;
		}
		else
		{
			global.gterms.erase(itr++);
		}
		//itr++;
	}
	docout.close();
}

void saveclass()
{
	std::cout << "saving classes\n";
	std::ofstream docout("characteristics");
	if (docout == null)
		return;
	docout<< "= " << global.docclasses.size() << "\n";
	
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		docout<< "+ " << itr->first << " " << itr->second.termsize << " ";
		docout<< itr->second.dcount << " " << itr->second.cluster << " ";
		docout<< itr->second.point << "\n";
		
		std::map::iterator titr = itr->second.charterms.begin();
		while (titr != itr->second.charterms.end())
		{
			docout << "- " << titr->first << " " << titr->second.count << " ";
			docout << titr->second.dcount << " " << titr->second.idf << " ";
			docout << titr->second.count * titr->second.idf << "\n";
			titr++;
		}
		docout << "\n";
		itr++;
	}
	
	docout.close();
}

void load()
{
	std::ifstream infile("brain");
	if (infile == null)
		return;
	
	std::string term = "";
	unsigned int dcount, tcount;
	std::string classf;
	float idf;
	char x;
	
	while (!infile.eof())
	{
		infile >> x;
		if (x == '+')
		{
			infile >> term >> tcount >> dcount >> idf;
			global.gterms[term].count = tcount;
			global.gterms[term].dcount = dcount;
			global.gterms[term].idf = idf;
		}
		else if (x == '-')
		{
			infile >> classf >> tcount;
			global.gterms[term].ccounts[classf] = tcount;
			global.docclasses[classf].seen = true;
		}
		else if (x == '=')
		{
			infile >> tcount;
			global.doccount+= tcount;
		}
	}
	infile.close();
	loadclass();
	//characterizeclasses();
}

void loadclass()
{
	std::ifstream infile("characteristics");
	if (infile == null)
		return;
	
	std::string term = "";
	unsigned int count, dcount, tcount, size, cls;
	double p;
	std::string classf;
	float idf, w;
	char x;
	
	while (!infile.eof())
	{
		infile >> x;
		if (x == '+')
		{
			infile >> classf >> size >> tcount >> cls >> p;
			global.docclasses[classf].termsize = size;
			global.docclasses[classf].dcount = tcount;
			global.docclasses[classf].cluster = cls;
			global.docclasses[classf].point = p;
			global.docclasses[classf].seen = true;
			
			if (global.clusters.count(cls) == 0)
			{
				double np = global.clusters[cls].meanpoint * global.clusters[cls].count + p;
				global.clusters[cls].count++;
				global.clusters[cls].meanpoint = np/global.clusters[cls].count;
			}
			else
			{
				global.clusters[cls].count = 1;
				global.clusters[cls].meanpoint = p;
			}
		}
		else if (x == '-')
		{
			infile >> term >> count >> dcount >> idf >> w;
			global.docclasses[classf].charterms[term].count = count;
			global.docclasses[classf].charterms[term].dcount = dcount;
			global.docclasses[classf].charterms[term].idf = idf;
		}
	}
	infile.close();
	
	cluster(global);
}

//end of file~