/*
 *  algorithms.cpp
 *  classifier
 *
 *  daniel wojcik
 *
 */

#include 
#include "algorithms.h"

//scoring function for a given term
//was dcount, now requires normalization by
//object-level term counts in caller.
double termtodouble(termshort& old)
{
	return scalar * old.idf * (old.count);// /(double)old.dcount);
}
double termtodouble(termstat& old)
{
	return scalar * old.idf * (old.count);// /(double)old.dcount);
}
double termtodouble(termstat& old, std::string cls)
{
	return scalar * old.idf * (old.ccounts[cls]);// / (double)old.dcount);
}


//dispatcher function
void classify(docitem& dcmt, globals& global)
{
	switch (clusmet)
	{
		case 0: cclassify(dcmt, global); break;
		case 1: nclassify(dcmt, global); break;
		case 2: svmclassify(dcmt, global); break;
		case 3: ejcclassify(dcmt, global); break;
	}
}

//dispatcher function
void cluster(globals& global)
{
	switch (clusmet)
	{
		case 0: break;
		case 1: nearcluster(global); break;
		case 2: svmcluster(global); break;
		case 3: break;
	}
}

void updateweights(docitem* dcmt, globals& global)
{
	std::string classf = dcmt->classification[0];
	if (global.docclasses.find(classf) == global.docclasses.end())
	{
		global.docclasses[classf].seen = true;
		global.docclasses[classf].dcount = 1;
	}
	else
		global.docclasses[classf].dcount++;
	
	std::map::iterator itr = global.gterms.begin();
	while (itr != global.gterms.end())
	{
		itr->second.idf = log10((float)global.doccount / itr->second.dcount);
		itr->second.ccounts[classf]+= dcmt->getcount(itr->first);
		
		itr++;
	}
}

void characterizeclasses(globals& global)
{
	cull(global);
	std::cout<< "characterizing\n";
	
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		std::string classf = itr->first;
		itr->second.termsize = 0;
		itr->second.termcount = 0;
		itr->second.charterms.clear();
		
		std::map::iterator titr = global.gterms.begin();
		while (titr != global.gterms.end())
		{
			termstat term = titr->second;
			if (itr->second.termsize < topk)
			{
				itr->second.charterms[titr->first].count = term.ccounts[classf];
				itr->second.charterms[titr->first].dcount = term.dcount;
				itr->second.charterms[titr->first].idf = term.idf;
				itr->second.termsize++;
				itr->second.termcount+= term.ccounts[classf];
			}
			else
			{
				termshort min;
				min.count = 0;
				min.dcount = 0;
				min.idf = 0.0;
				std::string minstr;
				
				std::map::iterator citr = itr->second.charterms.begin();
				while (citr != itr->second.charterms.end())
				{
					double minscore = termtodouble(min);
					double curscore = termtodouble(citr->second);
					if (min.dcount == 0 || curscore < minscore)
					{
						min.count = citr->second.count;
						min.idf = citr->second.idf;
						min.dcount = citr->second.dcount;
						minstr = citr->first;
					}
					citr++;
				}
				
				double minscore = termtodouble(min);
				double curscore = termtodouble(term,classf);
				if (minscore < curscore)
				{
					itr->second.charterms.erase(minstr);
					itr->second.charterms[titr->first].count = term.ccounts[classf];
					itr->second.charterms[titr->first].dcount = term.dcount;
					itr->second.charterms[titr->first].idf = term.idf;
				}
			}
			titr++;
		}
		itr++;
	}
}

//generate characteristic terms of the document.
//could probably do this better given the difference
//between value types in the desired maps.
void characterizedocument(docitem& dcmt, globals& global)
{
	dcmt.charterms.clear();
	unsigned int termsize = 0;
	
	std::map::iterator titr = global.gterms.begin();
	while (titr != global.gterms.end())
	{
		termshort term;
		term.count = dcmt.getcount(titr->first);
		term.dcount = 1;
		term.idf = titr->second.idf;
		
		if (termsize < topk)
		{
			dcmt.charterms[titr->first].count = term.count;
			dcmt.charterms[titr->first].dcount = term.dcount;
			dcmt.charterms[titr->first].idf = term.idf;
			termsize++;
		}
		else
		{
			termshort min;
			min.count = 0;
			min.dcount = 0;
			min.idf = 0.0;
			std::string minstr;
			
			std::map::iterator citr = dcmt.charterms.begin();
			while (citr != dcmt.charterms.end())
			{
				double minscore = termtodouble(min);
				double curscore = termtodouble(citr->second);
				if (min.dcount == 0 || curscore < minscore)
				{
					min.count = citr->second.count;
					min.idf = citr->second.idf;
					min.dcount = citr->second.dcount;
					minstr = citr->first;
				}
				citr++;
			}
			
			double minscore = termtodouble(min);
			double curscore = termtodouble(term);
			if (minscore < curscore)
			{
				dcmt.charterms.erase(minstr);
				dcmt.charterms[titr->first].count = term.count;
				dcmt.charterms[titr->first].dcount = term.dcount;
				dcmt.charterms[titr->first].idf = term.idf;
			}
		}
		titr++;
	}
}

//if too far away, make new class?
//doesn't do this currently, but should if clustering is enabled.
void cclassify(docitem& dcmt, globals& global)
{
	std::map::iterator itr = global.docclasses.begin();
	std::string classf = "-1";
	double maxw = -1;
	
	while (itr != global.docclasses.end())
	{
		double w = 0;
		std::map::iterator citr = itr->second.charterms.begin();
		while (citr != itr->second.charterms.end())
		{
			//weight function
			//currently set to use normalization by term, which doesn't work for this method!
			double cw = citr->second.count / (double)itr->second.termcount;
			double dw = dcmt.getcount(citr->first) / (double)dcmt.termcount;
			w+= scalar * citr->second.idf * abs(cw - dw);
			citr++;
		}
		if (classf.compare("-1") == 0 || maxw < w)
		{
			classf = itr->first;
			maxw = w;
		}
		itr++;
	}

	dcmt.classification[0] = classf;
	
	std::cout << "document : " << dcmt.classification[0] << " " << maxw << "\n";
	std::cout << "real class : " << dcmt.realclass[0] << "\n";
}

void nclassify(docitem& dcmt, globals& global)
{
	std::map::iterator itr = global.docclasses.begin();
	std::string classf = "-1";
	double maxw = -1, p = 0;
	
	while (itr != global.docclasses.end())
	{
		double w = 0;
		std::map::iterator citr = itr->second.charterms.begin();
		while (citr != itr->second.charterms.end())
		{
			//weight function
			//currently uses normalization by term, which doesn't work for this method!
			double cw = citr->second.count / (double)itr->second.termcount;
			double dw = dcmt.getcount(citr->first) / (double)dcmt.termcount;
			w+= scalar * citr->second.idf * abs(cw - dw);
			p+= citr->second.idf * dcmt.getcount(citr->first);
			citr++;
		}
		if (classf.compare("-1") == 0 || maxw < w)
		{
			classf = itr->first;
			maxw = w;
		}
		itr++;
	}

	dcmt.classification[0] = classf;
	std::pair classes[classtypes];
	classes[0].first = classf;
	classes[0].second = maxw;
	unsigned int t = 1;
	
	//next closest classes come from this cluster
	cluster cls = global.clusters[global.docclasses[classf].cluster];
	std::list::iterator ditr = cls.classes.begin();
	while (ditr != cls.classes.end())
	{
		std::string c = *ditr;
		if (c.compare(classf) == 0)
			continue;
		double w = 0;
		std::map::iterator citr = global.docclasses[c].charterms.begin();
		while (citr != global.docclasses[c].charterms.end())
		{
			//weight function
			//currently uses normalization by term, which doesn't work for this method!
			double cw = citr->second.count / (double)global.docclasses[c].termcount;
			double dw = dcmt.getcount(citr->first) / (double)dcmt.termcount;
			w+= scalar * citr->second.idf * abs(cw - dw);
			citr++;
		}
		
		unsigned int i = 1;
		while (i < t && i < classtypes)
		{
			if (w > classes[i].second)
			{
				std::string temp1 = classes[i].first;
				double temp2 = classes[i].second;
				classes[i].first = c;
				classes[i].second = w;
				c = temp1;
				w = temp2;
			}
			i++;
		}
		if (i < classtypes)
		{
			classes[i].first = c;
			classes[i].second = w;
			t++;
		}
		
		ditr++;
	}
	//assign classes to document
	for (unsigned int i = 0; i < t; i++)
	{
		dcmt.classification[i] = classes[i].first;
		std::cout<< "assigned class [" << i << "] : ";
		std::cout<< classes[i].first << " : " << classes[i].second;
		std::cout<< " : " << dcmt.realclass[0] << "\n";
	}
}

//do clustering on the classes
//allows it to classify based on computed class data
//rather than just compute the class from item data
//also enables program defined classes and superclasses
//slow, so only done when saving data to file
//could be sped up by only focusing on characteristic words
//for each document, but finding those could be tricky.
void nearcluster(globals& global)
{
	cull(global);
	std::cout<<"clustering\n";
	//should to a relevancy check as it goes along.
	//probably the best place to cull excess terms
	//from both the item and class lists.
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		unsigned int k = 0;
		std::cout<first <<"\n";
		itr->second.point = 0;
		bool findpoint = false;
		
		std::map::iterator citr = global.docclasses.begin();
		while (citr != global.docclasses.end())
		{
			if (citr != itr)
			{
				//l1 distance metric
				std::map::iterator titr = itr->second.charterms.begin();
				double d = 0;
				while (titr != itr->second.charterms.end())
				{
					if (findpoint)
						itr->second.point+= titr->second.count * titr->second.idf;
						
					std::map::iterator titr2 = citr->second.charterms.find(titr->first);
					if (titr2 != citr->second.charterms.end())
					{
						double c1 = titr->second.count / (double)itr->second.termcount;
						double c2 = titr2->second.count / (double)citr->second.termcount;
						d+= abs(c1 - c2) * titr->second.idf;
					}
					else
					{
						//some high value to weight against terms uncommon to them.
						d+= penalty;
					}
					titr++;
				}
				findpoint = true;
				
				if (k < neark)
				{
					if (d < maxd)
					{
						std::pair p;
						p.first = citr->first;
						p.second = d;
						itr->second.neighbors[k++] = p;
					}
				}
				else
				{
					int r = 0;
					double rd = itr->second.neighbors[r].second;
					for (unsigned int i = 1; i < k; i++)
					{
						if (rd < itr->second.neighbors[i].second)
						{
							r = i;
							rd = itr->second.neighbors[i].second;
						}
					}
					if (rd > d)
					{
						itr->second.neighbors[r].second = d;
						itr->second.neighbors[r].first = citr->first;
					}
				}
			}
			citr++;
		}
		itr->second.cluster = 0;
		itr++;
	}
	
	//bfs search to determine the clusters
	std::cout<<"clustering2\n";
	itr = global.docclasses.begin();
	unsigned int cls = 1;
	std::queue bfsq;
	std::list > mergel;
	
	while (itr != global.docclasses.end())
	{
		//bfs
		if (itr->second.cluster != 0)
		{
			itr++;
			continue;
		}
		
		itr->second.cluster = cls;
		for (unsigned int i = 0; i < neark; i++)
		{
			std::string c = itr->second.neighbors[i].first;
			if (global.docclasses[c].cluster != 0)
			{
				std::pair p;
				std::list >::iterator mitr = mergel.begin();
				while (mitr != mergel.end())
				{
					if (mitr->first == cls)
					{
						if (mitr->second == global.docclasses[c].cluster)
							break;
						else
						{
							p.first = global.docclasses[c].cluster;
							p.second = mitr->second;
							mergel.push_back(p);
							break;
						}
					}
					mitr++;
				}
				if (mitr == mergel.end())
				{
					p.first = cls;
					p.second = global.docclasses[c].cluster;
					mergel.push_back(p);
				}
			}
			else
				bfsq.push(c);
		}
		while (!bfsq.empty())
		{
			std::string c = bfsq.front();
			bfsq.pop();
			if (global.docclasses[c].cluster != 0)
			{
				std::pair p;
				std::list >::iterator mitr = mergel.begin();
				while (mitr != mergel.end())
				{
					if (mitr->first == cls)
					{
						if (mitr->second == global.docclasses[c].cluster)
							break;
						else
						{
							p.first = global.docclasses[c].cluster;
							p.second = mitr->second;
							mergel.push_back(p);
							break;
						}
					}
					mitr++;
				}
				if (mitr == mergel.end())
				{
					p.first = cls;
					p.second = global.docclasses[c].cluster;
					mergel.push_back(p);
				}
				continue;
			}
			
			double newpoint = global.clusters[cls].meanpoint * global.clusters[cls].count + global.docclasses[c].point;
			double newclass = global.clusters[cls].meanclass * global.clusters[cls].count;// + c;
			global.clusters[cls].count++;
			global.clusters[cls].meanpoint = newpoint / global.clusters[cls].count;
			global.clusters[cls].meanclass = newclass /global. clusters[cls].count;
			global.docclasses[c].cluster = cls;
			for (unsigned int i = 0; i < neark; i++)
				bfsq.push(global.docclasses[c].neighbors[i].first);
		}
		
		cls++;
		itr++;
	}
	
	if (mergeclusters)
	{
		std::cout<<"clustering3\n";
		while (!mergel.empty())
		{
			std::pair p = mergel.back();
			mergel.pop_back();
			
			double newpoint = global.clusters[p.second].meanpoint * global.clusters[p.second].count;
			newpoint+= global.clusters[p.first].meanpoint * global.clusters[p.first].count;
			double newclass = global.clusters[p.second].meanclass * global.clusters[p.second].count;
			newclass+= global.clusters[p.first].meanclass * global.clusters[p.first].count;
			global.clusters[p.second].count+= global.clusters[p.first].count;
			global.clusters[p.second].meanpoint = newpoint / global.clusters[p.second].count;
			global.clusters[p.second].meanclass = newclass / global.clusters[p.second].count;
			
			itr = global.docclasses.begin();
			while (itr != global.docclasses.end())
			{
				if (itr->second.cluster == p.first)
					itr->second.cluster = p.second;
				itr++;
			}
			
			global.clusters.erase(p.first);
		}
	}
	
	//add to cluster objects
	std::map::iterator clsitr = global.clusters.begin();
	while (clsitr != global.clusters.end())
	{
		clsitr->second.classes.clear();
		clsitr++;
	}
	
	itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		global.clusters[itr->second.cluster].classes.push_back(itr->first);
		itr++;
	}
		
	//clusterprint(global);
}

//do comparison for each class against all others
//use probabalistic decisions to weight each class
//perhaps if something has roughly equal probabilities
//in multiple classes, it goes in each?
void svmclassify(docitem& dcmt, globals& global)
{
	std::map classprob;
	sample_type sample;
	
	dcmt.classification[0] = "-1";
	
	characterizedocument(dcmt, global);
	std::map::iterator ditr = dcmt.charterms.begin();
	while (ditr != dcmt.charterms.end())
	{
		sample[ditr->first] = termtodouble(ditr->second) / dcmt.termcount;
		ditr++;
	}
	
	std::pair classes[classtypes];
	classes[0].first = "-1";
	classes[0].second = -1;
	int t = 0;
	
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		//classify according to probabilistic decision function
		//keep track of probabilities, pick highest classtypes
		//as long as they are above a certain threshhold.
		double p = itr->second.svmtrainer(sample);
		if (p > 0)
		{
			classprob[itr->first] = p;
			std::pair mp;
			mp.first = itr->first;
			mp.second = p;
			
			unsigned int i = 0;
			while (i < t) //keeps the classes sorted by max probability
			{
				if (p > classes[i].second)
				{
					std::pair temp;
					temp.first = classes[i].first;
					temp.second = classes[i].second;
					classes[i].first = mp.first;
					classes[i].second = mp.second;
					mp.first = temp.first;
					mp.second = temp.second;
				}
				i++;
			}
			if (i < classtypes)
			{
				classes[i].first = mp.first;
				classes[i].second = mp.second;
				t++;
			}
		}
		
		itr++;
	}
	
	if (classes[0].second <= 0)
	{
		//just output -1 as the class
		t++;
	}
	
	//assign classes to document
	for (unsigned int i = 0; i < t; i++)
	{
		dcmt.classification[i] = classes[i].first;
		std::cout<< "assigned class [" << i << "] : ";
		std::cout<< classes[i].first << " : " << classes[i].second;
		std::cout<< " : " << dcmt.realclass[0] << "\n";
	}
}

//doesn't actually cluster anything ;>_>
//instead, this handles the training of the svm
//decision functions. unfortunately, it basically
//has to start from scratch each time, but that's
//what the other clustering functions do too.
void svmcluster(globals& global)
{	
	std::cout<<"svm clustering\n";
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		//reset svm trainer, ensure specified parameters
		itr->second.svmtrainer.clear();
		itr->second.svmtrainer.set_lambda(lambda);
		itr->second.svmtrainer.set_tolerance(tol);
		itr->second.svmtrainer.set_max_num_sv(maxvect);
		
		sample_type sample;
		
		std::map::iterator titr = itr->second.charterms.begin();
		while (titr != itr->second.charterms.end())
		{
			sample[titr->first] = termtodouble(titr->second) / itr->second.termcount;
			titr++;
		}
		
		std::map::iterator citr = global.docclasses.begin();
		while (citr != global.docclasses.end())
		{
			if (itr->first == citr->first)
				itr->second.svmtrainer.train(sample,1);
			else
			{
				sample_type csample;
				titr = citr->second.charterms.begin();
				while (titr != citr->second.charterms.end())
				{
					csample[titr->first] = termtodouble(titr->second) / citr->second.termcount;
					titr++;
				}
				itr->second.svmtrainer.train(csample,-1);
			}
				
			citr++;
		}
	
		itr++;
	}
}

//extended jaccard coefficients
//basically, tanimoto coefficients
//similarity metric for attribute vectors
void ejcclassify(docitem& dcmt, globals& global)
{
	//need to do dot product of sparse vectors...
	//if they don't share the term, it's 0.
	//that makes the dot product of two vectors
	//with no common terms 0, or orthogonal.
	double dm = 0;
	
	characterizedocument(dcmt, global);
	std::map::iterator ditr = dcmt.charterms.begin();
	while (ditr != dcmt.charterms.end())
	{
		double d = termtodouble(ditr->second) / dcmt.termcount;
		dm+= d*d;
		ditr++;
	}
	
	std::pair results[classtypes];
	unsigned int t = 0;
	
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		double cm = 0;
		double dot = 0;
		std::pair result;
		
		std::map::iterator titr = itr->second.charterms.begin();
		while (titr != itr->second.charterms.end())
		{
			double d1 = termtodouble(titr->second) / itr->second.termcount;
			ditr = dcmt.charterms.find(titr->first);
			double d2 = 0;
			if (ditr != dcmt.charterms.end())
				d2 = termtodouble(ditr->second) / dcmt.termcount;
			dot+= d1*d2;
			cm+= d1*d1;
			
			titr++;
		}
		
		result.first = itr->first;
		result.second = dot / (cm + dm - dot);
		
		//compare to current results
		unsigned int i = 0;
		while (i < t)
		{
			if (result.second > results[i].second)
			{
				std::string temp1 = results[i].first;
				double temp2 = results[i].second;
				results[i].first = result.first;
				results[i].second = result.second;
				result.first = temp1;
				result.second = temp2;
			}
			i++;
		}
		if (i < classtypes && result.second > 0)
		{
			results[i].first = result.first;
			results[i].second = result.second;
			t++;
		}
		
		itr++;
	}
	
	//assign classes to document
	for (unsigned int i = 0; i < t; i++)
	{
		dcmt.classification[i] = results[i].first;
		std::cout<< "assigned class [" << i << "] : ";
		std::cout<< results[i].first << " : " << results[i].second;
		std::cout<< " : " << dcmt.realclass[0] << "\n";
	}
	if (t == 0)
	{
		dcmt.classification[0] = "-1";
		std::cout<< "assigned class [" << 0 << "] : ";
		std::cout<< dcmt.classification[0] << " : ";
		std::cout<< dcmt.realclass[0] << "\n";
	}
}

void clusterprint(globals& global)
{
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		std::cout<< itr->first << " : " << itr->second.cluster << "\n";
		itr++;
	}
}

void classprint(globals& global)
{
	std::map::iterator itr = global.docclasses.begin();
	while (itr != global.docclasses.end())
	{
		std::cout << "class " << itr->first << "\n";
		std::map::iterator titr = itr->second.charterms.begin();
		while (titr != itr->second.charterms.end())
		{
			std::cout << titr->first << " " << titr->second.count << " ";
			std::cout << titr->second.dcount << " " << titr->second.idf << "\n";
			titr++;
		}
		std::cout << "\n";
		itr++;
	}
}

void cull(globals& global)
{
	std::cout << "culling\n";
	std::map::iterator itr = global.gterms.begin();
	while (itr != global.gterms.end())
	{
		termstat term = itr->second;
		double tf = term.count; // (double)term.dcount;
		double mtf = minkeep * global.doccount / (double)supportscale;
		if (term.idf < minidf || tf < mtf)
			global.gterms.erase(itr++);
		else
			itr++;
	}
}