/* * main.cpp * Classifier * * Daniel Wojcik * */ #include #include #include #include #include #include "algorithms.h" /*std::map gTerms; std::list docs; std::map docClasses; std::map clusters; unsigned int docCount; unsigned int termCount;*/ Globals global; unsigned int nDocs; bool learning = false; unsigned int error; unsigned int rate, cRate; double sse, misses; //Classification & Clustering paramenters go here //void stats(); //void classify(DocItem& dcmt); //void cClassify(DocItem& dcmt); //void characterizeClasses(); //void computeWeights(); //void updateWeights(DocItem* dcmt); //void cluster(); //void cull(); void save(); void saveClass(); void load(); void loadClass(); DocItem* getDoc(int n); void top20(); //void classPrint(); //void clusterPrint(); int main (int argc, char** const argv) { std::string str, str2; global.docCount = 0; global.termCount = 0; nDocs = 0; error = 0; misses = 0; sse = 0; rate = 16; cRate = 0; if (argc > 1) { int d = 2; bool read = false, ndb = false; if (strcmp(argv[1], "-n") == 0) { ndb = true; d++; } if (strcmp(argv[d-1], "-r") == 0) { rate = atoi(argv[d++]); d++; } if (strcmp(argv[d-1], "-l") == 0) learning = true; else if (strcmp(argv[d-1], "-L") == 0) { read = true; learning = true; } else if (strcmp(argv[d-1], "-C") == 0) read = true; else d = 1; //Check for -O outFileName, default to something? if (!ndb) load(); while (d < argc) { std::ifstream argument(argv[d]); if (argument == NULL) break; //Error recovery goes here while (!argument.eof()) { std::ifstream document; str2 = ""; if (read) { argument >> str2; std::cout << "New file : " << str2 << "\n"; document.open(str2.c_str()); if (document == NULL) { continue; std::cout << "Fail\n"; } } else document.open(argv[d]); DocItem dcmt = DocItem(); //docClasses.clear(); //Needs to update global counts manually now while (!document.eof()) { str = ""; document >> str; for (unsigned int i = 0; i < str.size(); i++) { if (isalpha(str[i])) str[i] = tolower(str[i]); } //std::cout << str << "\n"; //Check for meta data if (str.compare("> str; if (category == 0 && str.compare("name='DC.date'") == 0) { std::string str1; document >> str1; str = str1.substr(9,4); dcmt.realClass[0] = str;//atoi(str.c_str()); break; } else if (category == 1 && str.compare("name='DC.subject'") == 0) { std::string str1; document >> str1; str = str1.substr(9,str1.size()-11); dcmt.realClass[0] = str; break; } else if (str[str.size()-1] == '>') break; } } std::map::iterator itr; //itr = find(gTerms.begin(), gTerms.end(), str); itr = global.gTerms.find(str); if (itr != global.gTerms.end()) { if (dcmt.getCount(itr->first) == 0) { dcmt.addTerm(itr->first); //Update inverted index itr->second.dCount++; itr->second.count++; //itr->second.invIndex.push_back(docCount); } else { dcmt.increment(str); itr->second.count++; } } else { global.gTerms[str].count = 1; global.gTerms[str].dCount = 1; //gTerms[str].invIndex.push_back(docCount); dcmt.addTerm(str); } global.termCount++; } //if (!read) document.close(); global.docCount++; nDocs++; if (learning) { if (read) { //argument >> dcmt.classification; dcmt.classification[0] = dcmt.realClass[0]; } else { //dcmt.classification = atoi(argv[++d]); dcmt.classification[0] = dcmt.realClass[0]; } } else classify(dcmt, global); //Doing it this way means order of documents matters, //but otherwise would update weights with no class. updateWeights(&dcmt, global); cRate++; if (!learning && cRate >= rate) //Wait until the end to do this when learning { characterizeClasses(global); cluster(global); save(); saveClass(); cRate = 0; } //global.docs.push_back(dcmt); //Some variance checking, mostly only good for date stuff. //Need to get clustering working to automatically set up date range classes. if (category == 0) { unsigned int e = atoi(dcmt.classification[0].c_str()); e-= atoi(dcmt.realClass[0].c_str()); //unsigned int e = abs(dcmt.classification[0] - dcmt.realClass[0]); error+= abs(e); } if (dcmt.classification[0] != dcmt.realClass[0]) { double d = classTypes; for (unsigned int i = 1; i < classTypes; i++) { if (dcmt.classification[i] == dcmt.realClass[0]) break; else d--; } misses+= 1/d; } } argument.close(); d++; } } else return 1; if (category == 0) std::cout << "Classification error: " << error/(float)nDocs << "\n"; std::cout << "Classification accuracy: " << misses << " misses in " << nDocs; std::cout << " documents, " << (nDocs - misses)/(float)nDocs*100 << "%\n"; characterizeClasses(global); //top20(); //classPrint(); cluster(global); save(); saveClass(); return 0; } void top20() { std::cout << "Top 20\n"; std::pair topTerms[20]; unsigned int terms = 0; std::map::iterator itr = global.gTerms.begin(); while (itr != global.gTerms.end()) { TermStat term = itr->second; //if (term.idf < minIDF || term.count < minKeep*(docCount/supportScale)) // gTerms.erase(itr++); //else { if (terms < 20) { std::pair p; p.first = itr->first; p.second = itr->second; topTerms[terms] = p; terms++; } else { for (unsigned int i = 0; i < terms; i++) { if (itr->second.count > topTerms[i].second.count) { std::pair p; p.first = itr->first; p.second = itr->second; topTerms[i] = p; break; } } } itr++; } } for (unsigned int i = 0; i < 20; i++) { std::cout << topTerms[i].first << " " << topTerms[i].second.count << " " << topTerms[i].second.idf << "\n"; } } //Write current state of classification knowledge to a file so //that it can be loaded later. Useful for incremental updates. void save() { std::cout << "Saving\n"; std::ofstream docOut("brain"); if (docOut == NULL) return; docOut<< "= " << global.docCount << "\n"; docOut<< "? " << global.gTerms.size() << "\n"; std::map::iterator itr = global.gTerms.begin(); while (itr != global.gTerms.end()) { TermStat term = itr->second; if (term.count >= minKeep*(global.docCount/(float)supportScale) && term.idf >= minIDF) { docOut<< "+ "<< itr->first<< " "<< term.count<< " "<< term.dCount<< " "<< term.idf<< "\n"; std::map::iterator wItr = term.cCounts.begin(); while (wItr != term.cCounts.end()) { docOut<< "- "<< wItr->first<< " "<< wItr->second << "\n"; wItr++; } itr++; } else { global.gTerms.erase(itr++); } //itr++; } docOut.close(); } void saveClass() { std::cout << "Saving Classes\n"; std::ofstream docOut("characteristics"); if (docOut == NULL) return; docOut<< "= " << global.docClasses.size() << "\n"; std::map::iterator itr = global.docClasses.begin(); while (itr != global.docClasses.end()) { docOut<< "+ " << itr->first << " " << itr->second.termSize << " "; docOut<< itr->second.dCount << " " << itr->second.cluster << " "; docOut<< itr->second.point << "\n"; std::map::iterator tItr = itr->second.charTerms.begin(); while (tItr != itr->second.charTerms.end()) { docOut << "- " << tItr->first << " " << tItr->second.count << " "; docOut << tItr->second.dCount << " " << tItr->second.idf << " "; docOut << tItr->second.count * tItr->second.idf << "\n"; tItr++; } docOut << "\n"; itr++; } docOut.close(); } void load() { std::ifstream inFile("brain"); if (inFile == NULL) return; std::string term = ""; unsigned int dCount, tCount; std::string classf; float idf; char x; while (!inFile.eof()) { inFile >> x; if (x == '+') { inFile >> term >> tCount >> dCount >> idf; global.gTerms[term].count = tCount; global.gTerms[term].dCount = dCount; global.gTerms[term].idf = idf; } else if (x == '-') { inFile >> classf >> tCount; global.gTerms[term].cCounts[classf] = tCount; global.docClasses[classf].seen = true; } else if (x == '=') { inFile >> tCount; global.docCount+= tCount; } } inFile.close(); loadClass(); //characterizeClasses(); } void loadClass() { std::ifstream inFile("characteristics"); if (inFile == NULL) return; std::string term = ""; unsigned int count, dCount, tCount, size, cls; double p; std::string classf; float idf, w; char x; while (!inFile.eof()) { inFile >> x; if (x == '+') { inFile >> classf >> size >> tCount >> cls >> p; global.docClasses[classf].termSize = size; global.docClasses[classf].dCount = tCount; global.docClasses[classf].cluster = cls; global.docClasses[classf].point = p; global.docClasses[classf].seen = true; if (global.clusters.count(cls) == 0) { double np = global.clusters[cls].meanPoint * global.clusters[cls].count + p; global.clusters[cls].count++; global.clusters[cls].meanPoint = np/global.clusters[cls].count; } else { global.clusters[cls].count = 1; global.clusters[cls].meanPoint = p; } } else if (x == '-') { inFile >> term >> count >> dCount >> idf >> w; global.docClasses[classf].charTerms[term].count = count; global.docClasses[classf].charTerms[term].dCount = dCount; global.docClasses[classf].charTerms[term].idf = idf; } } inFile.close(); cluster(global); } //End of file~