| Is Text English |
|
|
|
|
Apples and Oranges: When you are analysing text, it is best that all the documents in your corpus are the same language. This utility determines if a text document is English text. //================================= // Name : isenglish.h // Author :
This e-mail address is being protected from spam bots, you need JavaScript enabled to view it
// Version : // Copyright : Copyright 2008, Jim Cicon // Description : return 1 if a file is english, return 0 if not //================================= #ifndef __STOP_LIST_H__ #define __STOP_LIST_H__ #include <vector> #include <string> std::string stoplist = "a b c d e f g h i j k l m n o p q r s t u v w x y z about all also although am an and another any anybody anyhow anyone anything anywhere are as at be become been being but by can cannot could did do does doing done each eg either else et etc every ex for from had has have having he hence her hers herself high him himself his how however ie if in inc indeed is it its ltd many may me might more mr mrs ms must my myself no nor not of oh or otherwise ought our ours ourselves per put re self selves shall she should sl so some somehow such sup than that the their theirs them themselves then there therefore these they this those though thus to us very via viz vs was we were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which whichever while whither who whoever whole whom whose why will with within without would yes you your yours yourself yourselves"; void tokenize(const std::string & in_str, std::vector<std::string> & tokens, unsigned int minSize); std::string trim(std::string str); #endif //========================================== // Name : isenglish.cpp // Author : This e-mail address is being protected from spam bots, you need JavaScript enabled to view it // Version : // Copyright : Copyright 2008, Jim Cicon // Description : return 1 if a file is english, return 0 if not //========================================== #include <iostream> #include <fstream> #include <istream> #include "isenglish.h" using namespace std; int main(int argc, char *argv[]) { // Check for correct number of parameters if(5 != argc) { cout << endl; cout << "Incorrect number of commmand line arguments: "; cout << "threshold file-list english-directory nonenglish-directory" << endl; return EXIT_FAILURE; } int threshold = atoi(argv[1]); string pFileList = argv[2]; string pEnglishDir = argv[3]; string pNonEnglishDir = argv[4]; // Copy the stoplist into a vector vector<string> tokens; tokenize(stoplist, tokens, threshold); // Open the unknown-language fileList ifstream fileList (pFileList.data(), ifstream::in); if(false == fileList.is_open()) { cout << "Failed to open unknown-language file list: " << pFileList; return EXIT_FAILURE; } // Check each file in the list to see if it is english or nonenglish string word_j, file_i; int totalFiles = 0, englishFiles = 0, nonEnglishFiles = 0; float wordCount, stopListCount; vector<string>::iterator it; while(!fileList.eof()) { totalFiles++; wordCount = 0; stopListCount = 0; // Open the next unknown-language file getline(fileList, file_i); ifstream unknownFile(file_i.data(), ifstream::in); if(false == unknownFile.is_open()) { cout << "Failed to open unknown-language file: " << file_i; continue; } // Read the file a word at a time and compare it to the stop-list // Count how many matches are found, and track total number of words while (getline(unknownFile, word_j, ' ')) { wordCount++; it = tokens.begin(); while(tokens.end() != it) { if(*it == word_j) { stopListCount++; it++; continue; } it++; } } // Copy file to appropriate directory string command; if(stopListCount/wordCount > .1) { command = "cp " + file_i + " " + pEnglishDir; system(command.data()); cout << "ENGLISH: " << file_i; englishFiles++; } else { command = "cp " + file_i + " " + pNonEnglishDir; system(command.data()); cout << "NOT: " << file_i; nonEnglishFiles++; } cout << "(" << stopListCount << ", " << wordCount << ")" << endl; } cout << endl; cout << "Total Files Processed: " << totalFiles << ", English: " << englishFiles << " Non English: " << nonEnglishFiles << endl; return EXIT_SUCCESS; } // Helper function - break string into tokens void tokenize(const string & in_str, vector<string> & tokens, unsigned int minSize) { tokens.clear(); string white_space = " \t\r\n"; string delimiter = " "; string str = in_str; size_t token_end; // parse tokens while(str.length()) { // Strip leading whitespace size_t notwhite = str.find_first_not_of(white_space); str.erase(0, notwhite); // find end of token and push token token_end = str.find_first_of(white_space); // only tokenize if word is big enough if(token_end >= minSize) tokens.push_back(trim(str.substr(0, token_end))); // strip off used tokens str.erase(0, token_end); } } // Helper function - Trim whitespace from string string trim(string str) { char const* delims = " \t\r\n"; // trim leading whitespace string::size_type notwhite = str.find_first_not_of(delims); str.erase(0,notwhite); // trim trailing whitespace notwhite = str.find_last_not_of(delims); str.erase(notwhite+1); return str; } |