Is Text English PDF Print E-mail
Apples and Oranges: When you are analysing text, it is best that all the documents in your corpus are the same language.  This utility determines if a text document is English text.
 
 
//=================================
// Name        : isenglish.h
// Author      : This e-mail address is being protected from spam bots, you need JavaScript enabled to view it
// Version     :
// Copyright   : Copyright 2008, Jim Cicon
// Description : return 1 if a file is english, return 0 if not
//=================================
#ifndef __STOP_LIST_H__
#define __STOP_LIST_H__

#include <vector>
#include <string>

std::string stoplist = "a b c d e f g h i j k l m n o p q r s t u v w x y z about all also although am an and another any anybody anyhow anyone anything anywhere are as at be become been being but by can cannot could did do does doing done each eg either else et etc every ex for from had has have having he hence her hers herself high him himself his how however ie if in inc indeed is it its ltd many may me might more mr mrs ms must my myself no nor not of oh or otherwise ought our ours ourselves per put re self selves shall she should sl so some somehow such sup than that the their theirs them themselves then there therefore these they this those though thus to us very via viz vs was we were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which whichever while whither who whoever whole whom whose why will with within without would yes you your yours yourself yourselves";

void tokenize(const std::string & in_str, std::vector<std::string> & tokens, unsigned int minSize);
std::string trim(std::string str);

#endif
 
 //==========================================
// Name        : isenglish.cpp
// Author      : This e-mail address is being protected from spam bots, you need JavaScript enabled to view it
// Version     :
// Copyright   : Copyright 2008, Jim Cicon
// Description : return 1 if a file is english, return 0 if not
//==========================================
#include <iostream>
#include <fstream>
#include <istream>

#include "isenglish.h"

using namespace std;

int main(int argc, char *argv[])
{
    // Check for correct number of parameters
    if(5 != argc)
    {
        cout << endl;
        cout << "Incorrect number of commmand line arguments: ";
        cout << "threshold file-list english-directory nonenglish-directory" << endl;
            
        return EXIT_FAILURE;
    }
    int threshold = atoi(argv[1]);
    string pFileList = argv[2];
    string pEnglishDir = argv[3];
    string pNonEnglishDir = argv[4];
    
    // Copy the stoplist into a vector
    vector<string> tokens;
    tokenize(stoplist, tokens, threshold);
    
    // Open the unknown-language fileList
    ifstream fileList (pFileList.data(), ifstream::in);
    if(false == fileList.is_open())
    {
        cout << "Failed to open unknown-language file list: " << pFileList;
        return EXIT_FAILURE;
    }
    
    // Check each file in the list to see if it is english or nonenglish
    string word_j, file_i;
    int totalFiles = 0, englishFiles = 0, nonEnglishFiles = 0;
    float wordCount, stopListCount;
    vector<string>::iterator it;
    while(!fileList.eof())
    {
        totalFiles++;
        wordCount = 0;
        stopListCount = 0;

        // Open the next unknown-language file
        getline(fileList, file_i);
        ifstream unknownFile(file_i.data(), ifstream::in);
        if(false == unknownFile.is_open())
        {
            cout << "Failed to open unknown-language file: " << file_i;
            continue;
        }
        
        // Read the file a word at a time and compare it to the stop-list
        // Count how many matches are found, and track total number of words
        while (getline(unknownFile, word_j, ' '))
        {
            wordCount++;
        
            it = tokens.begin();
            while(tokens.end() != it)
            {
                if(*it == word_j)
                {
                    stopListCount++;
                    it++;
                    continue;
                }
                it++;
            }
        }
        
        // Copy file to appropriate directory
        string command;
        if(stopListCount/wordCount > .1)
        {
            command = "cp " + file_i + " " + pEnglishDir;
            system(command.data());
            
            cout << "ENGLISH: " << file_i;
            englishFiles++;
        }
        else
        {
            command = "cp " + file_i + " " + pNonEnglishDir;
            system(command.data());

            cout << "NOT: " << file_i;
            nonEnglishFiles++;
        }
        
        cout << "(" << stopListCount << ", " << wordCount << ")" << endl;
        
    }
    
    cout << endl;
    cout << "Total Files Processed: " << totalFiles << ", English: " << englishFiles << " Non English: " << nonEnglishFiles << endl;
    
    return EXIT_SUCCESS;
}

// Helper function - break string into tokens
void tokenize(const string & in_str, vector<string> & tokens, unsigned int minSize)
{
    tokens.clear();
    string white_space = " \t\r\n";
    string delimiter = " ";
    
    string str = in_str;    
    size_t token_end;
    
    // parse tokens
    while(str.length())
    {
        // Strip leading whitespace
        size_t notwhite = str.find_first_not_of(white_space);
        str.erase(0, notwhite);
        
        // find end of token and push token
        token_end = str.find_first_of(white_space);
        
        // only tokenize if word is big enough
        if(token_end >= minSize) tokens.push_back(trim(str.substr(0, token_end)));
        
        // strip off used tokens
        str.erase(0, token_end);
    }
}

// Helper function - Trim whitespace from string
string trim(string str)
{
    char const* delims = " \t\r\n";
    
    // trim leading whitespace
    string::size_type notwhite = str.find_first_not_of(delims);
    str.erase(0,notwhite);

    // trim trailing whitespace
    notwhite = str.find_last_not_of(delims);
    str.erase(notwhite+1);
    
    return str;
}