Content Analysis PDF Print E-mail
I wrote this word-counter as the front end for a content analysis engine.  It is blazingly fast and really clean.
 
It is similar to the Linux utility 'wf', however, instead of writing to standard-out, it creates a C++ map which resides in memory ... much faster than writing to the disk or to the screen.  It also supports serialization of this map to the disk. I benchmarked it against the Linux 'wf' utility and they are equal in speed.
 
 
//===================================
// Name        : wordCount.h
// Author      : Jim Cicon
// email       : This e-mail address is being protected from spam bots, you need JavaScript enabled to view it
// Version     :// Copyright   : copyright Jim Cicon, 2008// Description :
//===================================

#ifndef WORDCOUNT_H_
#define WORDCOUNT_H_

#include <map>
#include <string>

typedef std::map<std::string, unsigned int> termCountMap;

class wordCount
{
public:
    wordCount(std::string);
    virtual ~wordCount();
   
    bool serializeWordCountMap(std::string fileName = "");
    termCountMap & getWordCountMap(void){return m_wordCountMap;};
   
    unsigned int size(void){return m_wordCountMap.size();};
   
private:
    bool getCounts(void);
   
    std::string m_fileName;   
    termCountMap m_wordCountMap;
};

#endif /* WORDCOUNT_H_*/
 
//=========================================
// Name        : wordCount.cpp
// Author      : Jim Cicon
// email       : This e-mail address is being protected from spam bots, you need JavaScript enabled to view it
// Version     :
// Copyright   : copyright Jim Cicon, 2008
// Description :
//=========================================

#include "wordCount.h"

#include <iostream>
#include <fstream>

#include "lsaOStream.h"

using namespace std;

wordCount::wordCount(string inFile)
{
    m_wordCountMap.clear();
    m_fileName = inFile;
   
    getCounts();
}

wordCount::~wordCount()
{
}

bool wordCount::getCounts(void)
{
    ifstream ifs (m_fileName.data(), ifstream::in);
   
    termCountMap::iterator it;
    string term = "";
    while(!ifs.eof())
    {
        ifs >> term;
       
        it = m_wordCountMap.find(term);
        if(m_wordCountMap.end() != it)
            (*it).second++;
        else
            m_wordCountMap[term] = 1;
    }
   
    return true;
}

bool wordCount::serializeWordCountMap(string fileName)
{
    lsaOStream stream(fileName);
    ostream * ofs = stream.getStream();   
   
    termCountMap::iterator it = m_wordCountMap.begin();
    while(m_wordCountMap.end() != it)
    {
        *ofs << (*it).second << "\t" << (*it).first << endl;
       
        it++;
    }
   
    return true;
}