#include #include #include #include #include using namespace std; struct Freq { string word; int freq; Freq(string w, int f) : word(w), freq(f) {} }; // // Helpers // static int tospace(int c) { if (!isalpha(c)) return ' '; else return c; } static vector get_stop_words() { string word; vector stop_words; ifstream is("../stop_words.txt"); while (getline(is, word, ',')) { stop_words.push_back(word); } char w[2]; w[1] = '\0'; for (char c : "abcdefghijklmopqrstuvwxyz") { w[0] = c; stop_words.push_back(string(w)); } sort(stop_words.begin(), stop_words.end()); return stop_words; } static bool sort_by_freq(Freq x, Freq y) { return y.freq < x.freq; } // // The functions // /** Takes a path to a file and returns the entire contents of the file as a string */ string read_file(const char* path_to_file) { string data; ifstream is(path_to_file, std::ifstream::binary); if (is) { is.seekg (0, is.end); int length = is.tellg(); is.seekg (0, is.beg); char* buffer = new char [length + 1]; is.read(buffer,length); buffer[length] = '\0'; data = buffer; delete [] buffer; } return data; } /** Takes a string and returns a copy with all nonalphanumeric chars replaced by white space */ string filter_chars(string str_data) { std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tospace); return str_data; } /** Takes a string and returns a copy with all chars in lower case */ string normalize(string str_data) { std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tolower); return str_data; } /** Takes a string and scans for words, returning a list of words. */ vector scan(string str_data) { string word; vector words; istringstream is(str_data); while (is >> word) { words.push_back(word); } return words; } /** Takes a list of words and returns a copy with all stop words removed */ vector remove_stop_words(vector words) { vector stop_words = get_stop_words(); vector filtered_list; filtered_list.reserve(words.size()); for (string w : words) { if (!binary_search(stop_words.begin(), stop_words.end(), w)) filtered_list.push_back(w); } return filtered_list; } /** Takes a list of words and returns a dictionary associating words with frequencies of occurrence */ map frequencies(vector words) { map freq; for (string w : words) { map::iterator it = freq.find(w); if (it != freq.end()) { it->second++; } else { freq.insert(pair(w,1)); } } return freq; } /** Takes a dictionary of words and their frequencies and returns a list of pairs where the entries are sorted by frequency */ vector sort(map word_freq) { vector out_list; out_list.reserve(word_freq.size()); for (pair p : word_freq) { out_list.push_back(Freq(p.first, p.second)); } sort(out_list.begin(), out_list.end(), sort_by_freq); return out_list; } // // The main function // int main(int argc, char* argv[]) { vector word_freqs = sort(frequencies(remove_stop_words(scan(normalize(filter_chars(read_file(argv[1]))))))); for (vector::iterator it = word_freqs.begin(); it != word_freqs.begin()+25; it++) cout << it->word << " - " << it->freq << endl; return 0; }