import sys, re, operator, string from abc import ABCMeta # # The classes # class TFExercise(object): __metaclass__ = ABCMeta def info(self): return self.__class__.__name__ + ": No major data structure" class DataStorageManager(TFExercise): """ Models the contents of the file """ _data = '' def __init__(self, path_to_file): f = open(path_to_file) self._data = f.read() f.close() self.__filter_chars() self.__normalize() def __filter_chars(self): """ Takes a string and returns a copy with all nonalphanumeric chars replaced by white space """ pattern = re.compile('[\W_]+') self._data = pattern.sub(' ', self._data) def __normalize(self): """ Takes a string and returns a copy with all characters in lower case """ self._data = self._data.lower() def words(self): """ Returns the list words in storage """ data_str = ''.join(self._data) return data_str.split() def info(self): return self.__class__.__name__ + ": My major data structure is a " + self._data.__class__.__name__ class StopWordManager(TFExercise): """ Models the stop word filter """ _stop_words = [] def __init__(self): f = open('../stop_words.txt') self._stop_words = f.read().split(',') f.close() # add single-letter words self._stop_words.extend(list(string.ascii_lowercase)) def is_stop_word(self, word): return word in self._stop_words def info(self): return self.__class__.__name__ + ": My major data structure is a " + self._stop_words.__class__.__name__ class WordFrequencyManager(TFExercise): """ Keeps the word frequency data """ _word_freqs = {} def increment_count(self, word): if word in self._word_freqs: self._word_freqs[word] += 1 else: self._word_freqs[word] = 1 def sorted(self): return sorted(self._word_freqs.iteritems(), key=operator.itemgetter(1), reverse=True) def info(self): return self.__class__.__name__ + ": My major data structure is a " + self._word_freqs.__class__.__name__ class WordFrequencyController(TFExercise): def __init__(self, path_to_file): self._storage_manager = DataStorageManager(path_to_file) self._stop_word_manager = StopWordManager() self._word_freq_manager = WordFrequencyManager() def run(self): for w in self._storage_manager.words(): if not self._stop_word_manager.is_stop_word(w): self._word_freq_manager.increment_count(w) word_freqs = self._word_freq_manager.sorted() for tf in word_freqs[0:25]: print tf[0], ' - ', tf[1] # # The main function # WordFrequencyController(sys.argv[1]).run()