exercises-in-programming-style/09-kinds-of-things/tf-09.py

#!/usr/bin/env python

import abc, sys, re, operator, string

#
# The abstract data types
#
class IDataStorage (object):
    """ Models the contents of the file """
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def words(self):
        """ Returns the words in storage """
        raise NotImplementedError("Abstract Data Type")

class IStopWordFilter (object):
    """ Models the stop word filter """
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def is_stop_word(self, word):
        """ Checks whether the given word is a stop word """
        raise NotImplementedError("Abstract Data Type")

class IWordFrequencyCounter(object):
    """ Keeps the word frequency data """
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def increment_count(self, word):
        """ Increments the count for the given word """
        raise NotImplementedError("Abstract Data Type")

    @abc.abstractmethod
    def sorted(self):
        """ Returns the words and their frequencies, sorted by frequency"""
        raise NotImplementedError("Abstract Data Type")

#
# The concrete type implementations
#
class DataStorage:
    """ Implements the contents of the file """
    _data = ''
    def __init__(self, path_to_file):
        f = open(path_to_file)
        self._data = f.read()
        f.close()
        self.__filter_chars_normalize()
        self.__scan()

    def __filter_chars_normalize(self):
        """
        Takes a string and returns a copy with all nonalphanumeric chars
        replaced by white space
        """
        pattern = re.compile('[\W_]+')
        self._data = pattern.sub(' ', self._data).lower()

    def __scan(self):
        self._data = ''.join(self._data).split()

    def words(self):
        """
        Returns the list words in storage
        """
        return self._data

class StopWordFilter:
    """ Implements the stop word filter """
    _stop_words = []
    def __init__(self):
        f = open('../stop_words.txt')
        self._stop_words = f.read().split(',')
        f.close()
        # add single-letter words
        self._stop_words.extend(list(string.ascii_lowercase))

    def is_stop_word(self, word):
        return word in self._stop_words

class WordFrequencyCounter:
    """ Implements the word frequency data """
    _word_freqs = {}

    def increment_count(self, word):
        if word in self._word_freqs:
            self._word_freqs[word] += 1
        else:
            self._word_freqs[word] = 1

    def sorted(self):
        return sorted(self._word_freqs.iteritems(), key=operator.itemgetter(1), reverse=True)


#
# The wiring between ADTs and concrete implementations
#
IDataStorage.register(DataStorage)
IStopWordFilter.register(StopWordFilter)
IWordFrequencyCounter.register(WordFrequencyCounter)

class WordFrequencyApplication:
    def __init__(self, path_to_file):
        self._storage = DataStorage(path_to_file)
        self._stop_word_manager = StopWordFilter()
        self._word_freq_counter = WordFrequencyCounter()

    def run(self):
        for w in self._storage.words():
            if not self._stop_word_manager.is_stop_word(w):
                self._word_freq_counter.increment_count(w)

        word_freqs = self._word_freq_counter.sorted()
        for tf in word_freqs[0:25]:
            print tf[0], ' - ', tf[1]

#
# The main function
#
WordFrequencyApplication(sys.argv[1]).run()