diff --git a/19-plugins/config.ini b/19-plugins/config.ini new file mode 100644 index 0000000..fa60412 --- /dev/null +++ b/19-plugins/config.ini @@ -0,0 +1,5 @@ +[Plugins] +;; Valid options: plugins/words1.pyc, plugins/words2.pyc +words = plugins/words1.pyc +;; Valid options: plugins/frequencies1.pyc, plugins/frequencies2.pyc +frequencies = plugins/frequencies1.pyc diff --git a/19-plugins/plugins-src/compile.sh b/19-plugins/plugins-src/compile.sh new file mode 100644 index 0000000..37dac1a --- /dev/null +++ b/19-plugins/plugins-src/compile.sh @@ -0,0 +1,2 @@ +python -m compileall . +cp *.pyc ../plugins diff --git a/19-plugins/plugins-src/frequencies1.py b/19-plugins/plugins-src/frequencies1.py new file mode 100644 index 0000000..5171cbb --- /dev/null +++ b/19-plugins/plugins-src/frequencies1.py @@ -0,0 +1,18 @@ +import operator + +def top25(word_list): + """ + Takes a list of words and returns a dictionary associating + words with frequencies of occurrence + """ + if type(word_list) is not list or word_list == []: + return {} + + word_freqs = {} + for w in word_list: + if w in word_freqs: + word_freqs[w] += 1 + else: + word_freqs[w] = 1 + return sorted(word_freqs.iteritems(), key=operator.itemgetter(1), reverse=True)[:25] + diff --git a/19-plugins/plugins-src/frequencies2.py b/19-plugins/plugins-src/frequencies2.py new file mode 100644 index 0000000..016ce66 --- /dev/null +++ b/19-plugins/plugins-src/frequencies2.py @@ -0,0 +1,10 @@ +import operator, collections + +def top25(word_list): + """ + Takes a list of words and returns a dictionary associating + words with frequencies of occurrence + """ + counts = collections.Counter(w for w in word_list) + return counts.most_common(25) + diff --git a/19-plugins/plugins-src/words1.py b/19-plugins/plugins-src/words1.py new file mode 100644 index 0000000..b90594e --- /dev/null +++ b/19-plugins/plugins-src/words1.py @@ -0,0 +1,31 @@ +import sys, re, string + +def extract_words(path_to_file): + """ + Takes a path to a file and returns the non-stop + words, after properly removing nonalphanumeric chars + and normalizing for lower case + """ + if type(path_to_file) is not str or not path_to_file: + return [] + + try: + with open(path_to_file) as f: + str_data = f.read() + except IOError as e: + print "I/O error({0}) when opening {1}: {2}".format(e.errno, path_to_file, e.strerror) + return [] + + pattern = re.compile('[\W_]+') + word_list = pattern.sub(' ', str_data).lower().split() + + try: + with open('../stop_words.txt') as f: + stop_words = f.read().split(',') + except IOError as e: + print "I/O error({0}) when opening ../stops_words.txt: {1}".format(e.errno, e.strerror) + return [] + + stop_words.extend(list(string.ascii_lowercase)) + return [w for w in word_list if not w in stop_words] + diff --git a/19-plugins/plugins-src/words2.py b/19-plugins/plugins-src/words2.py new file mode 100644 index 0000000..f314f22 --- /dev/null +++ b/19-plugins/plugins-src/words2.py @@ -0,0 +1,12 @@ +import sys, re, string + +def extract_words(path_to_file): + """ + Takes a path to a file and returns the non-stop + words, after properly removing nonalphanumeric chars + and normalizing for lower case + """ + words = re.findall('[a-z]{2,}', open(path_to_file).read().lower()) + stopwords = set(open('../stop_words.txt').read().split(',')) + return [w for w in words if w not in stopwords] + diff --git a/19-plugins/plugins/frequencies1.pyc b/19-plugins/plugins/frequencies1.pyc new file mode 100644 index 0000000..a204d1f Binary files /dev/null and b/19-plugins/plugins/frequencies1.pyc differ diff --git a/19-plugins/plugins/frequencies2.pyc b/19-plugins/plugins/frequencies2.pyc new file mode 100644 index 0000000..30e5286 Binary files /dev/null and b/19-plugins/plugins/frequencies2.pyc differ diff --git a/19-plugins/plugins/words1.pyc b/19-plugins/plugins/words1.pyc new file mode 100644 index 0000000..cae656d Binary files /dev/null and b/19-plugins/plugins/words1.pyc differ diff --git a/19-plugins/plugins/words2.pyc b/19-plugins/plugins/words2.pyc new file mode 100644 index 0000000..699a7e5 Binary files /dev/null and b/19-plugins/plugins/words2.pyc differ diff --git a/19-plugins/tf-19.py b/19-plugins/tf-19.py new file mode 100644 index 0000000..13d8b46 --- /dev/null +++ b/19-plugins/tf-19.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +import sys, ConfigParser, imp + +def load_plugins(): + config = ConfigParser.ConfigParser() + config.read("config.ini") + words_plugin = config.get("Plugins", "words") + frequencies_plugin = config.get("Plugins", "frequencies") + global tfwords, tffreqs + tfwords = imp.load_compiled('tfwords', words_plugin) + tffreqs = imp.load_compiled('tffreqs', frequencies_plugin) + +# +# The main function +# + +load_plugins() +word_freqs = tffreqs.top25(tfwords.extract_words(sys.argv[1])) + +for tf in word_freqs: + print tf[0], ' - ', tf[1] +