Files
exercises-in-programming-style/12-inverse-multiplexer/tf-12.py
Bruce Adams 5749c2c50f Enhance testing: run all executables
Add a #! line at the beginning of each of the existing Python programs
and change these files to be executable. This sets the stage for having
the test script blindly run anything that is executable, adding support
for testing many programming languages.
2013-09-24 22:09:30 -04:00

110 lines
2.8 KiB
Python
Executable File

#!/usr/bin/env python
import sys, re, operator, string
#
# Functions for map reduce
#
def partition(data_str, nlines):
"""
Generator function that partitions the input data_str (a big string)
into chunks of nlines.
"""
lines = data_str.split('\n')
for i in xrange(0, len(lines), nlines):
yield '\n'.join(lines[i:i+nlines])
def split_words(data_str):
"""
Takes a string, filters non alphanumeric characters, normalizes to
lower case, scans for words, and filters the stop words.
It returns a list of pairs (word, 1), one for each word in the input, so
[(w1, 1), (w2, 1), ..., (wn, 1)]
"""
def _filter_chars(str_data):
"""
Takes a string and returns a copy with all nonalphanumeric chars
replaced by white space
"""
pattern = re.compile('[\W_]+')
return pattern.sub(' ', str_data)
def _normalize(str_data):
"""
Takes a string and returns a copy with all characters in lower case
"""
return str_data.lower()
def _scan(str_data):
"""
Takes a string and scans for words, returning
a list of words.
"""
return str_data.split()
def _remove_stop_words(word_list):
f = open('../stop_words.txt')
stop_words = f.read().split(',')
f.close()
# add single-letter words
stop_words.extend(list(string.ascii_lowercase))
return [w for w in word_list if not w in stop_words]
# The actual work of splitting the input into words
result = []
words = _remove_stop_words(_scan(_normalize(_filter_chars(data_str))))
for w in words:
result.append((w, 1))
return result
def count_words(pairs_list_1, pairs_list_2):
"""
Takes a two lists of pairs of the form
[(w1, 1), ...]
and returns a list of pairs [(w1, frequency), ...],
where frequency is the sum of all the reported occurrences
"""
mapping = dict((k, v) for k, v in pairs_list_1)
for p in pairs_list_2:
if p[0] in mapping:
mapping[p[0]] += p[1]
else:
mapping[p[0]] = 1
return mapping.items()
#
# Auxiliary functions
#
def read_file(path_to_file):
"""
Takes a path to a file and returns the entire
contents of the file as a string
"""
f = open(path_to_file)
data = f.read()
f.close()
return data
def sort(word_freq):
"""
Takes a collection of words and their frequencies
and returns a collection of pairs where the entries are
sorted by frequency
"""
return sorted(word_freq, key=operator.itemgetter(1), reverse=True)
#
# The main function
#
splits = map(split_words, partition(read_file(sys.argv[1]), 200))
splits.insert(0, []) # Normalize input to reduce
word_freqs = sort(reduce(count_words, splits))
for tf in word_freqs[0:25]:
print tf[0], ' - ', tf[1]