This commit is contained in:
Crista Lopes
2013-12-27 14:45:38 -08:00
parent a848cd6831
commit 84f1310591

View File

@@ -15,15 +15,11 @@ def partition(data_str, nlines):
def split_words(data_str): def split_words(data_str):
""" """
Takes a string, filters non alphanumeric characters, normalizes to Takes a string, returns a list of pairs (word, 1),
lower case, scans for words, and filters the stop words. one for each word in the input, so
It returns a list of pairs (word, 1), one for each word in the input, so
[(w1, 1), (w2, 1), ..., (wn, 1)] [(w1, 1), (w2, 1), ..., (wn, 1)]
""" """
def _scan(str_data): def _scan(str_data):
"""
Takes a string and returns a list of words
"""
pattern = re.compile('[\W_]+') pattern = re.compile('[\W_]+')
return pattern.sub(' ', str_data).lower().split() return pattern.sub(' ', str_data).lower().split()
@@ -60,7 +56,6 @@ def count_words(pairs_list_1, pairs_list_2):
# #
# Auxiliary functions # Auxiliary functions
# #
def read_file(path_to_file): def read_file(path_to_file):
with open(path_to_file) as f: with open(path_to_file) as f:
data = f.read() data = f.read()