From 1c729d9d638e99e6d28d89964a7fc1abdc5b626c Mon Sep 17 00:00:00 2001 From: Crista Lopes Date: Sun, 3 Nov 2013 10:10:46 -0800 Subject: [PATCH] Added a lot more comments to style 01 --- 01-good-old-times/tf-01.py | 62 ++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/01-good-old-times/tf-01.py b/01-good-old-times/tf-01.py index b6fc0d5..271dc71 100755 --- a/01-good-old-times/tf-01.py +++ b/01-good-old-times/tf-01.py @@ -11,45 +11,50 @@ def touchopen(filename, *args, **kwargs): open(filename, "a").close() # "touch" file return open(filename, *args, **kwargs) -# The constrained memory, which consists of only 1024 cells by constraint +# The constrained memory should have no more than 1024 cells data = [] - -# Overall strategy: -# - read the input file one line at a time -# - filter the characters, normalize to lower case -# - identify words, incrementing corresponding counts (in secondary memory) - - # We're lucky: -# The stop words are only 556 bytes and the lines are all less than -# 80 characters, so we can use that knowledge to simplify the problem: -# we can have the stop words loaded in memory while processing one line -# of the input at a time. -# If these two assumptions didn't hold, the algorithm would need to be -# changed considerably. +# The stop words are only 556 characters and the lines are all +# less than 80 characters, so we can use that knowledge to +# simplify the problem: we can have the stop words loaded in +# memory while processing one line of the input at a time. +# If these two assumptions didn't hold, the algorithm would +# need to be changed considerably. + +# Overall stragety: (PART 1) read the input file, count the +# words, increment/store counts in secondary memory (a file) +# (PART 2) find the 25 most frequent words in secondary memory # Load the list of stop words f = open('../stop_words.txt') data = [f.read(1024).split(',')] # data[0] holds the stop words f.close() -data.append([]) # data[1] is the line -data.append(None) # data[2] is the index of the start_char of a word -data.append(0) # data[3] is an index, i = 0 -data.append(False) # data[4] is a flag indicating where a word was found +# PART 1: +# - read the input file one line at a time +# - filter the characters, normalize to lower case +# - identify words, increment corresponding counts in file + +data.append([]) # data[1] is line (max 80 characters) +data.append(None) # data[2] is index of the start_char of word +data.append(0) # data[3] is index on characters, i = 0 +data.append(False) # data[4] is flag indicating whether word was found data.append('') # data[5] is the word -data.append('') # data[6] is word,NNNN from the word_freqs file, and then just the word +data.append('') # data[6] is word,NNNN from word_freqs file, and then just word data.append(0) # data[7] is frequency +# Open the secondary memory word_freqs = touchopen('word_freqs', 'rb+') +# Open the input file f = open(sys.argv[1]) +# Loop over input file's lines while True: data[1] = [f.readline()] - #print data[1] if data[1] == ['']: # end of input file break data[2] = None data[3] = 0 + # Loop over characters in the line for c in data[1][0]: # elimination of symbol c left as exercise if data[2] == None: if c.isalnum(): @@ -60,14 +65,12 @@ while True: # We found the end of a word. Process it data[4] = False data[5] = data[1][0][data[2]:data[3]].lower() - #print "Looking at " + data[5] # Ignore words with less than 2 characters and stop words if len(data[5]) >= 2 and data[5] not in data[0]: # Let's see if it already exists while True: data[6] = word_freqs.readline().strip() - #print " Comparing to " + data[6] if data[6] == '': break; data[7] = int(data[6].split(',')[1]) @@ -89,35 +92,36 @@ while True: # Let's reset data[2] = None data[3] += 1 - +# We're done with the input file f.close() word_freqs.flush() +# PART 2 # Now we need to find the 25 most frequently occuring words. # We don't need anything from the previous values in memory del data[:] # Let's use the first 25 entries for the top 25 words data = data + [[]]*(25 - len(data)) -data.append('') # data[25] is word,freq read from word_freqs file, and then word +data.append('') # data[25] is word,freq from file, and then word data.append(0) # data[26] is freq +# Loop over secondary memory file while True: data[25] = word_freqs.readline().strip() - if data[25] == '': - break; + if data[25] == '': # EOF + break data[26] = int(data[25].split(',')[1]) data[25] = data[25].split(',')[0].strip() # word, no white space - + # Check if this word has more counts than the ones in memory for i in range(25): # elimination of symbol i left as exercise if data[i] == [] or data[i][1] < data[26]: data.insert(i, [data[25], data[26]]) del data[26] # delete the last element break - for tf in data[0:25]: # elimination of symbol tf left as exercise if len(tf) == 2: print tf[0], ' - ', tf[1] - +# We're done word_freqs.close()