diff --git a/01-good-old-times/tf-01.py b/01-good-old-times/tf-01.py index fc27d22..5b913a5 100755 --- a/01-good-old-times/tf-01.py +++ b/01-good-old-times/tf-01.py @@ -25,22 +25,22 @@ data = [] # words, increment/store counts in secondary memory (a file) # (PART 2) find the 25 most frequent words in secondary memory -# Load the list of stop words -f = open('../stop_words.txt') -data = [f.read(1024).split(',')] # data[0] holds the stop words -f.close() - # PART 1: # - read the input file one line at a time # - filter the characters, normalize to lower case # - identify words, increment corresponding counts in file +# Load the list of stop words +f = open('../stop_words.txt') +data = [f.read(1024).split(',')] # data[0] holds the stop words +f.close() + data.append([]) # data[1] is line (max 80 characters) data.append(None) # data[2] is index of the start_char of word data.append(0) # data[3] is index on characters, i = 0 -data.append(False) # data[4] is flag indicating whether word was found +data.append(False) # data[4] is flag indicating if word was found data.append('') # data[5] is the word -data.append('') # data[6] is word,NNNN from word_freqs file, and then just word +data.append('') # data[6] is word,NNNN and then just word data.append(0) # data[7] is frequency # Open the secondary memory @@ -55,7 +55,7 @@ while True: data[2] = None data[3] = 0 # Loop over characters in the line - for c in data[1][0]: # elimination of symbol c left as exercise + for c in data[1][0]: # elimination of symbol c is exercise if data[2] == None: if c.isalnum(): # We found the start of a word @@ -66,7 +66,7 @@ while True: data[4] = False data[5] = data[1][0][data[2]:data[3]].lower() - # Ignore words with less than 2 characters and stop words + # Ignore words with len < 2, and stop words if len(data[5]) >= 2 and data[5] not in data[0]: # Let's see if it already exists while True: @@ -74,14 +74,15 @@ while True: if data[6] == '': break; data[7] = int(data[6].split(',')[1]) - data[6] = data[6].split(',')[0].strip() # word, no white space + # word, no white space + data[6] = data[6].split(',')[0].strip() if data[5] == data[6]: data[7] += 1 data[4] = True break if not data[4]: - word_freqs.seek(0, 1) # Not needed in Unix, needed in Windows + word_freqs.seek(0, 1) # Needed in Windows word_freqs.writelines("%20s,%04d\n" % (data[5], 1)) else: word_freqs.seek(-26, 1) @@ -112,15 +113,15 @@ while True: if data[25] == '': # EOF break data[26] = int(data[25].split(',')[1]) # Read it as integer - data[25] = data[25].split(',')[0].strip() # word, no white space + data[25] = data[25].split(',')[0].strip() # word # Check if this word has more counts than the ones in memory - for i in range(25): # elimination of symbol i left as exercise + for i in range(25): # elimination of symbol i is exercise if data[i] == [] or data[i][1] < data[26]: data.insert(i, [data[25], data[26]]) del data[26] # delete the last element break -for tf in data[0:25]: # elimination of symbol tf left as exercise +for tf in data[0:25]: # elimination of symbol tf is exercise if len(tf) == 2: print tf[0], ' - ', tf[1] # We're done