From 1c729d9d638e99e6d28d89964a7fc1abdc5b626c Mon Sep 17 00:00:00 2001
From: Crista Lopes <crista@tagide.com>
Date: Sun, 3 Nov 2013 10:10:46 -0800
Subject: [PATCH] Added a lot more comments to style 01

---
 01-good-old-times/tf-01.py | 62 ++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/01-good-old-times/tf-01.py b/01-good-old-times/tf-01.py
index b6fc0d5..271dc71 100755
--- a/01-good-old-times/tf-01.py
+++ b/01-good-old-times/tf-01.py
@@ -11,45 +11,50 @@ def touchopen(filename, *args, **kwargs):
     open(filename, "a").close() # "touch" file
     return open(filename, *args, **kwargs)
 
-# The constrained memory, which consists of only 1024 cells by constraint
+# The constrained memory should have no more than 1024 cells
 data = []
-
-# Overall strategy: 
-# - read the input file one line at a time
-# - filter the characters, normalize to lower case
-# - identify words, incrementing corresponding counts (in secondary memory)
-
-
 # We're lucky:
-# The stop words are only 556 bytes and the lines are all less than
-# 80 characters, so we can use that knowledge to simplify the problem:
-# we can have the stop words loaded in memory while processing one line
-# of the input at a time.
-# If these two assumptions didn't hold, the algorithm would need to be
-# changed considerably.
+# The stop words are only 556 characters and the lines are all 
+# less than 80 characters, so we can use that knowledge to 
+# simplify the problem: we can have the stop words loaded in 
+# memory while processing one line of the input at a time.
+# If these two assumptions didn't hold, the algorithm would 
+# need to be changed considerably.
+
+# Overall stragety: (PART 1) read the input file, count the 
+# words, increment/store counts in secondary memory (a file) 
+# (PART 2) find the 25 most frequent words in secondary memory
 
 # Load the list of stop words
 f = open('../stop_words.txt')
 data = [f.read(1024).split(',')] # data[0] holds the stop words
 f.close()
 
-data.append([])    # data[1] is the line
-data.append(None)  # data[2] is the index of the start_char of a word
-data.append(0)     # data[3] is an index, i = 0
-data.append(False) # data[4] is a flag indicating where a word was found
+# PART 1: 
+# - read the input file one line at a time
+# - filter the characters, normalize to lower case
+# - identify words, increment corresponding counts in file
+
+data.append([])    # data[1] is line (max 80 characters)
+data.append(None)  # data[2] is index of the start_char of word
+data.append(0)     # data[3] is index on characters, i = 0
+data.append(False) # data[4] is flag indicating whether word was found
 data.append('')    # data[5] is the word
-data.append('')    # data[6] is word,NNNN from the word_freqs file, and then just the word
+data.append('')    # data[6] is word,NNNN from word_freqs file, and then just word
 data.append(0)     # data[7] is frequency
 
+# Open the secondary memory
 word_freqs = touchopen('word_freqs', 'rb+')
+# Open the input file
 f = open(sys.argv[1])
+# Loop over input file's lines
 while True:
     data[1] = [f.readline()] 
-    #print data[1]
     if data[1] == ['']: # end of input file
         break
     data[2] = None
     data[3] = 0 
+    # Loop over characters in the line
     for c in data[1][0]: # elimination of symbol c left as exercise
         if data[2] == None:
             if c.isalnum():
@@ -60,14 +65,12 @@ while True:
                 # We found the end of a word. Process it
                 data[4] = False 
                 data[5] = data[1][0][data[2]:data[3]].lower()
-                #print "Looking at " + data[5]
 
                 # Ignore words with less than 2 characters and stop words
                 if len(data[5]) >= 2 and data[5] not in data[0]:
                     # Let's see if it already exists
                     while True:
                         data[6] = word_freqs.readline().strip()
-                        #print "  Comparing to " + data[6]
                         if data[6] == '':
                             break;
                         data[7] = int(data[6].split(',')[1])
@@ -89,35 +92,36 @@ while True:
                 # Let's reset
                 data[2] = None
         data[3] += 1
-
+# We're done with the input file
 f.close()
 word_freqs.flush()
 
+# PART 2
 # Now we need to find the 25 most frequently occuring words.
 # We don't need anything from the previous values in memory
 del data[:]
 
 # Let's use the first 25 entries for the top 25 words
 data = data + [[]]*(25 - len(data))
-data.append('') # data[25] is word,freq read from word_freqs file, and then word
+data.append('') # data[25] is word,freq from file, and then word
 data.append(0)  # data[26] is freq
 
+# Loop over secondary memory file
 while True:
     data[25] = word_freqs.readline().strip()
-    if data[25] == '':
-        break;
+    if data[25] == '': # EOF
+        break
     data[26] = int(data[25].split(',')[1])
     data[25] = data[25].split(',')[0].strip() # word, no white space
-
+    # Check if this word has more counts than the ones in memory
     for i in range(25): # elimination of symbol i left as exercise
         if data[i] == [] or data[i][1] < data[26]:
             data.insert(i, [data[25], data[26]]) 
             del data[26] #  delete the last element
             break
             
-
 for tf in data[0:25]: # elimination of symbol tf left as exercise
     if len(tf) == 2:
         print tf[0], ' - ', tf[1]
-
+# We're done
 word_freqs.close()