More clean up of comments / formatting for style 01

2013-11-04 09:15:29 -08:00
parent 8f32eba166
commit 912864de95
1 changed files with 15 additions and 14 deletions
--- a/01-good-old-times/tf-01.py
+++ b/01-good-old-times/tf-01.py
@@ -25,22 +25,22 @@ data = []
 # words, increment/store counts in secondary memory (a file) 
 # (PART 2) find the 25 most frequent words in secondary memory

-# Load the list of stop words
-f = open('../stop_words.txt')
-data = [f.read(1024).split(',')] # data[0] holds the stop words
-f.close()
-
 # PART 1: 
 # - read the input file one line at a time
 # - filter the characters, normalize to lower case
 # - identify words, increment corresponding counts in file

+# Load the list of stop words
+f = open('../stop_words.txt')
+data = [f.read(1024).split(',')] # data[0] holds the stop words
+f.close()
+
 data.append([])    # data[1] is line (max 80 characters)
 data.append(None)  # data[2] is index of the start_char of word
 data.append(0)     # data[3] is index on characters, i = 0
-data.append(False) # data[4] is flag indicating whether word was found
+data.append(False) # data[4] is flag indicating if word was found
 data.append('')    # data[5] is the word
-data.append('')    # data[6] is word,NNNN from word_freqs file, and then just word
+data.append('')    # data[6] is word,NNNN and then just word
 data.append(0)     # data[7] is frequency

 # Open the secondary memory
@@ -55,7 +55,7 @@ while True:
    data[2] = None
    data[3] = 0 
    # Loop over characters in the line
-    for c in data[1][0]: # elimination of symbol c left as exercise
+    for c in data[1][0]: # elimination of symbol c is exercise
        if data[2] == None:
            if c.isalnum():
                # We found the start of a word
@@ -66,7 +66,7 @@ while True:
                data[4] = False 
                data[5] = data[1][0][data[2]:data[3]].lower()

-                # Ignore words with less than 2 characters and stop words
+                # Ignore words with len < 2, and stop words
                if len(data[5]) >= 2 and data[5] not in data[0]:
                    # Let's see if it already exists
                    while True:
@@ -74,14 +74,15 @@ while True:
                        if data[6] == '':
                            break;
                        data[7] = int(data[6].split(',')[1])
-                        data[6] = data[6].split(',')[0].strip() # word, no white space
+                        # word, no white space
+                        data[6] = data[6].split(',')[0].strip() 
                        if data[5] == data[6]:
                            data[7] += 1
                            data[4] = True
                            break

                    if not data[4]:
-                        word_freqs.seek(0, 1) # Not needed in Unix, needed in Windows
+                        word_freqs.seek(0, 1) # Needed in Windows
                        word_freqs.writelines("%20s,%04d\n" % (data[5], 1))
                    else:
                        word_freqs.seek(-26, 1)
@@ -112,15 +113,15 @@ while True:
    if data[25] == '': # EOF
        break
    data[26] = int(data[25].split(',')[1]) # Read it as integer
-    data[25] = data[25].split(',')[0].strip() # word, no white space
+    data[25] = data[25].split(',')[0].strip() # word
    # Check if this word has more counts than the ones in memory
-    for i in range(25): # elimination of symbol i left as exercise
+    for i in range(25): # elimination of symbol i is exercise
        if data[i] == [] or data[i][1] < data[26]:
            data.insert(i, [data[25], data[26]]) 
            del data[26] #  delete the last element
            break
            
-for tf in data[0:25]: # elimination of symbol tf left as exercise
+for tf in data[0:25]: # elimination of symbol tf is exercise
    if len(tf) == 2:
        print tf[0], ' - ', tf[1]
 # We're done