Added a lot more comments to style 01

This commit is contained in:
Crista Lopes
2013-11-03 10:10:46 -08:00
parent 3af7744d5d
commit 1c729d9d63

View File

@@ -11,45 +11,50 @@ def touchopen(filename, *args, **kwargs):
open(filename, "a").close() # "touch" file open(filename, "a").close() # "touch" file
return open(filename, *args, **kwargs) return open(filename, *args, **kwargs)
# The constrained memory, which consists of only 1024 cells by constraint # The constrained memory should have no more than 1024 cells
data = [] data = []
# Overall strategy:
# - read the input file one line at a time
# - filter the characters, normalize to lower case
# - identify words, incrementing corresponding counts (in secondary memory)
# We're lucky: # We're lucky:
# The stop words are only 556 bytes and the lines are all less than # The stop words are only 556 characters and the lines are all
# 80 characters, so we can use that knowledge to simplify the problem: # less than 80 characters, so we can use that knowledge to
# we can have the stop words loaded in memory while processing one line # simplify the problem: we can have the stop words loaded in
# of the input at a time. # memory while processing one line of the input at a time.
# If these two assumptions didn't hold, the algorithm would need to be # If these two assumptions didn't hold, the algorithm would
# changed considerably. # need to be changed considerably.
# Overall stragety: (PART 1) read the input file, count the
# words, increment/store counts in secondary memory (a file)
# (PART 2) find the 25 most frequent words in secondary memory
# Load the list of stop words # Load the list of stop words
f = open('../stop_words.txt') f = open('../stop_words.txt')
data = [f.read(1024).split(',')] # data[0] holds the stop words data = [f.read(1024).split(',')] # data[0] holds the stop words
f.close() f.close()
data.append([]) # data[1] is the line # PART 1:
data.append(None) # data[2] is the index of the start_char of a word # - read the input file one line at a time
data.append(0) # data[3] is an index, i = 0 # - filter the characters, normalize to lower case
data.append(False) # data[4] is a flag indicating where a word was found # - identify words, increment corresponding counts in file
data.append([]) # data[1] is line (max 80 characters)
data.append(None) # data[2] is index of the start_char of word
data.append(0) # data[3] is index on characters, i = 0
data.append(False) # data[4] is flag indicating whether word was found
data.append('') # data[5] is the word data.append('') # data[5] is the word
data.append('') # data[6] is word,NNNN from the word_freqs file, and then just the word data.append('') # data[6] is word,NNNN from word_freqs file, and then just word
data.append(0) # data[7] is frequency data.append(0) # data[7] is frequency
# Open the secondary memory
word_freqs = touchopen('word_freqs', 'rb+') word_freqs = touchopen('word_freqs', 'rb+')
# Open the input file
f = open(sys.argv[1]) f = open(sys.argv[1])
# Loop over input file's lines
while True: while True:
data[1] = [f.readline()] data[1] = [f.readline()]
#print data[1]
if data[1] == ['']: # end of input file if data[1] == ['']: # end of input file
break break
data[2] = None data[2] = None
data[3] = 0 data[3] = 0
# Loop over characters in the line
for c in data[1][0]: # elimination of symbol c left as exercise for c in data[1][0]: # elimination of symbol c left as exercise
if data[2] == None: if data[2] == None:
if c.isalnum(): if c.isalnum():
@@ -60,14 +65,12 @@ while True:
# We found the end of a word. Process it # We found the end of a word. Process it
data[4] = False data[4] = False
data[5] = data[1][0][data[2]:data[3]].lower() data[5] = data[1][0][data[2]:data[3]].lower()
#print "Looking at " + data[5]
# Ignore words with less than 2 characters and stop words # Ignore words with less than 2 characters and stop words
if len(data[5]) >= 2 and data[5] not in data[0]: if len(data[5]) >= 2 and data[5] not in data[0]:
# Let's see if it already exists # Let's see if it already exists
while True: while True:
data[6] = word_freqs.readline().strip() data[6] = word_freqs.readline().strip()
#print " Comparing to " + data[6]
if data[6] == '': if data[6] == '':
break; break;
data[7] = int(data[6].split(',')[1]) data[7] = int(data[6].split(',')[1])
@@ -89,35 +92,36 @@ while True:
# Let's reset # Let's reset
data[2] = None data[2] = None
data[3] += 1 data[3] += 1
# We're done with the input file
f.close() f.close()
word_freqs.flush() word_freqs.flush()
# PART 2
# Now we need to find the 25 most frequently occuring words. # Now we need to find the 25 most frequently occuring words.
# We don't need anything from the previous values in memory # We don't need anything from the previous values in memory
del data[:] del data[:]
# Let's use the first 25 entries for the top 25 words # Let's use the first 25 entries for the top 25 words
data = data + [[]]*(25 - len(data)) data = data + [[]]*(25 - len(data))
data.append('') # data[25] is word,freq read from word_freqs file, and then word data.append('') # data[25] is word,freq from file, and then word
data.append(0) # data[26] is freq data.append(0) # data[26] is freq
# Loop over secondary memory file
while True: while True:
data[25] = word_freqs.readline().strip() data[25] = word_freqs.readline().strip()
if data[25] == '': if data[25] == '': # EOF
break; break
data[26] = int(data[25].split(',')[1]) data[26] = int(data[25].split(',')[1])
data[25] = data[25].split(',')[0].strip() # word, no white space data[25] = data[25].split(',')[0].strip() # word, no white space
# Check if this word has more counts than the ones in memory
for i in range(25): # elimination of symbol i left as exercise for i in range(25): # elimination of symbol i left as exercise
if data[i] == [] or data[i][1] < data[26]: if data[i] == [] or data[i][1] < data[26]:
data.insert(i, [data[25], data[26]]) data.insert(i, [data[25], data[26]])
del data[26] # delete the last element del data[26] # delete the last element
break break
for tf in data[0:25]: # elimination of symbol tf left as exercise for tf in data[0:25]: # elimination of symbol tf left as exercise
if len(tf) == 2: if len(tf) == 2:
print tf[0], ' - ', tf[1] print tf[0], ' - ', tf[1]
# We're done
word_freqs.close() word_freqs.close()