More clean up of comments / formatting for style 01
This commit is contained in:
@@ -25,22 +25,22 @@ data = []
|
|||||||
# words, increment/store counts in secondary memory (a file)
|
# words, increment/store counts in secondary memory (a file)
|
||||||
# (PART 2) find the 25 most frequent words in secondary memory
|
# (PART 2) find the 25 most frequent words in secondary memory
|
||||||
|
|
||||||
# Load the list of stop words
|
|
||||||
f = open('../stop_words.txt')
|
|
||||||
data = [f.read(1024).split(',')] # data[0] holds the stop words
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
# PART 1:
|
# PART 1:
|
||||||
# - read the input file one line at a time
|
# - read the input file one line at a time
|
||||||
# - filter the characters, normalize to lower case
|
# - filter the characters, normalize to lower case
|
||||||
# - identify words, increment corresponding counts in file
|
# - identify words, increment corresponding counts in file
|
||||||
|
|
||||||
|
# Load the list of stop words
|
||||||
|
f = open('../stop_words.txt')
|
||||||
|
data = [f.read(1024).split(',')] # data[0] holds the stop words
|
||||||
|
f.close()
|
||||||
|
|
||||||
data.append([]) # data[1] is line (max 80 characters)
|
data.append([]) # data[1] is line (max 80 characters)
|
||||||
data.append(None) # data[2] is index of the start_char of word
|
data.append(None) # data[2] is index of the start_char of word
|
||||||
data.append(0) # data[3] is index on characters, i = 0
|
data.append(0) # data[3] is index on characters, i = 0
|
||||||
data.append(False) # data[4] is flag indicating whether word was found
|
data.append(False) # data[4] is flag indicating if word was found
|
||||||
data.append('') # data[5] is the word
|
data.append('') # data[5] is the word
|
||||||
data.append('') # data[6] is word,NNNN from word_freqs file, and then just word
|
data.append('') # data[6] is word,NNNN and then just word
|
||||||
data.append(0) # data[7] is frequency
|
data.append(0) # data[7] is frequency
|
||||||
|
|
||||||
# Open the secondary memory
|
# Open the secondary memory
|
||||||
@@ -55,7 +55,7 @@ while True:
|
|||||||
data[2] = None
|
data[2] = None
|
||||||
data[3] = 0
|
data[3] = 0
|
||||||
# Loop over characters in the line
|
# Loop over characters in the line
|
||||||
for c in data[1][0]: # elimination of symbol c left as exercise
|
for c in data[1][0]: # elimination of symbol c is exercise
|
||||||
if data[2] == None:
|
if data[2] == None:
|
||||||
if c.isalnum():
|
if c.isalnum():
|
||||||
# We found the start of a word
|
# We found the start of a word
|
||||||
@@ -66,7 +66,7 @@ while True:
|
|||||||
data[4] = False
|
data[4] = False
|
||||||
data[5] = data[1][0][data[2]:data[3]].lower()
|
data[5] = data[1][0][data[2]:data[3]].lower()
|
||||||
|
|
||||||
# Ignore words with less than 2 characters and stop words
|
# Ignore words with len < 2, and stop words
|
||||||
if len(data[5]) >= 2 and data[5] not in data[0]:
|
if len(data[5]) >= 2 and data[5] not in data[0]:
|
||||||
# Let's see if it already exists
|
# Let's see if it already exists
|
||||||
while True:
|
while True:
|
||||||
@@ -74,14 +74,15 @@ while True:
|
|||||||
if data[6] == '':
|
if data[6] == '':
|
||||||
break;
|
break;
|
||||||
data[7] = int(data[6].split(',')[1])
|
data[7] = int(data[6].split(',')[1])
|
||||||
data[6] = data[6].split(',')[0].strip() # word, no white space
|
# word, no white space
|
||||||
|
data[6] = data[6].split(',')[0].strip()
|
||||||
if data[5] == data[6]:
|
if data[5] == data[6]:
|
||||||
data[7] += 1
|
data[7] += 1
|
||||||
data[4] = True
|
data[4] = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not data[4]:
|
if not data[4]:
|
||||||
word_freqs.seek(0, 1) # Not needed in Unix, needed in Windows
|
word_freqs.seek(0, 1) # Needed in Windows
|
||||||
word_freqs.writelines("%20s,%04d\n" % (data[5], 1))
|
word_freqs.writelines("%20s,%04d\n" % (data[5], 1))
|
||||||
else:
|
else:
|
||||||
word_freqs.seek(-26, 1)
|
word_freqs.seek(-26, 1)
|
||||||
@@ -112,15 +113,15 @@ while True:
|
|||||||
if data[25] == '': # EOF
|
if data[25] == '': # EOF
|
||||||
break
|
break
|
||||||
data[26] = int(data[25].split(',')[1]) # Read it as integer
|
data[26] = int(data[25].split(',')[1]) # Read it as integer
|
||||||
data[25] = data[25].split(',')[0].strip() # word, no white space
|
data[25] = data[25].split(',')[0].strip() # word
|
||||||
# Check if this word has more counts than the ones in memory
|
# Check if this word has more counts than the ones in memory
|
||||||
for i in range(25): # elimination of symbol i left as exercise
|
for i in range(25): # elimination of symbol i is exercise
|
||||||
if data[i] == [] or data[i][1] < data[26]:
|
if data[i] == [] or data[i][1] < data[26]:
|
||||||
data.insert(i, [data[25], data[26]])
|
data.insert(i, [data[25], data[26]])
|
||||||
del data[26] # delete the last element
|
del data[26] # delete the last element
|
||||||
break
|
break
|
||||||
|
|
||||||
for tf in data[0:25]: # elimination of symbol tf left as exercise
|
for tf in data[0:25]: # elimination of symbol tf is exercise
|
||||||
if len(tf) == 2:
|
if len(tf) == 2:
|
||||||
print tf[0], ' - ', tf[1]
|
print tf[0], ' - ', tf[1]
|
||||||
# We're done
|
# We're done
|
||||||
|
|||||||
Reference in New Issue
Block a user