@@ -21,7 +21,7 @@ def scan(str_data, func):
|
||||
|
||||
def remove_stop_words(word_list, func):
|
||||
with open('../stop_words.txt') as f:
|
||||
stop_words = f.read().split(',')
|
||||
stop_words = f.read().strip('\n').split(',')
|
||||
# add single-letter words
|
||||
stop_words.extend(list(string.ascii_lowercase))
|
||||
func([w for w in word_list if not w in stop_words], sort)
|
||||
|
||||
@@ -35,7 +35,7 @@ def scan(str_data):
|
||||
|
||||
def remove_stop_words(word_list):
|
||||
with open('../stop_words.txt') as f:
|
||||
stop_words = f.read().split(',')
|
||||
stop_words = f.read().strip('\n').split(',')
|
||||
# add single-letter words
|
||||
stop_words.extend(list(string.ascii_lowercase))
|
||||
return [w for w in word_list if not w in stop_words]
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
#!/usr/bin/env python
|
||||
import sys, operator, string
|
||||
import sys
|
||||
import operator
|
||||
import string
|
||||
|
||||
|
||||
def characters(filename):
|
||||
for line in open(filename):
|
||||
for c in line:
|
||||
yield c
|
||||
|
||||
|
||||
def all_words(filename):
|
||||
start_char = True
|
||||
for c in characters(filename):
|
||||
@@ -15,7 +19,8 @@ def all_words(filename):
|
||||
# We found the start of a word
|
||||
word = c.lower()
|
||||
start_char = False
|
||||
else: pass
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
if c.isalnum():
|
||||
word += c.lower()
|
||||
@@ -24,12 +29,15 @@ def all_words(filename):
|
||||
start_char = True
|
||||
yield word
|
||||
|
||||
|
||||
def non_stop_words(filename):
|
||||
stopwords = set(open('../stop_words.txt').read().split(',') + list(string.ascii_lowercase))
|
||||
stopwords = set(open(
|
||||
'../stop_words.txt').read().strip('\n').split(',') + list(string.ascii_lowercase))
|
||||
for w in all_words(filename):
|
||||
if not w in stopwords:
|
||||
yield w
|
||||
|
||||
|
||||
def count_and_sort(filename):
|
||||
freqs, i = {}, 1
|
||||
for w in non_stop_words(filename):
|
||||
@@ -38,6 +46,8 @@ def count_and_sort(filename):
|
||||
yield sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
|
||||
i = i+1
|
||||
yield sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
|
||||
|
||||
|
||||
#
|
||||
# The main function
|
||||
#
|
||||
@@ -45,4 +55,3 @@ for word_freqs in count_and_sort(sys.argv[1]):
|
||||
print("-----------------------------")
|
||||
for (w, c) in word_freqs[0:25]:
|
||||
print(w, '-', c)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user