@@ -21,7 +21,7 @@ def scan(str_data, func):
|
|||||||
|
|
||||||
def remove_stop_words(word_list, func):
|
def remove_stop_words(word_list, func):
|
||||||
with open('../stop_words.txt') as f:
|
with open('../stop_words.txt') as f:
|
||||||
stop_words = f.read().split(',')
|
stop_words = f.read().strip('\n').split(',')
|
||||||
# add single-letter words
|
# add single-letter words
|
||||||
stop_words.extend(list(string.ascii_lowercase))
|
stop_words.extend(list(string.ascii_lowercase))
|
||||||
func([w for w in word_list if not w in stop_words], sort)
|
func([w for w in word_list if not w in stop_words], sort)
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def scan(str_data):
|
|||||||
|
|
||||||
def remove_stop_words(word_list):
|
def remove_stop_words(word_list):
|
||||||
with open('../stop_words.txt') as f:
|
with open('../stop_words.txt') as f:
|
||||||
stop_words = f.read().split(',')
|
stop_words = f.read().strip('\n').split(',')
|
||||||
# add single-letter words
|
# add single-letter words
|
||||||
stop_words.extend(list(string.ascii_lowercase))
|
stop_words.extend(list(string.ascii_lowercase))
|
||||||
return [w for w in word_list if not w in stop_words]
|
return [w for w in word_list if not w in stop_words]
|
||||||
|
|||||||
@@ -1,11 +1,15 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys, operator, string
|
import sys
|
||||||
|
import operator
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
def characters(filename):
|
def characters(filename):
|
||||||
for line in open(filename):
|
for line in open(filename):
|
||||||
for c in line:
|
for c in line:
|
||||||
yield c
|
yield c
|
||||||
|
|
||||||
|
|
||||||
def all_words(filename):
|
def all_words(filename):
|
||||||
start_char = True
|
start_char = True
|
||||||
for c in characters(filename):
|
for c in characters(filename):
|
||||||
@@ -15,7 +19,8 @@ def all_words(filename):
|
|||||||
# We found the start of a word
|
# We found the start of a word
|
||||||
word = c.lower()
|
word = c.lower()
|
||||||
start_char = False
|
start_char = False
|
||||||
else: pass
|
else:
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
if c.isalnum():
|
if c.isalnum():
|
||||||
word += c.lower()
|
word += c.lower()
|
||||||
@@ -24,12 +29,15 @@ def all_words(filename):
|
|||||||
start_char = True
|
start_char = True
|
||||||
yield word
|
yield word
|
||||||
|
|
||||||
|
|
||||||
def non_stop_words(filename):
|
def non_stop_words(filename):
|
||||||
stopwords = set(open('../stop_words.txt').read().split(',') + list(string.ascii_lowercase))
|
stopwords = set(open(
|
||||||
|
'../stop_words.txt').read().strip('\n').split(',') + list(string.ascii_lowercase))
|
||||||
for w in all_words(filename):
|
for w in all_words(filename):
|
||||||
if not w in stopwords:
|
if not w in stopwords:
|
||||||
yield w
|
yield w
|
||||||
|
|
||||||
|
|
||||||
def count_and_sort(filename):
|
def count_and_sort(filename):
|
||||||
freqs, i = {}, 1
|
freqs, i = {}, 1
|
||||||
for w in non_stop_words(filename):
|
for w in non_stop_words(filename):
|
||||||
@@ -38,6 +46,8 @@ def count_and_sort(filename):
|
|||||||
yield sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
|
yield sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
|
||||||
i = i+1
|
i = i+1
|
||||||
yield sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
|
yield sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# The main function
|
# The main function
|
||||||
#
|
#
|
||||||
@@ -45,4 +55,3 @@ for word_freqs in count_and_sort(sys.argv[1]):
|
|||||||
print("-----------------------------")
|
print("-----------------------------")
|
||||||
for (w, c) in word_freqs[0:25]:
|
for (w, c) in word_freqs[0:25]:
|
||||||
print(w, '-', c)
|
print(w, '-', c)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user