Format to fit in 1 page

This commit is contained in:
Crista Lopes
2019-08-12 20:15:33 -07:00
parent cc86a2cebc
commit d202fea352

View File

@@ -1,11 +1,10 @@
import sys, string
import numpy as np
# Get an array of characters from the file, make sure it starts and ends with a space
# Example input: "Hello World! "
characters = np.array([' '] + list(open(sys.argv[1]).read()) + [' '])
# Example input: "Hello World!!"
characters = np.array([' ']+list(open(sys.argv[1]).read())+[' '])
# Result: array([' ', 'H', 'e', 'l', 'l', 'o', ' ', ' ',
# 'W', 'o', 'r', 'l', 'd', '!', ' ', ' '], dtype='<U1')
# 'W', 'o', 'r', 'l', 'd', '!', '!', ' '], dtype='<U1')
# Normalize
characters[~np.char.isalpha(characters)] = ' '
@@ -29,12 +28,12 @@ w_ranges = np.reshape(sp2[1:-1], (-1, 2))
# [13, 14]], dtype=int64)
# Voila! Words are in between spaces, given as pairs of indices
# But remember to skip contiguous spaces (the conditional at the end)
# But skip contiguous spaces (the conditional at the end)
words = [characters[w_ranges[i][0] : w_ranges[i][1]] for i in range(len(w_ranges)) if w_ranges[i][1]-w_ranges[i][0] > 1]
# Result: [array([' ', 'h', 'e', 'l', 'l', 'o'], dtype='<U1'),
# array([' ', 'w', 'o', 'r', 'l', 'd'], dtype='<U1')]
# But this is too much! Let's recode the characters as strings
# Let's recode the characters as strings
swords = np.array([''.join(row).strip() for row in words])
# Result: array(['hello', 'world'], dtype='<U5')
@@ -42,11 +41,11 @@ swords = np.array([''.join(row).strip() for row in words])
stop_words = open('../stop_words.txt').read().split(',')
stop_words.extend(list(string.ascii_lowercase))
stop_words = np.array(list(set(stop_words)))
non_stop_words = swords[~np.isin(swords, stop_words)]
ns_words = swords[~np.isin(swords, stop_words)]
### Finally, count the word occurrences
uniq, counts = np.unique(non_stop_words, axis=0, return_counts=True)
wf_sorted = sorted(zip(uniq, counts), key = lambda t: t[1], reverse=True)
uniq, counts = np.unique(ns_words, axis=0, return_counts=True)
wf_sorted = sorted(zip(uniq, counts), key=lambda t: t[1], reverse=True)
for w, c in wf_sorted[:25]:
print(w, '-', c)