#!/usr/bin/env python import sys, re, operator, string # # The all-important data stack # stack = [] # # The heap. Let's make it an associative array # mapping names to data (i.e. variables) # heap = {} # # The new "words" of our program # def read_file(): """ Takes a path to a file and returns the entire contents of the file as a string. Path to file expected to be on the stack """ f = open(stack.pop()) # Push the result onto the stack stack.append([f.read()]) f.close() def filter_chars(): """ Takes a string and returns a copy with all nonalphanumeric chars replaced by white space. The data is assumed to be on the stack. """ # This is not in style. RE is too high-level, but using it # for doing this fast and short. Push the pattern onto stack stack.append(re.compile('[\W_]+')) # Push the result onto the stack stack.append([stack.pop().sub(' ', stack.pop()[0]).lower()]) def scan(): """ Takes a string and scans for words, returning a list of words. The data is assumed to be on the stack. """ # Push the result onto the stack. # Again, split() is too high-level for this style, but using it # for doing this fast and short. Left as exercise. stack.append(stack.pop()[0].split()) def remove_stop_words(): """ Takes a list of words and returns a copy with all stop words removed. The data is assumed to be on the stack. """ f = open('../stop_words.txt') stack.append(f.read().split(',')) f.close() # add single-letter words stack[1].extend(list(string.ascii_lowercase)) heap['stop_words'] = stack.pop() # Again, this is too high-level for this style, but using it # for doing this fast and short. Left as exercise. stack.append([w for w in stack.pop() if not w in heap['stop_words']]) def frequencies(): """ Takes a list of words and returns a dictionary associating words with frequencies of occurrence. The word list is assumed to be on the stack. """ heap['word_list'] = stack.pop() heap['word_freqs'] = {} # A little flavour of the real Forth style here... stack.append(0) # Counter of words at stack[0] while stack[-1] != len(heap['word_list']): stack.append(heap['word_list'][stack[-1]]) # Push the word, stack[1] # ... but the following line is not in style, because the naive implementation # would be too slow, or we'd need to implement faster, hash-based search if stack[-1] in heap['word_freqs']: stack.append(heap['word_freqs'][stack[1]]) # push the frequency, stack[2] stack[2] = stack[2] + 1 # Swap the tuple the stack with a new one else: stack.append(1) # Push 1 in stack[2] heap['word_freqs'][stack.pop()] = stack.pop() # Load the updated freq back onto the heap # Increment the counter stack.append(1) stack.append(stack.pop() + stack.pop()) #Add the operands on the stack # Done with iteration. Pop the counter stack.pop() # Push the result onto the stack stack.append(heap['word_freqs']) def sort(): """ Takes a dictionary of words and their frequencies and returns a list of pairs where the entries are sorted by frequency """ # Not in style, left as exercise stack.append(sorted(stack.pop().iteritems(), key=operator.itemgetter(1), reverse=True)) # # The main function # stack.append(sys.argv[1]) read_file(); filter_chars(); scan(); remove_stop_words() frequencies(); sort() word_freqs = stack.pop() for i in range(0, 25): stack.append(word_freqs[i]) print stack[0][0], ' - ', stack[0][1] stack.pop()