From 42019f8f4416997b984f2b32949498d96f07c3c1 Mon Sep 17 00:00:00 2001 From: Crista Lopes Date: Mon, 30 Sep 2013 07:30:33 -0700 Subject: [PATCH] Added good-old-times style, #1 --- 01-good-old-times/README.md | 16 +++++ 01-good-old-times/tf-01.py | 126 ++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 01-good-old-times/README.md create mode 100644 01-good-old-times/tf-01.py diff --git a/01-good-old-times/README.md b/01-good-old-times/README.md new file mode 100644 index 0000000..31b21d8 --- /dev/null +++ b/01-good-old-times/README.md @@ -0,0 +1,16 @@ +Style #1 +============================== + +Constraints: + +- Very small amount of primary memory, typically orders of magnitude + smaller than the data that needs to be processed/generated. + (The example sets the limit to 1024 bytes) + +- No symbols -- i.e. no variable names or labels. All we have is + addressable memory. + +Possible names: + +- Good old times +- Early 50s style diff --git a/01-good-old-times/tf-01.py b/01-good-old-times/tf-01.py new file mode 100644 index 0000000..c800542 --- /dev/null +++ b/01-good-old-times/tf-01.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +import sys, os, string + +# Utility for handling the intermediate 'secondary memory' +def touchopen(filename, *args, **kwargs): + try: + os.remove(filename) + except OSError: + pass + open(filename, "a").close() # "touch" file + return open(filename, *args, **kwargs) + +# The constrained memory, which consists of only 1024 bytes by constraint +data = [] + +# Overall strategy: +# - read the input file one line at a time +# - filter the characters, normalize to lower case +# - identify words, incrementing corresponding counts (in secondary memory) + + +# We're lucky: +# The stop words are only 556 bytes and the lines are all less than +# 80 characters, so we can use that knowledge to simplify the problem: +# we can have the stop words loaded in memory while processing one line +# of the input at a time. +# If these two assumptions didn't hold, the algorithm would need to be +# changed considerably. + +# Load the list of stop words +f = open('../stop_words.txt') +data = [f.read(1024).split(',')] # data[0] holds the stop words +f.close() + +data.append([]) # data[1] is the line +data.append(None) # data[2] is the index of the start_char of a word +data.append(0) # data[3] is an index, i = 0 +data.append(False) # data[4] is a flag indicating where a word was found +data.append('') # data[5] is the word +data.append('') # data[6] is word,NNNN from the word_freqs file, and then just the word +data.append(0) # data[7] is frequency + +word_freqs = touchopen('word_freqs', 'r+') +f = open(sys.argv[1]) +while True: + data[1] = [f.readline()] + #print data[1] + if data[1] == ['']: # end of input file + break + data[2] = None + data[3] = 0 + for c in data[1][0]: # elimination of symbol c left as exercise + if data[2] == None: + if c.isalnum(): + # We found the start of a word + data[2] = data[3] + else: + if not c.isalnum(): + # We found the end of a word. Process it + data[4] = False + data[5] = data[1][0][data[2]:data[3]].lower() + #print "Looking at " + data[5] + + # Ignore words with less than 2 characters and stop words + if len(data[5]) >= 2 and data[5] not in data[0]: + # Let's see if it already exists + while True: + data[6] = word_freqs.readline().strip() + #print " Comparing to " + data[6] + if data[6] == '': + break; + data[7] = int(data[6].split(',')[1]) + data[6] = data[6].split(',')[0].strip() # word, no white space + if data[5] == data[6]: + data[7] += 1 + data[4] = True + break + + if not data[4]: + word_freqs.writelines("%20s,%04d\n" % (data[5], 1)) + if data[5] == 'i, n': + print "Got it " + str(data[1]) + else: + word_freqs.seek(-26, 1) + word_freqs.writelines("%20s,%04d\n" % (data[5], data[7])) + + word_freqs.seek(0,0) + + # Let's reset + data[2] = None + data[3] += 1 + +f.close() +word_freqs.flush() + +# Now we need to find the 25 most frequently occuring words. +# We don't need anything from the previous values in memory +del data[:] + +# Let's use the first 25 entries for the top 25 words +data = data + [[]]*(25 - len(data)) +data.append('') # data[25] is word,freq read from word_freqs file, and then word +data.append(0) # data[26] is freq +#print data + +while True: + data[25] = word_freqs.readline().strip() + if data[25] == '': + break; + data[26] = int(data[25].split(',')[1]) + data[25] = data[25].split(',')[0].strip() # word, no white space + + for i in range(25): # elimination of symbol i left as exercise + if data[i] == [] or data[i][1] < data[26]: + #print str(i) + " " + str(data[25]) + " " + str(data[26]) + data.insert(i, [data[25], data[26]]) + del data[26] # pop the last element + break + + +for tf in data[0:25]: # elimination of symbol tf left as exercise + if len(tf) == 2: + print tf[0], ' - ', tf[1] + +word_freqs.close()