Files
exercises-in-programming-style/14-tabular/tf-14.py
Bruce Adams 5749c2c50f Enhance testing: run all executables
Add a #! line at the beginning of each of the existing Python programs
and change these files to be executable. This sets the stage for having
the test script blindly run anything that is executable, adding support
for testing many programming languages.
2013-09-24 22:09:30 -04:00

92 lines
2.9 KiB
Python
Executable File

#!/usr/bin/env python
import sys, re, string, sqlite3
#
# The relational database of this problem consists of 3 tables:
# documents, words, characters
#
def create_db_schema(connection):
c = connection.cursor()
c.execute('''CREATE TABLE documents (id INTEGER PRIMARY KEY AUTOINCREMENT, name)''')
c.execute('''CREATE TABLE words (id, doc_id, value)''')
c.execute('''CREATE TABLE characters (id, word_id, value)''')
connection.commit()
c.close()
def load_file_into_database(path_to_file, connection):
""" Takes the path to a file and loads the contents into the database """
def _read_file(path_to_file):
"""
Takes a path to a file and returns the entire contents of the
file as a string
"""
f = open(path_to_file)
data = f.read()
f.close()
return data
def _filter_chars_and_normalize(str_data):
"""
Takes a string and returns a copy with all nonalphanumeric chars
replaced by white space, and all characters lower-cased
"""
pattern = re.compile('[\W_]+')
return pattern.sub(' ', str_data).lower()
def _scan(str_data):
""" Takes a string and scans for words, returning a list of words. """
return str_data.split()
def _remove_stop_words(word_list):
f = open('../stop_words.txt')
stop_words = f.read().split(',')
f.close()
# add single-letter words
stop_words.extend(list(string.ascii_lowercase))
return [w for w in word_list if not w in stop_words]
# The actual work of splitting the input into words
words = _remove_stop_words(_scan(_filter_chars_and_normalize(_read_file(path_to_file))))
# Now let's add data to the database
# Add the document itself to the database
c = connection.cursor()
c.execute("INSERT INTO documents (name) VALUES (?)", (path_to_file,))
c.execute("SELECT id from documents WHERE name=?", (path_to_file,))
doc_id = c.fetchone()[0]
# Add the words to the database
c.execute("SELECT MAX(id) FROM words")
row = c.fetchone()
word_id = row[0]
if word_id == None:
word_id = 0
for w in words:
c.execute("INSERT INTO words VALUES (?, ?, ?)", (word_id, doc_id, w))
# Add the characters to the database
char_id = 0
for char in w:
c.execute("INSERT INTO characters VALUES (?, ?, ?)", (char_id, word_id, char))
char_id += 1
word_id += 1
connection.commit()
c.close()
#
# The main function
#
connection = sqlite3.connect(':memory:')
create_db_schema(connection)
load_file_into_database(sys.argv[1], connection)
# Now, let's query
c = connection.cursor()
c.execute("SELECT value, COUNT(*) as C FROM words GROUP BY value ORDER BY C DESC")
for i in range(25):
row = c.fetchone()
if row != None:
print row[0] + ' - ' + str(row[1])
connection.close()