Changed the name of this style too
This commit is contained in:
8
05-pipeline/Makefile
Normal file
8
05-pipeline/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
CPP_FLAGS=-std=gnu++0x
|
||||
|
||||
tf-06: tf-06.cpp
|
||||
$(CXX) $(CPP_FLAGS) -o $@ $<
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm tf-06
|
||||
13
05-pipeline/README.md
Normal file
13
05-pipeline/README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
Style #5
|
||||
==============================
|
||||
|
||||
Constraints:
|
||||
|
||||
- Larger problem decomposed in functional abstractions. Functions, according to Mathematics, are relations from inputs to outputs.
|
||||
- Larger problem solved as a pipeline of function applications
|
||||
|
||||
Possible names:
|
||||
|
||||
- Candy factory
|
||||
- Functional
|
||||
- Pipeline
|
||||
46
05-pipeline/tf-05.clj
Executable file
46
05-pipeline/tf-05.clj
Executable file
@@ -0,0 +1,46 @@
|
||||
":";exec java -cp "$HOME/.m2/repository/org/clojure/clojure/1.5.1/clojure-1.5.1.jar" clojure.main $0 $*
|
||||
|
||||
; Sort of a hack; Clojure isn't really intended as a scripting language. :-/
|
||||
|
||||
(require '[clojure.string :refer [split]]
|
||||
'[clojure.java.io :refer [reader]]
|
||||
'[clojure.pprint :refer [pprint]])
|
||||
|
||||
(defn stopwords
|
||||
"Reads a set of comma-separated stopwords from the given filename."
|
||||
[file]
|
||||
(-> file
|
||||
slurp
|
||||
(split #"\s+,\s+")
|
||||
set))
|
||||
|
||||
(defn words
|
||||
"Splits a string into a sequence of words."
|
||||
[string]
|
||||
(clojure.string/split string #"[^a-zA-Z]+"))
|
||||
|
||||
(defn normalize
|
||||
"Normalizes split words to terms."
|
||||
[word]
|
||||
(.toLowerCase word))
|
||||
|
||||
(defn too-short?
|
||||
"Is a word too short for consideration?"
|
||||
[word]
|
||||
(<= (.length word) 2))
|
||||
|
||||
; Lazily split the file into lines, explode lines into words, normalize into
|
||||
; terms, reject unsuitable candidates, compute frequencies, and take the top
|
||||
; 25.
|
||||
(with-open [f (reader "../pride-and-prejudice.txt")]
|
||||
(->> f
|
||||
line-seq
|
||||
(mapcat words)
|
||||
(map normalize)
|
||||
(remove too-short?)
|
||||
(remove (stopwords "../stop_words.txt"))
|
||||
frequencies
|
||||
(sort-by second)
|
||||
reverse
|
||||
(take 25)
|
||||
pprint))
|
||||
180
05-pipeline/tf-05.cpp
Normal file
180
05-pipeline/tf-05.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <map>
|
||||
|
||||
using namespace std;
|
||||
|
||||
struct Freq {
|
||||
string word;
|
||||
int freq;
|
||||
Freq(string w, int f) : word(w), freq(f) {}
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Helpers
|
||||
//
|
||||
|
||||
static int tospace(int c)
|
||||
{
|
||||
if (!isalpha(c))
|
||||
return ' ';
|
||||
else
|
||||
return c;
|
||||
}
|
||||
|
||||
static vector<string> get_stop_words()
|
||||
{
|
||||
string word;
|
||||
vector<string> stop_words;
|
||||
ifstream is("../stop_words.txt");
|
||||
|
||||
while (getline(is, word, ',')) {
|
||||
stop_words.push_back(word);
|
||||
}
|
||||
|
||||
char w[2];
|
||||
w[1] = '\0';
|
||||
for (char c : "abcdefghijklmopqrstuvwxyz") {
|
||||
w[0] = c;
|
||||
stop_words.push_back(string(w));
|
||||
}
|
||||
|
||||
sort(stop_words.begin(), stop_words.end());
|
||||
|
||||
return stop_words;
|
||||
}
|
||||
|
||||
static bool sort_by_freq(Freq x, Freq y)
|
||||
{
|
||||
return y.freq < x.freq;
|
||||
}
|
||||
|
||||
//
|
||||
// The functions
|
||||
//
|
||||
|
||||
/** Takes a path to a file and returns the entire
|
||||
contents of the file as a string
|
||||
*/
|
||||
string read_file(const char* path_to_file)
|
||||
{
|
||||
string data;
|
||||
ifstream is(path_to_file, std::ifstream::binary);
|
||||
|
||||
if (is) {
|
||||
is.seekg (0, is.end);
|
||||
int length = is.tellg();
|
||||
is.seekg (0, is.beg);
|
||||
|
||||
char* buffer = new char [length + 1];
|
||||
is.read(buffer,length);
|
||||
buffer[length] = '\0';
|
||||
data = buffer;
|
||||
delete [] buffer;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
/** Takes a string and returns a copy with all nonalphanumeric
|
||||
chars replaced by white space
|
||||
*/
|
||||
string filter_chars(string str_data)
|
||||
{
|
||||
std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tospace);
|
||||
return str_data;
|
||||
}
|
||||
|
||||
|
||||
/** Takes a string and returns a copy with all chars in lower case
|
||||
*/
|
||||
string normalize(string str_data)
|
||||
{
|
||||
std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tolower);
|
||||
return str_data;
|
||||
}
|
||||
|
||||
|
||||
/** Takes a string and scans for words, returning
|
||||
a list of words.
|
||||
*/
|
||||
vector<string> scan(string str_data)
|
||||
{
|
||||
string word;
|
||||
vector<string> words;
|
||||
istringstream is(str_data);
|
||||
|
||||
while (is >> word) {
|
||||
words.push_back(word);
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
/** Takes a list of words and returns a copy with all stop
|
||||
words removed
|
||||
*/
|
||||
vector<string> remove_stop_words(vector<string> words)
|
||||
{
|
||||
vector<string> stop_words = get_stop_words();
|
||||
vector<string> filtered_list;
|
||||
|
||||
filtered_list.reserve(words.size());
|
||||
for (string w : words) {
|
||||
if (!binary_search(stop_words.begin(), stop_words.end(), w))
|
||||
filtered_list.push_back(w);
|
||||
}
|
||||
|
||||
return filtered_list;
|
||||
}
|
||||
|
||||
/** Takes a list of words and returns a dictionary associating
|
||||
words with frequencies of occurrence
|
||||
*/
|
||||
map<string,int> frequencies(vector<string> words)
|
||||
{
|
||||
map<string,int> freq;
|
||||
|
||||
for (string w : words) {
|
||||
map<string,int>::iterator it = freq.find(w);
|
||||
if (it != freq.end()) {
|
||||
it->second++;
|
||||
}
|
||||
else {
|
||||
freq.insert(pair<string,int>(w,1));
|
||||
}
|
||||
}
|
||||
return freq;
|
||||
}
|
||||
|
||||
/** Takes a dictionary of words and their frequencies
|
||||
and returns a list of pairs where the entries are
|
||||
sorted by frequency
|
||||
*/
|
||||
vector<Freq> sort(map<string,int> word_freq)
|
||||
{
|
||||
vector<Freq> out_list;
|
||||
|
||||
out_list.reserve(word_freq.size());
|
||||
for (pair<string,int> p : word_freq) {
|
||||
out_list.push_back(Freq(p.first, p.second));
|
||||
}
|
||||
sort(out_list.begin(), out_list.end(), sort_by_freq);
|
||||
return out_list;
|
||||
}
|
||||
|
||||
//
|
||||
// The main function
|
||||
//
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
vector<Freq> word_freqs = sort(frequencies(remove_stop_words(scan(normalize(filter_chars(read_file(argv[1])))))));
|
||||
|
||||
for (vector<Freq>::iterator it = word_freqs.begin(); it != word_freqs.begin()+25; it++)
|
||||
cout << it->word << " - " << it->freq << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
75
05-pipeline/tf-05.py
Executable file
75
05-pipeline/tf-05.py
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python
|
||||
import sys, re, operator, string
|
||||
|
||||
#
|
||||
# The functions
|
||||
#
|
||||
def read_file(path_to_file):
|
||||
"""
|
||||
Takes a path to a file and returns the entire
|
||||
contents of the file as a string
|
||||
"""
|
||||
with open(path_to_file) as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
def filter_chars_and_normalize(str_data):
|
||||
"""
|
||||
Takes a string and returns a copy with all nonalphanumeric
|
||||
chars replaced by white space
|
||||
"""
|
||||
pattern = re.compile('[\W_]+')
|
||||
return pattern.sub(' ', str_data).lower()
|
||||
|
||||
def scan(str_data):
|
||||
"""
|
||||
Takes a string and scans for words, returning
|
||||
a list of words.
|
||||
"""
|
||||
return str_data.split()
|
||||
|
||||
def remove_stop_words(word_list):
|
||||
"""
|
||||
Takes a list of words and returns a copy with all stop
|
||||
words removed
|
||||
"""
|
||||
with open('../stop_words.txt') as f:
|
||||
stop_words = f.read().split(',')
|
||||
# add single-letter words
|
||||
stop_words.extend(list(string.ascii_lowercase))
|
||||
return [w for w in word_list if not w in stop_words]
|
||||
|
||||
def frequencies(word_list):
|
||||
"""
|
||||
Takes a list of words and returns a dictionary associating
|
||||
words with frequencies of occurrence
|
||||
"""
|
||||
word_freqs = {}
|
||||
for w in word_list:
|
||||
if w in word_freqs:
|
||||
word_freqs[w] += 1
|
||||
else:
|
||||
word_freqs[w] = 1
|
||||
return word_freqs
|
||||
|
||||
def sort(word_freq):
|
||||
"""
|
||||
Takes a dictionary of words and their frequencies
|
||||
and returns a list of pairs where the entries are
|
||||
sorted by frequency
|
||||
"""
|
||||
return sorted(word_freq.iteritems(), key=operator.itemgetter(1), reverse=True)
|
||||
|
||||
def print_all(word_freqs):
|
||||
"""
|
||||
Takes a list of pairs where the entries are sorted by frequency and print them recursively.
|
||||
"""
|
||||
if(len(word_freqs) > 0):
|
||||
print word_freqs[0][0], ' - ', word_freqs[0][1]
|
||||
print_all(word_freqs[1:]);
|
||||
|
||||
#
|
||||
# The main function
|
||||
#
|
||||
print_all(sort(frequencies(remove_stop_words(scan(filter_chars_and_normalize(read_file(sys.argv[1]))))))[0:25])
|
||||
|
||||
Reference in New Issue
Block a user