Massive renaming!

This commit is contained in:
Crista Lopes
2019-08-12 14:38:16 -07:00
parent e6c1238a56
commit 61d5f74ad9
90 changed files with 0 additions and 0 deletions

8
06-pipeline/Makefile Normal file
View File

@@ -0,0 +1,8 @@
CPP_FLAGS=-std=gnu++0x
tf-06: tf-06.cpp
$(CXX) $(CPP_FLAGS) -o $@ $<
.PHONY: clean
clean:
rm tf-06

13
06-pipeline/README.md Normal file
View File

@@ -0,0 +1,13 @@
Style #5
==============================
Constraints:
- Larger problem decomposed in functional abstractions. Functions, according to Mathematics, are relations from inputs to outputs.
- Larger problem solved as a pipeline of function applications
Possible names:
- Candy factory
- Functional
- Pipeline

46
06-pipeline/tf-05.clj Executable file
View File

@@ -0,0 +1,46 @@
":";exec java -cp "$HOME/.m2/repository/org/clojure/clojure/1.5.1/clojure-1.5.1.jar" clojure.main $0 $*
; Sort of a hack; Clojure isn't really intended as a scripting language. :-/
(require '[clojure.string :refer [split]]
'[clojure.java.io :refer [reader]]
'[clojure.pprint :refer [pprint]])
(defn stopwords
"Reads a set of comma-separated stopwords from the given filename."
[file]
(-> file
slurp
(split #"\s+,\s+")
set))
(defn words
"Splits a string into a sequence of words."
[string]
(clojure.string/split string #"[^a-zA-Z]+"))
(defn normalize
"Normalizes split words to terms."
[word]
(.toLowerCase word))
(defn too-short?
"Is a word too short for consideration?"
[word]
(<= (.length word) 2))
; Lazily split the file into lines, explode lines into words, normalize into
; terms, reject unsuitable candidates, compute frequencies, and take the top
; 25.
(with-open [f (reader "../pride-and-prejudice.txt")]
(->> f
line-seq
(mapcat words)
(map normalize)
(remove too-short?)
(remove (stopwords "../stop_words.txt"))
frequencies
(sort-by second)
reverse
(take 25)
pprint))

180
06-pipeline/tf-05.cpp Normal file
View File

@@ -0,0 +1,180 @@
#include <iostream>
#include <fstream>
#include <sstream>
#include <regex>
#include <map>
using namespace std;
struct Freq {
string word;
int freq;
Freq(string w, int f) : word(w), freq(f) {}
};
//
// Helpers
//
static int tospace(int c)
{
if (!isalpha(c))
return ' ';
else
return c;
}
static vector<string> get_stop_words()
{
string word;
vector<string> stop_words;
ifstream is("../stop_words.txt");
while (getline(is, word, ',')) {
stop_words.push_back(word);
}
char w[2];
w[1] = '\0';
for (char c : "abcdefghijklmopqrstuvwxyz") {
w[0] = c;
stop_words.push_back(string(w));
}
sort(stop_words.begin(), stop_words.end());
return stop_words;
}
static bool sort_by_freq(Freq x, Freq y)
{
return y.freq < x.freq;
}
//
// The functions
//
/** Takes a path to a file and returns the entire
contents of the file as a string
*/
string read_file(const char* path_to_file)
{
string data;
ifstream is(path_to_file, std::ifstream::binary);
if (is) {
is.seekg (0, is.end);
int length = is.tellg();
is.seekg (0, is.beg);
char* buffer = new char [length + 1];
is.read(buffer,length);
buffer[length] = '\0';
data = buffer;
delete [] buffer;
}
return data;
}
/** Takes a string and returns a copy with all nonalphanumeric
chars replaced by white space
*/
string filter_chars(string str_data)
{
std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tospace);
return str_data;
}
/** Takes a string and returns a copy with all chars in lower case
*/
string normalize(string str_data)
{
std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tolower);
return str_data;
}
/** Takes a string and scans for words, returning
a list of words.
*/
vector<string> scan(string str_data)
{
string word;
vector<string> words;
istringstream is(str_data);
while (is >> word) {
words.push_back(word);
}
return words;
}
/** Takes a list of words and returns a copy with all stop
words removed
*/
vector<string> remove_stop_words(vector<string> words)
{
vector<string> stop_words = get_stop_words();
vector<string> filtered_list;
filtered_list.reserve(words.size());
for (string w : words) {
if (!binary_search(stop_words.begin(), stop_words.end(), w))
filtered_list.push_back(w);
}
return filtered_list;
}
/** Takes a list of words and returns a dictionary associating
words with frequencies of occurrence
*/
map<string,int> frequencies(vector<string> words)
{
map<string,int> freq;
for (string w : words) {
map<string,int>::iterator it = freq.find(w);
if (it != freq.end()) {
it->second++;
}
else {
freq.insert(pair<string,int>(w,1));
}
}
return freq;
}
/** Takes a dictionary of words and their frequencies
and returns a list of pairs where the entries are
sorted by frequency
*/
vector<Freq> sort(map<string,int> word_freq)
{
vector<Freq> out_list;
out_list.reserve(word_freq.size());
for (pair<string,int> p : word_freq) {
out_list.push_back(Freq(p.first, p.second));
}
sort(out_list.begin(), out_list.end(), sort_by_freq);
return out_list;
}
//
// The main function
//
int main(int argc, char* argv[])
{
vector<Freq> word_freqs = sort(frequencies(remove_stop_words(scan(normalize(filter_chars(read_file(argv[1])))))));
for (vector<Freq>::iterator it = word_freqs.begin(); it != word_freqs.begin()+25; it++)
cout << it->word << " - " << it->freq << endl;
return 0;
}

75
06-pipeline/tf-06.py Executable file
View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python
import sys, re, operator, string
#
# The functions
#
def read_file(path_to_file):
"""
Takes a path to a file and returns the entire
contents of the file as a string
"""
with open(path_to_file) as f:
data = f.read()
return data
def filter_chars_and_normalize(str_data):
"""
Takes a string and returns a copy with all nonalphanumeric
chars replaced by white space
"""
pattern = re.compile('[\W_]+')
return pattern.sub(' ', str_data).lower()
def scan(str_data):
"""
Takes a string and scans for words, returning
a list of words.
"""
return str_data.split()
def remove_stop_words(word_list):
"""
Takes a list of words and returns a copy with all stop
words removed
"""
with open('../stop_words.txt') as f:
stop_words = f.read().split(',')
# add single-letter words
stop_words.extend(list(string.ascii_lowercase))
return [w for w in word_list if not w in stop_words]
def frequencies(word_list):
"""
Takes a list of words and returns a dictionary associating
words with frequencies of occurrence
"""
word_freqs = {}
for w in word_list:
if w in word_freqs:
word_freqs[w] += 1
else:
word_freqs[w] = 1
return word_freqs
def sort(word_freq):
"""
Takes a dictionary of words and their frequencies
and returns a list of pairs where the entries are
sorted by frequency
"""
return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
def print_all(word_freqs):
"""
Takes a list of pairs where the entries are sorted by frequency and print them recursively.
"""
if(len(word_freqs) > 0):
print(word_freqs[0][0], ' - ', word_freqs[0][1])
print_all(word_freqs[1:]);
#
# The main function
#
print_all(sort(frequencies(remove_stop_words(scan(filter_chars_and_normalize(read_file(sys.argv[1]))))))[0:25])

7
06-pipeline/tf-09.sh Executable file
View File

@@ -0,0 +1,7 @@
#!/bin/sh
grep -o "[A-Za-z][A-Za-z][A-Za-z]*" $1 \
| tr '[:upper:]' '[:lower:]' \
| grep -Ev "^($(sed -e 's/,/|/g' ../stop_words.txt))$" \
| sort | uniq -c | sort -rn | head -25 \
| sed -e 's/^ *\([0-9]*\) *\([a-z]*\)/\2 - \1/'