Massive renaming!

2019-08-12 14:38:16 -07:00
parent e6c1238a56
commit 61d5f74ad9
90 changed files with 0 additions and 0 deletions
--- a/06-pipeline/Makefile
+++ b/06-pipeline/Makefile
@@ -0,0 +1,8 @@
+CPP_FLAGS=-std=gnu++0x
+
+tf-06: tf-06.cpp
+	$(CXX) $(CPP_FLAGS) -o $@ $<
+
+.PHONY: clean
+clean:
+	rm tf-06
--- a/06-pipeline/README.md
+++ b/06-pipeline/README.md
@@ -0,0 +1,13 @@
+Style #5
+==============================
+
+Constraints:
+
+- Larger problem decomposed in functional abstractions. Functions, according to Mathematics, are relations from inputs to outputs.
+- Larger problem solved as a pipeline of function applications
+
+Possible names:
+
+- Candy factory
+- Functional
+- Pipeline
--- a/06-pipeline/tf-05.clj
+++ b/06-pipeline/tf-05.clj
@@ -0,0 +1,46 @@
+":";exec java -cp "$HOME/.m2/repository/org/clojure/clojure/1.5.1/clojure-1.5.1.jar" clojure.main $0 $*
+
+; Sort of a hack; Clojure isn't really intended as a scripting language. :-/
+
+(require '[clojure.string :refer [split]]
+         '[clojure.java.io :refer [reader]]
+         '[clojure.pprint :refer [pprint]])
+
+(defn stopwords
+  "Reads a set of comma-separated stopwords from the given filename."
+  [file]
+  (-> file
+      slurp
+      (split #"\s+,\s+")
+      set))
+
+(defn words
+  "Splits a string into a sequence of words."
+  [string]
+  (clojure.string/split string #"[^a-zA-Z]+"))
+
+(defn normalize
+  "Normalizes split words to terms."
+  [word]
+  (.toLowerCase word))
+
+(defn too-short?
+  "Is a word too short for consideration?"
+  [word]
+  (<= (.length word) 2))
+
+; Lazily split the file into lines, explode lines into words, normalize into
+; terms, reject unsuitable candidates, compute frequencies, and take the top
+; 25.
+(with-open [f (reader "../pride-and-prejudice.txt")]
+  (->> f
+       line-seq
+       (mapcat words)
+       (map normalize)
+       (remove too-short?)
+       (remove (stopwords "../stop_words.txt"))
+       frequencies
+       (sort-by second)
+       reverse
+       (take 25)
+       pprint))
--- a/06-pipeline/tf-05.cpp
+++ b/06-pipeline/tf-05.cpp
@@ -0,0 +1,180 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <regex>
+#include <map>
+
+using namespace std;
+
+struct Freq {
+  string word;
+  int freq;
+  Freq(string w, int f) : word(w), freq(f) {}
+};
+
+
+//
+// Helpers
+//
+
+static int tospace(int c)
+{
+  if (!isalpha(c))
+    return ' ';
+  else
+    return c;
+}
+
+static vector<string> get_stop_words()
+{
+  string word;
+  vector<string> stop_words;
+  ifstream is("../stop_words.txt");
+  
+  while (getline(is, word, ',')) {
+    stop_words.push_back(word);
+  }
+
+  char w[2];
+  w[1] = '\0';
+  for (char c : "abcdefghijklmopqrstuvwxyz") {
+    w[0] = c;
+    stop_words.push_back(string(w));
+  } 
+
+  sort(stop_words.begin(), stop_words.end());
+  
+  return stop_words;
+}
+
+static bool sort_by_freq(Freq x, Freq y)
+{
+  return y.freq < x.freq;
+}
+
+//
+// The functions
+//
+
+/** Takes a path to a file and returns the entire
+    contents of the file as a string
+*/
+string read_file(const char* path_to_file)
+{
+  string data;
+  ifstream is(path_to_file, std::ifstream::binary);
+
+  if (is) {
+    is.seekg (0, is.end);
+    int length = is.tellg();
+    is.seekg (0, is.beg);
+
+    char* buffer = new char [length + 1];
+    is.read(buffer,length);
+    buffer[length] = '\0';
+    data = buffer;
+    delete [] buffer;
+  }
+  return data;
+}
+
+/** Takes a string and returns a copy with all nonalphanumeric 
+    chars replaced by white space
+*/
+string filter_chars(string str_data)
+{
+  std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tospace);
+  return str_data;
+}
+
+
+/** Takes a string and returns a copy with all chars in lower case
+*/
+string normalize(string str_data)
+{
+  std::transform(str_data.begin(), str_data.end(), str_data.begin(), ::tolower);
+  return str_data;
+}    
+  
+
+/** Takes a string and scans for words, returning
+    a list of words.
+*/
+vector<string> scan(string str_data)
+{
+  string word;
+  vector<string> words;
+  istringstream is(str_data);
+  
+  while (is >> word) {
+    words.push_back(word);
+  }
+  
+  return words;
+}
+
+/** Takes a list of words and returns a copy with all stop 
+    words removed
+*/
+vector<string> remove_stop_words(vector<string> words)
+{
+  vector<string> stop_words = get_stop_words();
+  vector<string> filtered_list;
+  
+  filtered_list.reserve(words.size());
+  for (string w : words) {
+    if (!binary_search(stop_words.begin(), stop_words.end(), w))
+      filtered_list.push_back(w);
+  }
+  
+  return filtered_list;
+}
+
+/** Takes a list of words and returns a dictionary associating
+    words with frequencies of occurrence
+*/
+map<string,int> frequencies(vector<string> words)
+{
+  map<string,int> freq;
+  
+  for (string w : words) {
+    map<string,int>::iterator it = freq.find(w);
+    if (it != freq.end()) {
+      it->second++;
+    }
+    else {
+      freq.insert(pair<string,int>(w,1));
+    }
+  }
+  return freq;
+}
+
+/** Takes a dictionary of words and their frequencies
+    and returns a list of pairs where the entries are
+    sorted by frequency
+*/
+vector<Freq> sort(map<string,int> word_freq)
+{
+  vector<Freq> out_list;
+  
+  out_list.reserve(word_freq.size());
+  for (pair<string,int> p : word_freq) {
+    out_list.push_back(Freq(p.first, p.second));
+  }
+  sort(out_list.begin(), out_list.end(), sort_by_freq);
+  return out_list;
+}
+
+//
+// The main function
+//
+
+int main(int argc, char* argv[])
+{
+  vector<Freq> word_freqs = sort(frequencies(remove_stop_words(scan(normalize(filter_chars(read_file(argv[1])))))));
+  
+  for (vector<Freq>::iterator it = word_freqs.begin(); it != word_freqs.begin()+25; it++)
+    cout << it->word << " - " << it->freq << endl;
+
+  return 0;
+}
--- a/06-pipeline/tf-06.py
+++ b/06-pipeline/tf-06.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+import sys, re, operator, string
+
+#
+# The functions
+#
+def read_file(path_to_file):
+    """
+    Takes a path to a file and returns the entire
+    contents of the file as a string
+    """
+    with open(path_to_file) as f:
+        data = f.read()
+    return data
+
+def filter_chars_and_normalize(str_data):
+    """
+    Takes a string and returns a copy with all nonalphanumeric 
+    chars replaced by white space
+    """
+    pattern = re.compile('[\W_]+')
+    return pattern.sub(' ', str_data).lower()
+
+def scan(str_data):
+    """
+    Takes a string and scans for words, returning
+    a list of words.
+    """
+    return str_data.split()
+
+def remove_stop_words(word_list):
+    """ 
+    Takes a list of words and returns a copy with all stop 
+    words removed 
+    """
+    with open('../stop_words.txt') as f:
+        stop_words = f.read().split(',')
+    # add single-letter words
+    stop_words.extend(list(string.ascii_lowercase))
+    return [w for w in word_list if not w in stop_words]
+
+def frequencies(word_list):
+    """
+    Takes a list of words and returns a dictionary associating
+    words with frequencies of occurrence
+    """
+    word_freqs = {}
+    for w in word_list:
+        if w in word_freqs:
+            word_freqs[w] += 1
+        else:
+            word_freqs[w] = 1
+    return word_freqs
+
+def sort(word_freq):
+    """
+    Takes a dictionary of words and their frequencies
+    and returns a list of pairs where the entries are
+    sorted by frequency 
+    """
+    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
+
+def print_all(word_freqs):
+    """
+    Takes a list of pairs where the entries are sorted by frequency and print them recursively.
+    """
+    if(len(word_freqs) > 0):
+        print(word_freqs[0][0], ' - ', word_freqs[0][1])
+        print_all(word_freqs[1:]);
+
+#
+# The main function
+#
+print_all(sort(frequencies(remove_stop_words(scan(filter_chars_and_normalize(read_file(sys.argv[1]))))))[0:25])
+
--- a/06-pipeline/tf-09.sh
+++ b/06-pipeline/tf-09.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+grep -o "[A-Za-z][A-Za-z][A-Za-z]*" $1 \
+    | tr '[:upper:]' '[:lower:]' \
+    | grep -Ev "^($(sed  -e 's/,/|/g' ../stop_words.txt))$" \
+    | sort | uniq -c | sort -rn | head -25 \
+    | sed -e 's/^ *\([0-9]*\) *\([a-z]*\)/\2  -  \1/'