From c7cb993a56fc8b2474dca127c8d3de8425d4c950 Mon Sep 17 00:00:00 2001 From: Crista Lopes Date: Tue, 26 Nov 2019 00:38:29 -0800 Subject: [PATCH] Very rough learning to count words --- 36-dnn/count_words.py | 266 +++++++++++++++++++++++++++++++++++++++ 36-dnn/generate_words.py | 69 ++++++++++ 2 files changed, 335 insertions(+) create mode 100644 36-dnn/count_words.py create mode 100644 36-dnn/generate_words.py diff --git a/36-dnn/count_words.py b/36-dnn/count_words.py new file mode 100644 index 0000000..93eb8aa --- /dev/null +++ b/36-dnn/count_words.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +''' +# An implementation of deep learning for counting symbols +Input: [10, 12, 10, 11, 2, 2, 2, 1, 1] +Output: words=[2, 10, 1, 12, 11] counts=[3, 2, 2, 1, 1] (Not necessarily in this order) + +''' # noqa + +from __future__ import print_function +from keras.models import Sequential +from keras import layers, metrics +from keras import backend as K +from keras.utils import plot_model +from keras.utils import to_categorical +import numpy as np +from six.moves import range +import string, re, collections, os, sys + +# Parameters for the model and dataset. +TRAINING_SIZE = 50000 +VOCAB_SIZE = 1000 +SAMPLE_SIZE = 100 +TOP = 2 +BATCH_SIZE = 50 + +data_folder = 'words_data' +if len(sys.argv) > 1: + data_folder = data_folder + '_' + sys.argv[1] +train_x = os.path.join(data_folder, 'train_x.txt') +train_y = os.path.join(data_folder, 'train_y.txt') +val_x = os.path.join(data_folder, 'val_x.txt') +val_y = os.path.join(data_folder, 'val_y.txt') + + +class WordTable(object): + """Given a text file: + + Encode the words to a one-hot integer representation + + Decode the one-hot or integer representation to their character output + + Decode a vector of probabilities to their character output + """ + def __init__(self): + """Initialize words table. + + # Arguments + filename: The file from which to map the words. + """ + global TRAINING_SIZE + global VOCAB_SIZE + global SAMPLE_SIZE + global BATCH_SIZE + + self.words = set() + nlines = 0 + max_words = 0 + with open(train_x) as f: + for line in f: + words = line.split() + self.words.update(words) + + nlines = nlines + 1 + if max_words < len(words): + max_words = len(words) + + self.words = list(self.words) + self.word_indices = dict((w, i) for i, w in enumerate(self.words)) + self.indices_word = dict((i, w) for i, w in enumerate(self.words)) + + TRAINING_SIZE = nlines + VOCAB_SIZE = len(self.words) + SAMPLE_SIZE = max_words + BATCH_SIZE = 50 + + def words_to_indices(self, words): + return [self.word_indices[w] for w in words] + + def indices_to_words(self, indices): + return [self.indices_word[i] for i in indices] + + def encode_one_hot(self, W, forConv=False): + """One-hot encode given word, or list of indices, W. + + # Arguments + W: either a word or a list of indices, to be encoded. + """ + if type(W) is string: + x = np.zeros(VOCAB_SIZE) + x[self.word_indices[W]] = 1 + return x + elif type(W) is list: # Example: [3, 9, 5] + x = np.zeros((SAMPLE_SIZE, VOCAB_SIZE)) if not forConv else np.zeros((SAMPLE_SIZE, VOCAB_SIZE, 1)) + for i, w in enumerate(W): + if i >= SAMPLE_SIZE: break + if not forConv: + x[i, w] = 1 + else: + x[i, w, 0] = 1 + return x + else: + raise Exception("Bad type to encode") + + + def decode(self, x): + """Decode the given vector or 1D array to their character output. + + # Arguments + x: A vector or a 2D array of probabilities or one-hot representations; + or a vector of word indices (used with `calc_argmax=False`). + calc_argmax: Whether to find the word index with maximum + probability, defaults to `True`. + """ + if x.ndim == 1: # either a single word, one-hot encoded, or multiple words + #one_idxs = [i for i, v in enumerate(x) if v >= 0.5] + one_idxs = np.argpartition(x, -TOP)[-TOP:] + print(f'Top 2 indices are {one_idxs} and values are ', np.rint(x[one_idxs])) + return [self.indices_word[i] for i in one_idxs] + elif x.ndim == 2: # a list of words, each one-hot encoded + words = [] + for w in x: + words.append(self.decode(w)) + return words + else: + raise Exception("Bad type to decode") + + +ctable = WordTable() +print(f'Words table with training size {TRAINING_SIZE}, batch size {BATCH_SIZE}, vocab size {VOCAB_SIZE} and sample size {SAMPLE_SIZE}') + + +def line_x_to_indices(line): + words = line.split() + return ctable.words_to_indices(words) + +def line_y_to_indices(line): + pairs = line.split(',') + if len(pairs[0]) < 2: # no counts here + return list(zip(ctable.words_to_indices(pairs), [1 for _ in range(len(pairs))])) + else: + words = [p.split()[0] for p in pairs] + counts = [int(p.split()[1]) for p in pairs] + w_indices = ctable.words_to_indices(words) + return w_indices, counts + +def input_generator(nsamples, train=True, forConv=False): + print('Generating input for ', 'training' if train else 'validation') + f_x, f_y = (train_x, train_y) if train else (val_x, val_y) + with open(f_x) as fx, open(f_y) as fy: + j = 0 + x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int) + y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64) + for line_x, line_y in zip(fx, fy): + question = line_x_to_indices(line_x) + expected_w, expected_c = line_y_to_indices(line_y) + x[j] = ctable.encode_one_hot(question, forConv) + y[j][expected_w] = expected_c + j = j + 1 + if j % nsamples == 0: + yield x, y + j = 0 + x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int) + y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64) + print("End of ", 'training' if train else 'validation') + return x, y + +def topcats(y_true, y_pred): + return metrics.top_k_categorical_accuracy(y_true, y_pred, k=TOP) + +def model_ff(): + print('Build model...') + epochs = 50 + model = Sequential() + model.add(layers.Dense(VOCAB_SIZE, input_shape=(SAMPLE_SIZE, VOCAB_SIZE))) +# model.add(layers.Dense(VOCAB_SIZE, activation='relu')) +# model.add(layers.Dropout(0.5)) +# model.add(layers.Dense(150, activation='relu')) + model.add(layers.Flatten()) + # model.add(layers.Dense(VOCAB_SIZE * 2, activation='relu')) + model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid')) + model.compile(loss='binary_crossentropy', + optimizer='adam', + metrics=['acc', topcats]) + return model, epochs, "words-ff2-{}b-{}ep".format(BATCH_SIZE, epochs) + +def model_convnet1D(): + print('Build model...') + epochs = 1 + model = Sequential() + model.add(layers.Conv1D(32, 10, activation='relu', + input_shape=(SAMPLE_SIZE, VOCAB_SIZE))) + model.add(layers.MaxPooling1D(2)) + model.add(layers.Conv1D(64, 10, activation='relu')) + model.add(layers.MaxPooling1D(2)) + model.add(layers.Conv1D(64, 10, activation='relu')) + model.add(layers.GlobalMaxPooling1D()) + model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid')) + + model.compile(loss='binary_crossentropy', + optimizer='adam', + metrics=['acc', topcats]) + + return model, epochs, "words-convnet1D-{}b-{}ep".format(BATCH_SIZE, epochs) + +def SumPooling2D(x): + return K.sum(x, axis=1) + +def model_convnet2D(): + print('Build model...') + epochs = 150 + model = Sequential() + model.add(layers.Conv2D(VOCAB_SIZE, (1, VOCAB_SIZE), + input_shape=(SAMPLE_SIZE, VOCAB_SIZE, 1))) + model.add(layers.Lambda(SumPooling2D)) + model.add(layers.Reshape((VOCAB_SIZE,))) + # model.add(layers.Flatten()) +# model.add(layers.Dense(VOCAB_SIZE)) + + model.compile(loss='mean_squared_error', + optimizer='adam', + metrics=['acc', topcats]) + +# model.compile(loss='binary_crossentropy', +# optimizer='adam', +# metrics=['acc', topcategories]) + + return model, epochs, "words-convnet2D-{}b-{}ep".format(BATCH_SIZE, epochs) + + +model, epochs, name = model_convnet2D() +model.summary() +plot_model(model, to_file=name + '.png', show_shapes=True) + +# Train the model each generation and show predictions against the validation +# dataset. +val_gen_2 = input_generator(5, train=False, forConv=True) +for iteration in range(1, epochs): + print() + print('-' * 50) + print('Iteration', iteration) + input_gen = input_generator(BATCH_SIZE, forConv=True) + val_gen = input_generator(BATCH_SIZE, False, forConv=True) + model.fit_generator(input_gen, + epochs = 1, + steps_per_epoch = 20, + validation_data = val_gen, + validation_steps = 10, workers=1) + # Select 10 samples from the validation set at random so we can visualize + # errors. +# print(batch_y) +# print(preds) + batch_x, batch_y = next(val_gen_2) + for i in range(len(batch_x)): + preds = model.predict(batch_x) + query = batch_x[i] + expected = batch_y[i] + prediction = preds[i] + #print(preds) +# preds[preds>=0.5] = 1 +# preds[preds<0.5] = 0 + + #q = ctable.decode(query) + correct = ctable.decode(expected) + guess = ctable.decode(prediction) + print('T', correct, ' G', guess) + +model.summary() +model.save(name + '.h5') + diff --git a/36-dnn/generate_words.py b/36-dnn/generate_words.py new file mode 100644 index 0000000..85f1930 --- /dev/null +++ b/36-dnn/generate_words.py @@ -0,0 +1,69 @@ +import os, sys +import collections +import numpy as np +import re + +SAMPLE_SIZE = 80 +VOCAB_SIZE = 10 +TOP = 5 + +stopwords = set(open('../stop_words.txt').read().split(',')) +all_words = re.findall('[a-z]{2,}', open('../pride-and-prejudice.txt').read().lower()) +words = list(set([w for w in all_words if w not in stopwords])) + +def generate_pair(with_counts): + # Grab a slice of the input file of size SAMPLE_SIZE + index = np.random.randint(0, len(all_words) - SAMPLE_SIZE) + querytmp = all_words[index:index+SAMPLE_SIZE] + # Replace unknown words with known ones + query = querytmp + for i, w in enumerate(querytmp): + if w not in words[:VOCAB_SIZE] and query[i] == w: + # Replace ALL occurrences in query with the same replacement word + other = words[np.random.randint(0, VOCAB_SIZE/2)] + query = [other if v == w else v for v in query] + + counts = collections.Counter(query) + top = counts.most_common() + if not with_counts: + ans = list(list(zip(*top))[0]) + else: + ans = [t[0] + " " + str(t[1]) for t in top] + return query, ans + + +def generate_data(data_folder, ntrain, nval, vocab_size, with_counts): + train_x = os.path.join(data_folder, 'train_x.txt') + train_y = os.path.join(data_folder, 'train_y.txt') + val_x = os.path.join(data_folder, 'val_x.txt') + val_y = os.path.join(data_folder, 'val_y.txt') + + if not os.path.exists(data_folder): + os.makedirs(data_folder) + + global VOCAB_SIZE + VOCAB_SIZE = vocab_size + with open(train_x, 'w') as fx, open(train_y, 'w') as fy: + for _ in range(0, ntrain): + query, ans = generate_pair(with_counts) + fx.write(' '.join(query) + '\n') + fy.write(','.join(ans) + '\n') + + with open(val_x, 'w') as fx, open(val_y, 'w') as fy: + for _ in range(0, nval): + query, ans = generate_pair(with_counts) + fx.write(' '.join(query) + '\n') + fy.write(','.join(ans) + '\n') + +def main(): + # [1]: number of samples in training set + # [2]: number of samples in validation set + # [3]: vocabulary size + # [4]: output with (1) or without (0) counts + data_folder = 'words_data' + if len(sys.argv) > 3: data_folder = data_folder + "_" + sys.argv[3] + generate_data(data_folder, int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), bool(int(sys.argv[4]))) + +if __name__ == "__main__": + main() +