Very rough learning to count words

This commit is contained in:
Crista Lopes
2019-11-26 00:38:29 -08:00
parent a18c4c6980
commit c7cb993a56
2 changed files with 335 additions and 0 deletions

266
36-dnn/count_words.py Normal file
View File

@@ -0,0 +1,266 @@
# -*- coding: utf-8 -*-
'''
# An implementation of deep learning for counting symbols
Input: [10, 12, 10, 11, 2, 2, 2, 1, 1]
Output: words=[2, 10, 1, 12, 11] counts=[3, 2, 2, 1, 1] (Not necessarily in this order)
''' # noqa
from __future__ import print_function
from keras.models import Sequential
from keras import layers, metrics
from keras import backend as K
from keras.utils import plot_model
from keras.utils import to_categorical
import numpy as np
from six.moves import range
import string, re, collections, os, sys
# Parameters for the model and dataset.
TRAINING_SIZE = 50000
VOCAB_SIZE = 1000
SAMPLE_SIZE = 100
TOP = 2
BATCH_SIZE = 50
data_folder = 'words_data'
if len(sys.argv) > 1:
data_folder = data_folder + '_' + sys.argv[1]
train_x = os.path.join(data_folder, 'train_x.txt')
train_y = os.path.join(data_folder, 'train_y.txt')
val_x = os.path.join(data_folder, 'val_x.txt')
val_y = os.path.join(data_folder, 'val_y.txt')
class WordTable(object):
"""Given a text file:
+ Encode the words to a one-hot integer representation
+ Decode the one-hot or integer representation to their character output
+ Decode a vector of probabilities to their character output
"""
def __init__(self):
"""Initialize words table.
# Arguments
filename: The file from which to map the words.
"""
global TRAINING_SIZE
global VOCAB_SIZE
global SAMPLE_SIZE
global BATCH_SIZE
self.words = set()
nlines = 0
max_words = 0
with open(train_x) as f:
for line in f:
words = line.split()
self.words.update(words)
nlines = nlines + 1
if max_words < len(words):
max_words = len(words)
self.words = list(self.words)
self.word_indices = dict((w, i) for i, w in enumerate(self.words))
self.indices_word = dict((i, w) for i, w in enumerate(self.words))
TRAINING_SIZE = nlines
VOCAB_SIZE = len(self.words)
SAMPLE_SIZE = max_words
BATCH_SIZE = 50
def words_to_indices(self, words):
return [self.word_indices[w] for w in words]
def indices_to_words(self, indices):
return [self.indices_word[i] for i in indices]
def encode_one_hot(self, W, forConv=False):
"""One-hot encode given word, or list of indices, W.
# Arguments
W: either a word or a list of indices, to be encoded.
"""
if type(W) is string:
x = np.zeros(VOCAB_SIZE)
x[self.word_indices[W]] = 1
return x
elif type(W) is list: # Example: [3, 9, 5]
x = np.zeros((SAMPLE_SIZE, VOCAB_SIZE)) if not forConv else np.zeros((SAMPLE_SIZE, VOCAB_SIZE, 1))
for i, w in enumerate(W):
if i >= SAMPLE_SIZE: break
if not forConv:
x[i, w] = 1
else:
x[i, w, 0] = 1
return x
else:
raise Exception("Bad type to encode")
def decode(self, x):
"""Decode the given vector or 1D array to their character output.
# Arguments
x: A vector or a 2D array of probabilities or one-hot representations;
or a vector of word indices (used with `calc_argmax=False`).
calc_argmax: Whether to find the word index with maximum
probability, defaults to `True`.
"""
if x.ndim == 1: # either a single word, one-hot encoded, or multiple words
#one_idxs = [i for i, v in enumerate(x) if v >= 0.5]
one_idxs = np.argpartition(x, -TOP)[-TOP:]
print(f'Top 2 indices are {one_idxs} and values are ', np.rint(x[one_idxs]))
return [self.indices_word[i] for i in one_idxs]
elif x.ndim == 2: # a list of words, each one-hot encoded
words = []
for w in x:
words.append(self.decode(w))
return words
else:
raise Exception("Bad type to decode")
ctable = WordTable()
print(f'Words table with training size {TRAINING_SIZE}, batch size {BATCH_SIZE}, vocab size {VOCAB_SIZE} and sample size {SAMPLE_SIZE}')
def line_x_to_indices(line):
words = line.split()
return ctable.words_to_indices(words)
def line_y_to_indices(line):
pairs = line.split(',')
if len(pairs[0]) < 2: # no counts here
return list(zip(ctable.words_to_indices(pairs), [1 for _ in range(len(pairs))]))
else:
words = [p.split()[0] for p in pairs]
counts = [int(p.split()[1]) for p in pairs]
w_indices = ctable.words_to_indices(words)
return w_indices, counts
def input_generator(nsamples, train=True, forConv=False):
print('Generating input for ', 'training' if train else 'validation')
f_x, f_y = (train_x, train_y) if train else (val_x, val_y)
with open(f_x) as fx, open(f_y) as fy:
j = 0
x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int)
y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64)
for line_x, line_y in zip(fx, fy):
question = line_x_to_indices(line_x)
expected_w, expected_c = line_y_to_indices(line_y)
x[j] = ctable.encode_one_hot(question, forConv)
y[j][expected_w] = expected_c
j = j + 1
if j % nsamples == 0:
yield x, y
j = 0
x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int)
y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64)
print("End of ", 'training' if train else 'validation')
return x, y
def topcats(y_true, y_pred):
return metrics.top_k_categorical_accuracy(y_true, y_pred, k=TOP)
def model_ff():
print('Build model...')
epochs = 50
model = Sequential()
model.add(layers.Dense(VOCAB_SIZE, input_shape=(SAMPLE_SIZE, VOCAB_SIZE)))
# model.add(layers.Dense(VOCAB_SIZE, activation='relu'))
# model.add(layers.Dropout(0.5))
# model.add(layers.Dense(150, activation='relu'))
model.add(layers.Flatten())
# model.add(layers.Dense(VOCAB_SIZE * 2, activation='relu'))
model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['acc', topcats])
return model, epochs, "words-ff2-{}b-{}ep".format(BATCH_SIZE, epochs)
def model_convnet1D():
print('Build model...')
epochs = 1
model = Sequential()
model.add(layers.Conv1D(32, 10, activation='relu',
input_shape=(SAMPLE_SIZE, VOCAB_SIZE)))
model.add(layers.MaxPooling1D(2))
model.add(layers.Conv1D(64, 10, activation='relu'))
model.add(layers.MaxPooling1D(2))
model.add(layers.Conv1D(64, 10, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['acc', topcats])
return model, epochs, "words-convnet1D-{}b-{}ep".format(BATCH_SIZE, epochs)
def SumPooling2D(x):
return K.sum(x, axis=1)
def model_convnet2D():
print('Build model...')
epochs = 150
model = Sequential()
model.add(layers.Conv2D(VOCAB_SIZE, (1, VOCAB_SIZE),
input_shape=(SAMPLE_SIZE, VOCAB_SIZE, 1)))
model.add(layers.Lambda(SumPooling2D))
model.add(layers.Reshape((VOCAB_SIZE,)))
# model.add(layers.Flatten())
# model.add(layers.Dense(VOCAB_SIZE))
model.compile(loss='mean_squared_error',
optimizer='adam',
metrics=['acc', topcats])
# model.compile(loss='binary_crossentropy',
# optimizer='adam',
# metrics=['acc', topcategories])
return model, epochs, "words-convnet2D-{}b-{}ep".format(BATCH_SIZE, epochs)
model, epochs, name = model_convnet2D()
model.summary()
plot_model(model, to_file=name + '.png', show_shapes=True)
# Train the model each generation and show predictions against the validation
# dataset.
val_gen_2 = input_generator(5, train=False, forConv=True)
for iteration in range(1, epochs):
print()
print('-' * 50)
print('Iteration', iteration)
input_gen = input_generator(BATCH_SIZE, forConv=True)
val_gen = input_generator(BATCH_SIZE, False, forConv=True)
model.fit_generator(input_gen,
epochs = 1,
steps_per_epoch = 20,
validation_data = val_gen,
validation_steps = 10, workers=1)
# Select 10 samples from the validation set at random so we can visualize
# errors.
# print(batch_y)
# print(preds)
batch_x, batch_y = next(val_gen_2)
for i in range(len(batch_x)):
preds = model.predict(batch_x)
query = batch_x[i]
expected = batch_y[i]
prediction = preds[i]
#print(preds)
# preds[preds>=0.5] = 1
# preds[preds<0.5] = 0
#q = ctable.decode(query)
correct = ctable.decode(expected)
guess = ctable.decode(prediction)
print('T', correct, ' G', guess)
model.summary()
model.save(name + '.h5')

69
36-dnn/generate_words.py Normal file
View File

@@ -0,0 +1,69 @@
import os, sys
import collections
import numpy as np
import re
SAMPLE_SIZE = 80
VOCAB_SIZE = 10
TOP = 5
stopwords = set(open('../stop_words.txt').read().split(','))
all_words = re.findall('[a-z]{2,}', open('../pride-and-prejudice.txt').read().lower())
words = list(set([w for w in all_words if w not in stopwords]))
def generate_pair(with_counts):
# Grab a slice of the input file of size SAMPLE_SIZE
index = np.random.randint(0, len(all_words) - SAMPLE_SIZE)
querytmp = all_words[index:index+SAMPLE_SIZE]
# Replace unknown words with known ones
query = querytmp
for i, w in enumerate(querytmp):
if w not in words[:VOCAB_SIZE] and query[i] == w:
# Replace ALL occurrences in query with the same replacement word
other = words[np.random.randint(0, VOCAB_SIZE/2)]
query = [other if v == w else v for v in query]
counts = collections.Counter(query)
top = counts.most_common()
if not with_counts:
ans = list(list(zip(*top))[0])
else:
ans = [t[0] + " " + str(t[1]) for t in top]
return query, ans
def generate_data(data_folder, ntrain, nval, vocab_size, with_counts):
train_x = os.path.join(data_folder, 'train_x.txt')
train_y = os.path.join(data_folder, 'train_y.txt')
val_x = os.path.join(data_folder, 'val_x.txt')
val_y = os.path.join(data_folder, 'val_y.txt')
if not os.path.exists(data_folder):
os.makedirs(data_folder)
global VOCAB_SIZE
VOCAB_SIZE = vocab_size
with open(train_x, 'w') as fx, open(train_y, 'w') as fy:
for _ in range(0, ntrain):
query, ans = generate_pair(with_counts)
fx.write(' '.join(query) + '\n')
fy.write(','.join(ans) + '\n')
with open(val_x, 'w') as fx, open(val_y, 'w') as fy:
for _ in range(0, nval):
query, ans = generate_pair(with_counts)
fx.write(' '.join(query) + '\n')
fy.write(','.join(ans) + '\n')
def main():
# [1]: number of samples in training set
# [2]: number of samples in validation set
# [3]: vocabulary size
# [4]: output with (1) or without (0) counts
data_folder = 'words_data'
if len(sys.argv) > 3: data_folder = data_folder + "_" + sys.argv[3]
generate_data(data_folder, int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), bool(int(sys.argv[4])))
if __name__ == "__main__":
main()