Some renaming

This commit is contained in:
Crista Lopes
2019-12-26 10:40:03 -08:00
parent 3412e6fe95
commit 5a2b5975c7
8 changed files with 112 additions and 0 deletions

264
37-dnn/count_words.py Normal file
View File

@@ -0,0 +1,264 @@
# -*- coding: utf-8 -*-
'''
# An implementation of deep learning for counting symbols
Input: [10, 12, 10, 11, 2, 2, 2, 1, 1]
Output: words=[2, 10, 1, 12, 11] counts=[3, 2, 2, 1, 1] (Not necessarily in this order)
'''
from __future__ import print_function
from keras.models import Sequential
from keras import layers, metrics
from keras import backend as K
from keras.utils import plot_model
from keras.utils import to_categorical
import numpy as np
from six.moves import range
import string, re, collections, os, sys
# Parameters for the model and dataset.
TRAINING_SIZE = 50000
VOCAB_SIZE = 1000
SAMPLE_SIZE = 100
TOP = 2
BATCH_SIZE = 50
data_folder = 'words_data'
if len(sys.argv) > 1:
data_folder = data_folder + '_' + sys.argv[1]
train_x = os.path.join(data_folder, 'train_x.txt')
train_y = os.path.join(data_folder, 'train_y.txt')
val_x = os.path.join(data_folder, 'val_x.txt')
val_y = os.path.join(data_folder, 'val_y.txt')
class WordTable(object):
"""Given a text file:
+ Encode the words to a one-hot integer representation
+ Decode the one-hot or integer representation to their character output
+ Decode a vector of probabilities to their character output
"""
def __init__(self):
"""Initialize words table.
# Arguments
filename: The file from which to map the words.
"""
global TRAINING_SIZE
global VOCAB_SIZE
global SAMPLE_SIZE
global BATCH_SIZE
self.words = set()
nlines = 0
max_words = 0
with open(train_x) as f:
for line in f:
words = line.split()
self.words.update(words)
nlines = nlines + 1
if max_words < len(words):
max_words = len(words)
self.words = list(self.words)
self.word_indices = dict((w, i) for i, w in enumerate(self.words))
self.indices_word = dict((i, w) for i, w in enumerate(self.words))
TRAINING_SIZE = nlines
VOCAB_SIZE = len(self.words)
SAMPLE_SIZE = max_words
BATCH_SIZE = 50
def words_to_indices(self, words):
return [self.word_indices[w] for w in words]
def indices_to_words(self, indices):
return [self.indices_word[i] for i in indices]
def encode_one_hot(self, W, forConv=False):
"""One-hot encode given word, or list of indices, W.
# Arguments
W: either a word or a list of indices, to be encoded.
"""
if type(W) is string:
x = np.zeros(VOCAB_SIZE)
x[self.word_indices[W]] = 1
return x
elif type(W) is list: # Example: [3, 9, 5]
x = np.zeros((SAMPLE_SIZE, VOCAB_SIZE)) if not forConv else np.zeros((SAMPLE_SIZE, VOCAB_SIZE, 1))
for i, w in enumerate(W):
if i >= SAMPLE_SIZE: break
if not forConv:
x[i, w] = 1
else:
x[i, w, 0] = 1
return x
else:
raise Exception("Bad type to encode")
def decode(self, x):
"""Decode the given vector or 1D array to their character output.
# Arguments
x: A vector or a 2D array of probabilities or one-hot representations;
or a vector of word indices (used with `calc_argmax=False`).
calc_argmax: Whether to find the word index with maximum
probability, defaults to `True`.
"""
if x.ndim == 1: # either a single word, one-hot encoded, or multiple words
#one_idxs = [i for i, v in enumerate(x) if v >= 0.5]
one_idxs = np.argpartition(x, -TOP)[-TOP:]
print(f'Top 2 indices are {one_idxs} and values are ', np.rint(x[one_idxs]))
return [self.indices_word[i] for i in one_idxs]
elif x.ndim == 2: # a list of words, each one-hot encoded
words = []
for w in x:
words.append(self.decode(w))
return words
else:
raise Exception("Bad type to decode")
ctable = WordTable()
print(f'Words table with training size {TRAINING_SIZE}, batch size {BATCH_SIZE}, vocab size {VOCAB_SIZE} and sample size {SAMPLE_SIZE}')
def line_x_to_indices(line):
words = line.split()
return ctable.words_to_indices(words)
def line_y_to_indices(line):
pairs = line.split(',')
if len(pairs[0]) < 2: # no counts here
return list(zip(ctable.words_to_indices(pairs), [1 for _ in range(len(pairs))]))
else:
words = [p.split()[0] for p in pairs]
counts = [int(p.split()[1]) for p in pairs]
w_indices = ctable.words_to_indices(words)
return w_indices, counts
def input_generator(nsamples, train=True, forConv=False):
print('Generating input for ', 'training' if train else 'validation')
f_x, f_y = (train_x, train_y) if train else (val_x, val_y)
with open(f_x) as fx, open(f_y) as fy:
j = 0
x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int)
y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64)
for line_x, line_y in zip(fx, fy):
question = line_x_to_indices(line_x)
expected_w, expected_c = line_y_to_indices(line_y)
x[j] = ctable.encode_one_hot(question, forConv)
y[j][expected_w] = expected_c
j = j + 1
if j % nsamples == 0:
yield x, y
j = 0
x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int)
y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64)
print("End of ", 'training' if train else 'validation')
return x, y
def topcats(y_true, y_pred):
return metrics.top_k_categorical_accuracy(y_true, y_pred, k=TOP)
def model_ff():
print('Build model...')
epochs = 50
model = Sequential()
model.add(layers.Dense(VOCAB_SIZE, input_shape=(SAMPLE_SIZE, VOCAB_SIZE)))
# model.add(layers.Dense(VOCAB_SIZE, activation='relu'))
# model.add(layers.Dropout(0.5))
# model.add(layers.Dense(150, activation='relu'))
model.add(layers.Flatten())
# model.add(layers.Dense(VOCAB_SIZE * 2, activation='relu'))
model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['acc', topcats])
return model, epochs, "words-ff2-{}b-{}ep".format(BATCH_SIZE, epochs)
def model_convnet1D():
print('Build model...')
epochs = 1
model = Sequential()
model.add(layers.Conv1D(32, 10, activation='relu',
input_shape=(SAMPLE_SIZE, VOCAB_SIZE)))
model.add(layers.MaxPooling1D(2))
model.add(layers.Conv1D(64, 10, activation='relu'))
model.add(layers.MaxPooling1D(2))
model.add(layers.Conv1D(64, 10, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['acc', topcats])
return model, epochs, "words-convnet1D-{}b-{}ep".format(BATCH_SIZE, epochs)
def SumPooling2D(x):
return K.sum(x, axis=1)
def model_convnet2D():
print('Build model...')
epochs = 150
model = Sequential()
model.add(layers.Conv2D(VOCAB_SIZE, (1, VOCAB_SIZE),
input_shape=(SAMPLE_SIZE, VOCAB_SIZE, 1)))
model.add(layers.Lambda(SumPooling2D))
model.add(layers.Reshape((VOCAB_SIZE,)))
# model.add(layers.Flatten())
# model.add(layers.Dense(VOCAB_SIZE))
model.compile(loss='mean_squared_error',
optimizer='adam',
metrics=['acc', topcats])
# model.compile(loss='binary_crossentropy',
# optimizer='adam',
# metrics=['acc', topcategories])
return model, epochs, "words-convnet2D-{}b-{}ep".format(BATCH_SIZE, epochs)
model, epochs, name = model_convnet2D()
model.summary()
plot_model(model, to_file=name + '.png', show_shapes=True)
# Train the model each generation and show predictions against the validation
# dataset.
val_gen_2 = input_generator(5, train=False, forConv=True)
for iteration in range(1, epochs):
print()
print('-' * 50)
print('Iteration', iteration)
input_gen = input_generator(BATCH_SIZE, forConv=True)
val_gen = input_generator(BATCH_SIZE, False, forConv=True)
model.fit_generator(input_gen,
epochs = 1,
steps_per_epoch = 20,
validation_data = val_gen,
validation_steps = 10, workers=1)
# Select 10 samples from the validation set at random so we can visualize
# errors.
# print(batch_y)
# print(preds)
batch_x, batch_y = next(val_gen_2)
for i in range(len(batch_x)):
preds = model.predict(batch_x)
query = batch_x[i]
expected = batch_y[i]
prediction = preds[i]
#print(preds)
# preds[preds>=0.5] = 1
# preds[preds<0.5] = 0
#q = ctable.decode(query)
correct = ctable.decode(expected)
guess = ctable.decode(prediction)
print('T', correct, ' G', guess)
model.summary()
model.save(name + '.h5')

View File

@@ -0,0 +1,74 @@
import os, sys
import collections
import numpy as np
import re, string
MAX_LINE_SIZE = 80
MAX_WORDS_IN_LINE = 20
all_chars = ""
with open('pride-and-prejudice.txt') as f:
all_chars = f.read().replace('\n', ' ')
all_words = re.findall('[a-z]{2,}', all_chars.lower())
words = list(set(all_words))
def generate_pair():
# Grab a slice of the input file of size MAX_LINE_SIZE
index = np.random.randint(0, len(all_chars) - MAX_LINE_SIZE)
cquery = ' ' + all_chars[index:index+MAX_LINE_SIZE - 2] + ' '
# Replace unknown words with known ones
wquery = set(re.findall('[a-z]{2,}', cquery.lower()))
for w in wquery:
if w not in words[:VOCAB_SIZE]:
# Replace ALL occurrences in query with the same replacement word
other = words[np.random.randint(0, VOCAB_SIZE/2)]
exp = '[^a-z]' + w + '[^a-z]'
indices = [(m.start()+1, m.end()-1) for m in re.finditer(exp, cquery.lower())]
for b, e in reversed(indices):
cquery = cquery[0:b] + other + cquery[e:]
# Make sure the size of all chars is less than MAX_LINE_SIZE
if len(cquery) >= MAX_LINE_SIZE:
last_sp = cquery[:MAX_LINE_SIZE].rfind(' ')
cquery = cquery[:last_sp] + ' ' * (MAX_LINE_SIZE - last_sp)
# OK, now that we have the sequence of chars, find its sequence of words
# [TODO] Remember to remove stop words
list_of_words = re.findall('[a-z]{2,}', cquery.lower())
return cquery.strip(), list_of_words
def generate_data(ntrain, nval, vocab_size, data_folder, train_x, train_y, val_x, val_y):
if not os.path.exists(data_folder):
os.makedirs(data_folder)
global VOCAB_SIZE
VOCAB_SIZE = vocab_size
with open(train_x, 'w') as fx, open(train_y, 'w') as fy:
for _ in range(0, ntrain):
query, ans = generate_pair()
fx.write(query + '\n')
fy.write(','.join(ans) + '\n')
with open(val_x, 'w') as fx, open(val_y, 'w') as fy:
for _ in range(0, nval):
query, ans = generate_pair()
fx.write(query + '\n')
fy.write(','.join(ans) + '\n')
def main():
# [1]: number of samples in training set
# [2]: number of samples in validation set
# [3]: vocabulary size
data_folder = 'c2w_data'
if len(sys.argv) > 3: data_folder = data_folder + "_" + sys.argv[3]
train_x = os.path.join(data_folder, 'train_x.txt')
train_y = os.path.join(data_folder, 'train_y.txt')
val_x = os.path.join(data_folder, 'val_x.txt')
val_y = os.path.join(data_folder, 'val_y.txt')
generate_data(int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), data_folder, train_x, train_y, val_x, val_y)
if __name__ == "__main__":
main()

69
37-dnn/generate_words.py Normal file
View File

@@ -0,0 +1,69 @@
import os, sys
import collections
import numpy as np
import re
SAMPLE_SIZE = 80
VOCAB_SIZE = 10
TOP = 5
stopwords = set(open('../stop_words.txt').read().split(','))
all_words = re.findall('[a-z]{2,}', open('../pride-and-prejudice.txt').read().lower())
words = list(set([w for w in all_words if w not in stopwords]))
def generate_pair(with_counts):
# Grab a slice of the input file of size SAMPLE_SIZE
index = np.random.randint(0, len(all_words) - SAMPLE_SIZE)
querytmp = all_words[index:index+SAMPLE_SIZE]
# Replace unknown words with known ones
query = querytmp
for i, w in enumerate(querytmp):
if w not in words[:VOCAB_SIZE] and query[i] == w:
# Replace ALL occurrences in query with the same replacement word
other = words[np.random.randint(0, VOCAB_SIZE/2)]
query = [other if v == w else v for v in query]
counts = collections.Counter(query)
top = counts.most_common()
if not with_counts:
ans = list(list(zip(*top))[0])
else:
ans = [t[0] + " " + str(t[1]) for t in top]
return query, ans
def generate_data(data_folder, ntrain, nval, vocab_size, with_counts):
train_x = os.path.join(data_folder, 'train_x.txt')
train_y = os.path.join(data_folder, 'train_y.txt')
val_x = os.path.join(data_folder, 'val_x.txt')
val_y = os.path.join(data_folder, 'val_y.txt')
if not os.path.exists(data_folder):
os.makedirs(data_folder)
global VOCAB_SIZE
VOCAB_SIZE = vocab_size
with open(train_x, 'w') as fx, open(train_y, 'w') as fy:
for _ in range(0, ntrain):
query, ans = generate_pair(with_counts)
fx.write(' '.join(query) + '\n')
fy.write(','.join(ans) + '\n')
with open(val_x, 'w') as fx, open(val_y, 'w') as fy:
for _ in range(0, nval):
query, ans = generate_pair(with_counts)
fx.write(' '.join(query) + '\n')
fy.write(','.join(ans) + '\n')
def main():
# [1]: number of samples in training set
# [2]: number of samples in validation set
# [3]: vocabulary size
# [4]: output with (1) or without (0) counts
data_folder = 'words_data'
if len(sys.argv) > 3: data_folder = data_folder + "_" + sys.argv[3]
generate_data(data_folder, int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), bool(int(sys.argv[4])))
if __name__ == "__main__":
main()

102
37-dnn/normalize-char-ff.py Normal file
View File

@@ -0,0 +1,102 @@
from keras.models import Model, Sequential
from keras import layers
from keras.layers import Input, Dense
from keras.utils import plot_model
import numpy as np
import sys, os, string, random
characters = sorted(string.printable)
char_indices = dict((c, i) for i, c in enumerate(characters))
indices_char = dict((i, c) for i, c in enumerate(characters))
INPUT_VOCAB_SIZE = len(characters)
BATCH_SIZE = 200
def encode_one_hot(c):
"""One-hot encode the given character.
"""
x = np.zeros((INPUT_VOCAB_SIZE))
index = char_indices[c]
x[index] = 1
return x
def decode_one_hot(x):
"""Return a character from a one-hot-encoded vector
"""
one_index = np.argmax(x)
c = indices_char[one_index]
return c
def build_model():
print('Build model...')
model = Sequential()
model.add(layers.Dense(INPUT_VOCAB_SIZE, input_shape=(INPUT_VOCAB_SIZE, ), activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def input_generator(nsamples):
def generate_char():
input_data = random.choice(characters)
expected = input_data.lower() if input_data in string.ascii_letters else ' '
return input_data, expected
while True:
data_in = np.zeros((nsamples, INPUT_VOCAB_SIZE))
data_out = np.zeros((nsamples, INPUT_VOCAB_SIZE))
for n in range(nsamples):
input_data, expected = generate_char()
data_in[n] = encode_one_hot(input_data)
data_out[n] = encode_one_hot(expected)
yield data_in, data_out
model = build_model()
model.summary()
plot_model(model, to_file='normalization.png', show_shapes=True)
# Train the model each generation and show predictions against a dataset.
val_gen2 = input_generator(4)
for iteration in range(1, 500):
print()
print('-' * 50)
print('Iteration', iteration)
input_gen = input_generator(BATCH_SIZE)
val_gen = input_generator(BATCH_SIZE)
model.fit_generator(input_gen,
epochs = 1,
steps_per_epoch = 20,
validation_data = val_gen,
validation_steps = 10, workers=1)
# Select samples from the a set at random so we can visualize errors.
batch_x, batch_y = next(val_gen2)
for i in range(len(batch_y)):
preds = model.predict(batch_x)
expected = batch_y[i]
prediction = preds[i]
correct = decode_one_hot(expected)
guess = decode_one_hot(prediction)
print('T', correct)
print('G', guess)
#with open(sys.argv[1]) as f:
# for line in f:
# if line.isspace(): continue
# onehots = encode_one_hot(line)
# data = [[] for _ in range(LINE_SIZE)]
# for i, c in enumerate(onehots):
# data[i].append(c)
# for j in range(len(onehots), LINE_SIZE):
# data[j].append(np.zeros((INPUT_VOCAB_SIZE)))
# inputs = [np.array(e) for e in data]
# preds = model.predict(inputs)
# normal = decode_one_hot(preds[0])
# print(decode_one_hot(onehots))
# print(normal)

142
37-dnn/normalize-chars.py Normal file
View File

@@ -0,0 +1,142 @@
from keras.models import Model
from keras import layers, metrics
from keras.layers import Input, Dense
from keras.utils import plot_model
import numpy as np
import sys, os, string, random
characters = sorted(string.printable)
char_indices = dict((c, i) for i, c in enumerate(characters))
indices_char = dict((i, c) for i, c in enumerate(characters))
INPUT_VOCAB_SIZE = len(characters)
LINE_SIZE = 100
BATCH_SIZE = 200
def encode_one_hot(s):
"""One-hot encode all characters of the given string.
"""
all = []
for c in s:
x = np.zeros((INPUT_VOCAB_SIZE))
index = char_indices[c]
x[index] = 1
all.append(x)
return all
def encode_one_hot2(s):
"""One-hot encode all characters of the given string.
"""
x = np.zeros((LINE_SIZE, INPUT_VOCAB_SIZE))
for n, c in enumerate(s):
index = char_indices[c]
x[n, index] = 1
return x
def decode_one_hot(x):
"""Return a string from a one-hot-encoded matrix
"""
s = []
for onehot in x:
one_index = np.argmax(onehot)
c = indices_char[one_index]
s.append(c)
return ''.join(s)
def build_model():
print('Build model...')
# Normalize every character in the input, using a shared dense model
n_layer = Dense(INPUT_VOCAB_SIZE, activation = "softmax")
raw_inputs = []
normalized_outputs = []
for _ in range(0, LINE_SIZE):
input_char = Input(shape=(INPUT_VOCAB_SIZE, ))
filtered_char = n_layer(input_char)
raw_inputs.append(input_char)
normalized_outputs.append(filtered_char)
merged_output = layers.concatenate(normalized_outputs, axis=-1)
reshape = layers.Reshape((LINE_SIZE, INPUT_VOCAB_SIZE, ))
reshaped_output = reshape(merged_output)
model = Model(inputs=raw_inputs, outputs=reshaped_output)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def input_generator(nsamples):
def generate_line():
input_data = [random.choice(characters) for _ in range(random.randint(1, LINE_SIZE))]
expected = [c.lower() if c in string.ascii_letters else ' ' for c in input_data]
return input_data, expected
while True:
data_in = [[] for _ in range(LINE_SIZE)]
data_out = np.zeros((nsamples, LINE_SIZE, INPUT_VOCAB_SIZE))
for n in range(nsamples):
input_data, expected = generate_line()
input_data = encode_one_hot(input_data)
for i, c in enumerate(input_data):
data_in[i].append(c)
for j in range(len(input_data), LINE_SIZE):
data_in[j].append(np.zeros((INPUT_VOCAB_SIZE)))
data_out[n] = encode_one_hot2(expected)
inputs = [np.array(e) for e in data_in]
yield inputs, data_out
model = build_model()
#model.summary()
plot_model(model, to_file='normalization.png', show_shapes=True)
# Train the model each generation and show predictions against the validation
# dataset.
val_gen2 = input_generator(1)
for iteration in range(1, 12):
print()
print('-' * 50)
print('Iteration', iteration)
input_gen = input_generator(BATCH_SIZE)
val_gen = input_generator(BATCH_SIZE)
model.fit_generator(input_gen,
epochs = 1,
steps_per_epoch = 20,
validation_data = val_gen,
validation_steps = 10, workers=1)
# Select samples from the a set at random so we can visualize errors.
batch_x, batch_y = next(val_gen2)
for i in range(len(batch_y)):
preds = model.predict(batch_x)
expected = batch_y[i]
prediction = preds[i]
correct = decode_one_hot(expected)
guess = decode_one_hot(prediction)
print('T:', correct)
print('G:', guess)
with open(sys.argv[1]) as f:
for line in f:
if line.isspace(): continue
onehots = encode_one_hot(line)
data = [[] for _ in range(LINE_SIZE)]
for i, c in enumerate(onehots):
data[i].append(c)
for j in range(len(onehots), LINE_SIZE):
data[j].append(np.zeros((INPUT_VOCAB_SIZE)))
inputs = [np.array(e) for e in data]
preds = model.predict(inputs)
normal = decode_one_hot(preds[0])
print(decode_one_hot(onehots))
print(normal)