Some renaming

2019-12-26 10:40:03 -08:00
parent 3412e6fe95
commit 5a2b5975c7
8 changed files with 112 additions and 0 deletions
--- a/37-dnn/count_words.py
+++ b/37-dnn/count_words.py
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+'''
+# An implementation of deep learning for counting symbols
+Input:  [10, 12, 10, 11, 2, 2, 2, 1, 1]
+Output: words=[2, 10, 1, 12, 11] counts=[3, 2, 2, 1, 1] (Not necessarily in this order)
+'''  
+from __future__ import print_function
+from keras.models import Sequential
+from keras import layers, metrics
+from keras import backend as K
+from keras.utils import plot_model
+from keras.utils import to_categorical
+import numpy as np
+from six.moves import range
+import string, re, collections, os, sys
+
+# Parameters for the model and dataset.
+TRAINING_SIZE = 50000
+VOCAB_SIZE = 1000
+SAMPLE_SIZE = 100
+TOP = 2
+BATCH_SIZE = 50
+
+data_folder = 'words_data' 
+if len(sys.argv) > 1:
+    data_folder = data_folder + '_' + sys.argv[1]
+train_x = os.path.join(data_folder, 'train_x.txt')
+train_y = os.path.join(data_folder, 'train_y.txt')
+val_x = os.path.join(data_folder, 'val_x.txt')
+val_y = os.path.join(data_folder, 'val_y.txt')
+
+
+class WordTable(object):
+    """Given a text file:
+    + Encode the words to a one-hot integer representation
+    + Decode the one-hot or integer representation to their character output
+    + Decode a vector of probabilities to their character output
+    """
+    def __init__(self):
+        """Initialize words table.
+
+        # Arguments
+            filename: The file from which to map the words.
+        """
+        global TRAINING_SIZE
+        global VOCAB_SIZE
+        global SAMPLE_SIZE
+        global BATCH_SIZE
+
+        self.words = set()
+        nlines = 0
+        max_words = 0
+        with open(train_x) as f:
+            for line in f:
+                words = line.split()
+                self.words.update(words)
+
+                nlines = nlines + 1
+                if max_words < len(words):
+                    max_words = len(words)
+
+        self.words = list(self.words)
+        self.word_indices = dict((w, i) for i, w in enumerate(self.words))
+        self.indices_word = dict((i, w) for i, w in enumerate(self.words))
+
+        TRAINING_SIZE = nlines
+        VOCAB_SIZE = len(self.words)
+        SAMPLE_SIZE = max_words
+        BATCH_SIZE = 50
+
+    def words_to_indices(self, words):
+        return [self.word_indices[w] for w in words]
+
+    def indices_to_words(self, indices):
+        return [self.indices_word[i] for i in indices]
+
+    def encode_one_hot(self, W, forConv=False):
+        """One-hot encode given word, or list of indices, W.
+
+        # Arguments
+            W: either a word or a list of indices, to be encoded.
+        """
+        if type(W) is string:
+            x = np.zeros(VOCAB_SIZE)
+            x[self.word_indices[W]] = 1
+            return x
+        elif type(W) is list: # Example: [3, 9, 5]
+            x = np.zeros((SAMPLE_SIZE, VOCAB_SIZE))  if not forConv else np.zeros((SAMPLE_SIZE, VOCAB_SIZE, 1))
+            for i, w in enumerate(W):
+                if i >= SAMPLE_SIZE: break
+                if not forConv:
+                    x[i, w] = 1 
+                else:
+                    x[i, w, 0] = 1
+            return x
+        else:
+            raise Exception("Bad type to encode")
+
+
+    def decode(self, x):
+        """Decode the given vector or 1D array to their character output.
+
+        # Arguments
+            x: A vector or a 2D array of probabilities or one-hot representations;
+                or a vector of word indices (used with `calc_argmax=False`).
+            calc_argmax: Whether to find the word index with maximum
+                probability, defaults to `True`.
+        """
+        if x.ndim == 1: # either a single word, one-hot encoded, or multiple words
+            #one_idxs = [i for i, v in enumerate(x) if v >= 0.5]
+            one_idxs = np.argpartition(x, -TOP)[-TOP:]
+            print(f'Top 2 indices are {one_idxs} and values are ', np.rint(x[one_idxs]))
+            return [self.indices_word[i] for i in one_idxs]
+        elif x.ndim == 2: # a list of words, each one-hot encoded
+            words = []
+            for w in x:
+                words.append(self.decode(w))
+            return words
+        else:
+            raise Exception("Bad type to decode")
+
+
+ctable = WordTable()
+print(f'Words table with training size {TRAINING_SIZE}, batch size {BATCH_SIZE}, vocab size {VOCAB_SIZE} and sample size {SAMPLE_SIZE}')
+
+
+def line_x_to_indices(line):
+    words = line.split()
+    return ctable.words_to_indices(words)
+
+def line_y_to_indices(line):
+    pairs = line.split(',')
+    if len(pairs[0]) < 2: # no counts here
+        return list(zip(ctable.words_to_indices(pairs), [1 for _ in range(len(pairs))]))
+    else:
+        words =  [p.split()[0] for p in pairs]
+        counts = [int(p.split()[1]) for p in pairs] 
+        w_indices = ctable.words_to_indices(words)
+        return w_indices, counts 
+
+def input_generator(nsamples, train=True, forConv=False):
+    print('Generating input for ', 'training' if train else 'validation')
+    f_x, f_y = (train_x, train_y) if train else (val_x, val_y)
+    with open(f_x) as fx, open(f_y) as fy:
+        j = 0
+        x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int)
+        y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64)
+        for line_x, line_y in zip(fx, fy):
+            question = line_x_to_indices(line_x)
+            expected_w, expected_c = line_y_to_indices(line_y)
+            x[j] = ctable.encode_one_hot(question, forConv)
+            y[j][expected_w] = expected_c
+            j = j + 1
+            if j % nsamples == 0:
+                yield x, y
+                j = 0
+                x = np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE), dtype=np.int) if not forConv else np.zeros((nsamples, SAMPLE_SIZE, VOCAB_SIZE, 1), dtype=np.int)
+                y = np.zeros((nsamples, VOCAB_SIZE), dtype=np.float64)
+        print("End of ", 'training' if train else 'validation')
+        return x, y
+
+def topcats(y_true, y_pred):
+    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=TOP)
+
+def model_ff():
+    print('Build model...')
+    epochs = 50
+    model = Sequential()
+    model.add(layers.Dense(VOCAB_SIZE,  input_shape=(SAMPLE_SIZE, VOCAB_SIZE)))
+#    model.add(layers.Dense(VOCAB_SIZE, activation='relu'))
+#    model.add(layers.Dropout(0.5))
+#    model.add(layers.Dense(150, activation='relu'))
+    model.add(layers.Flatten())
+ #   model.add(layers.Dense(VOCAB_SIZE * 2, activation='relu'))
+    model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid'))
+    model.compile(loss='binary_crossentropy',
+                optimizer='adam',
+                metrics=['acc', topcats])
+    return model, epochs, "words-ff2-{}b-{}ep".format(BATCH_SIZE, epochs)
+
+def model_convnet1D():
+    print('Build model...')
+    epochs = 1
+    model = Sequential()
+    model.add(layers.Conv1D(32, 10, activation='relu', 
+                input_shape=(SAMPLE_SIZE, VOCAB_SIZE)))
+    model.add(layers.MaxPooling1D(2))
+    model.add(layers.Conv1D(64, 10, activation='relu'))
+    model.add(layers.MaxPooling1D(2))
+    model.add(layers.Conv1D(64, 10, activation='relu'))
+    model.add(layers.GlobalMaxPooling1D())
+    model.add(layers.Dense(VOCAB_SIZE, activation='sigmoid'))
+
+    model.compile(loss='binary_crossentropy',
+                optimizer='adam',
+                metrics=['acc', topcats])
+    
+    return model, epochs, "words-convnet1D-{}b-{}ep".format(BATCH_SIZE, epochs)
+
+def SumPooling2D(x):
+    return K.sum(x, axis=1) 
+
+def model_convnet2D():
+    print('Build model...')
+    epochs = 150
+    model = Sequential()
+    model.add(layers.Conv2D(VOCAB_SIZE, (1, VOCAB_SIZE),  
+                input_shape=(SAMPLE_SIZE, VOCAB_SIZE, 1)))
+    model.add(layers.Lambda(SumPooling2D))
+    model.add(layers.Reshape((VOCAB_SIZE,)))
+    #    model.add(layers.Flatten())
+#    model.add(layers.Dense(VOCAB_SIZE))
+
+    model.compile(loss='mean_squared_error',
+                optimizer='adam',
+                metrics=['acc', topcats])
+    
+#    model.compile(loss='binary_crossentropy',
+#    optimizer='adam',
+#    metrics=['acc', topcategories])
+
+    return model, epochs, "words-convnet2D-{}b-{}ep".format(BATCH_SIZE, epochs)
+
+
+model, epochs, name = model_convnet2D()
+model.summary()
+plot_model(model, to_file=name + '.png', show_shapes=True)
+
+# Train the model each generation and show predictions against the validation
+# dataset.
+val_gen_2 = input_generator(5, train=False, forConv=True)
+for iteration in range(1, epochs):
+    print()
+    print('-' * 50)
+    print('Iteration', iteration)
+    input_gen = input_generator(BATCH_SIZE, forConv=True)
+    val_gen = input_generator(BATCH_SIZE, False, forConv=True)
+    model.fit_generator(input_gen,
+                epochs = 1,
+                steps_per_epoch = 20,
+                validation_data = val_gen,
+                validation_steps = 10, workers=1)
+    # Select 10 samples from the validation set at random so we can visualize
+    # errors.
+#    print(batch_y)
+#    print(preds)
+    batch_x, batch_y = next(val_gen_2)
+    for i in range(len(batch_x)):
+        preds = model.predict(batch_x)
+        query = batch_x[i]
+        expected = batch_y[i]
+        prediction = preds[i]
+        #print(preds)
+#        preds[preds>=0.5] = 1
+#        preds[preds<0.5] = 0
+
+        #q = ctable.decode(query)
+        correct = ctable.decode(expected)
+        guess = ctable.decode(prediction)
+        print('T', correct, '    G', guess)
+
+model.summary()
+model.save(name + '.h5')
+
--- a/37-dnn/generate_c2w_data.py
+++ b/37-dnn/generate_c2w_data.py
@@ -0,0 +1,74 @@
+import os, sys
+import collections
+import numpy as np
+import re, string
+
+MAX_LINE_SIZE = 80
+MAX_WORDS_IN_LINE = 20
+
+all_chars = ""
+with open('pride-and-prejudice.txt') as f:
+    all_chars = f.read().replace('\n', ' ')
+all_words = re.findall('[a-z]{2,}', all_chars.lower())
+words = list(set(all_words))
+
+def generate_pair():
+    # Grab a slice of the input file of size MAX_LINE_SIZE
+    index = np.random.randint(0, len(all_chars) - MAX_LINE_SIZE)
+    cquery = ' ' + all_chars[index:index+MAX_LINE_SIZE - 2] + ' ' 
+    # Replace unknown words with known ones
+    wquery = set(re.findall('[a-z]{2,}', cquery.lower()))
+    for w in wquery:
+        if w not in words[:VOCAB_SIZE]:
+            # Replace ALL occurrences in query with the same replacement word
+            other = words[np.random.randint(0, VOCAB_SIZE/2)]
+            exp = '[^a-z]' + w + '[^a-z]'
+            indices = [(m.start()+1, m.end()-1) for m in re.finditer(exp, cquery.lower())]
+            for b, e in reversed(indices):
+                cquery = cquery[0:b] + other + cquery[e:]
+
+    # Make sure the size of all chars is less than MAX_LINE_SIZE
+    if len(cquery) >= MAX_LINE_SIZE:
+        last_sp = cquery[:MAX_LINE_SIZE].rfind(' ')
+        cquery = cquery[:last_sp] + ' ' * (MAX_LINE_SIZE - last_sp)
+
+    # OK, now that we have the sequence of chars, find its sequence of words
+    # [TODO] Remember to remove stop words
+    list_of_words = re.findall('[a-z]{2,}', cquery.lower())
+
+    return cquery.strip(), list_of_words
+
+
+def generate_data(ntrain, nval, vocab_size, data_folder, train_x, train_y, val_x, val_y):
+    if not os.path.exists(data_folder):
+        os.makedirs(data_folder)
+
+    global VOCAB_SIZE
+    VOCAB_SIZE = vocab_size
+    with open(train_x, 'w') as fx, open(train_y, 'w') as fy:
+        for _ in range(0, ntrain):
+            query, ans = generate_pair()
+            fx.write(query + '\n')
+            fy.write(','.join(ans) + '\n')
+
+    with open(val_x, 'w') as fx, open(val_y, 'w') as fy:
+        for _ in range(0, nval):
+            query, ans = generate_pair()
+            fx.write(query + '\n')
+            fy.write(','.join(ans) + '\n')
+
+def main():
+    # [1]: number of samples in training set
+    # [2]: number of samples in validation set
+    # [3]: vocabulary size
+    data_folder = 'c2w_data'
+    if len(sys.argv) > 3: data_folder = data_folder + "_" + sys.argv[3]
+    train_x = os.path.join(data_folder, 'train_x.txt')
+    train_y = os.path.join(data_folder, 'train_y.txt')
+    val_x = os.path.join(data_folder, 'val_x.txt')
+    val_y = os.path.join(data_folder, 'val_y.txt')
+    generate_data(int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), data_folder, train_x, train_y, val_x, val_y)
+
+if __name__ == "__main__":
+    main()
+
--- a/37-dnn/generate_words.py
+++ b/37-dnn/generate_words.py
@@ -0,0 +1,69 @@
+import os, sys
+import collections
+import numpy as np
+import re
+
+SAMPLE_SIZE = 80
+VOCAB_SIZE = 10
+TOP = 5
+
+stopwords = set(open('../stop_words.txt').read().split(','))
+all_words = re.findall('[a-z]{2,}', open('../pride-and-prejudice.txt').read().lower())
+words = list(set([w for w in all_words if w not in stopwords]))
+
+def generate_pair(with_counts):
+    # Grab a slice of the input file of size SAMPLE_SIZE
+    index = np.random.randint(0, len(all_words) - SAMPLE_SIZE)
+    querytmp = all_words[index:index+SAMPLE_SIZE]
+    # Replace unknown words with known ones
+    query = querytmp
+    for i, w in enumerate(querytmp):
+        if w not in words[:VOCAB_SIZE] and query[i] == w:
+            # Replace ALL occurrences in query with the same replacement word
+            other = words[np.random.randint(0, VOCAB_SIZE/2)]
+            query = [other if v == w else v for v in query]
+
+    counts = collections.Counter(query)
+    top = counts.most_common()
+    if not with_counts:
+        ans = list(list(zip(*top))[0])
+    else:
+        ans = [t[0] + " " + str(t[1]) for t in top]
+    return query, ans
+
+
+def generate_data(data_folder, ntrain, nval, vocab_size, with_counts):
+    train_x = os.path.join(data_folder, 'train_x.txt')
+    train_y = os.path.join(data_folder, 'train_y.txt')
+    val_x = os.path.join(data_folder, 'val_x.txt')
+    val_y = os.path.join(data_folder, 'val_y.txt')
+
+    if not os.path.exists(data_folder):
+        os.makedirs(data_folder)
+
+    global VOCAB_SIZE
+    VOCAB_SIZE = vocab_size
+    with open(train_x, 'w') as fx, open(train_y, 'w') as fy:
+        for _ in range(0, ntrain):
+            query, ans = generate_pair(with_counts)
+            fx.write(' '.join(query) + '\n')
+            fy.write(','.join(ans) + '\n')
+
+    with open(val_x, 'w') as fx, open(val_y, 'w') as fy:
+        for _ in range(0, nval):
+            query, ans = generate_pair(with_counts)
+            fx.write(' '.join(query) + '\n')
+            fy.write(','.join(ans) + '\n')
+
+def main():
+    # [1]: number of samples in training set
+    # [2]: number of samples in validation set
+    # [3]: vocabulary size
+    # [4]: output with (1) or without (0) counts
+    data_folder = 'words_data'
+    if len(sys.argv) > 3: data_folder = data_folder + "_" + sys.argv[3]
+    generate_data(data_folder, int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), bool(int(sys.argv[4])))
+
+if __name__ == "__main__":
+    main()
+
--- a/37-dnn/normalize-char-ff.py
+++ b/37-dnn/normalize-char-ff.py
@@ -0,0 +1,102 @@
+from keras.models import Model, Sequential
+from keras import layers
+from keras.layers import Input, Dense
+from keras.utils import plot_model
+
+import numpy as np
+import sys, os, string, random
+
+characters = sorted(string.printable)
+char_indices = dict((c, i) for i, c in enumerate(characters))
+indices_char = dict((i, c) for i, c in enumerate(characters))
+
+INPUT_VOCAB_SIZE = len(characters)
+BATCH_SIZE = 200
+
+def encode_one_hot(c):
+    """One-hot encode the given character.
+    """
+    x = np.zeros((INPUT_VOCAB_SIZE))
+    index = char_indices[c]
+    x[index] = 1 
+    return x
+
+def decode_one_hot(x):
+    """Return a character from a one-hot-encoded vector
+    """
+    one_index = np.argmax(x)
+    c = indices_char[one_index]
+    return c
+    
+def build_model():
+    print('Build model...')
+    model = Sequential()
+    model.add(layers.Dense(INPUT_VOCAB_SIZE, input_shape=(INPUT_VOCAB_SIZE, ), activation='softmax'))
+    model.compile(loss='categorical_crossentropy',
+                optimizer='adam',
+                metrics=['accuracy'])
+    return model
+
+def input_generator(nsamples):
+    def generate_char():
+        input_data = random.choice(characters) 
+        expected = input_data.lower() if input_data in string.ascii_letters else ' ' 
+        return input_data, expected
+
+    while True:
+        data_in  = np.zeros((nsamples, INPUT_VOCAB_SIZE))
+        data_out = np.zeros((nsamples, INPUT_VOCAB_SIZE))
+        for n in range(nsamples):
+            input_data, expected = generate_char()
+            data_in[n] = encode_one_hot(input_data)
+            data_out[n] = encode_one_hot(expected)
+
+        yield data_in, data_out
+
+model = build_model()
+model.summary()
+plot_model(model, to_file='normalization.png', show_shapes=True)
+
+# Train the model each generation and show predictions against a dataset.
+val_gen2 = input_generator(4)
+for iteration in range(1, 500):
+    print()
+    print('-' * 50)
+    print('Iteration', iteration)
+    input_gen = input_generator(BATCH_SIZE)
+    val_gen = input_generator(BATCH_SIZE)
+    model.fit_generator(input_gen,
+                epochs = 1,
+                steps_per_epoch = 20,
+                validation_data = val_gen,
+                validation_steps = 10, workers=1)
+    # Select samples from the a set at random so we can visualize errors.
+    batch_x, batch_y = next(val_gen2)
+    for i in range(len(batch_y)):
+        preds = model.predict(batch_x)
+        expected = batch_y[i]
+        prediction = preds[i]
+
+        correct = decode_one_hot(expected)
+        guess = decode_one_hot(prediction)
+        print('T', correct)
+        print('G', guess)
+
+#with open(sys.argv[1]) as f:
+#    for line in f:
+#        if line.isspace(): continue
+#        onehots = encode_one_hot(line)
+
+#        data = [[] for _ in range(LINE_SIZE)]
+#        for i, c in enumerate(onehots):
+#            data[i].append(c)
+#        for j in range(len(onehots), LINE_SIZE):
+#            data[j].append(np.zeros((INPUT_VOCAB_SIZE)))
+
+#        inputs = [np.array(e) for e in data]
+
+#        preds = model.predict(inputs)
+#        normal = decode_one_hot(preds[0])
+
+#        print(decode_one_hot(onehots))
+#        print(normal)
--- a/37-dnn/normalize-chars.py
+++ b/37-dnn/normalize-chars.py
@@ -0,0 +1,142 @@
+from keras.models import Model
+from keras import layers, metrics
+from keras.layers import Input, Dense
+from keras.utils import plot_model
+
+import numpy as np
+import sys, os, string, random
+
+characters = sorted(string.printable)
+char_indices = dict((c, i) for i, c in enumerate(characters))
+indices_char = dict((i, c) for i, c in enumerate(characters))
+
+INPUT_VOCAB_SIZE = len(characters)
+LINE_SIZE = 100
+BATCH_SIZE = 200
+
+def encode_one_hot(s):
+    """One-hot encode all characters of the given string.
+    """
+    all = []
+    for c in s:
+        x = np.zeros((INPUT_VOCAB_SIZE)) 
+        index = char_indices[c]
+        x[index] = 1 
+        all.append(x)
+    return all
+
+def encode_one_hot2(s):
+    """One-hot encode all characters of the given string.
+    """
+    x = np.zeros((LINE_SIZE, INPUT_VOCAB_SIZE))
+    for n, c in enumerate(s):
+        index = char_indices[c]
+        x[n, index] = 1 
+    return x
+
+def decode_one_hot(x):
+    """Return a string from a one-hot-encoded matrix
+    """
+    s = []
+    for onehot in x:
+        one_index = np.argmax(onehot)
+        c = indices_char[one_index]
+        s.append(c) 
+    return ''.join(s)
+
+def build_model():
+    print('Build model...')
+    
+    # Normalize every character in the input, using a shared dense model
+    n_layer = Dense(INPUT_VOCAB_SIZE, activation = "softmax")
+    raw_inputs = []
+    normalized_outputs = []
+    for _ in range(0, LINE_SIZE):
+        input_char = Input(shape=(INPUT_VOCAB_SIZE, ))
+        filtered_char = n_layer(input_char)
+        raw_inputs.append(input_char)
+        normalized_outputs.append(filtered_char)
+
+    merged_output = layers.concatenate(normalized_outputs, axis=-1)
+
+    reshape = layers.Reshape((LINE_SIZE, INPUT_VOCAB_SIZE, ))
+    reshaped_output = reshape(merged_output)
+
+    model = Model(inputs=raw_inputs, outputs=reshaped_output)
+    model.compile(loss='categorical_crossentropy',
+                optimizer='adam',
+                metrics=['accuracy'])
+
+    return model
+
+def input_generator(nsamples):
+    def generate_line():
+        input_data = [random.choice(characters) for _ in range(random.randint(1, LINE_SIZE))]
+        expected = [c.lower() if c in string.ascii_letters else ' ' for c in input_data]
+        return input_data, expected
+
+    while True:
+        data_in  = [[] for _ in range(LINE_SIZE)]
+        data_out = np.zeros((nsamples, LINE_SIZE, INPUT_VOCAB_SIZE))
+        for n in range(nsamples):
+            input_data, expected = generate_line()
+            input_data = encode_one_hot(input_data)
+            for i, c in enumerate(input_data):
+                data_in[i].append(c)
+            for j in range(len(input_data), LINE_SIZE):
+                data_in[j].append(np.zeros((INPUT_VOCAB_SIZE)))
+
+            data_out[n] = encode_one_hot2(expected)
+
+        inputs = [np.array(e) for e in data_in]
+
+        yield inputs, data_out
+
+model = build_model()
+#model.summary()
+plot_model(model, to_file='normalization.png', show_shapes=True)
+
+# Train the model each generation and show predictions against the validation
+# dataset.
+val_gen2 = input_generator(1)
+for iteration in range(1, 12):
+    print()
+    print('-' * 50)
+    print('Iteration', iteration)
+    input_gen = input_generator(BATCH_SIZE)
+    val_gen = input_generator(BATCH_SIZE)
+    model.fit_generator(input_gen,
+                epochs = 1,
+                steps_per_epoch = 20,
+                validation_data = val_gen,
+                validation_steps = 10, workers=1)
+    # Select samples from the a set at random so we can visualize errors.
+    batch_x, batch_y = next(val_gen2)
+    for i in range(len(batch_y)):
+        preds = model.predict(batch_x)
+        expected = batch_y[i]
+        prediction = preds[i]
+
+        correct = decode_one_hot(expected)
+        guess = decode_one_hot(prediction)
+        print('T:', correct)
+        print('G:', guess)
+
+with open(sys.argv[1]) as f:
+    for line in f:
+        if line.isspace(): continue
+        onehots = encode_one_hot(line)
+
+        data = [[] for _ in range(LINE_SIZE)]
+        for i, c in enumerate(onehots):
+            data[i].append(c)
+        for j in range(len(onehots), LINE_SIZE):
+            data[j].append(np.zeros((INPUT_VOCAB_SIZE)))
+
+        inputs = [np.array(e) for e in data]
+
+        preds = model.predict(inputs)
+        normal = decode_one_hot(preds[0])
+
+        print(decode_one_hot(onehots))
+        print(normal)