From df289f0d2c46d9dceb6da9af7cc609cfd5b6918e Mon Sep 17 00:00:00 2001 From: Crista Lopes Date: Wed, 1 Jan 2020 12:38:17 -0800 Subject: [PATCH] oh look, one more style. Couldn't resist --- 38-sliding-window/tf-38.py | 84 +++++++++++++++++++ .../tf-38.py => 39-recurrent/tf-39.py | 0 2 files changed, 84 insertions(+) create mode 100644 38-sliding-window/tf-38.py rename 38-recurrent/tf-38.py => 39-recurrent/tf-39.py (100%) diff --git a/38-sliding-window/tf-38.py b/38-sliding-window/tf-38.py new file mode 100644 index 0000000..6ea6c79 --- /dev/null +++ b/38-sliding-window/tf-38.py @@ -0,0 +1,84 @@ +from keras.models import Sequential +from keras.layers import Dense +import numpy as np +import sys, os, string + +characters = string.printable +char_indices = dict((c, i) for i, c in enumerate(characters)) +indices_char = dict((i, c) for i, c in enumerate(characters)) + +INPUT_VOCAB_SIZE = len(characters) +WINDOW_SIZE = 3 + +def encode_one_hot(line): + line = " " + line + x = np.zeros((len(line), INPUT_VOCAB_SIZE)) + for i, c in enumerate(line): + index = char_indices[c] if c in characters else char_indices[' '] + x[i][index] = 1 + return x + +def decode_one_hot(x): + s = [] + for onehot in x: + one_index = np.argmax(onehot) + s.append(indices_char[one_index]) + return ''.join(s) + +def prepare_for_window(x): + # All slices of size WINDOW_SIZE, sliding through x + ind = [np.array(np.arange(i, i+WINDOW_SIZE)) for i in range(x.shape[0] - WINDOW_SIZE + 1)] + ind = np.array(ind, dtype=np.int32) + x_window = x[ind] + # Reshape it back to a 2-d tensor + return x_window.reshape(x_window.shape[0], x_window.shape[1]*x_window.shape[2]) + +def normalization_layer_set_weights(n_layer): + wb = [] + w = np.zeros((WINDOW_SIZE*INPUT_VOCAB_SIZE, INPUT_VOCAB_SIZE), dtype=np.float32) + b = np.zeros((INPUT_VOCAB_SIZE), dtype=np.float32) + # Let lower case letters go through + for c in string.ascii_lowercase: + i = char_indices[c] + w[INPUT_VOCAB_SIZE+i, i] = 1 + # Map capitals to lower case + for c in string.ascii_uppercase: + i = char_indices[c] + il = char_indices[c.lower()] + w[INPUT_VOCAB_SIZE+i, il] = 1 + # Map all non-letters to space + sp_idx = char_indices[' '] + for c in [c for c in list(string.printable) if c not in list(string.ascii_letters)]: + i = char_indices[c] + w[INPUT_VOCAB_SIZE+i, sp_idx] = 1 + # Map single letters to space + for c in [c for c in list(string.printable) if c not in list(string.ascii_letters)]: + i = char_indices[c] + w[i, sp_idx] = 0.75 + w[INPUT_VOCAB_SIZE*2+i, sp_idx] = 0.75 + + wb.append(w) + wb.append(b) + n_layer.set_weights(wb) + return n_layer + +def build_model(): + # Normalize characters using a dense layer + model = Sequential() + dense_layer = Dense(INPUT_VOCAB_SIZE, + input_shape=(WINDOW_SIZE*INPUT_VOCAB_SIZE,), + activation='softmax') + model.add(dense_layer) + return model + +model = build_model() +model.summary() +normalization_layer_set_weights(model.layers[0]) + +with open(sys.argv[1]) as f: + for line in f: + if line.isspace(): continue + batch = prepare_for_window(encode_one_hot(line)) + preds = model.predict(batch) + normal = decode_one_hot(preds) + print(normal) \ No newline at end of file diff --git a/38-recurrent/tf-38.py b/39-recurrent/tf-39.py similarity index 100% rename from 38-recurrent/tf-38.py rename to 39-recurrent/tf-39.py