oh look, one more style. Couldn't resist

2020-01-01 12:38:17 -08:00
parent a6d5ee0d13
commit df289f0d2c
2 changed files with 84 additions and 0 deletions
--- a/38-sliding-window/tf-38.py
+++ b/38-sliding-window/tf-38.py
@@ -0,0 +1,84 @@
+from keras.models import Sequential
+from keras.layers import Dense
+import numpy as np
+import sys, os, string
+
+characters = string.printable
+char_indices = dict((c, i) for i, c in enumerate(characters))
+indices_char = dict((i, c) for i, c in enumerate(characters))
+
+INPUT_VOCAB_SIZE = len(characters)
+WINDOW_SIZE = 3
+
+def encode_one_hot(line):
+    line = " " + line
+    x = np.zeros((len(line), INPUT_VOCAB_SIZE))
+    for i, c in enumerate(line):
+        index = char_indices[c] if c in characters else char_indices[' ']
+        x[i][index] = 1 
+    return x
+
+def decode_one_hot(x):
+    s = []
+    for onehot in x:
+        one_index = np.argmax(onehot) 
+        s.append(indices_char[one_index]) 
+    return ''.join(s)
+
+def prepare_for_window(x):
+    # All slices of size WINDOW_SIZE, sliding through x
+    ind = [np.array(np.arange(i, i+WINDOW_SIZE)) for i in range(x.shape[0] - WINDOW_SIZE + 1)]
+    ind = np.array(ind, dtype=np.int32)
+    x_window = x[ind]
+    # Reshape it back to a 2-d tensor
+    return x_window.reshape(x_window.shape[0], x_window.shape[1]*x_window.shape[2])
+    
+def normalization_layer_set_weights(n_layer):
+    wb = []
+    w = np.zeros((WINDOW_SIZE*INPUT_VOCAB_SIZE, INPUT_VOCAB_SIZE), dtype=np.float32)
+    b = np.zeros((INPUT_VOCAB_SIZE), dtype=np.float32)
+    # Let lower case letters go through
+    for c in string.ascii_lowercase:
+        i = char_indices[c]
+        w[INPUT_VOCAB_SIZE+i, i] = 1
+    # Map capitals to lower case
+    for c in string.ascii_uppercase:
+        i = char_indices[c]
+        il = char_indices[c.lower()]
+        w[INPUT_VOCAB_SIZE+i, il] = 1
+    # Map all non-letters to space
+    sp_idx = char_indices[' ']
+    for c in [c for c in list(string.printable) if c not in list(string.ascii_letters)]:
+        i = char_indices[c]
+        w[INPUT_VOCAB_SIZE+i, sp_idx] = 1
+    # Map single letters to space
+    for c in [c for c in list(string.printable) if c not in list(string.ascii_letters)]:
+        i = char_indices[c]
+        w[i, sp_idx] = 0.75
+        w[INPUT_VOCAB_SIZE*2+i, sp_idx] = 0.75
+
+    wb.append(w)
+    wb.append(b)
+    n_layer.set_weights(wb)
+    return n_layer
+
+def build_model():
+    # Normalize characters using a dense layer
+    model = Sequential()
+    dense_layer = Dense(INPUT_VOCAB_SIZE, 
+                        input_shape=(WINDOW_SIZE*INPUT_VOCAB_SIZE,),
+                        activation='softmax')
+    model.add(dense_layer)
+    return model
+
+model = build_model()
+model.summary()
+normalization_layer_set_weights(model.layers[0])
+
+with open(sys.argv[1]) as f:
+    for line in f:
+        if line.isspace(): continue
+        batch = prepare_for_window(encode_one_hot(line))
+        preds = model.predict(batch)
+        normal = decode_one_hot(preds)
+        print(normal)
--- a/39-recurrent/tf-39.py
+++ b/39-recurrent/tf-39.py