Add a version of this that works on single characters, and batches on the line. It's much simpler to explain.

2019-12-26 16:42:33 -08:00
parent 997c2827b1
commit b855384437
1 changed files with 71 additions and 0 deletions
--- a/35-dumb-filters/tf-35.py
+++ b/35-dumb-filters/tf-35.py
@@ -0,0 +1,71 @@
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.utils import plot_model
+import numpy as np
+import sys, os, string
+
+characters = string.printable
+char_indices = dict((c, i) for i, c in enumerate(characters))
+indices_char = dict((i, c) for i, c in enumerate(characters))
+
+INPUT_VOCAB_SIZE = len(characters)
+LINE_SIZE = 100
+
+def encode_one_hot(line):
+    x = np.zeros((len(line), INPUT_VOCAB_SIZE))
+    for i, c in enumerate(line):
+        if c in characters:
+            index = char_indices[c]
+        else:
+            index = char_indices[' ']
+        x[i][index] = 1 
+    return x
+
+def decode_one_hot(x):
+    s = []
+    for onehot in x:
+        one_index = np.argmax(onehot) 
+        s.append(indices_char[one_index]) 
+    return ''.join(s)
+    
+def normalization_layer_set_weights(n_layer):
+    wb = []
+    b = np.zeros((INPUT_VOCAB_SIZE), dtype=np.float32)
+    w = np.zeros((INPUT_VOCAB_SIZE, INPUT_VOCAB_SIZE), dtype=np.float32)
+    # Let lower case letters go through
+    for c in string.ascii_lowercase:
+        i = char_indices[c]
+        w[i, i] = 1
+    # Map capitals to lower case
+    for c in string.ascii_uppercase:
+        i = char_indices[c]
+        il = char_indices[c.lower()]
+        w[i, il] = 1
+    # Map all non-letters to space
+    sp_idx = char_indices[' ']
+    for c in [c for c in list(string.printable) if c not in list(string.ascii_letters)]:
+        i = char_indices[c]
+        w[i, sp_idx] = 1
+
+    wb.append(w)
+    wb.append(b)
+    n_layer.set_weights(wb)
+    return n_layer
+
+def build_model():
+    # Normalize characters using a dense layer
+    model = Sequential()
+    dense_layer = Dense(INPUT_VOCAB_SIZE, input_shape=(INPUT_VOCAB_SIZE,))
+    model.add(dense_layer)
+    normalization_layer_set_weights(dense_layer)
+    return model
+
+model = build_model()
+
+with open(sys.argv[1]) as f:
+    for line in f:
+        if line.isspace(): continue
+        batch = encode_one_hot(line)
+        preds = model.predict(batch)
+        normal = decode_one_hot(preds)
+        print(normal)