[opennlp-sandbox] branch master updated: Add TF training code for name finder

joern Thu, 24 May 2018 05:54:52 -0700

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git



The following commit(s) were added to refs/heads/master by this push:
     new 9a14494  Add TF training code for name finder
9a14494 is described below

commit 9a144940383f0ecaf81b9f06b05301a0e3bab8d1
Author: Jörn Kottmann <[email protected]>
AuthorDate: Thu May 24 14:53:42 2018 +0200

    Add TF training code for name finder
---
 tf-ner-poc/src/main/python/namefinder.py | 402 +++++++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)

diff --git a/tf-ner-poc/src/main/python/namefinder.py 
b/tf-ner-poc/src/main/python/namefinder.py
new file mode 100644
index 0000000..c55d835
--- /dev/null
+++ b/tf-ner-poc/src/main/python/namefinder.py
@@ -0,0 +1,402 @@
+
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License. 
+
+# This poc is based on source code taken from:
+# https://github.com/guillaumegenthial/sequence_tagging
+
+from math import floor
+
+import tensorflow as tf
+import re
+import numpy as np
+
+# Parse the OpenNLP Name Finder format into begin, end, type triples
+class NameSample:
+
+    def __init__(self, line):
+        self.tokens = []
+        self.names = []
+        start_regex = re.compile("<START(:([^:>\\s]*))?>")
+        parts = line.split()
+        start_index = -1
+        word_index = 0
+        for i in range(0, len(parts)):
+            if start_regex.match(parts[i]):
+                start_index = word_index
+                name_type = start_regex.search(parts[i]).group(2);
+                if None == name_type:
+                    name_type = "default"
+            elif parts[i] == "<END>":
+                self.names.append((start_index, word_index, name_type))
+            else:
+                self.tokens.append(parts[i])
+                word_index += 1
+
+class NameFinder:
+
+    def __init__(self):
+        self.label_dict = {}
+
+    def load_glove(self, glove_file):
+        with open(glove_file) as f:
+
+            word_dict = {}
+            embeddings = []
+
+            for line in f:
+                parts = line.strip().split(" ")
+                word_dict[parts[0]] = len(word_dict)
+                embeddings.append(np.array(parts[1:], dtype=np.float32))
+
+        # Create a reverse word dict
+        rev_word_dict = {}
+        for word, id in word_dict.items():
+            rev_word_dict[id] = word
+
+        return word_dict, rev_word_dict, np.asarray(embeddings)
+
+    def load_data(self, word_dict, file):
+        with open(file) as f:
+            raw_data = f.readlines()
+
+        sentences = []
+        labels = []
+        chars_set = set()
+
+        for line in raw_data:
+            name_sample = NameSample(line)
+            sentence = []
+
+            if len(name_sample.tokens) == 0:
+                continue
+
+            for token in name_sample.tokens:
+                vector = 0
+                if word_dict.get(token) is not None:
+                    vector = word_dict[token]
+
+                sentence.append(vector)
+
+                for c in token:
+                    chars_set.add(c)
+
+            label = ["other"] * len(name_sample.tokens)
+            for name in name_sample.names:
+                label[name[0]] = "B- " + name[2]
+                for i in range(name[0] + 1, name[1]):
+                    label[i] = "I-" + name[2]
+            sentences.append(sentence)
+            labels.append(label)
+
+            for label_string in label:
+                if not label_string in self.label_dict:
+                    self.label_dict[label_string] = len(self.label_dict)
+
+        return sentences, labels, chars_set
+
+    def encode_labels(self, labels):
+        label_ids = []
+        for label in labels:
+            label_ids.append(self.label_dict[label])
+
+        return label_ids
+
+
+    def mini_batch(self, rev_word_dict, sentences, labels, batch_size, 
batch_index):
+        begin = batch_size * batch_index
+        end = min(batch_size * (batch_index + 1), len(labels))
+
+        # Determine the max sentence length in the batch
+        max_length = 0
+        for i in range(begin, end):
+            length = len(sentences[i])
+            if length > max_length:
+                max_length = length
+
+        sb = []
+        lb = []
+        seq_length = []
+        for i in range(begin, end):
+            sb.append(sentences[i] + [0] * max(max_length - len(sentences[i]), 
0))
+            lb.append(self.encode_labels(labels[i]) + [0] * max(max_length - 
len(labels[i]), 0))
+            seq_length.append(len(sentences[i]))
+
+        # Determine the max word length in the batch
+        max_word_length = 0
+        for i in range(begin, end):
+            for word in sentences[i]:
+                length = len(rev_word_dict[word])
+                if length > max_word_length:
+                    max_word_length = length
+
+        cb = []
+        wlb = []
+        for i in range(begin, end):
+            sentence_word_length = []
+            sentence_word_chars = []
+            for word in sentences[i]:
+
+                word_chars = []
+                for c in rev_word_dict[word]:
+                    word_chars.append(ord(c))
+
+                sentence_word_length.append(len(word_chars))
+                word_chars = word_chars + [0] * max(max_word_length - 
len(word_chars), 0)
+                sentence_word_chars.append(word_chars)
+
+            for i in range(max(max_length - len(sentence_word_chars), 0)):
+                sentence_word_chars.append([0] * max_word_length)
+
+            cb.append(sentence_word_chars)
+            wlb.append(sentence_word_length + [0] * max(max_length - 
len(sentence_word_length), 0))
+
+        return sb, cb, wlb, lb, seq_length
+
+
+    def create_graph(self, nchars, embedding_dict): # probably not necessary 
to pass in the embedding_dict, can be passed to init directly
+
+
+        with tf.variable_scope("chars"):
+            # shape = (batch size, max length of sentence, max length of word)
+            char_ids = tf.placeholder(tf.int32, shape=[None, None, None])
+
+            # shape = (batch_size, max_length of sentence)
+            word_lengths_ph = tf.placeholder(tf.int32, shape=[None, None])
+
+            dim_char = 100
+
+            # 1. get character embeddings
+            K = tf.get_variable(name="char_embeddings", dtype=tf.float32,
+                                shape=[nchars, dim_char])
+
+            # shape = (batch, sentence, word, dim of char embeddings)
+            char_embeddings = tf.nn.embedding_lookup(K, char_ids)
+
+            # 2. put the time dimension on axis=1 for dynamic_rnn
+            s = tf.shape(char_embeddings) # store old shape
+            # shape = (batch x sentence, word, dim of char embeddings)
+            char_embeddings = tf.reshape(char_embeddings, shape=[s[0]*s[1], 
s[-2], dim_char])
+            word_lengths = tf.reshape(word_lengths_ph, shape=[s[0]*s[1]])
+
+            # 3. bi lstm on chars
+            char_hidden_size = 100
+            cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, 
state_is_tuple=True)
+            cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, 
state_is_tuple=True)
+
+            _, ((_, output_fw), (_, output_bw)) = 
tf.nn.bidirectional_dynamic_rnn(cell_fw,
+                                                                               
   cell_bw,
+                                                                               
   char_embeddings,
+                                                                               
   sequence_length=word_lengths,
+                                                                               
   dtype=tf.float32)
+            # shape = (batch x sentence, 2 x char_hidden_size)
+            output = tf.concat([output_fw, output_bw], axis=-1)
+
+            # shape = (batch, sentence, 2 x char_hidden_size)
+            char_rep = tf.reshape(output, shape=[-1, s[1], 2*char_hidden_size])
+
+        with tf.variable_scope("words"):
+            token_ids = tf.placeholder(tf.int32, shape=[None, None])
+            sequence_lengths = tf.placeholder(tf.int32, shape=[None])
+
+            # This is a hack to make it load an embedding matrix larger than 
2GB
+            # Don't hardcode this 300
+            embedding_placeholder = tf.placeholder(dtype=tf.float32, 
name="embedding_placeholder",
+                                                   shape=(len(embedding_dict), 
100))
+            embedding_matrix = tf.Variable(embedding_placeholder, 
dtype=tf.float32, trainable=False, name="glove_embeddings")
+
+            token_embeddings = tf.nn.embedding_lookup(embedding_matrix, 
token_ids)
+
+            # shape = (batch, sentence, 2 x char_hidden_size + 
word_vector_size)
+            word_embeddings = tf.concat([token_embeddings, char_rep], axis=-1)
+
+            word_embeddings = tf.nn.dropout(word_embeddings, 0.5)
+
+        hidden_size = 300
+
+        # Lets add a char lstm layer to reproduce the state of the art results 
...
+
+        with tf.variable_scope("bi-lstm"):
+            # Add LSTM layer
+            cell_fw = tf.contrib.rnn.LSTMCell(hidden_size)
+            cell_bw = tf.contrib.rnn.LSTMCell(hidden_size)
+
+            (output_fw, output_bw), _ = 
tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, word_embeddings,
+                                                                        
sequence_length=sequence_lengths, dtype=tf.float32)
+
+            context_rep = tf.concat([output_fw, output_bw], axis=-1)
+
+            context_rep = tf.nn.dropout(context_rep, 0.5)
+
+            labels = tf.placeholder(tf.int32, shape=[None, None], 
name="labels")
+
+        ntags = 7; # TODO: Compute this and not hard code
+
+        W = tf.get_variable("W", shape=[2*hidden_size, ntags], 
dtype=tf.float32)
+        b = tf.get_variable("b", shape=[ntags], dtype=tf.float32, 
initializer=tf.zeros_initializer())
+        ntime_steps = tf.shape(context_rep)[1]
+        context_rep_flat = tf.reshape(context_rep, [-1, 2*hidden_size])
+        pred = tf.matmul(context_rep_flat, W) + b
+        self.logits = tf.reshape(pred, [-1, ntime_steps, ntags])
+
+        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
+            self.logits, labels, sequence_lengths)
+
+        self.transition_params = transition_params
+
+        loss = tf.reduce_mean(-log_likelihood)
+
+        train_op = tf.train.AdamOptimizer().minimize(loss)
+
+        return embedding_placeholder, token_ids, char_ids, word_lengths_ph, \
+               sequence_lengths, labels, train_op
+
+    def predict_batch(self, sess, token_ids_ph, char_ids_ph, word_lengths_ph,
+                      sequence_lengths_ph, sentences, char_ids, word_length, 
lengths):
+
+        feed_dict = {token_ids_ph: sentences, char_ids_ph: char_ids, 
word_lengths_ph: word_length,
+                     sequence_lengths_ph: lengths}
+
+        viterbi_sequences = []
+        logits, trans_params = sess.run([self.logits, self.transition_params], 
feed_dict=feed_dict)
+
+        for logit, sequence_length in zip(logits, lengths):
+            if sequence_length != 0:
+                logit = logit[:sequence_length] # keep only the valid steps
+                viterbi_seq, viterbi_score = 
tf.contrib.crf.viterbi_decode(logit, trans_params)
+                viterbi_sequences += [viterbi_seq]
+            else:
+                viterbi_sequences += []
+
+        return viterbi_sequences, lengths
+
+def get_chunk_type(tok, idx_to_tag):
+    tag_name = idx_to_tag[tok]
+    tag_class = tag_name.split('-')[0]
+    tag_type = tag_name.split('-')[-1]
+    return tag_class, tag_type
+
+def get_chunks(seq, tags):
+    default = tags["other"]
+    idx_to_tag = {idx: tag for tag, idx in tags.items()}
+    chunks = []
+    chunk_type, chunk_start = None, None
+    for i, tok in enumerate(seq):
+        # End of a chunk 1
+        if tok == default and chunk_type is not None:
+            # Add a chunk.
+            chunk = (chunk_type, chunk_start, i)
+            chunks.append(chunk)
+            chunk_type, chunk_start = None, None
+
+        # End of a chunk + start of a chunk!
+        elif tok != default:
+            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
+            if chunk_type is None:
+                chunk_type, chunk_start = tok_chunk_type, i
+            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
+                chunk = (chunk_type, chunk_start, i)
+                chunks.append(chunk)
+                chunk_type, chunk_start = tok_chunk_type, i
+        else:
+            pass
+
+    # end condition
+    if chunk_type is not None:
+        chunk = (chunk_type, chunk_start, len(seq))
+        chunks.append(chunk)
+
+    return chunks
+
+def main():
+
+    name_finder = NameFinder()
+
+    # word_dict, rev_word_dict, embeddings = 
name_finder.load_glove("/home/burn/Downloads/glove.840B.300d.txt")
+    word_dict, rev_word_dict, embeddings = 
name_finder.load_glove("/home/blue/Downloads/fastText/memorial.vec")
+    sentences, labels, char_set = name_finder.load_data(word_dict, "train.txt")
+    #sentences_test, labels_test, char_set_test = 
name_finder.load_data(word_dict,"conll03.testa")
+    sentences_test, labels_test, char_set_test = 
name_finder.load_data(word_dict,"dev.txt")
+
+
+    embedding_ph, token_ids_ph, char_ids_ph, word_lengths_ph, 
sequence_lengths_ph, labels_ph, train_op \
+        = name_finder.create_graph(len(char_set | char_set_test), embeddings)
+
+    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
+                                            log_device_placement=True))
+
+    with sess.as_default():
+        init = tf.global_variables_initializer()
+        sess.run(init, feed_dict={embedding_ph: embeddings})
+
+        batch_size = 20
+        for epoch in range(100):
+            print("Epoch " + str(epoch))
+
+            for batch_index in range(floor(len(sentences) / batch_size)):
+                if batch_index % 200 == 0:
+                    print("batch_index " + str(batch_index))
+
+                # mini_batch should also return char_ids and word length ...
+                sentences_batch, chars_batch, word_length_batch, labels_batch, 
lengths = \
+                    name_finder.mini_batch(rev_word_dict, sentences, labels, 
batch_size, batch_index)
+
+                feed_dict = {token_ids_ph:  sentences_batch, char_ids_ph: 
chars_batch, word_lengths_ph: word_length_batch, sequence_lengths_ph: lengths,
+                             labels_ph: labels_batch}
+
+                train_op.run(feed_dict, sess)
+
+
+            accs = []
+            correct_preds, total_correct, total_preds = 0., 0., 0.
+            for batch_index in range(floor(len(sentences_test) / batch_size)):
+                sentences_test_batch, chars_batch_test, 
word_length_batch_test, \
+                labels_test_batch, length_test = 
name_finder.mini_batch(rev_word_dict,
+                                                                        
sentences_test,
+                                                                        
labels_test,
+                                                                        
batch_size,
+                                                                        
batch_index)
+
+                labels_pred, sequence_lengths = name_finder.predict_batch(
+                    sess, token_ids_ph, char_ids_ph, word_lengths_ph, 
sequence_lengths_ph,
+                    sentences_test_batch, chars_batch_test, 
word_length_batch_test, length_test)
+
+                for lab, lab_pred, length in zip(labels_test_batch, 
labels_pred,
+                                                 sequence_lengths):
+                    lab      = lab[:length]
+                    lab_pred = lab_pred[:length]
+                    accs += [a==b for (a, b) in zip(lab, lab_pred)]
+
+                    lab_chunks      = set(get_chunks(lab, 
name_finder.label_dict))
+                    lab_pred_chunks = set(get_chunks(lab_pred, 
name_finder.label_dict))
+
+                    correct_preds += len(lab_chunks & lab_pred_chunks)
+                    total_preds   += len(lab_pred_chunks)
+                    total_correct += len(lab_chunks)
+
+            p = correct_preds / total_preds if correct_preds > 0 else 0
+            r = correct_preds / total_correct if correct_preds > 0 else 0
+            f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
+            acc = np.mean(accs)
+
+            print("ACC " + str(acc))
+            print("F1  " + str(f1) + "  P " + str(p) + "  R " + str(r))
+
+            # TODO: Store the model, load it with java ...
+
+if __name__ == "__main__":
+    main()

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[opennlp-sandbox] branch master updated: Add TF training code for name finder

Reply via email to