[GitHub] opringle commented on issue #8663: terminate called after throwing an instance of 'std::out_of_range?

GitBox Tue, 09 Jan 2018 21:24:07 -0800

opringle commented on issue #8663: terminate called after throwing an instance 
of 'std::out_of_range?
URL: 
https://github.com/apache/incubator-mxnet/issues/8663#issuecomment-356503080
 
 
   Also getting the same error while trying to implement bucketing with 
variable length data/label:
   
   ```
   # Steps to reproduce
     
   #modules
   import pandas as pd
   import numpy as np
   from sklearn.model_selection import train_test_split
   
   #custom modules
   from custom_methods import save_obj
   import config
   
   #read in csv of NER training data
   df = pd.read_csv("../data/ner_dataset.csv", encoding="ISO-8859-1")
   
   #rename columns because I have OCD
   df = df.rename(columns = {"Sentence #" : "utterance_id",
                               "Word" : "token", 
                               "POS" : "POS_tag", 
                               "Tag" : "BILOU_tag"})
   
   #clean utterance_id column
   df.loc[:, "utterance_id"] = df["utterance_id"].str.replace('Sentence: ', '')
   
   #fill np.nan utterance ID's with the last valid entry
   df = df.fillna(method='ffill')
   df.loc[:, "utterance_id"] = df["utterance_id"].apply(int)
   
   #melt BILOU tags and tokens into an array per utterance
   df1 = 
df.groupby("utterance_id")["BILOU_tag"].apply(list).to_frame().reset_index()
   df2 = 
df.groupby("utterance_id")["token"].apply(list).to_frame().reset_index()
   
   #join the results on utterance id
   df = df1.merge(df2, how = "left", on = "utterance_id")
   
   # we need to pad sentences to be atleast as long as the min bucket size
   min_sentence_length = min(config.buckets)
   
   # pad all other sentences to this length
   def pad(x, max_l):
   
     pads = max_l - len(x)
   
     if pads > 0:
       padded_sentence = x + [""] * pads
       padded_tags = x + ["O"] * pads
   
     else:
       padded_sentence = x
       padded_tags = x
   
     return padded_sentence, padded_tags
   
   df["token"] = df["token"].apply(lambda x: pad(x, min_sentence_length)[0])
   df["BILOU_tag"] = df["BILOU_tag"].apply(lambda x: pad(x, 
min_sentence_length)[1])
   
   print(df.head(3))
   print(df.iloc[2,1])
   print(df.iloc[2,2])
   
   #get dictionary mapping BILOU tags to indices and save it
   unique_tags = list(set([a for b in df.BILOU_tag.tolist() for a in b]))
   tag_indices = list(range(len(unique_tags)))
   tag_index_dict = dict(zip(unique_tags, tag_indices))
   save_obj(tag_index_dict, "../data/tag_index_dict")
   
   #get dictionary mapping unique words to indices and save it
   unique_words = list(set([a for b in df.token.tolist() for a in b]))
   word_indices = list(range(len(unique_words)))
   word_index_dict = dict(zip(unique_words, word_indices))
   save_obj(word_index_dict, "../data/word_index_dict")
   
   #index padded_tag lists and padded utterances
   df["indexed_tags"] = df["BILOU_tag"].apply(lambda x: 
[tag_index_dict.get(tag) for tag in x])
   df["indexed_utterance"] = df["token"].apply(lambda x: 
[word_index_dict.get(word) for word in x])
   
   #get a list of list of int for data and labels
   data = df.indexed_utterance.values.tolist()
   label = df.indexed_tags.values.tolist()
   
   #split into training and test sets
   split_index = int(config.split[0] * len(data))
   x_train = data[:split_index]
   x_test = data[split_index:]
   y_train = label[:split_index]
   y_test = label[split_index:]
   
   #save to file
   file = open('../data/x_train.txt', 'w')
   for item in x_train:
     file.write("%s\n" % item)
   
   file = open('../data/x_test.txt', 'w')
   for item in x_test:
     file.write("%s\n" % item)
   
   file = open('../data/y_train.txt', 'w')
   for item in y_train:
     file.write("%s\n" % item)
   
   file = open('../data/y_test.txt', 'w')
   for item in y_test:
     file.write("%s\n" % item)
   
   import pickle
   import mxnet as mx
   import bisect
   import random
   import numpy as np
   from mxnet.io import DataIter, DataBatch, DataDesc
   from mxnet import ndarray
   
   def save_obj(obj, name):
       with open(name + '.pkl', 'wb') as f:
           pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
   
   
   def load_obj(name):
       with open(name + '.pkl', 'rb') as f:
           return pickle.load(f)
   
   
   class NERIter(DataIter):
       """Simple bucketing iterator for named entity recognition.
       Input features have same shape as output labels.
   
       Parameters
       ----------
       data : list of list of int encoded data
       labels : list of list of int encoded entity tags
       batch_size : int
       invalid_label : int, optional
           Key for invalid label, e.g. . The default is -1.
       dtype : str, optional
           Data type of the encoding. The default data type is 'float32'.
       buckets : list of int, optional
           Size of the data buckets. Automatically generated if None.
       data_name : str, optional
           Name of the data. The default name is 'data'.
       label_name : str, optional
           Name of the label. The default name is 'softmax_label'.
       layout : str, optional
           Format of data and label. 'NT' means (batch_size, length)
           and 'TN' means (length, batch_size).
       """
       def __init__(self, data, label, batch_size, buckets=None, 
invalid_label=-1,
                    data_name='data', label_name='softmax_label', 
dtype='float32',
                    layout='NT'):
           super(NERIter, self).__init__()
           if not buckets:
               buckets = [i for i, j in enumerate(np.bincount([len(s) for s in 
data]))
                          if j >= batch_size]
           buckets.sort()
   
           ndiscard = 0
           self.data = [[] for _ in buckets]
           for i, sent in enumerate(data):
               buck = bisect.bisect_left(buckets, len(sent))
               if buck == len(buckets):
                   ndiscard += 1
                   continue
               buff = np.full((buckets[buck],), invalid_label, dtype=dtype)
               buff[:len(sent)] = sent
               self.data[buck].append(buff)
   
           self.data = [np.asarray(i, dtype=dtype) for i in self.data]
   
           print("WARNING: discarded %d data longer than the largest 
bucket."%ndiscard)
   
   
           ndiscard = 0
           self.label = [[] for _ in buckets]
           for i, sent in enumerate(label):
               buck = bisect.bisect_left(buckets, len(sent))
               if buck == len(buckets):
                   ndiscard += 1
                   continue
               buff = np.full((buckets[buck],), invalid_label, dtype=dtype)
               buff[:len(sent)] = sent
               self.label[buck].append(buff)
   
           self.label = [np.asarray(i, dtype=dtype) for i in self.label]
   
           print("WARNING: discarded %d labels longer than the largest 
bucket."%ndiscard)
   
   
           self.batch_size = batch_size
           self.buckets = buckets
           self.data_name = data_name
           self.label_name = label_name
           self.dtype = dtype
           self.invalid_label = invalid_label
           self.nddata = []
           self.ndlabel = []
           self.major_axis = layout.find('N')
           self.layout = layout
           self.default_bucket_key = max(buckets)
   
           if self.major_axis == 0:
               self.provide_data = [DataDesc(
                   name=self.data_name, shape=(batch_size, 
self.default_bucket_key),
                   layout=self.layout)]
               self.provide_label = [DataDesc(
                   name=self.label_name, shape=(batch_size, 
self.default_bucket_key),
                   layout=self.layout)]
           elif self.major_axis == 1:
               self.provide_data = [DataDesc(
                   name=self.data_name, shape=(self.default_bucket_key, 
batch_size),
                   layout=self.layout)]
               self.provide_label = [DataDesc(
                   name=self.label_name, shape=(self.default_bucket_key, 
batch_size),
                   layout=self.layout)]
           else:
               raise ValueError("Invalid layout %s: Must by NT (batch major) or 
TN (time major)")
   
           self.idx = []
           for i, buck in enumerate(self.data):
               self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size 
+ 1, batch_size)])
           self.curr_idx = 0
   
           self.reset()
           
       def reset(self):
           """Resets the iterator to the beginning of the data."""
           self.curr_idx = 0
           random.shuffle(self.idx)
           for buck in self.data:
               np.random.shuffle(buck)
   
           self.nddata = []
           self.ndlabel = []
           for buck in self.data:
               self.nddata.append(ndarray.array(buck, dtype=self.dtype))
   
           for label in self.label:
               self.ndlabel.append(ndarray.array(label, dtype=self.dtype))
   
   
       def next(self):
           """Returns the next batch of data."""
           if self.curr_idx == len(self.idx):
               raise StopIteration
           i, j = self.idx[self.curr_idx]
           self.curr_idx += 1
   
           if self.major_axis == 1:
               data = self.nddata[i][j:j+self.batch_size].T
               label = self.ndlabel[i][j:j+self.batch_size].T
           else:
               data = self.nddata[i][j:j+self.batch_size]
               label = self.ndlabel[i][j:j+self.batch_size]
   
           return DataBatch([data], [label], pad=0,
                            bucket_key=self.buckets[i],
                            provide_data=[DataDesc(name=self.data_name, 
shape=data.shape,layout=self.layout)],
                            provide_label=[DataDesc(name=self.label_name, 
shape=label.shape,layout=self.layout)])
   
   
   #modules
   import mxnet as mx
   import numpy as np
   import sys
   import os
   import ast
   
   #custom modules
   import config
   from custom_methods import load_obj, NERIter
   
   #load numpy files
   with open("../data/x_train.txt") as f:
       x_train = f.readlines()
   x_train = [ast.literal_eval(x.strip()) for x in x_train]
   
   with open("../data/x_test.txt") as f:
       x_test = f.readlines()
   x_test = [ast.literal_eval(x.strip()) for x in x_test]
   
   with open("../data/y_train.txt") as f:
       y_train = f.readlines()
   y_train = [ast.literal_eval(x.strip()) for x in y_train]
   
   with open("../data/y_test.txt") as f:
       y_test = f.readlines()
   y_test = [ast.literal_eval(x.strip()) for x in y_test]
   
   x_train = x_train[:2000]
   x_test = x_test[:200]
   y_train = y_train[:2000]
   y_test = y_test[:200]
   
   print("\ntraining examples: ", len(x_train), "\n\ntest examples: ", 
len(x_test), "\n")
   
   #create custom data iterators for training and testing
   train_iter = NERIter(data=x_train,
                       label=y_train,
                       batch_size=config.batch_size,
                       buckets = config.buckets,
                       data_name='seq_data',
                       label_name='seq_label')
   
   val_iter = NERIter(data=x_test,
                       label=y_test,
                       batch_size=config.batch_size,
                       buckets = config.buckets,
                       data_name='seq_data',
                       label_name='seq_label')
   
   #print the first few input batches for a sanity check
   # for i, batch in enumerate(train_iter):
   #     if batch.bucket_key == 5:
   #         print("\nbatch ", i, " data: ", batch.data, "\nbatch ", i, " 
label: ", batch.label, "\nbucket size: ", batch.bucket_key)
   #         continue
   # train_iter.reset()
   
   #create a bidirectional lstm cell 
https://mxnet.incubator.apache.org/api/python/rnn.html & 
http://colah.github.io/posts/2015-08-Understanding-LSTMs/
   bi_cell = 
mx.rnn.BidirectionalCell(l_cell=mx.rnn.LSTMCell(num_hidden=config.lstm_state_size,
 prefix="forward_"),
                                          
r_cell=mx.rnn.LSTMCell(num_hidden=config.lstm_state_size, prefix="backward_"))
   
   
   #architecture is defined in a function, to allow variable length input 
sequences
   def sym_gen(seq_len):
       """function that creates a network graph, depending on sequence length"""
   
       print("-" * 50)
   
       #define hyperparameters from data folder
       vocab_size = len(load_obj("../data/word_index_dict"))
       num_labels = len(load_obj("../data/tag_index_dict"))
       input_feature_shape = (config.batch_size, seq_len)
       input_label_shape = (config.batch_size, seq_len)
   
       #data placeholders: we are inputting a sequence of data each time.
       seq_data = mx.symbol.Variable('seq_data')
       seq_label = mx.sym.Variable('seq_label')
       print("\ninput data shape: ", 
seq_data.infer_shape(seq_data=input_feature_shape)[1][0])
       print("\ninput label shape: ", 
seq_label.infer_shape(seq_label=input_label_shape)[1][0])
   
       #create an embedding layer
       embed_layer = mx.sym.Embedding(data=seq_data, input_dim=vocab_size, 
output_dim=config.word_embedding_vector_length, name='vocab_embed')
       print("\nembedding layer shape: ", 
embed_layer.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #unroll the lstm cell in time, obtaining a concartenated symbol for each 
time step (forward and backwards)
       bi_cell.reset()
       outputs, states = bi_cell.unroll(length=seq_len, inputs=embed_layer, 
merge_outputs=False, layout="NTC")
       print("\nindividual concatenated forward and backward cell shape: ", 
outputs[0].infer_shape(seq_data=input_feature_shape)[1][0])
       print("\nnumber of recurrent cell unrolls: ", len(outputs))
   
       #for each timestep, add a fully connected layer with num_neurons = 
num_possible_tags
       step_outputs = []
       for i, step_output in enumerate(outputs):
           fc = mx.sym.FullyConnected(data=step_output, num_hidden=num_labels)
           reshaped_fc = mx.sym.Reshape(data=fc, shape=(config.batch_size, 
num_labels, 1))
           step_outputs.append(reshaped_fc)
       print("\nshape after each cell output passes through fully connected 
layer: ",
           reshaped_fc.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #concatenate fully connected layers for each timestep
       sm_input = mx.sym.concat(*step_outputs, dim=2)
       print("\nshape after concatenating outputs: ", 
sm_input.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #apply softmax cross entropy loss to each column of each training 
example (shape =(num_labels, tokens))
       sm = mx.sym.SoftmaxOutput(data=sm_input, label=seq_label, 
name='softmax', multi_output=True)
       print("\nshape after loss function: ", 
sm.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #set lstm pointer to back of network
       lstm = sm
   
       return lstm, ('seq_data',), ('seq_label',)
   
   
   # create a trainable bucketing module
   model = mx.mod.BucketingModule(sym_gen=sym_gen, 
                                  
default_bucket_key=train_iter.default_bucket_key, 
                                  context = config.context)
   
   
   ################
   # #fit the model (not working right now)
   ################
   
   model.fit(
       train_data=train_iter,
       eval_data=val_iter,
       eval_metric='accuracy',
       optimizer='sgd',
       optimizer_params={"learning_rate": config.learning_rate},
       num_epoch=config.num_epoch)
   
   
   ###############
   # to debug issues
   ##############
   
   # # allocate memory given the input data and label shapes
   # model.bind(data_shapes=train_iter.provide_data,
   #            label_shapes=train_iter.provide_label)
   # # initialize parameters by uniform random numbers
   # model.init_params(initializer=mx.init.Uniform(scale=.1))
   # # use SGD with learning rate 0.1 to train
   # model.init_optimizer(optimizer='sgd', optimizer_params=(
   #     ('learning_rate', config.learning_rate), ))
   # # use accuracy as the metric
   # metric = mx.metric.create('acc')
   # # train 5 epochs, i.e. going over the data iter one pass
   # for epoch in range(config.num_epoch):
   #     train_iter.reset()
   #     metric.reset()
   #     for batch in train_iter:
   #         model.forward(batch, is_train=True)       # compute predictions
   #         # accumulate prediction accuracy
   #         model.update_metric(metric, batch.label)
   #         model.backward()                          # compute gradients
   #         model.update()                            # update parameters
   #     print('Epoch %d, Training %s' % (epoch, metric.get()))
   
   ########################
   # predict to check shape
   ########################
   
   print("\nmodel predictions are of shape: ", model.predict(val_iter).shape)
   
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] opringle commented on issue #8663: terminate called after throwing an instance of 'std::out_of_range?

Reply via email to