[GitHub] opringle commented on issue #8663: terminate called after throwing an instance of 'std::out_of_range?

GitBox Tue, 09 Jan 2018 21:26:07 -0800

opringle commented on issue #8663: terminate called after throwing an instance 
of 'std::out_of_range?
URL: 
https://github.com/apache/incubator-mxnet/issues/8663#issuecomment-356503080
 
 
   Also getting the same error while trying to implement bucketing with 
variable length data/label:
   
   # Steps to reproduce
   
   ```python
   import pickle
   import mxnet as mx
   import bisect
   import random
   import numpy as np
   from mxnet.io import DataIter, DataBatch, DataDesc
   from mxnet import ndarray
   
   def save_obj(obj, name):
       with open(name + '.pkl', 'wb') as f:
           pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
   
   
   def load_obj(name):
       with open(name + '.pkl', 'rb') as f:
           return pickle.load(f)
   
   
   class NERIter(DataIter):
       """Simple bucketing iterator for named entity recognition.
       Input features have same shape as output labels.
   
       Parameters
       ----------
       data : list of list of int encoded data
       labels : list of list of int encoded entity tags
       batch_size : int
       invalid_label : int, optional
           Key for invalid label, e.g. . The default is -1.
       dtype : str, optional
           Data type of the encoding. The default data type is 'float32'.
       buckets : list of int, optional
           Size of the data buckets. Automatically generated if None.
       data_name : str, optional
           Name of the data. The default name is 'data'.
       label_name : str, optional
           Name of the label. The default name is 'softmax_label'.
       layout : str, optional
           Format of data and label. 'NT' means (batch_size, length)
           and 'TN' means (length, batch_size).
       """
       def __init__(self, data, label, batch_size, buckets=None, 
invalid_label=-1,
                    data_name='data', label_name='softmax_label', 
dtype='float32',
                    layout='NT'):
           super(NERIter, self).__init__()
           if not buckets:
               buckets = [i for i, j in enumerate(np.bincount([len(s) for s in 
data]))
                          if j >= batch_size]
           buckets.sort()
   
           ndiscard = 0
           self.data = [[] for _ in buckets]
           for i, sent in enumerate(data):
               buck = bisect.bisect_left(buckets, len(sent))
               if buck == len(buckets):
                   ndiscard += 1
                   continue
               buff = np.full((buckets[buck],), invalid_label, dtype=dtype)
               buff[:len(sent)] = sent
               self.data[buck].append(buff)
   
           self.data = [np.asarray(i, dtype=dtype) for i in self.data]
   
           print("WARNING: discarded %d data longer than the largest 
bucket."%ndiscard)
   
   
           ndiscard = 0
           self.label = [[] for _ in buckets]
           for i, sent in enumerate(label):
               buck = bisect.bisect_left(buckets, len(sent))
               if buck == len(buckets):
                   ndiscard += 1
                   continue
               buff = np.full((buckets[buck],), invalid_label, dtype=dtype)
               buff[:len(sent)] = sent
               self.label[buck].append(buff)
   
           self.label = [np.asarray(i, dtype=dtype) for i in self.label]
   
           print("WARNING: discarded %d labels longer than the largest 
bucket."%ndiscard)
   
   
           self.batch_size = batch_size
           self.buckets = buckets
           self.data_name = data_name
           self.label_name = label_name
           self.dtype = dtype
           self.invalid_label = invalid_label
           self.nddata = []
           self.ndlabel = []
           self.major_axis = layout.find('N')
           self.layout = layout
           self.default_bucket_key = max(buckets)
   
           if self.major_axis == 0:
               self.provide_data = [DataDesc(
                   name=self.data_name, shape=(batch_size, 
self.default_bucket_key),
                   layout=self.layout)]
               self.provide_label = [DataDesc(
                   name=self.label_name, shape=(batch_size, 
self.default_bucket_key),
                   layout=self.layout)]
           elif self.major_axis == 1:
               self.provide_data = [DataDesc(
                   name=self.data_name, shape=(self.default_bucket_key, 
batch_size),
                   layout=self.layout)]
               self.provide_label = [DataDesc(
                   name=self.label_name, shape=(self.default_bucket_key, 
batch_size),
                   layout=self.layout)]
           else:
               raise ValueError("Invalid layout %s: Must by NT (batch major) or 
TN (time major)")
   
           self.idx = []
           for i, buck in enumerate(self.data):
               self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size 
+ 1, batch_size)])
           self.curr_idx = 0
   
           self.reset()
           
       def reset(self):
           """Resets the iterator to the beginning of the data."""
           self.curr_idx = 0
           random.shuffle(self.idx)
           for buck in self.data:
               np.random.shuffle(buck)
   
           self.nddata = []
           self.ndlabel = []
           for buck in self.data:
               self.nddata.append(ndarray.array(buck, dtype=self.dtype))
   
           for label in self.label:
               self.ndlabel.append(ndarray.array(label, dtype=self.dtype))
   
   
       def next(self):
           """Returns the next batch of data."""
           if self.curr_idx == len(self.idx):
               raise StopIteration
           i, j = self.idx[self.curr_idx]
           self.curr_idx += 1
   
           if self.major_axis == 1:
               data = self.nddata[i][j:j+self.batch_size].T
               label = self.ndlabel[i][j:j+self.batch_size].T
           else:
               data = self.nddata[i][j:j+self.batch_size]
               label = self.ndlabel[i][j:j+self.batch_size]
   
           return DataBatch([data], [label], pad=0,
                            bucket_key=self.buckets[i],
                            provide_data=[DataDesc(name=self.data_name, 
shape=data.shape,layout=self.layout)],
                            provide_label=[DataDesc(name=self.label_name, 
shape=label.shape,layout=self.layout)])
   
   #load numpy files
   with open("../data/x_train.txt") as f:
       x_train = f.readlines()
   x_train = [ast.literal_eval(x.strip()) for x in x_train]
   
   with open("../data/x_test.txt") as f:
       x_test = f.readlines()
   x_test = [ast.literal_eval(x.strip()) for x in x_test]
   
   with open("../data/y_train.txt") as f:
       y_train = f.readlines()
   y_train = [ast.literal_eval(x.strip()) for x in y_train]
   
   with open("../data/y_test.txt") as f:
       y_test = f.readlines()
   y_test = [ast.literal_eval(x.strip()) for x in y_test]
   
   x_train = x_train[:2000]
   x_test = x_test[:200]
   y_train = y_train[:2000]
   y_test = y_test[:200]
   
   print("\ntraining examples: ", len(x_train), "\n\ntest examples: ", 
len(x_test), "\n")
   
   #create custom data iterators for training and testing
   train_iter = NERIter(data=x_train,
                       label=y_train,
                       batch_size=config.batch_size,
                       buckets = config.buckets,
                       data_name='seq_data',
                       label_name='seq_label')
   
   val_iter = NERIter(data=x_test,
                       label=y_test,
                       batch_size=config.batch_size,
                       buckets = config.buckets,
                       data_name='seq_data',
                       label_name='seq_label')
   
   #print the first few input batches for a sanity check
   # for i, batch in enumerate(train_iter):
   #     if batch.bucket_key == 5:
   #         print("\nbatch ", i, " data: ", batch.data, "\nbatch ", i, " 
label: ", batch.label, "\nbucket size: ", batch.bucket_key)
   #         continue
   # train_iter.reset()
   
   #create a bidirectional lstm cell 
https://mxnet.incubator.apache.org/api/python/rnn.html & 
http://colah.github.io/posts/2015-08-Understanding-LSTMs/
   bi_cell = 
mx.rnn.BidirectionalCell(l_cell=mx.rnn.LSTMCell(num_hidden=config.lstm_state_size,
 prefix="forward_"),
                                          
r_cell=mx.rnn.LSTMCell(num_hidden=config.lstm_state_size, prefix="backward_"))
   
   
   #architecture is defined in a function, to allow variable length input 
sequences
   def sym_gen(seq_len):
       """function that creates a network graph, depending on sequence length"""
   
       print("-" * 50)
   
       #define hyperparameters from data folder
       vocab_size = len(load_obj("../data/word_index_dict"))
       num_labels = len(load_obj("../data/tag_index_dict"))
       input_feature_shape = (config.batch_size, seq_len)
       input_label_shape = (config.batch_size, seq_len)
   
       #data placeholders: we are inputting a sequence of data each time.
       seq_data = mx.symbol.Variable('seq_data')
       seq_label = mx.sym.Variable('seq_label')
       print("\ninput data shape: ", 
seq_data.infer_shape(seq_data=input_feature_shape)[1][0])
       print("\ninput label shape: ", 
seq_label.infer_shape(seq_label=input_label_shape)[1][0])
   
       #create an embedding layer
       embed_layer = mx.sym.Embedding(data=seq_data, input_dim=vocab_size, 
output_dim=config.word_embedding_vector_length, name='vocab_embed')
       print("\nembedding layer shape: ", 
embed_layer.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #unroll the lstm cell in time, obtaining a concartenated symbol for each 
time step (forward and backwards)
       bi_cell.reset()
       outputs, states = bi_cell.unroll(length=seq_len, inputs=embed_layer, 
merge_outputs=False, layout="NTC")
       print("\nindividual concatenated forward and backward cell shape: ", 
outputs[0].infer_shape(seq_data=input_feature_shape)[1][0])
       print("\nnumber of recurrent cell unrolls: ", len(outputs))
   
       #for each timestep, add a fully connected layer with num_neurons = 
num_possible_tags
       step_outputs = []
       for i, step_output in enumerate(outputs):
           fc = mx.sym.FullyConnected(data=step_output, num_hidden=num_labels)
           reshaped_fc = mx.sym.Reshape(data=fc, shape=(config.batch_size, 
num_labels, 1))
           step_outputs.append(reshaped_fc)
       print("\nshape after each cell output passes through fully connected 
layer: ",
           reshaped_fc.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #concatenate fully connected layers for each timestep
       sm_input = mx.sym.concat(*step_outputs, dim=2)
       print("\nshape after concatenating outputs: ", 
sm_input.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #apply softmax cross entropy loss to each column of each training 
example (shape =(num_labels, tokens))
       sm = mx.sym.SoftmaxOutput(data=sm_input, label=seq_label, 
name='softmax', multi_output=True)
       print("\nshape after loss function: ", 
sm.infer_shape(seq_data=input_feature_shape)[1][0])
   
       #set lstm pointer to back of network
       lstm = sm
   
       return lstm, ('seq_data',), ('seq_label',)
   
   
   # create a trainable bucketing module
   model = mx.mod.BucketingModule(sym_gen=sym_gen, 
                                  
default_bucket_key=train_iter.default_bucket_key, 
                                  context = config.context)
   
   
   ################
   # #fit the model (not working right now)
   ################
   
   model.fit(
       train_data=train_iter,
       eval_data=val_iter,
       eval_metric='accuracy',
       optimizer='sgd',
       optimizer_params={"learning_rate": config.learning_rate},
       num_epoch=config.num_epoch)
   
   
   ###############
   # to debug issues
   ##############
   
   # # allocate memory given the input data and label shapes
   # model.bind(data_shapes=train_iter.provide_data,
   #            label_shapes=train_iter.provide_label)
   # # initialize parameters by uniform random numbers
   # model.init_params(initializer=mx.init.Uniform(scale=.1))
   # # use SGD with learning rate 0.1 to train
   # model.init_optimizer(optimizer='sgd', optimizer_params=(
   #     ('learning_rate', config.learning_rate), ))
   # # use accuracy as the metric
   # metric = mx.metric.create('acc')
   # # train 5 epochs, i.e. going over the data iter one pass
   # for epoch in range(config.num_epoch):
   #     train_iter.reset()
   #     metric.reset()
   #     for batch in train_iter:
   #         model.forward(batch, is_train=True)       # compute predictions
   #         # accumulate prediction accuracy
   #         model.update_metric(metric, batch.label)
   #         model.backward()                          # compute gradients
   #         model.update()                            # update parameters
   #     print('Epoch %d, Training %s' % (epoch, metric.get()))
   
   ########################
   # predict to check shape
   ########################
   
   print("\nmodel predictions are of shape: ", model.predict(val_iter).shape)
   
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] opringle commented on issue #8663: terminate called after throwing an instance of 'std::out_of_range?

Reply via email to