opringle commented on issue #8663: terminate called after throwing an instance of 'std::out_of_range? URL: https://github.com/apache/incubator-mxnet/issues/8663#issuecomment-356503080 Also getting the same error while trying to implement bucketing with variable length data/label: ``` # Steps to reproduce #modules import pandas as pd import numpy as np from sklearn.model_selection import train_test_split #custom modules from custom_methods import save_obj import config #read in csv of NER training data df = pd.read_csv("../data/ner_dataset.csv", encoding="ISO-8859-1") #rename columns because I have OCD df = df.rename(columns = {"Sentence #" : "utterance_id", "Word" : "token", "POS" : "POS_tag", "Tag" : "BILOU_tag"}) #clean utterance_id column df.loc[:, "utterance_id"] = df["utterance_id"].str.replace('Sentence: ', '') #fill np.nan utterance ID's with the last valid entry df = df.fillna(method='ffill') df.loc[:, "utterance_id"] = df["utterance_id"].apply(int) #melt BILOU tags and tokens into an array per utterance df1 = df.groupby("utterance_id")["BILOU_tag"].apply(list).to_frame().reset_index() df2 = df.groupby("utterance_id")["token"].apply(list).to_frame().reset_index() #join the results on utterance id df = df1.merge(df2, how = "left", on = "utterance_id") # we need to pad sentences to be atleast as long as the min bucket size min_sentence_length = min(config.buckets) # pad all other sentences to this length def pad(x, max_l): pads = max_l - len(x) if pads > 0: padded_sentence = x + [""] * pads padded_tags = x + ["O"] * pads else: padded_sentence = x padded_tags = x return padded_sentence, padded_tags df["token"] = df["token"].apply(lambda x: pad(x, min_sentence_length)[0]) df["BILOU_tag"] = df["BILOU_tag"].apply(lambda x: pad(x, min_sentence_length)[1]) print(df.head(3)) print(df.iloc[2,1]) print(df.iloc[2,2]) #get dictionary mapping BILOU tags to indices and save it unique_tags = list(set([a for b in df.BILOU_tag.tolist() for a in b])) tag_indices = list(range(len(unique_tags))) tag_index_dict = dict(zip(unique_tags, tag_indices)) save_obj(tag_index_dict, "../data/tag_index_dict") #get dictionary mapping unique words to indices and save it unique_words = list(set([a for b in df.token.tolist() for a in b])) word_indices = list(range(len(unique_words))) word_index_dict = dict(zip(unique_words, word_indices)) save_obj(word_index_dict, "../data/word_index_dict") #index padded_tag lists and padded utterances df["indexed_tags"] = df["BILOU_tag"].apply(lambda x: [tag_index_dict.get(tag) for tag in x]) df["indexed_utterance"] = df["token"].apply(lambda x: [word_index_dict.get(word) for word in x]) #get a list of list of int for data and labels data = df.indexed_utterance.values.tolist() label = df.indexed_tags.values.tolist() #split into training and test sets split_index = int(config.split[0] * len(data)) x_train = data[:split_index] x_test = data[split_index:] y_train = label[:split_index] y_test = label[split_index:] #save to file file = open('../data/x_train.txt', 'w') for item in x_train: file.write("%s\n" % item) file = open('../data/x_test.txt', 'w') for item in x_test: file.write("%s\n" % item) file = open('../data/y_train.txt', 'w') for item in y_train: file.write("%s\n" % item) file = open('../data/y_test.txt', 'w') for item in y_test: file.write("%s\n" % item) import pickle import mxnet as mx import bisect import random import numpy as np from mxnet.io import DataIter, DataBatch, DataDesc from mxnet import ndarray def save_obj(obj, name): with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name): with open(name + '.pkl', 'rb') as f: return pickle.load(f) class NERIter(DataIter): """Simple bucketing iterator for named entity recognition. Input features have same shape as output labels. Parameters ---------- data : list of list of int encoded data labels : list of list of int encoded entity tags batch_size : int invalid_label : int, optional Key for invalid label, e.g. . The default is -1. dtype : str, optional Data type of the encoding. The default data type is 'float32'. buckets : list of int, optional Size of the data buckets. Automatically generated if None. data_name : str, optional Name of the data. The default name is 'data'. label_name : str, optional Name of the label. The default name is 'softmax_label'. layout : str, optional Format of data and label. 'NT' means (batch_size, length) and 'TN' means (length, batch_size). """ def __init__(self, data, label, batch_size, buckets=None, invalid_label=-1, data_name='data', label_name='softmax_label', dtype='float32', layout='NT'): super(NERIter, self).__init__() if not buckets: buckets = [i for i, j in enumerate(np.bincount([len(s) for s in data])) if j >= batch_size] buckets.sort() ndiscard = 0 self.data = [[] for _ in buckets] for i, sent in enumerate(data): buck = bisect.bisect_left(buckets, len(sent)) if buck == len(buckets): ndiscard += 1 continue buff = np.full((buckets[buck],), invalid_label, dtype=dtype) buff[:len(sent)] = sent self.data[buck].append(buff) self.data = [np.asarray(i, dtype=dtype) for i in self.data] print("WARNING: discarded %d data longer than the largest bucket."%ndiscard) ndiscard = 0 self.label = [[] for _ in buckets] for i, sent in enumerate(label): buck = bisect.bisect_left(buckets, len(sent)) if buck == len(buckets): ndiscard += 1 continue buff = np.full((buckets[buck],), invalid_label, dtype=dtype) buff[:len(sent)] = sent self.label[buck].append(buff) self.label = [np.asarray(i, dtype=dtype) for i in self.label] print("WARNING: discarded %d labels longer than the largest bucket."%ndiscard) self.batch_size = batch_size self.buckets = buckets self.data_name = data_name self.label_name = label_name self.dtype = dtype self.invalid_label = invalid_label self.nddata = [] self.ndlabel = [] self.major_axis = layout.find('N') self.layout = layout self.default_bucket_key = max(buckets) if self.major_axis == 0: self.provide_data = [DataDesc( name=self.data_name, shape=(batch_size, self.default_bucket_key), layout=self.layout)] self.provide_label = [DataDesc( name=self.label_name, shape=(batch_size, self.default_bucket_key), layout=self.layout)] elif self.major_axis == 1: self.provide_data = [DataDesc( name=self.data_name, shape=(self.default_bucket_key, batch_size), layout=self.layout)] self.provide_label = [DataDesc( name=self.label_name, shape=(self.default_bucket_key, batch_size), layout=self.layout)] else: raise ValueError("Invalid layout %s: Must by NT (batch major) or TN (time major)") self.idx = [] for i, buck in enumerate(self.data): self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size + 1, batch_size)]) self.curr_idx = 0 self.reset() def reset(self): """Resets the iterator to the beginning of the data.""" self.curr_idx = 0 random.shuffle(self.idx) for buck in self.data: np.random.shuffle(buck) self.nddata = [] self.ndlabel = [] for buck in self.data: self.nddata.append(ndarray.array(buck, dtype=self.dtype)) for label in self.label: self.ndlabel.append(ndarray.array(label, dtype=self.dtype)) def next(self): """Returns the next batch of data.""" if self.curr_idx == len(self.idx): raise StopIteration i, j = self.idx[self.curr_idx] self.curr_idx += 1 if self.major_axis == 1: data = self.nddata[i][j:j+self.batch_size].T label = self.ndlabel[i][j:j+self.batch_size].T else: data = self.nddata[i][j:j+self.batch_size] label = self.ndlabel[i][j:j+self.batch_size] return DataBatch([data], [label], pad=0, bucket_key=self.buckets[i], provide_data=[DataDesc(name=self.data_name, shape=data.shape,layout=self.layout)], provide_label=[DataDesc(name=self.label_name, shape=label.shape,layout=self.layout)]) #modules import mxnet as mx import numpy as np import sys import os import ast #custom modules import config from custom_methods import load_obj, NERIter #load numpy files with open("../data/x_train.txt") as f: x_train = f.readlines() x_train = [ast.literal_eval(x.strip()) for x in x_train] with open("../data/x_test.txt") as f: x_test = f.readlines() x_test = [ast.literal_eval(x.strip()) for x in x_test] with open("../data/y_train.txt") as f: y_train = f.readlines() y_train = [ast.literal_eval(x.strip()) for x in y_train] with open("../data/y_test.txt") as f: y_test = f.readlines() y_test = [ast.literal_eval(x.strip()) for x in y_test] x_train = x_train[:2000] x_test = x_test[:200] y_train = y_train[:2000] y_test = y_test[:200] print("\ntraining examples: ", len(x_train), "\n\ntest examples: ", len(x_test), "\n") #create custom data iterators for training and testing train_iter = NERIter(data=x_train, label=y_train, batch_size=config.batch_size, buckets = config.buckets, data_name='seq_data', label_name='seq_label') val_iter = NERIter(data=x_test, label=y_test, batch_size=config.batch_size, buckets = config.buckets, data_name='seq_data', label_name='seq_label') #print the first few input batches for a sanity check # for i, batch in enumerate(train_iter): # if batch.bucket_key == 5: # print("\nbatch ", i, " data: ", batch.data, "\nbatch ", i, " label: ", batch.label, "\nbucket size: ", batch.bucket_key) # continue # train_iter.reset() #create a bidirectional lstm cell https://mxnet.incubator.apache.org/api/python/rnn.html & http://colah.github.io/posts/2015-08-Understanding-LSTMs/ bi_cell = mx.rnn.BidirectionalCell(l_cell=mx.rnn.LSTMCell(num_hidden=config.lstm_state_size, prefix="forward_"), r_cell=mx.rnn.LSTMCell(num_hidden=config.lstm_state_size, prefix="backward_")) #architecture is defined in a function, to allow variable length input sequences def sym_gen(seq_len): """function that creates a network graph, depending on sequence length""" print("-" * 50) #define hyperparameters from data folder vocab_size = len(load_obj("../data/word_index_dict")) num_labels = len(load_obj("../data/tag_index_dict")) input_feature_shape = (config.batch_size, seq_len) input_label_shape = (config.batch_size, seq_len) #data placeholders: we are inputting a sequence of data each time. seq_data = mx.symbol.Variable('seq_data') seq_label = mx.sym.Variable('seq_label') print("\ninput data shape: ", seq_data.infer_shape(seq_data=input_feature_shape)[1][0]) print("\ninput label shape: ", seq_label.infer_shape(seq_label=input_label_shape)[1][0]) #create an embedding layer embed_layer = mx.sym.Embedding(data=seq_data, input_dim=vocab_size, output_dim=config.word_embedding_vector_length, name='vocab_embed') print("\nembedding layer shape: ", embed_layer.infer_shape(seq_data=input_feature_shape)[1][0]) #unroll the lstm cell in time, obtaining a concartenated symbol for each time step (forward and backwards) bi_cell.reset() outputs, states = bi_cell.unroll(length=seq_len, inputs=embed_layer, merge_outputs=False, layout="NTC") print("\nindividual concatenated forward and backward cell shape: ", outputs[0].infer_shape(seq_data=input_feature_shape)[1][0]) print("\nnumber of recurrent cell unrolls: ", len(outputs)) #for each timestep, add a fully connected layer with num_neurons = num_possible_tags step_outputs = [] for i, step_output in enumerate(outputs): fc = mx.sym.FullyConnected(data=step_output, num_hidden=num_labels) reshaped_fc = mx.sym.Reshape(data=fc, shape=(config.batch_size, num_labels, 1)) step_outputs.append(reshaped_fc) print("\nshape after each cell output passes through fully connected layer: ", reshaped_fc.infer_shape(seq_data=input_feature_shape)[1][0]) #concatenate fully connected layers for each timestep sm_input = mx.sym.concat(*step_outputs, dim=2) print("\nshape after concatenating outputs: ", sm_input.infer_shape(seq_data=input_feature_shape)[1][0]) #apply softmax cross entropy loss to each column of each training example (shape =(num_labels, tokens)) sm = mx.sym.SoftmaxOutput(data=sm_input, label=seq_label, name='softmax', multi_output=True) print("\nshape after loss function: ", sm.infer_shape(seq_data=input_feature_shape)[1][0]) #set lstm pointer to back of network lstm = sm return lstm, ('seq_data',), ('seq_label',) # create a trainable bucketing module model = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=train_iter.default_bucket_key, context = config.context) ################ # #fit the model (not working right now) ################ model.fit( train_data=train_iter, eval_data=val_iter, eval_metric='accuracy', optimizer='sgd', optimizer_params={"learning_rate": config.learning_rate}, num_epoch=config.num_epoch) ############### # to debug issues ############## # # allocate memory given the input data and label shapes # model.bind(data_shapes=train_iter.provide_data, # label_shapes=train_iter.provide_label) # # initialize parameters by uniform random numbers # model.init_params(initializer=mx.init.Uniform(scale=.1)) # # use SGD with learning rate 0.1 to train # model.init_optimizer(optimizer='sgd', optimizer_params=( # ('learning_rate', config.learning_rate), )) # # use accuracy as the metric # metric = mx.metric.create('acc') # # train 5 epochs, i.e. going over the data iter one pass # for epoch in range(config.num_epoch): # train_iter.reset() # metric.reset() # for batch in train_iter: # model.forward(batch, is_train=True) # compute predictions # # accumulate prediction accuracy # model.update_metric(metric, batch.label) # model.backward() # compute gradients # model.update() # update parameters # print('Epoch %d, Training %s' % (epoch, metric.get())) ######################## # predict to check shape ######################## print("\nmodel predictions are of shape: ", model.predict(val_iter).shape) ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
