ThomasDelteil commented on issue #10042: [MXNET-86] Gluon dataloader crash on speech recognition training URL: https://github.com/apache/incubator-mxnet/issues/10042#issuecomment-374378588 Thanks @Jerryzcn here is a reproducible example: Play with the `NUM_WORKERS` and `optimized` parameters to surface the problems i mentioned above ```python import mxnet as mx print(mx.__version__) from mxnet import nd, autograd, gluon import os import pandas as pd from mxnet.gluon.data import ArrayDataset from mxnet.gluon.data import DataLoader import numpy as np import multiprocessing import wget if not os.path.isfile('pickleddata.pkl'): wget.download('https://s3.us-east-2.amazonaws.com/tdelteil-test-mxnet/pickleddata.pkl') data = pd.read_pickle('pickleddata.pkl') # /!\ The important bit: NUM_WORKERS = multiprocessing.cpu_count() # number of workers used in the data loading optimized = True categories = [ 'Home_and_Kitchen', 'Books', 'CDs_and_Vinyl', 'Movies_and_TV', 'Cell_Phones_and_Accessories', 'Sports_and_Outdoors', 'Clothing_Shoes_and_Jewelry' ] ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}") # The 69 characters as specified in the paper ALPHABET_INDEX = {letter: index for index, letter in enumerate(ALPHABET)} # { a: 0, b: 1, etc} FEATURE_LEN = 1014 # max-length in characters for one document BATCH_SIZE = 128 # number of documents per batch def encode(text): encoded = np.zeros([len(ALPHABET), FEATURE_LEN], dtype='float32') review = text.lower()[:FEATURE_LEN-1:-1] i = 0 for letter in text: if i >= FEATURE_LEN: break; if letter in ALPHABET_INDEX: encoded[ALPHABET_INDEX[letter]][i] = 1 i += 1 return encoded class AmazonDataSet(ArrayDataset): # We pre-process the documents on the fly def __getitem__(self, idx): return encode(self._data[0][idx]), self._data[1][idx] # Data loaders: split = 0.8 split_index = int(split*len(data)) train_data_X = data['X'][:split_index].as_matrix() train_data_Y = data['Y'][:split_index].as_matrix() test_data_X = data['X'][split_index:].as_matrix() test_data_Y = data['Y'][split_index:].as_matrix() train_dataset = AmazonDataSet(train_data_X, train_data_Y) test_dataset = AmazonDataSet(test_data_X, test_data_Y) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, last_batch='discard') test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, last_batch='discard') # context: ctx = mx.gpu() # to run on GPU # build network NUM_FILTERS = 256 # number of convolutional filters per convolutional layer NUM_OUTPUTS = len(categories) # number of classes FULLY_CONNECTED = 1024 # number of unit in the fully connected dense layer DROPOUT_RATE = 0.5 # probability of node drop out LEARNING_RATE = 0.01 # learning rate of the gradient MOMENTUM = 0.9 # momentum of the gradient WDECAY = 0.00001 # regularization term to limit size of weights net = gluon.nn.HybridSequential() with net.name_scope(): net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=7, activation='relu')) net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3)) net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=7, activation='relu')) net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3)) net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu')) net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu')) net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu')) net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu')) net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3)) net.add(gluon.nn.Flatten()) net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu')) net.add(gluon.nn.Dropout(DROPOUT_RATE)) net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu')) net.add(gluon.nn.Dropout(DROPOUT_RATE)) net.add(gluon.nn.Dense(NUM_OUTPUTS)) hybridize = True # for speed improvement, compile the network but no in-depth debugging possible net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) # loss softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() # optimizer trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': LEARNING_RATE, 'wd':WDECAY, 'momentum':MOMENTUM}) # Training Loop import time start_epoch = 6 number_epochs = 7 smoothing_constant = .01 tick = time.time() for e in range(start_epoch, number_epochs): for i, (review, label) in enumerate(train_dataloader): review = review.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(): output = net(review) loss = softmax_cross_entropy(output, label) loss.backward() trainer.step(review.shape[0]) # moving average of the loss if optimized: curr_loss = nd.mean(loss) else: curr_loss = nd.mean(loss).asscalar() moving_loss = (curr_loss if (i == 0) else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss) if (i%100 == 0): tock = time.time() if optimized: print('Batch {}:{},{},{} seconds for 100 batches'.format(i, curr_loss.asscalar(),moving_loss.asscalar(), tock-tick)) else: print('Batch {}:{},{},{} seconds for 100 batches'.format(i, curr_loss, moving_loss, tick-tock)) tick = tock print("Epoch %s. Loss: %s, Test_acc %s" % (e, moving_loss.asscalar(), test_accuracy)) ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
