Marcovaldong commented on issue #14796: The model gets stuck after several thousand batch_size URL: https://github.com/apache/incubator-mxnet/issues/14796#issuecomment-487273345 @lanking520 The code is as follows: ``` from tqdm import tqdm import argparse import logging import math import os import time import collections import kaldi_io import mxnet as mx import numpy as np from mxnet import autograd, gluon from mxnet.gluon import utils as gutils from mxnet.gluon import contrib from DataLoader import SequentialLoader, TokenAcc, phone from DataLoader import SpectrogramDataset, AudioDataLoader, BucketingSampler from DataLoader import _batchify_fn from mxnet.gluon.data import DataLoader from model import Transducer parser = argparse.ArgumentParser(description='MXNet Autograd RNN/LSTM Acoustic Model on youdao.') parser.add_argument('--train_manifest', metavar='DIR', help='path to train manifest csv', default='data/manifest.huiting.pinyin.train') # 'data/train_manifest.csv') parser.add_argument('--val_manifest', metavar='DIR', help='path to validation manifest csv', default='data/manifest.huiting.pinyin.test') # 'data/val.csv') parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate') parser.add_argument('--num_workers', default=32, type=int, help='Number of workers used in data-loading') parser.add_argument('--labels_path', default='labels', help='Contains all characters for transcription') parser.add_argument('--window_size', default=.02, type=float, help='Window size for spectrogram in seconds') parser.add_argument('--window_stride', default=.01, type=float, help='Window stride for spectrogram in seconds') parser.add_argument('--window', default='hamming', help='Window type for spectrogram generation') parser.add_argument('--noise_dir', default=None, help='Directory to inject noise into audio. If default, noise Inject not added') parser.add_argument('--noise_prob', default=0.4, help='Probability of noise being added per sample') parser.add_argument('--noise_min', default=0.0, help='Minimum noise level to sample from. (1.0 means all noise, not original signal)', type=float) parser.add_argument('--noise_max', default=0.5, help='Maximum noise levels to sample from. Maximum 1.0', type=float) parser.add_argument('--lr', type=float, default=1e-3, help='initial learning rate') parser.add_argument('--epochs', type=int, default=200, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=1, metavar='N', help='batch size') parser.add_argument('--dropout', type=float, default=0.5, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--bi', default=True, action='store_true', help='whether use bidirectional lstm') parser.add_argument('--noise', type=float, default=0, help='add gaussian noise to inputs') parser.add_argument('--log-interval', type=int, default=50, # metavar='N', help='report interval') parser.add_argument('--out', type=str, default='exp/rnnt_lr1e-3', help='path to save the final model') parser.add_argument('--stdout', default=False, action='store_true') parser.add_argument('--init', type=str, default='', help='Initial am & pm parameters') parser.add_argument('--begin_epoch', default=1, type=int, help='the epoch number from which to train') parser.add_argument('--initam', type=str, default='', help='Initial am parameters') parser.add_argument('--initpm', type=str, default='', help='Initial pm parameters') parser.add_argument('--gradclip', type=float, default=0) parser.add_argument('--schedule', default=True, help='whether to annealing the learning rate') parser.add_argument('--tmp', default=1000, help='how many epoch to save params for preventing crash') args = parser.parse_args() os.makedirs(args.out, exist_ok=True) with open(os.path.join(args.out, 'args'), 'w') as f: f.write(str(args)) if args.stdout: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt="%m-%d %H:%M:%S", level=logging.INFO) else: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt="%m-%d %H:%M:%S", filename=os.path.join(args.out, 'train.log'), level=logging.INFO) context = [mx.gpu(i) for i in [0, 1, 2, 3]] # context = [mx.gpu(i) for i in [6, 7]] # Dataset audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) # trainset = SequentialLoader(audio_conf=audio_conf, manifest_filepath=args.train_manifest, batch_size=args.batch_size, # normalize=True, augment=False) # devset = SequentialLoader(audio_conf=audio_conf, manifest_filepath=args.val_manifest, batch_size=args.batch_size, # normalize=True, augment=False) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, normalize=True, augment=False) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, normalize=True, augment=False) train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) train_loader = DataLoader(train_dataset, batchify_fn=_batchify_fn, num_workers=args.num_workers, batch_sampler=train_sampler) # train_loader = DataLoader(train_dataset, batch_size=args.batch_size, batchify_fn=_batchify_fn, # num_workers=args.num_workers, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size//4, batchify_fn=_batchify_fn, num_workers=args.num_workers) ############################################################################### # Build the model ############################################################################### model = Transducer(len(phone), 250, 3, args.dropout, bidirectional=args.bi) # model = Transducer(input_size=320, vocab_size=len(phone), num_hidden=250, num_layers=3, dropout=args.dropout, # blank=0, bidirectional=args.bi) if args.init: model.collect_params().load(args.init, context) elif args.initam or args.initpm: model.initialize(mx.init.Uniform(0.1), ctx=context) # NOTE only use lstm layer if args.initam: model.collect_params('transducer0_rnnmodel0_lstm0').load(args.initam, context, True, True) if args.initpm: model.collect_params('transducer0_lstm0').load(args.initpm, context, True, True) else: model.initialize(mx.init.Uniform(0.1), ctx=context) # trainer = gluon.Trainer(model.collect_params(), 'sgd', # {'learning_rate': args.lr, # 'momentum': 0.9}) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': args.lr, 'beta1': 0.4}) def evaluate(model): losses = [] for (data) in tqdm(test_loader): xs, ys, xlen, ylen = data gpu_xs = gutils.split_and_load(xs, ctx_list=context) gpu_ys = gutils.split_and_load(ys, ctx_list=context) gpu_xlens = gutils.split_and_load(xlen, ctx_list=context) gpu_ylens = gutils.split_and_load(ylen, ctx_list=context) # with autograd.record(): loss_gpus = [model(gpu_x, gpu_y, gpu_xlen, gpu_ylen) for gpu_x, gpu_y, gpu_xlen, gpu_ylen in zip(gpu_xs, gpu_ys, gpu_xlens, gpu_ylens)] losses.append(sum([float(loss.sum().asscalar()) for loss in loss_gpus])) return sum(losses) / len(test_dataset) def train(): best_model = None prev_loss = 1000 for epoch in range(args.begin_epoch, args.epochs+1): print('Train the model for the %d th epoch with learning rate %.2e' % (epoch, trainer.learning_rate)) logging.info('Train the model for the %d th epoch with learning rate %.2e' % (epoch, trainer.learning_rate)) losses = [] totl0 = 0 start_time = time.time() tic = time.time() for i, (xs, ys, xlen, ylen) in enumerate(train_loader, 1): gpu_xs = gutils.split_and_load(xs, ctx_list=context) gpu_ys = gutils.split_and_load(ys, ctx_list=context) gpu_xlens = gutils.split_and_load(xlen, ctx_list=context) gpu_ylens = gutils.split_and_load(ylen, ctx_list=context) if args.noise > 0: xs += mx.nd.normal(0, args.noise, xs.shape[-1], ctx=xs.context) with autograd.record(): loss_gpus = [model(gpu_x, gpu_y, gpu_xlen, gpu_ylen) for gpu_x, gpu_y, gpu_xlen, gpu_ylen in zip(gpu_xs, gpu_ys, gpu_xlens, gpu_ylens)] for loss in loss_gpus: loss.backward() losses.append(sum([float(loss.sum().asscalar()) for loss in loss_gpus])) # gradient clip if args.gradclip > 0: grads = [p.grad(context) for p in model.collect_params().values()] gluon.utils.clip_global_norm(grads, args.gradclip) trainer.step(args.batch_size) # , ignore_stale_grad=True) # mx.nd.waitall() totl0 += losses[-1] if i % args.log_interval == 0: l0 = totl0 / args.batch_size / args.log_interval toc = time.time() print("Epoch [%d / %d][%d / %d] loss %.2f time %.2f s" % (epoch, args.epochs, i, len(train_loader), l0, toc - tic)) logging.info('[Epoch %d Batch %d] loss %.2f' % (epoch, i, l0)) totl0 = 0 tic = time.time() if i % args.tmp == 0: tmp_path = "{}/params_tmp_epoch{:03d}".format(args.out, epoch) model.collect_params().save(tmp_path) losses = sum(losses) / len(train_dataset) val_l = evaluate(model) print('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.2e' % ( epoch, time.time()-start_time, losses, val_l, trainer.learning_rate)) logging.info('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.2e' % ( epoch, time.time()-start_time, losses, val_l, trainer.learning_rate)) if val_l < prev_loss: prev_loss = val_l best_model = '{}/params_epoch{:03d}_tr{:.2f}_cv{:.2f}'.format(args.out, epoch, losses, val_l) model.collect_params().save(best_model) flag = 0 else: model.collect_params().save('{}/params_epoch{:03d}_tr{:.2f}_cv{:.2f}_rejected'.format(args.out, epoch, losses, val_l)) model.collect_params().load(best_model, context) if args.schedule: trainer.set_learning_rate(trainer.learning_rate / 5) flag = 1 if args.schedule and not flag: trainer.set_learning_rate(trainer.learning_rate / 2) if __name__ == '__main__': train() ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
