burness commented on issue #11763: When Train SSD, It hold on during read the 
data
URL: 
https://github.com/apache/incubator-mxnet/issues/11763#issuecomment-405783889
 
 
   Yes, I make my own data to be VOC format, and here is my train_ssd.py
   
   ```
   """Train SSD"""
   import argparse
   import os
   import logging
   import time
   import numpy as np
   import mxnet as mx
   from mxnet import nd
   from mxnet import gluon
   from mxnet import autograd
   import gluoncv as gcv
   from gluoncv import data as gdata
   from gluoncv import utils as gutils
   from gluoncv.model_zoo import get_model
   from gluoncv.data.batchify import Tuple, Stack, Pad
   from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
   from gluoncv.data.transforms.presets.ssd import SSDDefaultValTransform
   from gluoncv.utils.metrics.voc_detection import VOC07MApMetric
   from gluoncv.utils.metrics.coco_detection import COCODetectionMetric
   from gluoncv.utils.metrics.accuracy import Accuracy
   from own_voc_detection import VOCDetection
   
   def parse_args():
       parser = argparse.ArgumentParser(description='Train SSD networks.')
       parser.add_argument('--network', type=str, default='vgg16_atrous',
                           help="Base network name which serves as feature 
extraction base.")
       parser.add_argument('--data-shape', type=int, default=300,
                           help="Input data shape, use 300, 512.")
       parser.add_argument('--batch-size', type=int, default=32,
                           help='Training mini-batch size')
       parser.add_argument('--dataset', type=str, default='voc',
                           help='Training dataset. Now support voc.')
       parser.add_argument('--num-workers', '-j', dest='num_workers', type=int,
                           default=4, help='Number of data workers, you can use 
larger '
                           'number to accelerate data loading, if you CPU and 
GPUs are powerful.')
       parser.add_argument('--gpus', type=str, default='0',
                           help='Training with GPUs, you can specify 1,3 for 
example.')
       parser.add_argument('--epochs', type=int, default=240,
                           help='Training epochs.')
       parser.add_argument('--resume', type=str, default='',
                           help='Resume from previously saved parameters if not 
None. '
                           'For example, you can resume from 
./ssd_xxx_0123.params')
       parser.add_argument('--start-epoch', type=int, default=0,
                           help='Starting epoch for resuming, default is 0 for 
new training.'
                           'You can specify it to 100 for example to start from 
100 epoch.')
       parser.add_argument('--lr', type=float, default=0.001,
                           help='Learning rate, default is 0.001')
       parser.add_argument('--lr-decay', type=float, default=0.1,
                           help='decay rate of learning rate. default is 0.1.')
       parser.add_argument('--lr-decay-epoch', type=str, default='160,200',
                           help='epoches at which learning rate decays. default 
is 160,200.')
       parser.add_argument('--momentum', type=float, default=0.9,
                           help='SGD momentum, default is 0.9')
       parser.add_argument('--wd', type=float, default=0.0005,
                           help='Weight decay, default is 5e-4')
       parser.add_argument('--log-interval', type=int, default=100,
                           help='Logging mini-batch interval. Default is 100.')
       parser.add_argument('--save-prefix', type=str, default='',
                           help='Saving parameter prefix')
       parser.add_argument('--save-interval', type=int, default=10,
                           help='Saving parameters epoch interval, best model 
will always be saved.')
       parser.add_argument('--val-interval', type=int, default=1,
                           help='Epoch interval for validation, increase the 
number will reduce the '
                                'training time if validation is slow.')
       parser.add_argument('--seed', type=int, default=233,
                           help='Random seed to be fixed.')
       args = parser.parse_args()
       return args
   
   def get_dataset(dataset, args):
       if dataset.lower() == 'voc':
           train_dataset = VOCDetection(
               splits=[(2007, 'trainval')])
           val_dataset = gdata.VOCDetection(
               splits=[(2007, 'test')])
           val_metric = VOC07MApMetric(iou_thresh=0.5, 
class_names=val_dataset.classes)
       elif dataset.lower() == 'coco':
           train_dataset = gdata.COCODetection(splits='instances_train2017')
           val_dataset = gdata.COCODetection(splits='instances_val2017', 
skip_empty=False)
           val_metric = COCODetectionMetric(
               val_dataset, args.save_prefix + '_eval', cleanup=True,
               data_shape=(args.data_shape, args.data_shape))
           # coco validation is slow, consider increase the validation interval
           if args.val_interval == 1:
               args.val_interval = 10
       else:
           raise NotImplementedError('Dataset: {} not 
implemented.'.format(dataset))
       return train_dataset, val_dataset, val_metric
   
   def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, 
num_workers):
       """Get dataloader."""
       width, height = data_shape, data_shape
       # use fake data to generate fixed anchors for target generation
       with autograd.train_mode():
           _, _, anchors = net(mx.nd.zeros((1, 3, height, width)))
       batchify_fn = Tuple(Stack(), Stack(), Stack())  # stack image, 
cls_targets, box_targets
       train_loader = gluon.data.DataLoader(
           train_dataset.transform(SSDDefaultTrainTransform(width, height, 
anchors)),
           batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', 
num_workers=num_workers)
       val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
       val_loader = gluon.data.DataLoader(
           val_dataset.transform(SSDDefaultValTransform(width, height)),
           batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', 
num_workers=num_workers)
       return train_loader, val_loader
   
   def save_params(net, best_map, current_map, epoch, save_interval, prefix):
       current_map = float(current_map)
       if current_map > best_map[0]:
           best_map[0] = current_map
           net.save_params('{:s}_best.params'.format(prefix, epoch, 
current_map))
           with open(prefix+'_best_map.log', 'a') as f:
               f.write('\n{:04d}:\t{:.4f}'.format(epoch, current_map))
       if save_interval and epoch % save_interval == 0:
           net.save_params('{:s}_{:04d}_{:.4f}.params'.format(prefix, epoch, 
current_map))
   
   def validate(net, val_data, ctx, eval_metric):
       """Test on validation dataset."""
       eval_metric.reset()
       # set nms threshold and topk constraint
       net.set_nms(nms_thresh=0.45, nms_topk=400)
       net.hybridize()
       for batch in val_data:
           data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, 
batch_axis=0)
           label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, 
batch_axis=0)
           det_bboxes = []
           det_ids = []
           det_scores = []
           gt_bboxes = []
           gt_ids = []
           gt_difficults = []
           for x, y in zip(data, label):
               # get prediction results
               ids, scores, bboxes = net(x)
               det_ids.append(ids)
               det_scores.append(scores)
               # clip to image size
               det_bboxes.append(bboxes.clip(0, batch[0].shape[2]))
               # split ground truths
               gt_ids.append(y.slice_axis(axis=-1, begin=4, end=5))
               gt_bboxes.append(y.slice_axis(axis=-1, begin=0, end=4))
               gt_difficults.append(y.slice_axis(axis=-1, begin=5, end=6) if 
y.shape[-1] > 5 else None)
   
           # update metric
           eval_metric.update(det_bboxes, det_ids, det_scores, gt_bboxes, 
gt_ids, gt_difficults)
       return eval_metric.get()
   
   def train(net, train_data, val_data, eval_metric, args):
       """Training pipeline"""
       net.collect_params().reset_ctx(ctx)
       trainer = gluon.Trainer(
           net.collect_params(), 'sgd',
           {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum})
   
       # lr decay policy
       lr_decay = float(args.lr_decay)
       lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if 
ls.strip()])
   
       mbox_loss = gcv.loss.SSDMultiBoxLoss()
       ce_metric = mx.metric.Loss('CrossEntropy')
       smoothl1_metric = mx.metric.Loss('SmoothL1')
   
       # set up logger
       logging.basicConfig()
       logger = logging.getLogger()
       logger.setLevel(logging.INFO)
       log_file_path = args.save_prefix + '_train.log'
       log_dir = os.path.dirname(log_file_path)
       if log_dir and not os.path.exists(log_dir):
           os.makedirs(log_dir)
       fh = logging.FileHandler(log_file_path)
       logger.addHandler(fh)
       logger.info(args)
       logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
       best_map = [0]
       for epoch in range(args.start_epoch, args.epochs):
           while lr_steps and epoch >= lr_steps[0]:
               new_lr = trainer.learning_rate * lr_decay
               lr_steps.pop(0)
               trainer.set_learning_rate(new_lr)
               logger.info("[Epoch {}] Set learning rate to {}".format(epoch, 
new_lr))
           ce_metric.reset()
           smoothl1_metric.reset()
           tic = time.time()
           btic = time.time()
           net.hybridize()
           for i, batch in enumerate(train_data):
               batch_size = batch[0].shape[0]
               data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, 
batch_axis=0)
               cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, 
batch_axis=0)
               box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, 
batch_axis=0)
               with autograd.record():
                   cls_preds = []
                   box_preds = []
                   for x in data:
                       cls_pred, box_pred, _ = net(x)
                       cls_preds.append(cls_pred)
                       box_preds.append(box_pred)
                   sum_loss, cls_loss, box_loss = mbox_loss(
                       cls_preds, box_preds, cls_targets, box_targets)
                   autograd.backward(sum_loss)
               # since we have already normalized the loss, we don't want to 
normalize
               # by batch-size anymore
               trainer.step(1)
               ce_metric.update(0, [l * batch_size for l in cls_loss])
               smoothl1_metric.update(0, [l * batch_size for l in box_loss])
               if args.log_interval and not (i + 1) % args.log_interval:
                   name1, loss1 = ce_metric.get()
                   name2, loss2 = smoothl1_metric.get()
                   logger.info('[Epoch {}][Batch {}], Speed: {:.3f} 
samples/sec, {}={:.3f}, {}={:.3f}'.format(
                       epoch, i, batch_size/(time.time()-btic), name1, loss1, 
name2, loss2))
               btic = time.time()
   
           name1, loss1 = ce_metric.get()
           name2, loss2 = smoothl1_metric.get()
           logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, 
{}={:.3f}'.format(
               epoch, (time.time()-tic), name1, loss1, name2, loss2))
           if not (epoch + 1) % args.val_interval:
               # consider reduce the frequency of validation to save time
               map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
               val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in 
zip(map_name, mean_ap)])
               logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
               current_map = float(mean_ap[-1])
           else:
               current_map = 0.
           save_params(net, best_map, current_map, epoch, args.save_interval, 
args.save_prefix)
   
   if __name__ == '__main__':
       args = parse_args()
       # fix seed for mxnet, numpy and python builtin random generator.
       gutils.random.seed(args.seed)
   
       # training contexts
       ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()]
       ctx = ctx if ctx else [mx.cpu()]
   
       # network
       net_name = '_'.join(('ssd', str(args.data_shape), args.network, 
args.dataset))
       args.save_prefix += net_name
       net = get_model(net_name, pretrained_base=True)
       if args.resume.strip():
           net.load_params(args.resume.strip())
       else:
           for param in net.collect_params().values():
               if param._data is not None:
                   continue
               param.initialize()
   
       # training data
       train_dataset, val_dataset, eval_metric = get_dataset(args.dataset, args)
       train_data, val_data = get_dataloader(
           net, train_dataset, val_dataset, args.data_shape, args.batch_size, 
args.num_workers)
   
       # training
       train(net, train_data, val_data, eval_metric, args)
   
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to