Cypher42 opened a new issue #15320: Error when calling asnumpy() or exporting the weight of darknet53 while training URL: https://github.com/apache/incubator-mxnet/issues/15320 I derived a training script for YoloNet from the training script provided by GluonCV. After each batch of validation data is queued I request the label information using ` label.asnumpy(), same for the model prediction. The system throws an MXNetError at the first (and only the first) iteration of the validation loop. The error can't be reproduced when in debug Mode. 21-Jun-2019 19:08:48 | [ERROR] ERROR - training_and_evaluation.train_test - [19:04:30] src/operator/tensor/./matrix_op-inl.h:670: Check failed: b < len (4176 vs. 3549) slicing with begin[1]=4176 exceends limit of 3549 -- | -- 21-Jun-2019 19:08:48 | [ERROR] 21-Jun-2019 19:08:48 | [ERROR] Stack trace returned 10 entries: 21-Jun-2019 19:08:48 | [ERROR] [bt] (0) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x40b29a) [0x7fe337d9d29a] 21-Jun-2019 19:08:48 | [ERROR] [bt] (1) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x40b8b1) [0x7fe337d9d8b1] 21-Jun-2019 19:08:48 | [ERROR] [bt] (2) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29a351d) [0x7fe33a33551d] 21-Jun-2019 19:08:48 | [ERROR] [bt] (3) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29f68cd) [0x7fe33a3888cd] 21-Jun-2019 19:08:48 | [ERROR] [bt] (4) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x2e8) [0x7fe33a700fc8] 21-Jun-2019 19:08:48 | [ERROR] [bt] (5) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cc07c9) [0x7fe33a6527c9] 21-Jun-2019 19:08:48 | [ERROR] [bt] (6) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cca104) [0x7fe33a65c104] 21-Jun-2019 19:08:48 | [ERROR] [bt] (7) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cce292) [0x7fe33a660292] 21-Jun-2019 19:08:48 | [ERROR] [bt] (8) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cca834) [0x7fe33a65c834] 21-Jun-2019 19:08:48 | [ERROR] [bt] (9) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/bin/../lib/libstdc++.so.6(+0xc8421) [0x7fe3ee63c421] 21-Jun-2019 19:08:48 | [ERROR] 21-Jun-2019 19:08:48 | [ERROR] 21-Jun-2019 19:08:48 | [ERROR] ERROR - training_and_evaluation.train_test - Error in mxbase while transforming validation labels The error occurs regardless of the machine it is run on, it is also indipendant of the device(s) the tensors are saved on. The saving of the network also crashes with a similar error: 21-Jun-2019 19:16:26 | [ERROR] INFO - /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/src/main/python/scripts/train_yolonet.py - Saving network -- | -- 21-Jun-2019 19:16:26 | [ERROR] ERROR - /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/src/main/python/scripts/train_yolonet.py - [19:08:10] src/operator/tensor/./matrix_op-inl.h:670: Check failed: b < len (10338 vs. 3549) slicing with begin[1]=10338 exceends limit of 3549 21-Jun-2019 19:16:26 | [ERROR] 21-Jun-2019 19:16:26 | [ERROR] Stack trace returned 10 entries: 21-Jun-2019 19:16:26 | [ERROR] [bt] (0) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x40b29a) [0x7fe337d9d29a] 21-Jun-2019 19:16:26 | [ERROR] [bt] (1) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x40b8b1) [0x7fe337d9d8b1] 21-Jun-2019 19:16:26 | [ERROR] [bt] (2) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29a351d) [0x7fe33a33551d] 21-Jun-2019 19:16:26 | [ERROR] [bt] (3) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29f68cd) [0x7fe33a3888cd] 21-Jun-2019 19:16:26 | [ERROR] [bt] (4) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x2e8) [0x7fe33a700fc8] 21-Jun-2019 19:16:26 | [ERROR] [bt] (5) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cc07c9) [0x7fe33a6527c9] 21-Jun-2019 19:16:26 | [ERROR] [bt] (6) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cca104) [0x7fe33a65c104] 21-Jun-2019 19:16:26 | [ERROR] [bt] (7) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cce292) [0x7fe33a660292] 21-Jun-2019 19:16:26 | [ERROR] [bt] (8) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cca834) [0x7fe33a65c834] 21-Jun-2019 19:16:26 | [ERROR] [bt] (9) /opt/bambooagent0/bamboo-agent-home/xml-data/build-dir/CLIP-SYD0-JOB1/target/pymaven/envs/builder/bin/../lib/libstdc++.so.6(+0xc8421) [0x7fe3ee63c421] 21-Jun-2019 19:16:26 | [ERROR] Here is the code of the taining and validation loop: `import pytest from tempfile import mkdtemp from os.path import join, exists from shutil import rmtree from model_zoo.yolonet import yolo_gen1 from gluoncv.data.transforms.presets import yolo as yoloaug from gluoncv import utils from mxnet import autograd from mxnet.gluon import Trainer import mxnet as mx import gluoncv as gcv from gluoncv.data import VOCDetection import mxnet as mx from mxnet import gluon import numpy as np from gluoncv.model_zoo.yolo.yolo3 import YOLOV3 from types import FunctionType from typing import List from gluoncv.utils import LRScheduler, LRSequential import time import logging import os import sys from typing import Tuple logging.basicConfig() logging.basicConfig(stream=sys.stdout, level=logging.INFO) LOG = logging.getLogger(__name__) def get_bbox_prediction_and_cast_to_numpy(boxes: mx.ndarray, confidence: mx.ndarray, class_ids: mx.ndarray, classes: List[int] = [0]) -> Tuple[np.array, np.array, np.array]: """ clean bouding box prediction. The input must be sorted according to the confidence :param boxes: predicted boxed :param confidence: confidences for each box :param class_ids: score for each box :param classes: class indices :return: """ pred_boxes = boxes.asnumpy() confidence_score = np.squeeze(confidence.asnumpy()) classification_result = np.squeeze(class_ids.asnumpy()) pred_boxes = pred_boxes[confidence_score >= 0] class_ids = classification_result[confidence_score >= 0] confidence_score = confidence_score[confidence_score >= 0] class_sorted_boxes = [] for class_id in classes: if not class_id in class_ids: class_specific_boxes = np.array([]) else: class_specific_boxes = pred_boxes[class_ids == class_id] boxes_with_confidence_mask = np.zeros((len(class_specific_boxes), 5)) boxes_with_confidence_mask[:,:4] = class_specific_boxes boxes_with_confidence_mask[:,-1] = confidence_score[class_ids == class_id] class_specific_boxes = boxes_with_confidence_mask class_sorted_boxes.append(class_specific_boxes) return np.asarray(class_sorted_boxes), classes def cast_label_to_numpy(label: mx.ndarray, classes: List[int]=[0]) -> np.array: """ Cast labels from the generator into aimmetrics processable shape :param label: labels from the generator :return: labels processable by aimmetrics object detection metrics """ np_label = label[0].asnumpy() class_ids = np_label[:,-1].squeeze() out_labels = [] for cls in range(np.max(classes)): if len(class_ids[class_ids == cls]) == 0: out_labels.append(np.array([])) else: specific_cls_labels = np_label[class_ids == cls] out_labels.append(specific_cls_labels) return out_labels def validate(val_data_loader: mx.gluon.data.DataLoader, net: YOLOV3, ctx: List, val_metrics: List[FunctionType], postprocessing: FunctionType = get_bbox_prediction_and_cast_to_numpy, label_postprocessing: FunctionType = cast_label_to_numpy): net.set_nms(nms_thresh=0.6, nms_topk=400, post_nms=10) mx.nd.waitall() net.hybridize() all_pred_boxes = [] all_pred_ids = [] all_gt_boxes = [] for i, (data, label) in enumerate(val_data_loader): LOG.info('Evaluating: {} of {} batches'.format(i, len(val_data_loader))) data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=0, even_split=False) #label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=0, even_split=False) try: for x in data: # get prediction pred_ids, pred_scores, pred_bboxes = net(x) # get ground truth mx.nd.waitall() pred_bboxes, pred_ids = postprocessing(pred_bboxes[0], pred_scores[0], pred_ids[0]) all_pred_boxes.append(pred_bboxes) all_pred_ids.append(pred_ids) gt_boxes = label_postprocessing(label) except mx.base.MXNetError as e: LOG.error(e) LOG.error('Error in mxbase while transforming validation labels') continue all_gt_boxes.append(gt_boxes) result = [] for metric in val_metrics: try: tmp = metric(all_pred_boxes, all_gt_boxes) except ValueError: tmp = 1.0 result.append(tmp) return result def train(train_data_loader: mx.gluon.data.DataLoader, val_data_loader: mx.gluon.data.DataLoader, net: YOLOV3, metrics: List[FunctionType], metrics_names : List[str], epochs: int, check_point_intervall: int, ctx: List, lr_decay_period: int, warmup_epochs: int, batch_size: int, num_samples: int, lr_mode: str, lr_decay: float, lr: float, wd: float = 0.0005, momentum: float = 0.9, val_func: FunctionType = validate, sacred_logging: FunctionType = lambda metric, name: LOG.info(f'{name}:\t{metric}'), articact_logging: FunctionType = lambda net, epoch: LOG.warning('No artifact logging function set')): LOG.info('Deploying on device {}'.format(ctx)) net.collect_params().reset_ctx(ctx) # setting up when to decay the lr if lr_decay_period > 0: LOG.info('Enabling learning rate decay with period {}'.format(lr_decay_period)) lr_decay_epoch = list(range(lr_decay_period, epochs, lr_decay_period)) else: LOG.info('Disabling learning rate decay, since value {} is an invalid length of period'.format(lr_decay_period)) lr_decay_epoch = epochs+1 #ToDo: valid?? lr_decay_epoch = [e - warmup_epochs for e in lr_decay_epoch] num_batches = num_samples // batch_size LOG.info('Training will last for {} epochs with {} batches each'.format(epochs, num_batches)) # setting up the lr scheduler lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=lr, nepochs=warmup_epochs, iters_per_epoch=num_batches), LRScheduler(lr_mode, base_lr=lr, nepochs=epochs - warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2), ]) LOG.info('Setting up learning rate scheduler complete. Training for {} warmup and {} normal epochs'.format(warmup_epochs, epochs)) LOG.info('Learning rate scheduling is set to {}'.format(lr_mode)) # setting up the trainer trainer = gluon.Trainer( net.collect_params(), 'sgd', {'wd': wd, 'momentum': momentum, 'lr_scheduler': lr_scheduler}, kvstore='local') LOG.info('Trainer initialized') LOG.info('Start training with {} Epochs'.format(epochs)) #best_map = [0] for epoch in range(epochs): LOG.info(f'Start training epoch {epoch} / {epochs}') mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data_loader): LOG.info(f'{i} / {num_batches}') if num_batches <= i: break batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [gluon.utils.split_and_load(batch[i], ctx_list=ctx, batch_axis=0) for i in range(1, 6)] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) #mx.nd.waitall() autograd.backward(sum_losses) trainer.step(batch_size) LOG.info('Epoch completed, start validating') result = val_func(val_data_loader=val_data_loader, net=net, ctx=ctx, val_metrics=metrics) for metric, name in zip(result, metrics_names): sacred_logging(metric, name) articact_logging(net, epoch) `
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
