roywei commented on a change in pull request #14629: [MXNET-1333] Estimator and Fit API URL: https://github.com/apache/incubator-mxnet/pull/14629#discussion_r280602388
########## File path: python/mxnet/gluon/contrib/estimator/estimator.py ########## @@ -0,0 +1,376 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-variable +"""Gluon Estimator""" + +import copy +import warnings +import weakref + +from .event_handler import MetricHandler, ValidationHandler, LoggingHandler +from .event_handler import TrainBegin, EpochBegin, BatchBegin, BatchEnd, EpochEnd, TrainEnd +from .... import gluon, autograd +from ....context import Context, cpu, gpu, num_gpus +from ....metric import EvalMetric, Loss, Accuracy + +__all__ = ['Estimator'] + + +class Estimator(object): + """Estimator Class for easy model training + + :py:class:`Estimator` can be used to facilitate the training & validation process + + + Parameters + ---------- + loss : gluon.loss.Loss or list of gluon.loss.Loss + Loss(objective functions) to calculate during training + metrics : EvalMetric or list of EvalMetric + Metrics for evaluating models + initializer : Initializer + initializer to initialize the network + trainer : Trainer + Trainer to apply optimizer on network parameters + context : Context or list of Context + device(s) to run the training on + """ + + def __init__(self, net, + loss, + metrics=None, + initializer=None, + trainer=None, + context=None): + + self.net = net + self.loss = self._check_loss(loss) + self.train_metrics = self._check_metrics(metrics) + + self.context = self._check_context(context) + self._initialize(initializer) + self.trainer = self._check_trainer(trainer) + + def _check_loss(self, loss): + if isinstance(loss, gluon.loss.Loss): + loss = [loss] + elif isinstance(loss, list) or all([isinstance(l, gluon.loss.Loss) for l in loss]): + loss = loss + else: + raise ValueError("loss must be a Loss or a list of Loss, " + "refer to gluon.loss.Loss:{}".format(loss)) + return loss + + def _check_metrics(self, metrics): + if isinstance(metrics, EvalMetric): + metrics = [metrics] + else: + metrics = metrics or [] + if not all([isinstance(metric, EvalMetric) for metric in metrics]): + raise ValueError("metrics must be a Metric or a list of Metric, " + "refer to mxnet.metric.EvalMetric:{}".format(metrics)) + return metrics + + def _check_context(self, context): + # infer available context + gpus = num_gpus() + available_gpus = [gpu(i) for i in range(gpus)] + + if context: + # check context values, only accept Context or a list of Context + if isinstance(context, Context): + context = [context] + elif isinstance(context, list) and all([isinstance(c, Context) for c in context]): + context = context + else: + raise ValueError("context must be a Context or a list of Context, " + "for example mx.cpu() or [mx.gpu(0), mx.gpu(1)], " + "refer to mxnet.Context:{}".format(context)) + for ctx in context: + assert ctx in available_gpus or str(ctx).startswith('cpu'), \ + "%s is not available, please make sure " \ + "your context is in one of: mx.cpu(), %s" % \ + (ctx, ", ".join([str(ctx) for ctx in available_gpus])) + else: + # provide default context + if gpus > 0: + # only use 1 GPU by default + if gpus > 1: + warnings.warn("You have multiple GPUs, gpu(0) will be used by default." + "To utilize all your GPUs, specify context as a list of gpus, " + "e.g. context=[mx.gpu(0), mx.gpu(1)] ") + context = [gpu(0)] + else: + context = [cpu()] + return context + + def _initialize(self, initializer): + # initialize the network + if initializer: + if self._is_initialized(): + # if already initialized, re-init with user specified initializer + warnings.warn("Network already initialized, re-initializing with %s. " + "You don't need to pass initializer if you already " + "initialized your net." % type(initializer).__name__) + self.net.initialize(init=initializer, ctx=self.context, force_reinit=True) + else: + # initialize with user specified initializer + self.net.initialize(init=initializer, ctx=self.context, force_reinit=False) + else: + if not self._is_initialized(): + self.net.initialize(ctx=self.context) + + def _check_trainer(self, trainer): + # handle trainer + if not trainer: + warnings.warn("No trainer specified, default SGD optimizer " + "with learning rate 0.001 is used.") + trainer = gluon.Trainer(self.net.collect_params(), + 'sgd', {'learning_rate': 0.001}) + elif not isinstance(trainer, gluon.Trainer): + raise ValueError("Trainer must be a Gluon Trainer instance, refer to " + "gluon.Trainer:{}".format(trainer)) + return trainer + + def _is_initialized(self): + param_dict = self.net.collect_params() + for param in param_dict: + try: + param_dict[param].list_ctx() + except RuntimeError: + return False + return True + + def _get_data_and_label(self, batch, ctx): + data = batch[0] + label = batch[1] + data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=0) + label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=0) + return data, label + + def prepare_loss_and_metrics(self): + """ + Based on loss functions and training metrics in estimator + Create metric wrappers to record loss values, + Create copies of train loss/metric objects to record validation values + Returns train_metrics and val_metrics + + """ + if any(not hasattr(self, attribute) for attribute in + ['train_metrics', 'val_metrics']): + # Use default mx.metric.Accuracy() for gluon.loss.SoftmaxCrossEntropyLoss() + if not self.train_metrics and any([isinstance(l, gluon.loss.SoftmaxCrossEntropyLoss) for l in self.loss]): + self.train_metrics = [Accuracy()] + self.val_metrics = [] + for loss in self.loss: + # remove trailing numbers from loss name to avoid confusion + self.train_metrics.append(Loss(loss.name.rstrip('1234567890'))) Review comment: bydefault, loss.name will have node number in name, I'm removing trailing numbers. ``` >>>loss=mx.gluon.loss.L2Loss() >>> loss.name 'l2loss0' ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
