[GitHub] spark pull request: [SPARK-12877] [ML] Add train-validation-split ...

JeremyNixon Thu, 25 Feb 2016 15:34:16 -0800

Github user JeremyNixon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/11335#discussion_r54183781
  
    --- Diff: python/pyspark/ml/tuning.py ---
    @@ -288,6 +289,172 @@ def copy(self, extra=None):
             return CrossValidatorModel(self.bestModel.copy(extra))
     
     
    +class TrainValidationSplit(Estimator, HasSeed):
    +
    +    estimator = Param(Params._dummy(), "estimator", "estimator to be 
tested")
    +    estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps", 
"estimator param maps")
    +    evaluator = Param(
    +        Params._dummy(), "evaluator",
    +        "evaluator used to select hyper-parameters that maximize the 
metric")
    +    trainRatio = Param(Params._dummy(), "trainRatio", "proportion for 
train-validation ratio")
    +
    +    @keyword_only
    +    def __init__(self, estimator=None, estimatorParamMaps=None, 
evaluator=None, trainRatio=0.75,
    +                 seed=None):
    +        """
    +        __init__(self, estimator=None, estimatorParamMaps=None, 
evaluator=None, trainRatio=0.75,\
    +                 seed=None)
    +        """
    +        super(TrainValidationSplit, self).__init__()
    +        self._setDefault(trainRatio=0.75)
    +        kwargs = self.__init__._input_kwargs
    +        self._set(**kwargs)
    +
    +    @since("2.0.0")
    +    @keyword_only
    +    def setParams(self, estimator=None, estimatorParamMaps=None, 
evaluator=None, trainRatio=0.75,
    +                  seed=None):
    +        """
    +        setParams(self, estimator=None, estimatorParamMaps=None, 
evaluator=None, trainRatio= 0.75,\
    +                  seed=None):
    +        Sets params for the train validation split.
    +        """
    +        kwargs = self.setParams._input_kwargs
    +        return self._set(**kwargs)
    +
    +    @since("2.0.0")
    +    def setEstimator(self, value):
    +        """
    +        Sets the value of :py:attr:`estimator`.
    +        """
    +        self._paramMap[self.estimator] = value
    +        return self
    +
    +    @since("2.0.0")
    +    def getEstimator(self):
    +        """
    +        Gets the value of estimator or its default value.
    +        """
    +        return self.getOrDefault(self.estimator)
    +
    +    @since("2.0.0")
    +    def setEstimatorParamMaps(self, value):
    +        """
    +        Sets the value of :py:attr:`estimatorParamMaps`.
    +        """
    +        self._paramMap[self.estimatorParamMaps] = value
    +        return self
    +
    +    @since("2.0.0")
    +    def getEstimatorParamMaps(self):
    +        """
    +        Gets the value of estimatorParamMaps or its default value.
    +        """
    +        return self.getOrDefault(self.estimatorParamMaps)
    +
    +    @since("2.0.0")
    +    def setEvaluator(self, value):
    +        """
    +        Sets the value of :py:attr:`evaluator`.
    +        """
    +        self._paramMap[self.evaluator] = value
    +        return self
    +
    +    @since("2.0.0")
    +    def getEvaluator(self):
    +        """
    +        Gets the value of evaluator or its default value.
    +        """
    +        return self.getOrDefault(self.evaluator)
    +
    +    @since("2.0.0")
    +    def setTrainRatio(self, value):
    +        """
    +        Sets the value of :py:attr:`trainRatio`.
    +        """
    +        self._paramMap[self.trainRatio] = value
    +        return self
    +
    +    @since("2.0.0")
    +    def getTrainRatio(self):
    +        """
    +        Gets the value of trainRatio or its default value.
    +        """
    +        return self.getOrDefault(self.trainRatio)
    +
    +    def _fit(self, dataset):
    +        est = self.getOrDefault(self.estimator)
    +        epm = self.getOrDefault(self.estimatorParamMaps)
    +        numModels = len(epm)
    +        eva = self.getOrDefault(self.evaluator)
    +        tRatio = self.getOrDefault(self.trainRatio)
    +        seed = self.getOrDefault(self.seed)
    +        randCol = self.uid + "_rand"
    +        df = dataset.select("*", rand(seed).alias(randCol))
    +        metrics = np.zeros(numModels)
    +        condition = (df[randCol] >= tRatio)
    +        validation = df.filter(condition)
    +        train = df.filter(~condition)
    +        for j in range(numModels):
    +            model = est.fit(train, epm[j])
    +            metric = eva.evaluate(model.transform(validation, epm[j]))
    +            metrics[j] += metric
    +        if eva.isLargerBetter():
    +            bestIndex = np.argmax(metrics)
    +        else:
    +            bestIndex = np.argmin(metrics)
    +        bestModel = est.fit(dataset, epm[bestIndex])
    +        return TrainValidationSplitModel(bestModel)
    +
    +    @since("2.0.0")
    +    def copy(self, extra=None):
    +        """
    +        Creates a copy of this instance with a randomly generated uid
    +        and some extra params. This copies creates a deep copy of
    +        the embedded paramMap, and copies the embedded and extra 
parameters over.
    +
    +        :param extra: Extra parameters to copy to the new instance
    +        :return: Copy of this instance
    +        """
    +        if extra is None:
    +            extra = dict()
    +        newCV = Params.copy(self, extra)
    --- End diff --
    
    Updated.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-12877] [ML] Add train-validation-split ...

Reply via email to