[GitHub] spark pull request: [SPARK-4435] [MLlib] [PySpark] improve classif...

mengxr Mon, 17 Nov 2014 12:26:50 -0800

Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/3305#discussion_r20462384
  
    --- Diff: python/pyspark/mllib/classification.py ---
    @@ -29,47 +30,96 @@
                'SVMWithSGD', 'NaiveBayesModel', 'NaiveBayes']
     
     
    -class LogisticRegressionModel(LinearModel):
    +class ClassificationModel(LinearModel):
    +    """
    +    :: Experimental ::
    +
    +    Represents a classification model that predicts to which of a set of 
categories an example
    +    belongs. The categories are represented by double values: 0.0, 1.0, 
2.0, etc.
    +    """
    +    def __init__(self, weights, intercept):
    +        super(ClassificationModel, self).__init__(weights, intercept)
    +        self._threshold = 0.5
    +
    +    def setThreshold(self, value):
    +        """
    +        :: Experimental ::
    +
    +        Sets the threshold that separates positive predictions from 
negative predictions. An example
    +        with prediction score greater than or equal to this threshold is 
identified as an positive,
    +        and negative otherwise. The default value is 0.5.
    +        """
    +        self._threshold = value
    +
    +    def clearThreshold(self):
    +        """
    +        :: Experimental ::
    +
    +        Clears the threshold so that `predict` will output raw prediction 
scores.
    +        """
    +        self._threshold = None
    +
    +    def predict(self, test):
    +        """
    +        Predict values for a single data point or an RDD of points using 
the model trained.
    +        """
    +        raise NotImplementedError
    +
    +
    +class LogisticRegressionModel(ClassificationModel):
     
         """A linear binary classification model derived from logistic 
regression.
     
         >>> data = [
    -    ...     LabeledPoint(0.0, [0.0]),
    -    ...     LabeledPoint(1.0, [1.0]),
    -    ...     LabeledPoint(1.0, [2.0]),
    -    ...     LabeledPoint(1.0, [3.0])
    +    ...     LabeledPoint(0.0, [0.0, 1.0]),
    +    ...     LabeledPoint(1.0, [1.0, 0.0]),
         ... ]
         >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
    -    >>> lrm.predict(array([1.0])) > 0
    -    True
    -    >>> lrm.predict(array([0.0])) <= 0
    -    True
    +    >>> lrm.predict([1.0, 0.0])
    +    1
    +    >>> lrm.predict([0.0, 1.0])
    +    0
    +    >>> lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect()
    +    [1, 0]
    +    >>> lrm.clearThreshold()
    +    >>> lrm.predict([0.0, 1.0])
    +    0.123...
    +
         >>> sparse_data = [
         ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
    -    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
    +    ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
         ... ]
         >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
    -    >>> lrm.predict(array([0.0, 1.0])) > 0
    -    True
    -    >>> lrm.predict(array([0.0, 0.0])) <= 0
    -    True
    -    >>> lrm.predict(SparseVector(2, {1: 1.0})) > 0
    -    True
    -    >>> lrm.predict(SparseVector(2, {1: 0.0})) <= 0
    -    True
    +    >>> lrm.predict(array([0.0, 1.0]))
    +    1
    +    >>> lrm.predict(array([1.0, 0.0]))
    +    0
    +    >>> lrm.predict(SparseVector(2, {1: 1.0}))
    +    1
    +    >>> lrm.predict(SparseVector(2, {0: 1.0}))
    +    0
         """
     
         def predict(self, x):
    +        """
    +        Predict values for a single data point or an RDD of points using 
the model trained.
    +        """
    +        if isinstance(x, RDD):
    +            return x.map(lambda v: self.predict(v))
    +
             x = _convert_to_vector(x)
             margin = self.weights.dot(x) + self._intercept
             if margin > 0:
    -            prob = 1 / (1 + exp(-margin))
    +            prob = 1 / (1.0 + exp(-margin))
             else:
                 exp_margin = exp(margin)
                 prob = exp_margin / (1 + exp_margin)
    -        return 1 if prob > 0.5 else 0
    +        if self._threshold is None:
    +            return prob
    +        else:
    +            return 1 if prob >= self._threshold else 0
    --- End diff --
    
    That was a mistake. Let's update the Scala side as well.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-4435] [MLlib] [PySpark] improve classif...

Reply via email to