[GitHub] spark pull request: [SPARK-4435] [MLlib] [PySpark] improve classif...

mengxr Mon, 17 Nov 2014 15:37:12 -0800

Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/3305#discussion_r20474914
  
    --- Diff: python/pyspark/mllib/classification.py ---
    @@ -29,47 +30,98 @@
                'SVMWithSGD', 'NaiveBayesModel', 'NaiveBayes']
     
     
    -class LogisticRegressionModel(LinearModel):
    +class LinearBinaryClassificationModel(LinearModel):
    +    """
    +    Represents a linear binary classification model that predicts to which
    +    of a set of categories an example belongs. The categories are 
represented
    +    by double values: 0.0, 1.0, 2.0, etc.
    +    """
    +    def __init__(self, weights, intercept):
    +        super(LinearBinaryClassificationModel, self).__init__(weights, 
intercept)
    +        self._threshold = None
    +
    +    def setThreshold(self, value):
    +        """
    +        :: Experimental ::
    +
    +        Sets the threshold that separates positive predictions from 
negative predictions. An example
    +        with prediction score greater than or equal to this threshold is 
identified as an positive,
    +        and negative otherwise. The default value is 0.5.
    +        """
    +        self._threshold = value
    +
    +    def clearThreshold(self):
    +        """
    +        :: Experimental ::
    +
    +        Clears the threshold so that `predict` will output raw prediction 
scores.
    +        """
    +        self._threshold = None
    +
    +    def predict(self, test):
    +        """
    +        Predict values for a single data point or an RDD of points using 
the model trained.
    +        """
    +        raise NotImplementedError
    +
    +
    +class LogisticRegressionModel(LinearBinaryClassificationModel):
     
         """A linear binary classification model derived from logistic 
regression.
     
         >>> data = [
    -    ...     LabeledPoint(0.0, [0.0]),
    -    ...     LabeledPoint(1.0, [1.0]),
    -    ...     LabeledPoint(1.0, [2.0]),
    -    ...     LabeledPoint(1.0, [3.0])
    +    ...     LabeledPoint(0.0, [0.0, 1.0]),
    +    ...     LabeledPoint(1.0, [1.0, 0.0]),
         ... ]
         >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
    -    >>> lrm.predict(array([1.0])) > 0
    -    True
    -    >>> lrm.predict(array([0.0])) <= 0
    -    True
    +    >>> lrm.predict([1.0, 0.0])
    +    1
    +    >>> lrm.predict([0.0, 1.0])
    +    0
    +    >>> lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect()
    +    [1, 0]
    +    >>> lrm.clearThreshold()
    +    >>> lrm.predict([0.0, 1.0])
    +    0.123...
    +
         >>> sparse_data = [
         ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
    -    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
    +    ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
         ... ]
         >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
    -    >>> lrm.predict(array([0.0, 1.0])) > 0
    -    True
    -    >>> lrm.predict(array([0.0, 0.0])) <= 0
    -    True
    -    >>> lrm.predict(SparseVector(2, {1: 1.0})) > 0
    -    True
    -    >>> lrm.predict(SparseVector(2, {1: 0.0})) <= 0
    -    True
    +    >>> lrm.predict(array([0.0, 1.0]))
    +    1
    +    >>> lrm.predict(array([1.0, 0.0]))
    +    0
    +    >>> lrm.predict(SparseVector(2, {1: 1.0}))
    +    1
    +    >>> lrm.predict(SparseVector(2, {0: 1.0}))
    +    0
         """
    +    def __init__(self, weights, intercept):
    +        super(LogisticRegressionModel, self).__init__(weights, intercept)
    +        self._threshold = 0.5
     
         def predict(self, x):
    +        """
    +        Predict values for a single data point or an RDD of points using 
the model trained.
    +        """
    +        if isinstance(x, RDD):
    +            return x.map(lambda v: self.predict(v))
    +
             x = _convert_to_vector(x)
             margin = self.weights.dot(x) + self._intercept
             if margin > 0:
    -            prob = 1 / (1 + exp(-margin))
    +            prob = 1 / (1.0 + exp(-margin))
    --- End diff --
    
    Either `1` or `1.0` is fine here. Let's just try to be consistent. In this 
case, maybe let's keep the original "1".



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-4435] [MLlib] [PySpark] improve classif...

Reply via email to