Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/3305#discussion_r20474914
--- Diff: python/pyspark/mllib/classification.py ---
@@ -29,47 +30,98 @@
'SVMWithSGD', 'NaiveBayesModel', 'NaiveBayes']
-class LogisticRegressionModel(LinearModel):
+class LinearBinaryClassificationModel(LinearModel):
+ """
+ Represents a linear binary classification model that predicts to which
+ of a set of categories an example belongs. The categories are
represented
+ by double values: 0.0, 1.0, 2.0, etc.
+ """
+ def __init__(self, weights, intercept):
+ super(LinearBinaryClassificationModel, self).__init__(weights,
intercept)
+ self._threshold = None
+
+ def setThreshold(self, value):
+ """
+ :: Experimental ::
+
+ Sets the threshold that separates positive predictions from
negative predictions. An example
+ with prediction score greater than or equal to this threshold is
identified as an positive,
+ and negative otherwise. The default value is 0.5.
+ """
+ self._threshold = value
+
+ def clearThreshold(self):
+ """
+ :: Experimental ::
+
+ Clears the threshold so that `predict` will output raw prediction
scores.
+ """
+ self._threshold = None
+
+ def predict(self, test):
+ """
+ Predict values for a single data point or an RDD of points using
the model trained.
+ """
+ raise NotImplementedError
+
+
+class LogisticRegressionModel(LinearBinaryClassificationModel):
"""A linear binary classification model derived from logistic
regression.
>>> data = [
- ... LabeledPoint(0.0, [0.0]),
- ... LabeledPoint(1.0, [1.0]),
- ... LabeledPoint(1.0, [2.0]),
- ... LabeledPoint(1.0, [3.0])
+ ... LabeledPoint(0.0, [0.0, 1.0]),
+ ... LabeledPoint(1.0, [1.0, 0.0]),
... ]
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
- >>> lrm.predict(array([1.0])) > 0
- True
- >>> lrm.predict(array([0.0])) <= 0
- True
+ >>> lrm.predict([1.0, 0.0])
+ 1
+ >>> lrm.predict([0.0, 1.0])
+ 0
+ >>> lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect()
+ [1, 0]
+ >>> lrm.clearThreshold()
+ >>> lrm.predict([0.0, 1.0])
+ 0.123...
+
>>> sparse_data = [
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
- ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
... ]
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
- >>> lrm.predict(array([0.0, 1.0])) > 0
- True
- >>> lrm.predict(array([0.0, 0.0])) <= 0
- True
- >>> lrm.predict(SparseVector(2, {1: 1.0})) > 0
- True
- >>> lrm.predict(SparseVector(2, {1: 0.0})) <= 0
- True
+ >>> lrm.predict(array([0.0, 1.0]))
+ 1
+ >>> lrm.predict(array([1.0, 0.0]))
+ 0
+ >>> lrm.predict(SparseVector(2, {1: 1.0}))
+ 1
+ >>> lrm.predict(SparseVector(2, {0: 1.0}))
+ 0
"""
+ def __init__(self, weights, intercept):
+ super(LogisticRegressionModel, self).__init__(weights, intercept)
+ self._threshold = 0.5
def predict(self, x):
+ """
+ Predict values for a single data point or an RDD of points using
the model trained.
+ """
+ if isinstance(x, RDD):
+ return x.map(lambda v: self.predict(v))
+
x = _convert_to_vector(x)
margin = self.weights.dot(x) + self._intercept
if margin > 0:
- prob = 1 / (1 + exp(-margin))
+ prob = 1 / (1.0 + exp(-margin))
--- End diff --
Either `1` or `1.0` is fine here. Let's just try to be consistent. In this
case, maybe let's keep the original "1".
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]