Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/3305#discussion_r20462384
--- Diff: python/pyspark/mllib/classification.py ---
@@ -29,47 +30,96 @@
'SVMWithSGD', 'NaiveBayesModel', 'NaiveBayes']
-class LogisticRegressionModel(LinearModel):
+class ClassificationModel(LinearModel):
+ """
+ :: Experimental ::
+
+ Represents a classification model that predicts to which of a set of
categories an example
+ belongs. The categories are represented by double values: 0.0, 1.0,
2.0, etc.
+ """
+ def __init__(self, weights, intercept):
+ super(ClassificationModel, self).__init__(weights, intercept)
+ self._threshold = 0.5
+
+ def setThreshold(self, value):
+ """
+ :: Experimental ::
+
+ Sets the threshold that separates positive predictions from
negative predictions. An example
+ with prediction score greater than or equal to this threshold is
identified as an positive,
+ and negative otherwise. The default value is 0.5.
+ """
+ self._threshold = value
+
+ def clearThreshold(self):
+ """
+ :: Experimental ::
+
+ Clears the threshold so that `predict` will output raw prediction
scores.
+ """
+ self._threshold = None
+
+ def predict(self, test):
+ """
+ Predict values for a single data point or an RDD of points using
the model trained.
+ """
+ raise NotImplementedError
+
+
+class LogisticRegressionModel(ClassificationModel):
"""A linear binary classification model derived from logistic
regression.
>>> data = [
- ... LabeledPoint(0.0, [0.0]),
- ... LabeledPoint(1.0, [1.0]),
- ... LabeledPoint(1.0, [2.0]),
- ... LabeledPoint(1.0, [3.0])
+ ... LabeledPoint(0.0, [0.0, 1.0]),
+ ... LabeledPoint(1.0, [1.0, 0.0]),
... ]
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data))
- >>> lrm.predict(array([1.0])) > 0
- True
- >>> lrm.predict(array([0.0])) <= 0
- True
+ >>> lrm.predict([1.0, 0.0])
+ 1
+ >>> lrm.predict([0.0, 1.0])
+ 0
+ >>> lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect()
+ [1, 0]
+ >>> lrm.clearThreshold()
+ >>> lrm.predict([0.0, 1.0])
+ 0.123...
+
>>> sparse_data = [
... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
- ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
... ]
>>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data))
- >>> lrm.predict(array([0.0, 1.0])) > 0
- True
- >>> lrm.predict(array([0.0, 0.0])) <= 0
- True
- >>> lrm.predict(SparseVector(2, {1: 1.0})) > 0
- True
- >>> lrm.predict(SparseVector(2, {1: 0.0})) <= 0
- True
+ >>> lrm.predict(array([0.0, 1.0]))
+ 1
+ >>> lrm.predict(array([1.0, 0.0]))
+ 0
+ >>> lrm.predict(SparseVector(2, {1: 1.0}))
+ 1
+ >>> lrm.predict(SparseVector(2, {0: 1.0}))
+ 0
"""
def predict(self, x):
+ """
+ Predict values for a single data point or an RDD of points using
the model trained.
+ """
+ if isinstance(x, RDD):
+ return x.map(lambda v: self.predict(v))
+
x = _convert_to_vector(x)
margin = self.weights.dot(x) + self._intercept
if margin > 0:
- prob = 1 / (1 + exp(-margin))
+ prob = 1 / (1.0 + exp(-margin))
else:
exp_margin = exp(margin)
prob = exp_margin / (1 + exp_margin)
- return 1 if prob > 0.5 else 0
+ if self._threshold is None:
+ return prob
+ else:
+ return 1 if prob >= self._threshold else 0
--- End diff --
That was a mistake. Let's update the Scala side as well.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]