Repository: spark
Updated Branches:
refs/heads/master c1b74df60 -> 435337381
[SPARK-6256] [MLlib] MLlib Python API parity check for regression
MLlib Python API parity check for Regression, major disparities need to be
added for Python list following:
```scala
LinearRegressionWithSGD
setValidateData
LassoWithSGD
setIntercept
setValidateData
RidgeRegressionWithSGD
setIntercept
setValidateData
```
setFeatureScaling is mllib private function which is not needed to expose in
pyspark.
Author: Yanbo Liang <[email protected]>
Closes #4997 from yanboliang/spark-6256 and squashes the following commits:
102f498 [Yanbo Liang] fix intercept issue & add doc test
1fb7b4f [Yanbo Liang] change 'intercept' to 'addIntercept'
de5ecbc [Yanbo Liang] MLlib Python API parity check for regression
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43533738
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43533738
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43533738
Branch: refs/heads/master
Commit: 435337381f093f95248c8f0204e60c0b366edc81
Parents: c1b74df
Author: Yanbo Liang <[email protected]>
Authored: Wed Mar 25 13:38:33 2015 -0700
Committer: Joseph K. Bradley <[email protected]>
Committed: Wed Mar 25 13:38:33 2015 -0700
----------------------------------------------------------------------
.../spark/mllib/api/python/PythonMLLibAPI.scala | 16 ++++++--
python/pyspark/mllib/regression.py | 43 ++++++++++++++++----
2 files changed, 49 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/43533738/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 15ca254..e391567 100644
---
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -111,9 +111,11 @@ private[python] class PythonMLLibAPI extends Serializable {
initialWeights: Vector,
regParam: Double,
regType: String,
- intercept: Boolean): JList[Object] = {
+ intercept: Boolean,
+ validateData: Boolean): JList[Object] = {
val lrAlg = new LinearRegressionWithSGD()
lrAlg.setIntercept(intercept)
+ .setValidateData(validateData)
lrAlg.optimizer
.setNumIterations(numIterations)
.setRegParam(regParam)
@@ -135,8 +137,12 @@ private[python] class PythonMLLibAPI extends Serializable {
stepSize: Double,
regParam: Double,
miniBatchFraction: Double,
- initialWeights: Vector): JList[Object] = {
+ initialWeights: Vector,
+ intercept: Boolean,
+ validateData: Boolean): JList[Object] = {
val lassoAlg = new LassoWithSGD()
+ lassoAlg.setIntercept(intercept)
+ .setValidateData(validateData)
lassoAlg.optimizer
.setNumIterations(numIterations)
.setRegParam(regParam)
@@ -157,8 +163,12 @@ private[python] class PythonMLLibAPI extends Serializable {
stepSize: Double,
regParam: Double,
miniBatchFraction: Double,
- initialWeights: Vector): JList[Object] = {
+ initialWeights: Vector,
+ intercept: Boolean,
+ validateData: Boolean): JList[Object] = {
val ridgeAlg = new RidgeRegressionWithSGD()
+ ridgeAlg.setIntercept(intercept)
+ .setValidateData(validateData)
ridgeAlg.optimizer
.setNumIterations(numIterations)
.setRegParam(regParam)
http://git-wip-us.apache.org/repos/asf/spark/blob/43533738/python/pyspark/mllib/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/regression.py
b/python/pyspark/mllib/regression.py
index 414a0ad..209f1ee 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -140,6 +140,13 @@ class LinearRegressionModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
+ >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data),
iterations=100, step=1.0,
+ ... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1,
regType="l2",
+ ... intercept=True, validateData=True)
+ >>> abs(lrm.predict(array([0.0])) - 0) < 0.5
+ True
+ >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
+ True
"""
def save(self, sc, path):
java_model =
sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel(
@@ -173,7 +180,8 @@ class LinearRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
- initialWeights=None, regParam=0.0, regType=None,
intercept=False):
+ initialWeights=None, regParam=0.0, regType=None, intercept=False,
+ validateData=True):
"""
Train a linear regression model on the given data.
@@ -195,15 +203,18 @@ class LinearRegressionWithSGD(object):
(default: None)
- @param intercept: Boolean parameter which indicates the use
+ :param intercept: Boolean parameter which indicates the use
or not of the augmented representation for
training data (i.e. whether bias features
are activated or not). (default: False)
+ :param validateData: Boolean parameter which indicates if the
+ algorithm should validate data before
training.
+ (default: True)
"""
def train(rdd, i):
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd,
int(iterations),
float(step), float(miniBatchFraction), i,
float(regParam),
- regType, bool(intercept))
+ regType, bool(intercept), bool(validateData))
return _regression_train_wrapper(train, LinearRegressionModel, data,
initialWeights)
@@ -253,6 +264,13 @@ class LassoModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
+ >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=100,
step=1.0,
+ ... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]),
intercept=True,
+ ... validateData=True)
+ >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
+ True
+ >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
+ True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel(
@@ -273,11 +291,13 @@ class LassoWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
- miniBatchFraction=1.0, initialWeights=None):
+ miniBatchFraction=1.0, initialWeights=None, intercept=False,
+ validateData=True):
"""Train a Lasso regression model on the given data."""
def train(rdd, i):
return callMLlibFunc("trainLassoModelWithSGD", rdd,
int(iterations), float(step),
- float(regParam), float(miniBatchFraction), i)
+ float(regParam), float(miniBatchFraction), i,
bool(intercept),
+ bool(validateData))
return _regression_train_wrapper(train, LassoModel, data,
initialWeights)
@@ -327,6 +347,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
+ >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data),
iterations=100, step=1.0,
+ ... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]),
intercept=True,
+ ... validateData=True)
+ >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
+ True
+ >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
+ True
"""
def save(self, sc, path):
java_model =
sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel(
@@ -347,11 +374,13 @@ class RidgeRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
- miniBatchFraction=1.0, initialWeights=None):
+ miniBatchFraction=1.0, initialWeights=None, intercept=False,
+ validateData=True):
"""Train a ridge regression model on the given data."""
def train(rdd, i):
return callMLlibFunc("trainRidgeModelWithSGD", rdd,
int(iterations), float(step),
- float(regParam), float(miniBatchFraction), i)
+ float(regParam), float(miniBatchFraction), i,
bool(intercept),
+ bool(validateData))
return _regression_train_wrapper(train, RidgeRegressionModel, data,
initialWeights)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]