spark git commit: [SPARK-6519][ML] Add spark.ml API for bisecting k-means
Repository: spark Updated Branches: refs/heads/master 8e4f894e9 -> 9376ae723 [SPARK-6519][ML] Add spark.ml API for bisecting k-means Author: Yu ISHIKAWA <yuu.ishik...@gmail.com> Closes #9604 from yu-iskw/SPARK-6519. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9376ae72 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9376ae72 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9376ae72 Branch: refs/heads/master Commit: 9376ae723e4ec0515120c488541617a0538f8879 Parents: 8e4f894 Author: Yu ISHIKAWA <yuu.ishik...@gmail.com> Authored: Wed Jan 20 10:48:10 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jan 20 10:48:10 2016 -0800 -- .../spark/ml/clustering/BisectingKMeans.scala | 196 +++ .../ml/clustering/BisectingKMeansSuite.scala| 85 2 files changed, 281 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9376ae72/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala new file mode 100644 index 000..0b47cbb --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.clustering + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params} +import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.mllib.clustering. + {BisectingKMeans => MLlibBisectingKMeans, BisectingKMeansModel => MLlibBisectingKMeansModel} +import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.types.{IntegerType, StructType} + + +/** + * Common params for BisectingKMeans and BisectingKMeansModel + */ +private[clustering] trait BisectingKMeansParams extends Params + with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol { + + /** + * Set the number of clusters to create (k). Must be > 1. Default: 2. + * @group param + */ + @Since("2.0.0") + final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1) + + /** @group getParam */ + @Since("2.0.0") + def getK: Int = $(k) + + /** @group expertParam */ + @Since("2.0.0") + final val minDivisibleClusterSize = new Param[Double]( +this, +"minDivisibleClusterSize", +"the minimum number of points (if >= 1.0) or the minimum proportion", +(value: Double) => value > 0) + + /** @group expertGetParam */ + @Since("2.0.0") + def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize) + + /** + * Validates and transforms the input schema. + * @param schema input schema + * @return output schema + */ + protected def validateAndTransformSchema(schema: StructType): StructType = { +SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT) +SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType) + } +} + +/** + * :: Experimental :: + * Model fitted by BisectingKMeans. + * + * @param parentModel a model trained by spark.mllib.clustering.BisectingKMeans. + */ +@Since("2.0.0") +@Experimental +class BisectingKMeansModel private[ml] ( +@Since("2.0.0") override val uid: String, +private val parentModel: MLlibBisectingKMeansModel + ) extends Model[BisectingKMeansModel] with BisectingKMeansParams { + + @Since("2.0.0") + override def copy(extra: ParamMap): BisectingKMe
spark git commit: [SPARK-12230][ML] WeightedLeastSquares.fit() should handle division by zero properly if standard deviation of target variable is zero.
Repository: spark Updated Branches: refs/heads/master 9bb35c5b5 -> 9753835cf [SPARK-12230][ML] WeightedLeastSquares.fit() should handle division by zero properly if standard deviation of target variable is zero. This fixes the behavior of WeightedLeastSquars.fit() when the standard deviation of the target variable is zero. If the fitIntercept is true, there is no need to train. Author: Imran Younus <iyou...@us.ibm.com> Closes #10274 from iyounus/SPARK-12230_bug_fix_in_weighted_least_squares. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9753835c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9753835c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9753835c Branch: refs/heads/master Commit: 9753835cf3acc135e61bf668223046e29306c80d Parents: 9bb35c5 Author: Imran Younus <iyou...@us.ibm.com> Authored: Wed Jan 20 11:16:59 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jan 20 11:16:59 2016 -0800 -- .../spark/ml/optim/WeightedLeastSquares.scala | 21 +- .../ml/optim/WeightedLeastSquaresSuite.scala| 69 ++-- 2 files changed, 83 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9753835c/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index 8617722..797870e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -86,6 +86,24 @@ private[ml] class WeightedLeastSquares( val aaBar = summary.aaBar val aaValues = aaBar.values +if (bStd == 0) { + if (fitIntercept) { +logWarning(s"The standard deviation of the label is zero, so the coefficients will be " + + s"zeros and the intercept will be the mean of the label; as a result, " + + s"training is not needed.") +val coefficients = new DenseVector(Array.ofDim(k-1)) +val intercept = bBar +val diagInvAtWA = new DenseVector(Array(0D)) +return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) + } else { +require(!(regParam > 0.0 && standardizeLabel), + "The standard deviation of the label is zero. " + +"Model cannot be regularized with standardization=true") +logWarning(s"The standard deviation of the label is zero. " + + "Consider setting fitIntercept=true.") + } +} + // add regularization to diagonals var i = 0 var j = 2 @@ -94,8 +112,7 @@ private[ml] class WeightedLeastSquares( if (standardizeFeatures) { lambda *= aVar(j - 2) } - if (standardizeLabel) { -// TODO: handle the case when bStd = 0 + if (standardizeLabel && bStd != 0) { lambda /= bStd } aaValues(i) += lambda http://git-wip-us.apache.org/repos/asf/spark/blob/9753835c/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala index b542ba3..0b58a98 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.rdd.RDD class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext { private var instances: RDD[Instance] = _ + private var instancesConstLabel: RDD[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() @@ -43,6 +44,20 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(29.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2) + +/* + R code: + + A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2) + b.const <- c(17, 17, 17, 17) + w <- c(1, 2, 3, 4) + */ +instancesConstLabel = sc.parallelize(Seq( + Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(17.0, 2.0, Vectors.dense(1.0, 7.0)), + Instance(17.0, 3.0, Vectors.dense(2.0, 11.0)), + Instance(17.0, 4.0, Vectors.dense(3.0, 13.0)) +), 2) } test("WLS against lm&
spark git commit: [SPARK-11295][PYSPARK] Add packages to JUnit output for Python tests
Repository: spark Updated Branches: refs/heads/master 9376ae723 -> 9bb35c5b5 [SPARK-11295][PYSPARK] Add packages to JUnit output for Python tests This is #9263 from gliptak (improving grouping/display of test case results) with a small fix of bisecting k-means unit test. Author: Gábor Lipták <glip...@gmail.com> Author: Xiangrui Meng <m...@databricks.com> Closes #10850 from mengxr/SPARK-11295. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9bb35c5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9bb35c5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9bb35c5b Branch: refs/heads/master Commit: 9bb35c5b59e58dbebbdc6856d611bff73dd35a91 Parents: 9376ae7 Author: Gábor Lipták <glip...@gmail.com> Authored: Wed Jan 20 11:11:10 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jan 20 11:11:10 2016 -0800 -- python/pyspark/ml/tests.py| 1 + python/pyspark/mllib/tests.py | 26 +++--- python/pyspark/sql/tests.py | 1 + python/pyspark/streaming/tests.py | 1 + python/pyspark/tests.py | 1 + 5 files changed, 19 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9bb35c5b/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 4eb17bf..9ea639d 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -394,6 +394,7 @@ class CrossValidatorTests(PySparkTestCase): if __name__ == "__main__": +from pyspark.ml.tests import * if xmlrunner: unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports')) else: http://git-wip-us.apache.org/repos/asf/spark/blob/9bb35c5b/python/pyspark/mllib/tests.py -- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 32ed48e..79ce495 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -77,21 +77,24 @@ except: pass ser = PickleSerializer() -sc = SparkContext('local[4]', "MLlib tests") class MLlibTestCase(unittest.TestCase): def setUp(self): -self.sc = sc +self.sc = SparkContext('local[4]', "MLlib tests") + +def tearDown(self): +self.sc.stop() class MLLibStreamingTestCase(unittest.TestCase): def setUp(self): -self.sc = sc +self.sc = SparkContext('local[4]', "MLlib tests") self.ssc = StreamingContext(self.sc, 1.0) def tearDown(self): self.ssc.stop(False) +self.sc.stop() @staticmethod def _eventually(condition, timeout=30.0, catch_assertions=False): @@ -423,7 +426,7 @@ class ListTests(MLlibTestCase): from pyspark.mllib.clustering import BisectingKMeans data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2) bskm = BisectingKMeans() -model = bskm.train(sc.parallelize(data, 2), k=4) +model = bskm.train(self.sc.parallelize(data, 2), k=4) p = array([0.0, 0.0]) rdd_p = self.sc.parallelize([p]) self.assertEqual(model.predict(p), model.predict(rdd_p).first()) @@ -1166,7 +1169,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase): clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] -predict_data = [sc.parallelize(batch, 1) for batch in predict_data] +predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) @@ -1197,7 +1200,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase): # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] -batches = [sc.parallelize(batch) for batch in batches] +batches = [self.sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] @@ -1230,7 +1233,7 @@ class LinearDataGeneratorTests(MLlibTestCase): self.assertEqual(len(point.features), 3) linear_data = LinearDataGenerator.generateLinearRDD( -sc=sc, nexamples=6, nfeatures=2, eps=0.1, +sc=self.sc, nexamples=6, nfeatures=2, eps=0.1, nParts=2, intercept=0.0).collect() self.assertEqual(len(linear_data), 6) for point in linear_data: @@ -1406,7 +1409,7 @@ class StreamingLinearRegressionWithTests(MLLib
spark git commit: Revert "[SPARK-11295] Add packages to JUnit output for Python tests"
Repository: spark Updated Branches: refs/heads/master 3ac648289 -> beda90142 Revert "[SPARK-11295] Add packages to JUnit output for Python tests" This reverts commit c6f971b4aeca7265ab374fa46c5c452461d9b6a7. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/beda9014 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/beda9014 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/beda9014 Branch: refs/heads/master Commit: beda9014220be77dd735e6af1903e7d93dceb110 Parents: 3ac6482 Author: Xiangrui Meng <m...@databricks.com> Authored: Tue Jan 19 16:51:17 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Jan 19 16:51:17 2016 -0800 -- python/pyspark/ml/tests.py| 1 - python/pyspark/mllib/tests.py | 24 ++-- python/pyspark/sql/tests.py | 1 - python/pyspark/streaming/tests.py | 1 - python/pyspark/tests.py | 1 - 5 files changed, 10 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/beda9014/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 9ea639d..4eb17bf 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -394,7 +394,6 @@ class CrossValidatorTests(PySparkTestCase): if __name__ == "__main__": -from pyspark.ml.tests import * if xmlrunner: unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports')) else: http://git-wip-us.apache.org/repos/asf/spark/blob/beda9014/python/pyspark/mllib/tests.py -- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index ea7d297..32ed48e 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -77,24 +77,21 @@ except: pass ser = PickleSerializer() +sc = SparkContext('local[4]', "MLlib tests") class MLlibTestCase(unittest.TestCase): def setUp(self): -self.sc = SparkContext('local[4]', "MLlib tests") - -def tearDown(self): -self.sc.stop() +self.sc = sc class MLLibStreamingTestCase(unittest.TestCase): def setUp(self): -self.sc = SparkContext('local[4]', "MLlib tests") +self.sc = sc self.ssc = StreamingContext(self.sc, 1.0) def tearDown(self): self.ssc.stop(False) -self.sc.stop() @staticmethod def _eventually(condition, timeout=30.0, catch_assertions=False): @@ -1169,7 +1166,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase): clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] -predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data] +predict_data = [sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) @@ -1200,7 +1197,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase): # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] -batches = [self.sc.parallelize(batch) for batch in batches] +batches = [sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] @@ -1233,7 +1230,7 @@ class LinearDataGeneratorTests(MLlibTestCase): self.assertEqual(len(point.features), 3) linear_data = LinearDataGenerator.generateLinearRDD( -sc=self.sc, nexamples=6, nfeatures=2, eps=0.1, +sc=sc, nexamples=6, nfeatures=2, eps=0.1, nParts=2, intercept=0.0).collect() self.assertEqual(len(linear_data), 6) for point in linear_data: @@ -1409,7 +1406,7 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase): for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) -batches.append(self.sc.parallelize(batch)) +batches.append(sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) slr.trainOn(input_stream) @@ -1433,7 +1430,7 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase): for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) -batches.append(self.sc.par
spark git commit: [SPARK-12346][ML] Missing attribute names in GLM for vector-type features
Repository: spark Updated Branches: refs/heads/branch-1.6 53184ce77 -> 8c2b67f55 [SPARK-12346][ML] Missing attribute names in GLM for vector-type features Currently `summary()` fails on a GLM model fitted over a vector feature missing ML attrs, since the output feature attrs will also have no name. We can avoid this situation by forcing `VectorAssembler` to make up suitable names when inputs are missing names. cc mengxr Author: Eric Liang <e...@databricks.com> Closes #10323 from ericl/spark-12346. (cherry picked from commit 5e492e9d5bc0992cbcffe64a9aaf3b334b173d2c) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c2b67f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c2b67f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c2b67f5 Branch: refs/heads/branch-1.6 Commit: 8c2b67f55416562a0f1fafeefb073f79701c9cc9 Parents: 53184ce Author: Eric Liang <e...@databricks.com> Authored: Mon Jan 18 12:50:58 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jan 18 12:51:06 2016 -0800 -- .../spark/ml/feature/VectorAssembler.scala | 6 ++-- .../apache/spark/ml/feature/RFormulaSuite.scala | 38 .../spark/ml/feature/VectorAssemblerSuite.scala | 4 +-- 3 files changed, 43 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8c2b67f5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 801096f..ec7ead5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -70,19 +70,19 @@ class VectorAssembler(override val uid: String) val group = AttributeGroup.fromStructField(field) if (group.attributes.isDefined) { // If attributes are defined, copy them with updated names. -group.attributes.get.map { attr => +group.attributes.get.zipWithIndex.map { case (attr, i) => if (attr.name.isDefined) { // TODO: Define a rigorous naming scheme. attr.withName(c + "_" + attr.name.get) } else { -attr +attr.withName(c + "_" + i) } } } else { // Otherwise, treat all attributes as numeric. If we cannot get the number of attributes // from metadata, check the first row. val numAttrs = group.numAttributes.getOrElse(first.getAs[Vector](index).size) -Array.fill(numAttrs)(NumericAttribute.defaultAttr) +Array.tabulate(numAttrs)(i => NumericAttribute.defaultAttr.withName(c + "_" + i)) } case otherType => throw new SparkException(s"VectorAssembler does not support the $otherType type") http://git-wip-us.apache.org/repos/asf/spark/blob/8c2b67f5/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index dc20a5e..16e565d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -143,6 +143,44 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext { assert(attrs === expectedAttrs) } + test("vector attribute generation") { +val formula = new RFormula().setFormula("id ~ vec") +val original = sqlContext.createDataFrame( + Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0))) +).toDF("id", "vec") +val model = formula.fit(original) +val result = model.transform(original) +val attrs = AttributeGroup.fromStructField(result.schema("features")) +val expectedAttrs = new AttributeGroup( + "features", + Array[Attribute]( +new NumericAttribute(Some("vec_0"), Some(1)), +new NumericAttribute(Some("vec_1"), Some(2 +assert(attrs === expectedAttrs) + } + + test("vector attribute generation with unnamed input attrs") { +val formula = new RFormula().setFormula("id ~ vec2") +val base = sqlContext.createDataFrame( + Seq((1, Vectors.dense(0.0,
spark git commit: [SPARK-12346][ML] Missing attribute names in GLM for vector-type features
Repository: spark Updated Branches: refs/heads/master 44fcf992a -> 5e492e9d5 [SPARK-12346][ML] Missing attribute names in GLM for vector-type features Currently `summary()` fails on a GLM model fitted over a vector feature missing ML attrs, since the output feature attrs will also have no name. We can avoid this situation by forcing `VectorAssembler` to make up suitable names when inputs are missing names. cc mengxr Author: Eric Liang <e...@databricks.com> Closes #10323 from ericl/spark-12346. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e492e9d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e492e9d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e492e9d Branch: refs/heads/master Commit: 5e492e9d5bc0992cbcffe64a9aaf3b334b173d2c Parents: 44fcf99 Author: Eric Liang <e...@databricks.com> Authored: Mon Jan 18 12:50:58 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jan 18 12:50:58 2016 -0800 -- .../spark/ml/feature/VectorAssembler.scala | 6 ++-- .../apache/spark/ml/feature/RFormulaSuite.scala | 38 .../spark/ml/feature/VectorAssemblerSuite.scala | 4 +-- 3 files changed, 43 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e492e9d/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 716bc63..7ff5ad1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -70,19 +70,19 @@ class VectorAssembler(override val uid: String) val group = AttributeGroup.fromStructField(field) if (group.attributes.isDefined) { // If attributes are defined, copy them with updated names. -group.attributes.get.map { attr => +group.attributes.get.zipWithIndex.map { case (attr, i) => if (attr.name.isDefined) { // TODO: Define a rigorous naming scheme. attr.withName(c + "_" + attr.name.get) } else { -attr +attr.withName(c + "_" + i) } } } else { // Otherwise, treat all attributes as numeric. If we cannot get the number of attributes // from metadata, check the first row. val numAttrs = group.numAttributes.getOrElse(first.getAs[Vector](index).size) -Array.fill(numAttrs)(NumericAttribute.defaultAttr) +Array.tabulate(numAttrs)(i => NumericAttribute.defaultAttr.withName(c + "_" + i)) } case otherType => throw new SparkException(s"VectorAssembler does not support the $otherType type") http://git-wip-us.apache.org/repos/asf/spark/blob/5e492e9d/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index dc20a5e..16e565d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -143,6 +143,44 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext { assert(attrs === expectedAttrs) } + test("vector attribute generation") { +val formula = new RFormula().setFormula("id ~ vec") +val original = sqlContext.createDataFrame( + Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0))) +).toDF("id", "vec") +val model = formula.fit(original) +val result = model.transform(original) +val attrs = AttributeGroup.fromStructField(result.schema("features")) +val expectedAttrs = new AttributeGroup( + "features", + Array[Attribute]( +new NumericAttribute(Some("vec_0"), Some(1)), +new NumericAttribute(Some("vec_1"), Some(2 +assert(attrs === expectedAttrs) + } + + test("vector attribute generation with unnamed input attrs") { +val formula = new RFormula().setFormula("id ~ vec2") +val base = sqlContext.createDataFrame( + Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0))) +).toDF("id", "vec") +val metadata = new AttributeGroup( + &q
svn commit: r1723237 - in /spark: mllib/index.md site/mllib/index.html site/news/index.html
Author: meng Date: Wed Jan 6 06:27:47 2016 New Revision: 1723237 URL: http://svn.apache.org/viewvc?rev=1723237=rev Log: list bisecting k-means and AFT regression on mllib page Modified: spark/mllib/index.md spark/site/mllib/index.html spark/site/news/index.html Modified: spark/mllib/index.md URL: http://svn.apache.org/viewvc/spark/mllib/index.md?rev=1723237=1723236=1723237=diff == --- spark/mllib/index.md (original) +++ spark/mllib/index.md Wed Jan 6 06:27:47 2016 @@ -90,8 +90,9 @@ subproject: MLlib classification and regression tree random forest and gradient-boosted trees recommendation via alternating least squares (ALS) - clustering via k-means, Gaussian mixtures (GMM), and power iteration clustering + clustering via k-means, bisecting k-means, Gaussian mixtures (GMM), and power iteration clustering topic modeling via latent Dirichlet allocation (LDA) + survival analysis via accelerated failure time model singular value decomposition (SVD) and QR decomposition principal component analysis (PCA) linear regression with L1, L2, and elastic-net regularization Modified: spark/site/mllib/index.html URL: http://svn.apache.org/viewvc/spark/site/mllib/index.html?rev=1723237=1723236=1723237=diff == --- spark/site/mllib/index.html (original) +++ spark/site/mllib/index.html Wed Jan 6 06:27:47 2016 @@ -257,8 +257,9 @@ classification and regression tree random forest and gradient-boosted trees recommendation via alternating least squares (ALS) - clustering via k-means, Gaussian mixtures (GMM), and power iteration clustering + clustering via k-means, bisecting k-means, Gaussian mixtures (GMM), and power iteration clustering topic modeling via latent Dirichlet allocation (LDA) + survival analysis via accelerated failure time model singular value decomposition (SVD) and QR decomposition principal component analysis (PCA) linear regression with L1, L2, and elastic-net regularization Modified: spark/site/news/index.html URL: http://svn.apache.org/viewvc/spark/site/news/index.html?rev=1723237=1723236=1723237=diff == --- spark/site/news/index.html (original) +++ spark/site/news/index.html Wed Jan 6 06:27:47 2016 @@ -275,7 +275,7 @@ With this release the Spark community co One month to Spark Summit 2015 in San Francisco - May 15, 2015 + May 16, 2015 There is one month left until https://spark-summit.org/2015/;>Spark Summit 2015, which will be held in San Francisco on June 15th to 17th. @@ -287,7 +287,7 @@ The Summit will contain Announcing Spark Summit Europe - May 15, 2015 + May 16, 2015 Abstract submissions are now open for the first ever https://www.prevalentdesignevents.com/sparksummit2015/europe/speaker/;>Spark Summit Europe. The event will take place on October 27th to 29th in Amsterdam. Submissions are welcome across a variety of Spark related topics, including use cases and ongoing development. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[3/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example
[SPARK-11551][DOC] Replace example code in ml-features.md using include_example PR on behalf of somideshmukh, thanks! Author: Xusen Yin <yinxu...@gmail.com> Author: somideshmukh <somi...@us.ibm.com> Closes #10219 from yinxusen/SPARK-11551. (cherry picked from commit 051c6a066f7b5fcc7472412144c15b50a5319bd5) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfb42013 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfb42013 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfb42013 Branch: refs/heads/branch-1.6 Commit: bfb4201395c6a1905c6eb46de4ea3eefe8d17309 Parents: ee0a6e7 Author: Xusen Yin <yinxu...@gmail.com> Authored: Wed Dec 9 12:00:48 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Dec 9 12:01:00 2015 -0800 -- docs/ml-features.md | 1112 +- .../spark/examples/ml/JavaBinarizerExample.java | 68 ++ .../examples/ml/JavaBucketizerExample.java | 71 ++ .../spark/examples/ml/JavaDCTExample.java | 65 + .../ml/JavaElementwiseProductExample.java | 75 ++ .../examples/ml/JavaMinMaxScalerExample.java| 51 + .../spark/examples/ml/JavaNGramExample.java | 71 ++ .../examples/ml/JavaNormalizerExample.java | 54 + .../examples/ml/JavaOneHotEncoderExample.java | 78 ++ .../spark/examples/ml/JavaPCAExample.java | 71 ++ .../ml/JavaPolynomialExpansionExample.java | 71 ++ .../spark/examples/ml/JavaRFormulaExample.java | 69 ++ .../examples/ml/JavaStandardScalerExample.java | 54 + .../ml/JavaStopWordsRemoverExample.java | 65 + .../examples/ml/JavaStringIndexerExample.java | 66 ++ .../spark/examples/ml/JavaTokenizerExample.java | 75 ++ .../examples/ml/JavaVectorAssemblerExample.java | 67 ++ .../examples/ml/JavaVectorIndexerExample.java | 61 + .../examples/ml/JavaVectorSlicerExample.java| 73 ++ .../src/main/python/ml/binarizer_example.py | 43 + .../src/main/python/ml/bucketizer_example.py| 43 + .../python/ml/elementwise_product_example.py| 39 + examples/src/main/python/ml/n_gram_example.py | 42 + .../src/main/python/ml/normalizer_example.py| 43 + .../main/python/ml/onehot_encoder_example.py| 48 + examples/src/main/python/ml/pca_example.py | 42 + .../python/ml/polynomial_expansion_example.py | 43 + examples/src/main/python/ml/rformula_example.py | 44 + .../main/python/ml/standard_scaler_example.py | 43 + .../main/python/ml/stopwords_remover_example.py | 40 + .../main/python/ml/string_indexer_example.py| 39 + .../src/main/python/ml/tokenizer_example.py | 44 + .../main/python/ml/vector_assembler_example.py | 42 + .../main/python/ml/vector_indexer_example.py| 40 + .../spark/examples/ml/BinarizerExample.scala| 48 + .../spark/examples/ml/BucketizerExample.scala | 52 + .../apache/spark/examples/ml/DCTExample.scala | 54 + .../examples/ml/ElementWiseProductExample.scala | 52 + .../spark/examples/ml/MinMaxScalerExample.scala | 50 + .../apache/spark/examples/ml/NGramExample.scala | 47 + .../spark/examples/ml/NormalizerExample.scala | 52 + .../examples/ml/OneHotEncoderExample.scala | 58 + .../apache/spark/examples/ml/PCAExample.scala | 53 + .../ml/PolynomialExpansionExample.scala | 51 + .../spark/examples/ml/RFormulaExample.scala | 49 + .../examples/ml/StandardScalerExample.scala | 52 + .../examples/ml/StopWordsRemoverExample.scala | 48 + .../examples/ml/StringIndexerExample.scala | 48 + .../spark/examples/ml/TokenizerExample.scala| 54 + .../examples/ml/VectorAssemblerExample.scala| 49 + .../examples/ml/VectorIndexerExample.scala | 54 + .../spark/examples/ml/VectorSlicerExample.scala | 58 + 52 files changed, 2820 insertions(+), 1061 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 55e4012..7ad7c4e 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.fea and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) for more details on the API. -{% highlight scala %} -import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} - -val sentenceDataFrame = sqlContext.createDataFrame(Seq( - (0, "Hi I heard about Spark"), - (1, "I wish Java could use case classes"), - (2, "Logistic,regression,models,are,neat")
[3/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example
[SPARK-11551][DOC] Replace example code in ml-features.md using include_example PR on behalf of somideshmukh, thanks! Author: Xusen Yin <yinxu...@gmail.com> Author: somideshmukh <somi...@us.ibm.com> Closes #10219 from yinxusen/SPARK-11551. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/051c6a06 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/051c6a06 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/051c6a06 Branch: refs/heads/master Commit: 051c6a066f7b5fcc7472412144c15b50a5319bd5 Parents: 1eb7c22 Author: Xusen Yin <yinxu...@gmail.com> Authored: Wed Dec 9 12:00:48 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Dec 9 12:00:48 2015 -0800 -- docs/ml-features.md | 1112 +- .../spark/examples/ml/JavaBinarizerExample.java | 68 ++ .../examples/ml/JavaBucketizerExample.java | 71 ++ .../spark/examples/ml/JavaDCTExample.java | 65 + .../ml/JavaElementwiseProductExample.java | 75 ++ .../examples/ml/JavaMinMaxScalerExample.java| 51 + .../spark/examples/ml/JavaNGramExample.java | 71 ++ .../examples/ml/JavaNormalizerExample.java | 54 + .../examples/ml/JavaOneHotEncoderExample.java | 78 ++ .../spark/examples/ml/JavaPCAExample.java | 71 ++ .../ml/JavaPolynomialExpansionExample.java | 71 ++ .../spark/examples/ml/JavaRFormulaExample.java | 69 ++ .../examples/ml/JavaStandardScalerExample.java | 54 + .../ml/JavaStopWordsRemoverExample.java | 65 + .../examples/ml/JavaStringIndexerExample.java | 66 ++ .../spark/examples/ml/JavaTokenizerExample.java | 75 ++ .../examples/ml/JavaVectorAssemblerExample.java | 67 ++ .../examples/ml/JavaVectorIndexerExample.java | 61 + .../examples/ml/JavaVectorSlicerExample.java| 73 ++ .../src/main/python/ml/binarizer_example.py | 43 + .../src/main/python/ml/bucketizer_example.py| 43 + .../python/ml/elementwise_product_example.py| 39 + examples/src/main/python/ml/n_gram_example.py | 42 + .../src/main/python/ml/normalizer_example.py| 43 + .../main/python/ml/onehot_encoder_example.py| 48 + examples/src/main/python/ml/pca_example.py | 42 + .../python/ml/polynomial_expansion_example.py | 43 + examples/src/main/python/ml/rformula_example.py | 44 + .../main/python/ml/standard_scaler_example.py | 43 + .../main/python/ml/stopwords_remover_example.py | 40 + .../main/python/ml/string_indexer_example.py| 39 + .../src/main/python/ml/tokenizer_example.py | 44 + .../main/python/ml/vector_assembler_example.py | 42 + .../main/python/ml/vector_indexer_example.py| 40 + .../spark/examples/ml/BinarizerExample.scala| 48 + .../spark/examples/ml/BucketizerExample.scala | 52 + .../apache/spark/examples/ml/DCTExample.scala | 54 + .../examples/ml/ElementWiseProductExample.scala | 52 + .../spark/examples/ml/MinMaxScalerExample.scala | 50 + .../apache/spark/examples/ml/NGramExample.scala | 47 + .../spark/examples/ml/NormalizerExample.scala | 52 + .../examples/ml/OneHotEncoderExample.scala | 58 + .../apache/spark/examples/ml/PCAExample.scala | 53 + .../ml/PolynomialExpansionExample.scala | 51 + .../spark/examples/ml/RFormulaExample.scala | 49 + .../examples/ml/StandardScalerExample.scala | 52 + .../examples/ml/StopWordsRemoverExample.scala | 48 + .../examples/ml/StringIndexerExample.scala | 48 + .../spark/examples/ml/TokenizerExample.scala| 54 + .../examples/ml/VectorAssemblerExample.scala| 49 + .../examples/ml/VectorIndexerExample.scala | 54 + .../spark/examples/ml/VectorSlicerExample.scala | 58 + 52 files changed, 2820 insertions(+), 1061 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 55e4012..7ad7c4e 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.fea and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) for more details on the API. -{% highlight scala %} -import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} - -val sentenceDataFrame = sqlContext.createDataFrame(Seq( - (0, "Hi I heard about Spark"), - (1, "I wish Java could use case classes"), - (2, "Logistic,regression,models,are,neat") -)).toDF("label", "sentence") -val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words&
[1/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example
Repository: spark Updated Branches: refs/heads/master 1eb7c22ce -> 051c6a066 http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala new file mode 100644 index 000..9fa494c --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.StringIndexer +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object StringIndexerExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("StringIndexerExample") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// $example on$ +val df = sqlContext.createDataFrame( + Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) +).toDF("id", "category") + +val indexer = new StringIndexer() + .setInputCol("category") + .setOutputCol("categoryIndex") + +val indexed = indexer.fit(df).transform(df) +indexed.show() +// $example off$ +sc.stop() + } +} +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala new file mode 100644 index 000..01e0d13 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object TokenizerExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("TokenizerExample") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// $example on$ +val sentenceDataFrame = sqlContext.createDataFrame(Seq( + (0, "Hi I heard about Spark"), + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") +)).toDF("label", "sentence") + +val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") +val regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select("words", "label").take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select("words", "label").take(3).foreach(println) +// $example off$ +sc.stop() + } +}
[2/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example
http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java new file mode 100644 index 000..668f71e --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.PolynomialExpansion; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaPolynomialExpansionExample { + public static void main(String[] args) { +SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); +JavaSparkContext jsc = new JavaSparkContext(conf); +SQLContext jsql = new SQLContext(jsc); + +// $example on$ +PolynomialExpansion polyExpansion = new PolynomialExpansion() + .setInputCol("features") + .setOutputCol("polyFeatures") + .setDegree(3); + +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(-2.0, 2.3)), + RowFactory.create(Vectors.dense(0.0, 0.0)), + RowFactory.create(Vectors.dense(0.6, -1.1)) +)); + +StructType schema = new StructType(new StructField[]{ + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); + +DataFrame df = jsql.createDataFrame(data, schema); +DataFrame polyDF = polyExpansion.transform(df); + +Row[] row = polyDF.select("polyFeatures").take(3); +for (Row r : row) { + System.out.println(r.get(0)); +} +// $example off$ +jsc.stop(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java new file mode 100644 index 000..1e1062b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RFormula; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import
[1/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example
Repository: spark Updated Branches: refs/heads/branch-1.6 ee0a6e722 -> bfb420139 http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala new file mode 100644 index 000..9fa494c --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.StringIndexer +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object StringIndexerExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("StringIndexerExample") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// $example on$ +val df = sqlContext.createDataFrame( + Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) +).toDF("id", "category") + +val indexer = new StringIndexer() + .setInputCol("category") + .setOutputCol("categoryIndex") + +val indexed = indexer.fit(df).transform(df) +indexed.show() +// $example off$ +sc.stop() + } +} +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala new file mode 100644 index 000..01e0d13 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object TokenizerExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("TokenizerExample") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// $example on$ +val sentenceDataFrame = sqlContext.createDataFrame(Seq( + (0, "Hi I heard about Spark"), + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") +)).toDF("label", "sentence") + +val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") +val regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select("words", "label").take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select("words", "label").take(3).foreach(println) +// $example off$ +sc.stop() + }
[2/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example
http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java new file mode 100644 index 000..668f71e --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.PolynomialExpansion; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaPolynomialExpansionExample { + public static void main(String[] args) { +SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); +JavaSparkContext jsc = new JavaSparkContext(conf); +SQLContext jsql = new SQLContext(jsc); + +// $example on$ +PolynomialExpansion polyExpansion = new PolynomialExpansion() + .setInputCol("features") + .setOutputCol("polyFeatures") + .setDegree(3); + +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(-2.0, 2.3)), + RowFactory.create(Vectors.dense(0.0, 0.0)), + RowFactory.create(Vectors.dense(0.6, -1.1)) +)); + +StructType schema = new StructType(new StructField[]{ + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); + +DataFrame df = jsql.createDataFrame(data, schema); +DataFrame polyDF = polyExpansion.transform(df); + +Row[] row = polyDF.select("polyFeatures").take(3); +for (Row r : row) { + System.out.println(r.get(0)); +} +// $example off$ +jsc.stop(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java new file mode 100644 index 000..1e1062b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RFormula; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import
spark git commit: [SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code
Repository: spark Updated Branches: refs/heads/branch-1.6 8652fc03c -> 5c8216920 [SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code Add ```SQLTransformer``` user guide, example code and make Scala API doc more clear. Author: Yanbo Liang <yblia...@gmail.com> Closes #10006 from yanboliang/spark-11958. (cherry picked from commit 4a39b5a1bee28cec792d509654f6236390cafdcb) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c821692 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c821692 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c821692 Branch: refs/heads/branch-1.6 Commit: 5c8216920b4110d8fc4329e1fe52543ee17c4a54 Parents: 8652fc0 Author: Yanbo Liang <yblia...@gmail.com> Authored: Mon Dec 7 23:50:57 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:51:10 2015 -0800 -- docs/ml-features.md | 59 .../examples/ml/JavaSQLTransformerExample.java | 59 examples/src/main/python/ml/sql_transformer.py | 40 + .../examples/ml/SQLTransformerExample.scala | 45 +++ .../spark/ml/feature/SQLTransformer.scala | 11 +++- 5 files changed, 212 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c821692/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 5105a94..f85e0d5 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -756,6 +756,65 @@ for more details on the API. +## SQLTransformer + +`SQLTransformer` implements the transformations which are defined by SQL statement. +Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."` +where `"__THIS__"` represents the underlying table of the input dataset. +The select clause specifies the fields, constants, and expressions to display in +the output, it can be any select clause that Spark SQL supports. Users can also +use Spark SQL built-in function and UDFs to operate on these selected columns. +For example, `SQLTransformer` supports statements like: + +* `SELECT a, a + b AS a_b FROM __THIS__` +* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5` +* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b` + +**Examples** + +Assume that we have the following DataFrame with columns `id`, `v1` and `v2`: + + + id | v1 | v2 +|-|- + 0 | 1.0 | 3.0 + 2 | 2.0 | 5.0 + + +This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`: + + + id | v1 | v2 | v3 | v4 +|-|-|-|- + 0 | 1.0 | 3.0 | 4.0 | 3.0 + 2 | 2.0 | 5.0 | 7.0 |10.0 + + + + + +Refer to the [SQLTransformer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer) +for more details on the API. + +{% include_example scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %} + + + + +Refer to the [SQLTransformer Java docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html) +for more details on the API. + +{% include_example java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %} + + + + +Refer to the [SQLTransformer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more details on the API. + +{% include_example python/ml/sql_transformer.py %} + + + ## VectorAssembler `VectorAssembler` is a transformer that combines a given list of columns into a single vector http://git-wip-us.apache.org/repos/asf/spark/blob/5c821692/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java new file mode 100644 index 000..d55c707 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to
spark git commit: [SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code
Repository: spark Updated Branches: refs/heads/master 7d05a6245 -> 4a39b5a1b [SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code Add ```SQLTransformer``` user guide, example code and make Scala API doc more clear. Author: Yanbo Liang <yblia...@gmail.com> Closes #10006 from yanboliang/spark-11958. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a39b5a1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a39b5a1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a39b5a1 Branch: refs/heads/master Commit: 4a39b5a1bee28cec792d509654f6236390cafdcb Parents: 7d05a62 Author: Yanbo Liang <yblia...@gmail.com> Authored: Mon Dec 7 23:50:57 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:50:57 2015 -0800 -- docs/ml-features.md | 59 .../examples/ml/JavaSQLTransformerExample.java | 59 examples/src/main/python/ml/sql_transformer.py | 40 + .../examples/ml/SQLTransformerExample.scala | 45 +++ .../spark/ml/feature/SQLTransformer.scala | 11 +++- 5 files changed, 212 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4a39b5a1/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 5105a94..f85e0d5 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -756,6 +756,65 @@ for more details on the API. +## SQLTransformer + +`SQLTransformer` implements the transformations which are defined by SQL statement. +Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."` +where `"__THIS__"` represents the underlying table of the input dataset. +The select clause specifies the fields, constants, and expressions to display in +the output, it can be any select clause that Spark SQL supports. Users can also +use Spark SQL built-in function and UDFs to operate on these selected columns. +For example, `SQLTransformer` supports statements like: + +* `SELECT a, a + b AS a_b FROM __THIS__` +* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5` +* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b` + +**Examples** + +Assume that we have the following DataFrame with columns `id`, `v1` and `v2`: + + + id | v1 | v2 +|-|- + 0 | 1.0 | 3.0 + 2 | 2.0 | 5.0 + + +This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`: + + + id | v1 | v2 | v3 | v4 +|-|-|-|- + 0 | 1.0 | 3.0 | 4.0 | 3.0 + 2 | 2.0 | 5.0 | 7.0 |10.0 + + + + + +Refer to the [SQLTransformer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer) +for more details on the API. + +{% include_example scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %} + + + + +Refer to the [SQLTransformer Java docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html) +for more details on the API. + +{% include_example java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %} + + + + +Refer to the [SQLTransformer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more details on the API. + +{% include_example python/ml/sql_transformer.py %} + + + ## VectorAssembler `VectorAssembler` is a transformer that combines a given list of columns into a single vector http://git-wip-us.apache.org/repos/asf/spark/blob/4a39b5a1/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java new file mode 100644 index 000..d55c707 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF
spark git commit: [SPARK-10259][ML] Add @since annotation to ml.classification
Repository: spark Updated Branches: refs/heads/master 73896588d -> 7d05a6245 [SPARK-10259][ML] Add @since annotation to ml.classification Add since annotation to ml.classification Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp> Closes #8534 from taishi-oss/issue10259. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d05a624 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d05a624 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d05a624 Branch: refs/heads/master Commit: 7d05a624510f7299b3dd07f87c203db1ff7caa3e Parents: 7389658 Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp> Authored: Mon Dec 7 23:46:55 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:46:55 2015 -0800 -- .../classification/DecisionTreeClassifier.scala | 30 +++-- .../spark/ml/classification/GBTClassifier.scala | 35 +-- .../ml/classification/LogisticRegression.scala | 64 +++- .../MultilayerPerceptronClassifier.scala| 23 +-- .../spark/ml/classification/NaiveBayes.scala| 19 -- .../spark/ml/classification/OneVsRest.scala | 24 ++-- .../classification/RandomForestClassifier.scala | 34 +-- 7 files changed, 185 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d05a624/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index c478aea..8c4cec1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.classification -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams} import org.apache.spark.ml.tree.impl.RandomForest @@ -36,32 +36,44 @@ import org.apache.spark.sql.DataFrame * It supports both binary and multiclass labels, as well as both continuous and categorical * features. */ +@Since("1.4.0") @Experimental -final class DecisionTreeClassifier(override val uid: String) +final class DecisionTreeClassifier @Since("1.4.0") ( +@Since("1.4.0") override val uid: String) extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] with DecisionTreeParams with TreeClassifierParams { + @Since("1.4.0") def this() = this(Identifiable.randomUID("dtc")) // Override parameter setters from parent trait for Java API compatibility. + @Since("1.4.0") override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value) + @Since("1.4.0") override def setMaxBins(value: Int): this.type = super.setMaxBins(value) + @Since("1.4.0") override def setMinInstancesPerNode(value: Int): this.type = super.setMinInstancesPerNode(value) + @Since("1.4.0") override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value) + @Since("1.4.0") override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value) + @Since("1.4.0") override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value) + @Since("1.4.0") override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value) + @Since("1.4.0") override def setImpurity(value: String): this.type = super.setImpurity(value) + @Since("1.6.0") override def setSeed(value: Long): this.type = super.setSeed(value) override protected def train(dataset: DataFrame): DecisionTreeClassificationModel = { @@ -89,12 +101,15 @@ final class DecisionTreeClassifier(override val uid: String) subsamplingRate = 1.0) } + @Since("1.4.1") override def copy(extra: ParamMap): DecisionTreeClassifier = defaultCopy(extra) } +@Since("1.4.0") @Experimental object DecisionTreeClassifier { /** Accessor for supported impurities: entropy, gini */ + @Since("1.4.0") final val supportedImpurities: Array[String] = TreeClassifierParams.supportedImpurities } @@ -104,12 +119,13 @@ object DecisionTreeClassifier { * It supports both binary and multiclass labels, as well as both continuous
spark git commit: [SPARK-10259][ML] Add @since annotation to ml.classification
Repository: spark Updated Branches: refs/heads/branch-1.6 3c683ed5f -> 8652fc03c [SPARK-10259][ML] Add @since annotation to ml.classification Add since annotation to ml.classification Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp> Closes #8534 from taishi-oss/issue10259. (cherry picked from commit 7d05a624510f7299b3dd07f87c203db1ff7caa3e) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8652fc03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8652fc03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8652fc03 Branch: refs/heads/branch-1.6 Commit: 8652fc03c21f79b41ce13f41991feba11fc7b29c Parents: 3c683ed Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp> Authored: Mon Dec 7 23:46:55 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:47:03 2015 -0800 -- .../classification/DecisionTreeClassifier.scala | 30 +++-- .../spark/ml/classification/GBTClassifier.scala | 35 +-- .../ml/classification/LogisticRegression.scala | 64 +++- .../MultilayerPerceptronClassifier.scala| 23 +-- .../spark/ml/classification/NaiveBayes.scala| 19 -- .../spark/ml/classification/OneVsRest.scala | 24 ++-- .../classification/RandomForestClassifier.scala | 34 +-- 7 files changed, 185 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8652fc03/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index c478aea..8c4cec1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.classification -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams} import org.apache.spark.ml.tree.impl.RandomForest @@ -36,32 +36,44 @@ import org.apache.spark.sql.DataFrame * It supports both binary and multiclass labels, as well as both continuous and categorical * features. */ +@Since("1.4.0") @Experimental -final class DecisionTreeClassifier(override val uid: String) +final class DecisionTreeClassifier @Since("1.4.0") ( +@Since("1.4.0") override val uid: String) extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] with DecisionTreeParams with TreeClassifierParams { + @Since("1.4.0") def this() = this(Identifiable.randomUID("dtc")) // Override parameter setters from parent trait for Java API compatibility. + @Since("1.4.0") override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value) + @Since("1.4.0") override def setMaxBins(value: Int): this.type = super.setMaxBins(value) + @Since("1.4.0") override def setMinInstancesPerNode(value: Int): this.type = super.setMinInstancesPerNode(value) + @Since("1.4.0") override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value) + @Since("1.4.0") override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value) + @Since("1.4.0") override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value) + @Since("1.4.0") override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value) + @Since("1.4.0") override def setImpurity(value: String): this.type = super.setImpurity(value) + @Since("1.6.0") override def setSeed(value: Long): this.type = super.setSeed(value) override protected def train(dataset: DataFrame): DecisionTreeClassificationModel = { @@ -89,12 +101,15 @@ final class DecisionTreeClassifier(override val uid: String) subsamplingRate = 1.0) } + @Since("1.4.1") override def copy(extra: ParamMap): DecisionTreeClassifier = defaultCopy(extra) } +@Since("1.4.0") @Experimental object DecisionTreeClassifier { /** Accessor for supported impurities: entropy, gini */ + @Since("1.4.0") final val supportedImpurities: Array[String] = TreeClassifierParams.supportedI
[3/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example
[SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example Made new patch contaning only markdown examples moved to exmaple/folder. Ony three java code were not shfted since they were contaning compliation error ,these classes are 1)StandardScale 2)NormalizerExample 3)VectorIndexer Author: Xusen Yin <yinxu...@gmail.com> Author: somideshmukh <somi...@us.ibm.com> Closes #10002 from somideshmukh/SomilBranch1.33. (cherry picked from commit 78209b0ccaf3f22b5e2345dfb2b98edfdb746819) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3c683ed5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3c683ed5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3c683ed5 Branch: refs/heads/branch-1.6 Commit: 3c683ed5ffe704a6fec7c6d434eeed784276470d Parents: 115bfbd Author: somideshmukh <somi...@us.ibm.com> Authored: Mon Dec 7 23:26:34 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:26:56 2015 -0800 -- docs/ml-features.md | 1109 +- .../spark/examples/ml/JavaBinarizerExample.java | 68 ++ .../examples/ml/JavaBucketizerExample.java | 70 ++ .../spark/examples/ml/JavaDCTExample.java | 65 + .../ml/JavaElementwiseProductExample.java | 75 ++ .../examples/ml/JavaMinMaxScalerExample.java| 50 + .../spark/examples/ml/JavaNGramExample.java | 71 ++ .../examples/ml/JavaNormalizerExample.java | 52 + .../examples/ml/JavaOneHotEncoderExample.java | 77 ++ .../spark/examples/ml/JavaPCAExample.java | 71 ++ .../ml/JavaPolynomialExpansionExample.java | 71 ++ .../spark/examples/ml/JavaRFormulaExample.java | 69 ++ .../examples/ml/JavaStandardScalerExample.java | 53 + .../ml/JavaStopWordsRemoverExample.java | 65 + .../examples/ml/JavaStringIndexerExample.java | 66 ++ .../spark/examples/ml/JavaTokenizerExample.java | 75 ++ .../examples/ml/JavaVectorAssemblerExample.java | 67 ++ .../examples/ml/JavaVectorIndexerExample.java | 60 + .../examples/ml/JavaVectorSlicerExample.java| 73 ++ .../src/main/python/ml/binarizer_example.py | 43 + .../src/main/python/ml/bucketizer_example.py| 42 + .../python/ml/elementwise_product_example.py| 39 + examples/src/main/python/ml/n_gram_example.py | 42 + .../src/main/python/ml/normalizer_example.py| 41 + .../main/python/ml/onehot_encoder_example.py| 47 + examples/src/main/python/ml/pca_example.py | 42 + .../python/ml/polynomial_expansion_example.py | 43 + examples/src/main/python/ml/rformula_example.py | 44 + .../main/python/ml/standard_scaler_example.py | 42 + .../main/python/ml/stopwords_remover_example.py | 40 + .../main/python/ml/string_indexer_example.py| 39 + .../src/main/python/ml/tokenizer_example.py | 44 + .../main/python/ml/vector_assembler_example.py | 42 + .../main/python/ml/vector_indexer_example.py| 39 + .../spark/examples/ml/BinarizerExample.scala| 48 + .../spark/examples/ml/BucketizerExample.scala | 51 + .../apache/spark/examples/ml/DCTExample.scala | 54 + .../examples/ml/ElementWiseProductExample.scala | 53 + .../spark/examples/ml/MinMaxScalerExample.scala | 49 + .../apache/spark/examples/ml/NGramExample.scala | 47 + .../spark/examples/ml/NormalizerExample.scala | 50 + .../examples/ml/OneHotEncoderExample.scala | 58 + .../apache/spark/examples/ml/PCAExample.scala | 54 + .../ml/PolynomialExpansionExample.scala | 53 + .../spark/examples/ml/RFormulaExample.scala | 49 + .../examples/ml/StandardScalerExample.scala | 51 + .../examples/ml/StopWordsRemoverExample.scala | 48 + .../examples/ml/StringIndexerExample.scala | 49 + .../spark/examples/ml/TokenizerExample.scala| 54 + .../examples/ml/VectorAssemblerExample.scala| 49 + .../examples/ml/VectorIndexerExample.scala | 53 + .../spark/examples/ml/VectorSlicerExample.scala | 58 + 52 files changed, 2806 insertions(+), 1058 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3c683ed5/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index b499d6e..5105a94 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.fea and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) for more details on the API. -{% highlight scala %} -import org.apache.spark.ml.feature.{Tokenizer, RegexToken
[2/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example
http://git-wip-us.apache.org/repos/asf/spark/blob/3c683ed5/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java new file mode 100644 index 000..668f71e --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.PolynomialExpansion; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaPolynomialExpansionExample { + public static void main(String[] args) { +SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); +JavaSparkContext jsc = new JavaSparkContext(conf); +SQLContext jsql = new SQLContext(jsc); + +// $example on$ +PolynomialExpansion polyExpansion = new PolynomialExpansion() + .setInputCol("features") + .setOutputCol("polyFeatures") + .setDegree(3); + +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(-2.0, 2.3)), + RowFactory.create(Vectors.dense(0.0, 0.0)), + RowFactory.create(Vectors.dense(0.6, -1.1)) +)); + +StructType schema = new StructType(new StructField[]{ + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); + +DataFrame df = jsql.createDataFrame(data, schema); +DataFrame polyDF = polyExpansion.transform(df); + +Row[] row = polyDF.select("polyFeatures").take(3); +for (Row r : row) { + System.out.println(r.get(0)); +} +// $example off$ +jsc.stop(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/3c683ed5/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java new file mode 100644 index 000..1e1062b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RFormula; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import
[1/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example
Repository: spark Updated Branches: refs/heads/master 3e7e05f5e -> 78209b0cc http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala new file mode 100644 index 000..1be8a5f --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.StringIndexer +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object StringIndexerExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("StringIndexerExample") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// $example on$ +val df = sqlContext.createDataFrame( + Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) +).toDF("id", "category") + +val indexer = new StringIndexer() + .setInputCol("category") + .setOutputCol("categoryIndex") + +val indexed = indexer.fit(df).transform(df) +indexed.show() +// $example off$ +sc.stop() + } +} +// scalastyle:on println + http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala new file mode 100644 index 000..01e0d13 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object TokenizerExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("TokenizerExample") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// $example on$ +val sentenceDataFrame = sqlContext.createDataFrame(Seq( + (0, "Hi I heard about Spark"), + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") +)).toDF("label", "sentence") + +val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") +val regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select("words", "label").take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select("words", "label").take(3).foreach(println) +// $example off$ +sc.stop() + } +}
spark git commit: [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib - 1.5 backport
Repository: spark Updated Branches: refs/heads/branch-1.5 3868ab644 -> 2f30927a5 [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib - 1.5 backport This backports [https://github.com/apache/spark/pull/10161] to Spark 1.5, with the difference that ChiSqSelector does not require modification. Switched from using SQLContext constructor to using getOrCreate, mainly in model save/load methods. This covers all instances in spark.mllib. There were no uses of the constructor in spark.ml. CC: yhuai mengxr Author: Joseph K. Bradley <jos...@databricks.com> Closes #10183 from jkbradley/sqlcontext-backport1.5. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f30927a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f30927a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f30927a Branch: refs/heads/branch-1.5 Commit: 2f30927a5f40f2862e777bfe97282ddcfc0a063a Parents: 3868ab6 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Mon Dec 7 23:37:23 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:37:23 2015 -0800 -- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 6 +++--- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 8 .../mllib/classification/impl/GLMClassificationModel.scala | 4 ++-- .../apache/spark/mllib/clustering/GaussianMixtureModel.scala | 4 ++-- .../org/apache/spark/mllib/clustering/KMeansModel.scala | 4 ++-- .../spark/mllib/clustering/PowerIterationClustering.scala| 4 ++-- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- .../mllib/recommendation/MatrixFactorizationModel.scala | 4 ++-- .../apache/spark/mllib/regression/IsotonicRegression.scala | 4 ++-- .../spark/mllib/regression/impl/GLMRegressionModel.scala | 4 ++-- .../apache/spark/mllib/tree/model/DecisionTreeModel.scala| 4 ++-- .../apache/spark/mllib/tree/model/treeEnsembleModels.scala | 4 ++-- 12 files changed, 27 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2f30927a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index f585aac..06e13b7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1149,7 +1149,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = { // We use DataFrames for serialization of IndexedRows to Python, // so return a DataFrame. -val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext) +val sqlContext = SQLContext.getOrCreate(indexedRowMatrix.rows.sparkContext) sqlContext.createDataFrame(indexedRowMatrix.rows) } @@ -1159,7 +1159,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = { // We use DataFrames for serialization of MatrixEntry entries to // Python, so return a DataFrame. -val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext) +val sqlContext = SQLContext.getOrCreate(coordinateMatrix.entries.sparkContext) sqlContext.createDataFrame(coordinateMatrix.entries) } @@ -1169,7 +1169,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = { // We use DataFrames for serialization of sub-matrix blocks to // Python, so return a DataFrame. -val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext) +val sqlContext = SQLContext.getOrCreate(blockMatrix.blocks.sparkContext) sqlContext.createDataFrame(blockMatrix.blocks) } } http://git-wip-us.apache.org/repos/asf/spark/blob/2f30927a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index a956084..aef9ef2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -192,7 +192,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { modelType: String) def save(sc: SparkContext, path: String, data: Data): Unit = { - val sqlCon
spark git commit: Closes #10098
Repository: spark Updated Branches: refs/heads/master 78209b0cc -> 73896588d Closes #10098 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73896588 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73896588 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73896588 Branch: refs/heads/master Commit: 73896588dd3af6ba77c9692cd5120ee32448eb22 Parents: 78209b0 Author: Xiangrui Meng <m...@databricks.com> Authored: Mon Dec 7 23:34:16 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 23:34:16 2015 -0800 -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib
Repository: spark Updated Branches: refs/heads/branch-1.6 cdeb89b34 -> 115bfbdae [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib Switched from using SQLContext constructor to using getOrCreate, mainly in model save/load methods. This covers all instances in spark.mllib. There were no uses of the constructor in spark.ml. CC: mengxr yhuai Author: Joseph K. Bradley <jos...@databricks.com> Closes #10161 from jkbradley/mllib-sqlcontext-fix. (cherry picked from commit 3e7e05f5ee763925ed60410d7de04cf36b723de1) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/115bfbda Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/115bfbda Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/115bfbda Branch: refs/heads/branch-1.6 Commit: 115bfbdae82b1c2804ea501ffd420d0aa17aac45 Parents: cdeb89b Author: Joseph K. Bradley <jos...@databricks.com> Authored: Mon Dec 7 16:37:09 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 16:37:16 2015 -0800 -- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 6 +++--- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 8 .../mllib/classification/impl/GLMClassificationModel.scala | 4 ++-- .../apache/spark/mllib/clustering/GaussianMixtureModel.scala | 4 ++-- .../org/apache/spark/mllib/clustering/KMeansModel.scala | 4 ++-- .../spark/mllib/clustering/PowerIterationClustering.scala| 4 ++-- .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 4 ++-- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- .../mllib/recommendation/MatrixFactorizationModel.scala | 4 ++-- .../apache/spark/mllib/regression/IsotonicRegression.scala | 4 ++-- .../spark/mllib/regression/impl/GLMRegressionModel.scala | 4 ++-- .../apache/spark/mllib/tree/model/DecisionTreeModel.scala| 4 ++-- .../apache/spark/mllib/tree/model/treeEnsembleModels.scala | 4 ++-- 13 files changed, 29 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/115bfbda/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 54b03a9..2aa6aec 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1191,7 +1191,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = { // We use DataFrames for serialization of IndexedRows to Python, // so return a DataFrame. -val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext) +val sqlContext = SQLContext.getOrCreate(indexedRowMatrix.rows.sparkContext) sqlContext.createDataFrame(indexedRowMatrix.rows) } @@ -1201,7 +1201,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = { // We use DataFrames for serialization of MatrixEntry entries to // Python, so return a DataFrame. -val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext) +val sqlContext = SQLContext.getOrCreate(coordinateMatrix.entries.sparkContext) sqlContext.createDataFrame(coordinateMatrix.entries) } @@ -1211,7 +1211,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = { // We use DataFrames for serialization of sub-matrix blocks to // Python, so return a DataFrame. -val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext) +val sqlContext = SQLContext.getOrCreate(blockMatrix.blocks.sparkContext) sqlContext.createDataFrame(blockMatrix.blocks) } } http://git-wip-us.apache.org/repos/asf/spark/blob/115bfbda/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index a956084..aef9ef2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -192,7 +192,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { modelType: String) def save(sc: SparkContext, path
spark git commit: [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib
Repository: spark Updated Branches: refs/heads/master 36282f78b -> 3e7e05f5e [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib Switched from using SQLContext constructor to using getOrCreate, mainly in model save/load methods. This covers all instances in spark.mllib. There were no uses of the constructor in spark.ml. CC: mengxr yhuai Author: Joseph K. Bradley <jos...@databricks.com> Closes #10161 from jkbradley/mllib-sqlcontext-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e7e05f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e7e05f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e7e05f5 Branch: refs/heads/master Commit: 3e7e05f5ee763925ed60410d7de04cf36b723de1 Parents: 36282f7 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Mon Dec 7 16:37:09 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 7 16:37:09 2015 -0800 -- .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala | 6 +++--- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 8 .../mllib/classification/impl/GLMClassificationModel.scala | 4 ++-- .../apache/spark/mllib/clustering/GaussianMixtureModel.scala | 4 ++-- .../org/apache/spark/mllib/clustering/KMeansModel.scala | 4 ++-- .../spark/mllib/clustering/PowerIterationClustering.scala| 4 ++-- .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 4 ++-- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- .../mllib/recommendation/MatrixFactorizationModel.scala | 4 ++-- .../apache/spark/mllib/regression/IsotonicRegression.scala | 4 ++-- .../spark/mllib/regression/impl/GLMRegressionModel.scala | 4 ++-- .../apache/spark/mllib/tree/model/DecisionTreeModel.scala| 4 ++-- .../apache/spark/mllib/tree/model/treeEnsembleModels.scala | 4 ++-- 13 files changed, 29 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e7e05f5/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 54b03a9..2aa6aec 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1191,7 +1191,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = { // We use DataFrames for serialization of IndexedRows to Python, // so return a DataFrame. -val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext) +val sqlContext = SQLContext.getOrCreate(indexedRowMatrix.rows.sparkContext) sqlContext.createDataFrame(indexedRowMatrix.rows) } @@ -1201,7 +1201,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = { // We use DataFrames for serialization of MatrixEntry entries to // Python, so return a DataFrame. -val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext) +val sqlContext = SQLContext.getOrCreate(coordinateMatrix.entries.sparkContext) sqlContext.createDataFrame(coordinateMatrix.entries) } @@ -1211,7 +1211,7 @@ private[python] class PythonMLLibAPI extends Serializable { def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = { // We use DataFrames for serialization of sub-matrix blocks to // Python, so return a DataFrame. -val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext) +val sqlContext = SQLContext.getOrCreate(blockMatrix.blocks.sparkContext) sqlContext.createDataFrame(blockMatrix.blocks) } } http://git-wip-us.apache.org/repos/asf/spark/blob/3e7e05f5/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index a956084..aef9ef2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -192,7 +192,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { modelType: String) def save(sc: SparkContext, path: String, data: Data): Unit = { - val sqlContext = new SQLContext(sc) + val sqlContext = SQLContext.getOrCreate(sc) import
spark git commit: [MINOR][ML] Use coefficients replace weights
Repository: spark Updated Branches: refs/heads/master 688e521c2 -> d576e76bb [MINOR][ML] Use coefficients replace weights Use ```coefficients``` replace ```weights```, I wish they are the last two. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #10065 from yanboliang/coefficients. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d576e76b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d576e76b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d576e76b Branch: refs/heads/master Commit: d576e76bbaa818480d31d2b8fbbe4b15718307d9 Parents: 688e521 Author: Yanbo Liang <yblia...@gmail.com> Authored: Thu Dec 3 11:37:34 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Dec 3 11:37:34 2015 -0800 -- python/pyspark/ml/classification.py | 2 +- python/pyspark/ml/regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d576e76b/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 4a2982e..5599b8f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -49,7 +49,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF() >>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight") >>> model = lr.fit(df) ->>> model.weights +>>> model.coefficients DenseVector([5.5...]) >>> model.intercept -2.68... http://git-wip-us.apache.org/repos/asf/spark/blob/d576e76b/python/pyspark/ml/regression.py -- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 944e648..a0bb8ce 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -40,7 +40,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction Linear regression. The learning objective is to minimize the squared error, with regularization. -The specific squared error loss function used is: L = 1/2n ||A weights - y||^2^ +The specific squared error loss function used is: L = 1/2n ||A coefficients - y||^2^ This support multiple types of regularization: - none (a.k.a. ordinary least squares) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][ML] Use coefficients replace weights
Repository: spark Updated Branches: refs/heads/branch-1.6 bf8b95fa4 -> e0577f542 [MINOR][ML] Use coefficients replace weights Use ```coefficients``` replace ```weights```, I wish they are the last two. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #10065 from yanboliang/coefficients. (cherry picked from commit d576e76bbaa818480d31d2b8fbbe4b15718307d9) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0577f54 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0577f54 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0577f54 Branch: refs/heads/branch-1.6 Commit: e0577f542878d582651aad7c65dc33c47014b4fb Parents: bf8b95f Author: Yanbo Liang <yblia...@gmail.com> Authored: Thu Dec 3 11:37:34 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Dec 3 11:37:41 2015 -0800 -- python/pyspark/ml/classification.py | 2 +- python/pyspark/ml/regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e0577f54/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 4a2982e..5599b8f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -49,7 +49,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF() >>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight") >>> model = lr.fit(df) ->>> model.weights +>>> model.coefficients DenseVector([5.5...]) >>> model.intercept -2.68... http://git-wip-us.apache.org/repos/asf/spark/blob/e0577f54/python/pyspark/ml/regression.py -- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 944e648..a0bb8ce 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -40,7 +40,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction Linear regression. The learning objective is to minimize the squared error, with regularization. -The specific squared error loss function used is: L = 1/2n ||A weights - y||^2^ +The specific squared error loss function used is: L = 1/2n ||A coefficients - y||^2^ This support multiple types of regularization: - none (a.k.a. ordinary least squares) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning
Repository: spark Updated Branches: refs/heads/master 452690ba1 -> de07d06ab [SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning cc mengxr noel-smith I worked on this issues based on https://github.com/apache/spark/pull/8729. ehsanmok thank you for your contricution! Author: Yu ISHIKAWA <yuu.ishik...@gmail.com> Author: Ehsan M.Kermani <ehsanmo1...@gmail.com> Closes #9338 from yu-iskw/JIRA-10266. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de07d06a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de07d06a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de07d06a Branch: refs/heads/master Commit: de07d06abecf3516c95d099b6c01a86e0c8cfd8c Parents: 452690b Author: Yu ISHIKAWA <yuu.ishik...@gmail.com> Authored: Wed Dec 2 14:15:54 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Dec 2 14:15:54 2015 -0800 -- .../apache/spark/ml/tuning/CrossValidator.scala | 34 ++-- .../spark/ml/tuning/ParamGridBuilder.scala | 14 ++-- .../spark/ml/tuning/TrainValidationSplit.scala | 26 --- 3 files changed, 58 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/de07d06a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 83a9048..5c09f1a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -19,18 +19,18 @@ package org.apache.spark.ml.tuning import com.github.fommil.netlib.F2jBLAS import org.apache.hadoop.fs.Path -import org.json4s.{JObject, DefaultFormats} import org.json4s.jackson.JsonMethods._ +import org.json4s.{DefaultFormats, JObject} -import org.apache.spark.ml.classification.OneVsRestParams -import org.apache.spark.ml.feature.RFormulaModel -import org.apache.spark.{SparkContext, Logging} +import org.apache.spark.{Logging, SparkContext} import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml._ +import org.apache.spark.ml.classification.OneVsRestParams import org.apache.spark.ml.evaluation.Evaluator +import org.apache.spark.ml.feature.RFormulaModel import org.apache.spark.ml.param._ -import org.apache.spark.ml.util._ import org.apache.spark.ml.util.DefaultParamsReader.Metadata +import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType @@ -58,26 +58,34 @@ private[ml] trait CrossValidatorParams extends ValidatorParams { * :: Experimental :: * K-fold cross validation. */ +@Since("1.2.0") @Experimental -class CrossValidator(override val uid: String) extends Estimator[CrossValidatorModel] +class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) + extends Estimator[CrossValidatorModel] with CrossValidatorParams with MLWritable with Logging { + @Since("1.2.0") def this() = this(Identifiable.randomUID("cv")) private val f2jBLAS = new F2jBLAS /** @group setParam */ + @Since("1.2.0") def setEstimator(value: Estimator[_]): this.type = set(estimator, value) /** @group setParam */ + @Since("1.2.0") def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value) /** @group setParam */ + @Since("1.2.0") def setEvaluator(value: Evaluator): this.type = set(evaluator, value) /** @group setParam */ + @Since("1.2.0") def setNumFolds(value: Int): this.type = set(numFolds, value) + @Since("1.4.0") override def fit(dataset: DataFrame): CrossValidatorModel = { val schema = dataset.schema transformSchema(schema, logging = true) @@ -116,10 +124,12 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this)) } + @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { $(estimator).transformSchema(schema) } + @Since("1.4.0") override def validateParams(): Unit = { super.validateParams() val est = $(estimator) @@ -128,6 +138,7 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM } } + @Since("1.4.0") override def copy(extra: ParamMap): CrossValidator = { val copied = defaultCopy(extra).asInstanceOf[CrossValidator]
spark git commit: [SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning
Repository: spark Updated Branches: refs/heads/branch-1.6 5d915fed3 -> 911259e9a [SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning cc mengxr noel-smith I worked on this issues based on https://github.com/apache/spark/pull/8729. ehsanmok thank you for your contricution! Author: Yu ISHIKAWA <yuu.ishik...@gmail.com> Author: Ehsan M.Kermani <ehsanmo1...@gmail.com> Closes #9338 from yu-iskw/JIRA-10266. (cherry picked from commit de07d06abecf3516c95d099b6c01a86e0c8cfd8c) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/911259e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/911259e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/911259e9 Branch: refs/heads/branch-1.6 Commit: 911259e9af6f9a81e775b1aa6d82fa44956bf993 Parents: 5d915fe Author: Yu ISHIKAWA <yuu.ishik...@gmail.com> Authored: Wed Dec 2 14:15:54 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Dec 2 14:16:05 2015 -0800 -- .../apache/spark/ml/tuning/CrossValidator.scala | 34 ++-- .../spark/ml/tuning/ParamGridBuilder.scala | 14 ++-- .../spark/ml/tuning/TrainValidationSplit.scala | 26 --- 3 files changed, 58 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/911259e9/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 83a9048..5c09f1a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -19,18 +19,18 @@ package org.apache.spark.ml.tuning import com.github.fommil.netlib.F2jBLAS import org.apache.hadoop.fs.Path -import org.json4s.{JObject, DefaultFormats} import org.json4s.jackson.JsonMethods._ +import org.json4s.{DefaultFormats, JObject} -import org.apache.spark.ml.classification.OneVsRestParams -import org.apache.spark.ml.feature.RFormulaModel -import org.apache.spark.{SparkContext, Logging} +import org.apache.spark.{Logging, SparkContext} import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml._ +import org.apache.spark.ml.classification.OneVsRestParams import org.apache.spark.ml.evaluation.Evaluator +import org.apache.spark.ml.feature.RFormulaModel import org.apache.spark.ml.param._ -import org.apache.spark.ml.util._ import org.apache.spark.ml.util.DefaultParamsReader.Metadata +import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType @@ -58,26 +58,34 @@ private[ml] trait CrossValidatorParams extends ValidatorParams { * :: Experimental :: * K-fold cross validation. */ +@Since("1.2.0") @Experimental -class CrossValidator(override val uid: String) extends Estimator[CrossValidatorModel] +class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) + extends Estimator[CrossValidatorModel] with CrossValidatorParams with MLWritable with Logging { + @Since("1.2.0") def this() = this(Identifiable.randomUID("cv")) private val f2jBLAS = new F2jBLAS /** @group setParam */ + @Since("1.2.0") def setEstimator(value: Estimator[_]): this.type = set(estimator, value) /** @group setParam */ + @Since("1.2.0") def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value) /** @group setParam */ + @Since("1.2.0") def setEvaluator(value: Evaluator): this.type = set(evaluator, value) /** @group setParam */ + @Since("1.2.0") def setNumFolds(value: Int): this.type = set(numFolds, value) + @Since("1.4.0") override def fit(dataset: DataFrame): CrossValidatorModel = { val schema = dataset.schema transformSchema(schema, logging = true) @@ -116,10 +124,12 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this)) } + @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { $(estimator).transformSchema(schema) } + @Since("1.4.0") override def validateParams(): Unit = { super.validateParams() val est = $(estimator) @@ -128,6 +138,7 @@ class CrossValidator(override val uid: String) extends Estimator[CrossValidatorM } } + @Since("
spark git commit: [SPARK-12000] do not specify arg types when reference a method in ScalaDoc
Repository: spark Updated Branches: refs/heads/branch-1.6 cb142fd1e -> 656d44e20 [SPARK-12000] do not specify arg types when reference a method in ScalaDoc This fixes SPARK-12000, verified on my local with JDK 7. It seems that `scaladoc` try to match method names and messed up with annotations. cc: JoshRosen jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #10114 from mengxr/SPARK-12000.2. (cherry picked from commit 9bb695b7a82d837e2c7a724514ea6b203efb5364) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/656d44e2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/656d44e2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/656d44e2 Branch: refs/heads/branch-1.6 Commit: 656d44e2021d2f637d724c1d71ecdca1f447a4be Parents: cb142fd Author: Xiangrui Meng <m...@databricks.com> Authored: Wed Dec 2 17:19:31 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Dec 2 17:19:45 2015 -0800 -- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- .../org/apache/spark/mllib/clustering/BisectingKMeansModel.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/656d44e2/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 29a7aa0..82adfa6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -214,7 +214,7 @@ class BisectingKMeans private ( } /** - * Java-friendly version of [[run(RDD[Vector])*]] + * Java-friendly version of [[run()]]. */ def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd) } http://git-wip-us.apache.org/repos/asf/spark/blob/656d44e2/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 5015f15..f942e56 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -64,7 +64,7 @@ class BisectingKMeansModel @Since("1.6.0") ( } /** - * Java-friendly version of [[predict(RDD[Vector])*]] + * Java-friendly version of [[predict()]]. */ @Since("1.6.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = @@ -88,7 +88,7 @@ class BisectingKMeansModel @Since("1.6.0") ( } /** - * Java-friendly version of [[computeCost(RDD[Vector])*]]. + * Java-friendly version of [[computeCost()]]. */ @Since("1.6.0") def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12000] do not specify arg types when reference a method in ScalaDoc
Repository: spark Updated Branches: refs/heads/master d0d7ec533 -> 9bb695b7a [SPARK-12000] do not specify arg types when reference a method in ScalaDoc This fixes SPARK-12000, verified on my local with JDK 7. It seems that `scaladoc` try to match method names and messed up with annotations. cc: JoshRosen jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #10114 from mengxr/SPARK-12000.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9bb695b7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9bb695b7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9bb695b7 Branch: refs/heads/master Commit: 9bb695b7a82d837e2c7a724514ea6b203efb5364 Parents: d0d7ec5 Author: Xiangrui Meng <m...@databricks.com> Authored: Wed Dec 2 17:19:31 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Dec 2 17:19:31 2015 -0800 -- .../org/apache/spark/mllib/clustering/BisectingKMeans.scala | 2 +- .../org/apache/spark/mllib/clustering/BisectingKMeansModel.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9bb695b7/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 29a7aa0..82adfa6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -214,7 +214,7 @@ class BisectingKMeans private ( } /** - * Java-friendly version of [[run(RDD[Vector])*]] + * Java-friendly version of [[run()]]. */ def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd) } http://git-wip-us.apache.org/repos/asf/spark/blob/9bb695b7/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala index 5015f15..f942e56 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala @@ -64,7 +64,7 @@ class BisectingKMeansModel @Since("1.6.0") ( } /** - * Java-friendly version of [[predict(RDD[Vector])*]] + * Java-friendly version of [[predict()]]. */ @Since("1.6.0") def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = @@ -88,7 +88,7 @@ class BisectingKMeansModel @Since("1.6.0") ( } /** - * Java-friendly version of [[computeCost(RDD[Vector])*]]. + * Java-friendly version of [[computeCost()]]. */ @Since("1.6.0") def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] fixed list display in ml-ensembles
Repository: spark Updated Branches: refs/heads/branch-1.6 32911de77 -> 0978ec11c [MINOR][DOCS] fixed list display in ml-ensembles The list in ml-ensembles.md wasn't properly formatted and, as a result, was looking like this: ![old](http://i.imgur.com/2ZhELLR.png) This PR aims to make it look like this: ![new](http://i.imgur.com/0Xriwd2.png) Author: BenFradet <benjamin.fra...@gmail.com> Closes #10025 from BenFradet/ml-ensembles-doc. (cherry picked from commit f2fbfa444f6e8d27953ec2d1c0b3abd603c963f9) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0978ec11 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0978ec11 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0978ec11 Branch: refs/heads/branch-1.6 Commit: 0978ec11c9a080bd493da2e9d11c81c08e8e6962 Parents: 32911de Author: BenFradet <benjamin.fra...@gmail.com> Authored: Mon Nov 30 13:02:08 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 13:02:19 2015 -0800 -- docs/ml-ensembles.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0978ec11/docs/ml-ensembles.md -- diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index f6c3c30..14fef76 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -20,6 +20,7 @@ Both use [MLlib decision trees](ml-decision-tree.html) as their base models. Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html). In this section, we demonstrate the Pipelines API for ensembles. The main differences between this API and the [original MLlib ensembles API](mllib-ensembles.html) are: + * support for ML Pipelines * separation of classification vs. regression * use of DataFrame metadata to distinguish continuous and categorical features - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] fixed list display in ml-ensembles
Repository: spark Updated Branches: refs/heads/master 8df584b02 -> f2fbfa444 [MINOR][DOCS] fixed list display in ml-ensembles The list in ml-ensembles.md wasn't properly formatted and, as a result, was looking like this: ![old](http://i.imgur.com/2ZhELLR.png) This PR aims to make it look like this: ![new](http://i.imgur.com/0Xriwd2.png) Author: BenFradet <benjamin.fra...@gmail.com> Closes #10025 from BenFradet/ml-ensembles-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2fbfa44 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2fbfa44 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2fbfa44 Branch: refs/heads/master Commit: f2fbfa444f6e8d27953ec2d1c0b3abd603c963f9 Parents: 8df584b Author: BenFradet <benjamin.fra...@gmail.com> Authored: Mon Nov 30 13:02:08 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 13:02:08 2015 -0800 -- docs/ml-ensembles.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f2fbfa44/docs/ml-ensembles.md -- diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index f6c3c30..14fef76 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -20,6 +20,7 @@ Both use [MLlib decision trees](ml-decision-tree.html) as their base models. Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html). In this section, we demonstrate the Pipelines API for ensembles. The main differences between this API and the [original MLlib ensembles API](mllib-ensembles.html) are: + * support for ML Pipelines * separation of classification vs. regression * use of DataFrame metadata to distinguish continuous and categorical features - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11960][MLLIB][DOC] User guide for streaming tests
Repository: spark Updated Branches: refs/heads/branch-1.6 a387cef3a -> ebf87ebc0 [SPARK-11960][MLLIB][DOC] User guide for streaming tests CC jkbradley mengxr josepablocam Author: Feynman Liang <feynman.li...@gmail.com> Closes #10005 from feynmanliang/streaming-test-user-guide. (cherry picked from commit 55358889309cf2d856b72e72e0f3081dfdf61cfa) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebf87ebc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebf87ebc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebf87ebc Branch: refs/heads/branch-1.6 Commit: ebf87ebc02075497f4682e3ad0f8e63d33f3b86e Parents: a387cef Author: Feynman Liang <feynman.li...@gmail.com> Authored: Mon Nov 30 15:38:44 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 15:38:51 2015 -0800 -- docs/mllib-guide.md | 1 + docs/mllib-statistics.md| 25 .../examples/mllib/StreamingTestExample.scala | 2 ++ 3 files changed, 28 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebf87ebc/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 54e35fc..43772ad 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -34,6 +34,7 @@ We list major functionality from both below, with links to detailed guides. * [correlations](mllib-statistics.html#correlations) * [stratified sampling](mllib-statistics.html#stratified-sampling) * [hypothesis testing](mllib-statistics.html#hypothesis-testing) + * [streaming significance testing](mllib-statistics.html#streaming-significance-testing) * [random data generation](mllib-statistics.html#random-data-generation) * [Classification and regression](mllib-classification-regression.html) * [linear models (SVMs, logistic regression, linear regression)](mllib-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/ebf87ebc/docs/mllib-statistics.md -- diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index ade5b07..de209f6 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -521,6 +521,31 @@ print(testResult) # summary of the test including the p-value, test statistic, +### Streaming Significance Testing +MLlib provides online implementations of some tests to support use cases +like A/B testing. These tests may be performed on a Spark Streaming +`DStream[(Boolean,Double)]` where the first element of each tuple +indicates control group (`false`) or treatment group (`true`) and the +second element is the value of an observation. + +Streaming significance testing supports the following parameters: + +* `peacePeriod` - The number of initial data points from the stream to +ignore, used to mitigate novelty effects. +* `windowSize` - The number of past batches to perform hypothesis +testing over. Setting to `0` will perform cumulative processing using +all prior batches. + + + + +[`StreamingTest`](api/scala/index.html#org.apache.spark.mllib.stat.test.StreamingTest) +provides streaming hypothesis testing. + +{% include_example scala/org/apache/spark/examples/mllib/StreamingTestExample.scala %} + + + ## Random data generation http://git-wip-us.apache.org/repos/asf/spark/blob/ebf87ebc/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala index ab29f90..b6677c6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala @@ -64,6 +64,7 @@ object StreamingTestExample { dir.toString }) +// $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => (label.toBoolean, value.toDouble) }) @@ -75,6 +76,7 @@ object StreamingTestExample { val out = streamingTest.registerStream(data) out.print() +// $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml
Repository: spark Updated Branches: refs/heads/branch-1.6 a8c6d8acc -> 1562ef10f [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml jira: https://issues.apache.org/jira/browse/SPARK-11689 Add simple user guide for LDA under spark.ml and example code under examples/. Use include_example to include example code in the user guide markdown. Check SPARK-11606 for instructions. Original PR is reverted due to document build error. https://github.com/apache/spark/pull/9722 mengxr feynmanliang yinxusen Sorry for the troubling. Author: Yuhao Yang <hhb...@gmail.com> Closes #9974 from hhbyyh/ldaMLExample. (cherry picked from commit e232720a65dfb9ae6135cbb7674e35eddd88d625) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1562ef10 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1562ef10 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1562ef10 Branch: refs/heads/branch-1.6 Commit: 1562ef10f5d1722a6c275726083684e6d0463a4f Parents: a8c6d8a Author: Yuhao Yang <hhb...@gmail.com> Authored: Mon Nov 30 14:56:51 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 14:56:58 2015 -0800 -- docs/ml-clustering.md | 31 +++ docs/ml-guide.md| 3 +- docs/mllib-guide.md | 1 + .../spark/examples/ml/JavaLDAExample.java | 97 .../apache/spark/examples/ml/LDAExample.scala | 77 5 files changed, 208 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md new file mode 100644 index 000..cfefb5d --- /dev/null +++ b/docs/ml-clustering.md @@ -0,0 +1,31 @@ +--- +layout: global +title: Clustering - ML +displayTitle: ML - Clustering +--- + +In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). + +## Latent Dirichlet allocation (LDA) + +`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`, +and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by +`EMLDAOptimizer` to a `DistributedLDAModel` if needed. + + + + + +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details. + +{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %} + + + + +Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index be18a05..6f35b30 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., provide class probabilities, and linear models provide model summaries. * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) @@ -950,4 +951,4 @@ model.transform(test) {% endhighlight %} - + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 91e50cc..54e35fc 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -69,6 +69,7 @@ We list major functionality from both below, with links to detailed guides. concepts. It also contains sections on using algorithms within the Pipelines API, for example: * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples
spark git commit: [SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python)
Repository: spark Updated Branches: refs/heads/master e232720a6 -> de64b65f7 [SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python) Remove duplicate mllib example (DT/RF/GBT in Java/Python). Since we have tutorial code for DT/RF/GBT classification/regression in Scala/Java/Python and example applications for DT/RF/GBT in Scala, so we mark these as duplicated and remove them. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9954 from yanboliang/SPARK-11975. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de64b65f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de64b65f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de64b65f Branch: refs/heads/master Commit: de64b65f7cf2ac58c1abc310ba547637fdbb8557 Parents: e232720 Author: Yanbo Liang <yblia...@gmail.com> Authored: Mon Nov 30 15:01:08 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 15:01:08 2015 -0800 -- .../spark/examples/mllib/JavaDecisionTree.java | 116 --- .../mllib/JavaGradientBoostedTreesRunner.java | 126 .../examples/mllib/JavaRandomForestExample.java | 139 -- .../main/python/mllib/decision_tree_runner.py | 144 --- .../main/python/mllib/gradient_boosted_trees.py | 77 -- .../main/python/mllib/random_forest_example.py | 90 6 files changed, 692 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/de64b65f/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java deleted file mode 100644 index 1f82e3f..000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import java.util.HashMap; - -import scala.Tuple2; - -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.DecisionTree; -import org.apache.spark.mllib.tree.model.DecisionTreeModel; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; - -/** - * Classification and regression using decision trees. - */ -public final class JavaDecisionTree { - - public static void main(String[] args) { -String datapath = "data/mllib/sample_libsvm_data.txt"; -if (args.length == 1) { - datapath = args[0]; -} else if (args.length > 1) { - System.err.println("Usage: JavaDecisionTree "); - System.exit(1); -} -SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree"); -JavaSparkContext sc = new JavaSparkContext(sparkConf); - -JavaRDD data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().cache(); - -// Compute the number of classes from the data. -Integer numClasses = data.map(new Function<LabeledPoint, Double>() { - @Override public Double call(LabeledPoint p) { -return p.label(); - } -}).countByValue().size(); - -// Set parameters. -// Empty categoricalFeaturesInfo indicates all features are continuous. -HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>(); -String impurity = "gini"; -Integer maxDepth = 5; -Integer maxBins = 32; - -// Train a DecisionTree model for classification. -final DecisionTreeModel model = Decis
spark git commit: [SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python)
Repository: spark Updated Branches: refs/heads/branch-1.6 1562ef10f -> a387cef3a [SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python) Remove duplicate mllib example (DT/RF/GBT in Java/Python). Since we have tutorial code for DT/RF/GBT classification/regression in Scala/Java/Python and example applications for DT/RF/GBT in Scala, so we mark these as duplicated and remove them. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9954 from yanboliang/SPARK-11975. (cherry picked from commit de64b65f7cf2ac58c1abc310ba547637fdbb8557) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a387cef3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a387cef3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a387cef3 Branch: refs/heads/branch-1.6 Commit: a387cef3a40d47a8ca7fa9c6aa2842318700df49 Parents: 1562ef1 Author: Yanbo Liang <yblia...@gmail.com> Authored: Mon Nov 30 15:01:08 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 15:01:16 2015 -0800 -- .../spark/examples/mllib/JavaDecisionTree.java | 116 --- .../mllib/JavaGradientBoostedTreesRunner.java | 126 .../examples/mllib/JavaRandomForestExample.java | 139 -- .../main/python/mllib/decision_tree_runner.py | 144 --- .../main/python/mllib/gradient_boosted_trees.py | 77 -- .../main/python/mllib/random_forest_example.py | 90 6 files changed, 692 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a387cef3/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java deleted file mode 100644 index 1f82e3f..000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import java.util.HashMap; - -import scala.Tuple2; - -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.DecisionTree; -import org.apache.spark.mllib.tree.model.DecisionTreeModel; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; - -/** - * Classification and regression using decision trees. - */ -public final class JavaDecisionTree { - - public static void main(String[] args) { -String datapath = "data/mllib/sample_libsvm_data.txt"; -if (args.length == 1) { - datapath = args[0]; -} else if (args.length > 1) { - System.err.println("Usage: JavaDecisionTree "); - System.exit(1); -} -SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree"); -JavaSparkContext sc = new JavaSparkContext(sparkConf); - -JavaRDD data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().cache(); - -// Compute the number of classes from the data. -Integer numClasses = data.map(new Function<LabeledPoint, Double>() { - @Override public Double call(LabeledPoint p) { -return p.label(); - } -}).countByValue().size(); - -// Set parameters. -// Empty categoricalFeaturesInfo indicates all features are continuous. -HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>(); -String impurity = "gini"; -Integer maxDepth = 5; -
spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml
Repository: spark Updated Branches: refs/heads/master a8ceec5e8 -> e232720a6 [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml jira: https://issues.apache.org/jira/browse/SPARK-11689 Add simple user guide for LDA under spark.ml and example code under examples/. Use include_example to include example code in the user guide markdown. Check SPARK-11606 for instructions. Original PR is reverted due to document build error. https://github.com/apache/spark/pull/9722 mengxr feynmanliang yinxusen Sorry for the troubling. Author: Yuhao Yang <hhb...@gmail.com> Closes #9974 from hhbyyh/ldaMLExample. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e232720a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e232720a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e232720a Branch: refs/heads/master Commit: e232720a65dfb9ae6135cbb7674e35eddd88d625 Parents: a8ceec5 Author: Yuhao Yang <hhb...@gmail.com> Authored: Mon Nov 30 14:56:51 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Nov 30 14:56:51 2015 -0800 -- docs/ml-clustering.md | 31 +++ docs/ml-guide.md| 3 +- docs/mllib-guide.md | 1 + .../spark/examples/ml/JavaLDAExample.java | 97 .../apache/spark/examples/ml/LDAExample.scala | 77 5 files changed, 208 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md new file mode 100644 index 000..cfefb5d --- /dev/null +++ b/docs/ml-clustering.md @@ -0,0 +1,31 @@ +--- +layout: global +title: Clustering - ML +displayTitle: ML - Clustering +--- + +In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). + +## Latent Dirichlet allocation (LDA) + +`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`, +and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by +`EMLDAOptimizer` to a `DistributedLDAModel` if needed. + + + + + +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details. + +{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %} + + + + +Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index be18a05..6f35b30 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., provide class probabilities, and linear models provide model summaries. * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) @@ -950,4 +951,4 @@ model.transform(test) {% endhighlight %} - + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 91e50cc..54e35fc 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -69,6 +69,7 @@ We list major functionality from both below, with links to detailed guides. concepts. It also contains sections on using algorithms within the Pipelines API, for example: * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java new file mode 100644 index 000..3a5d323 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/m
spark git commit: [SPARK-11952][ML] Remove duplicate ml examples
Repository: spark Updated Branches: refs/heads/master e5aaae6e1 -> 56a0aba0a [SPARK-11952][ML] Remove duplicate ml examples Remove duplicate ml examples (only for ml). mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9933 from yanboliang/SPARK-11685. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56a0aba0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56a0aba0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56a0aba0 Branch: refs/heads/master Commit: 56a0aba0a60326ba026056c9a23f3f6ec7258c19 Parents: e5aaae6 Author: Yanbo Liang <yblia...@gmail.com> Authored: Tue Nov 24 09:52:53 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 24 09:52:53 2015 -0800 -- .../main/python/ml/gradient_boosted_trees.py| 82 -- .../src/main/python/ml/logistic_regression.py | 66 --- .../src/main/python/ml/random_forest_example.py | 87 3 files changed, 235 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/56a0aba0/examples/src/main/python/ml/gradient_boosted_trees.py -- diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py b/examples/src/main/python/ml/gradient_boosted_trees.py deleted file mode 100644 index c3bf8aa..000 --- a/examples/src/main/python/ml/gradient_boosted_trees.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys - -from pyspark import SparkContext -from pyspark.ml.classification import GBTClassifier -from pyspark.ml.feature import StringIndexer -from pyspark.ml.regression import GBTRegressor -from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics -from pyspark.sql import Row, SQLContext - -""" -A simple example demonstrating a Gradient Boosted Trees Classification/Regression Pipeline. -Note: GBTClassifier only supports binary classification currently -Run with: - bin/spark-submit examples/src/main/python/ml/gradient_boosted_trees.py -""" - - -def testClassification(train, test): -# Train a GradientBoostedTrees model. - -rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel") - -model = rf.fit(train) -predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ -.map(lambda x: (x.prediction, x.indexedLabel)) - -metrics = BinaryClassificationMetrics(predictionAndLabels) -print("AUC %.3f" % metrics.areaUnderROC) - - -def testRegression(train, test): -# Train a GradientBoostedTrees model. - -rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") - -model = rf.fit(train) -predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ -.map(lambda x: (x.prediction, x.indexedLabel)) - -metrics = RegressionMetrics(predictionAndLabels) -print("rmse %.3f" % metrics.rootMeanSquaredError) -print("r2 %.3f" % metrics.r2) -print("mae %.3f" % metrics.meanAbsoluteError) - - -if __name__ == "__main__": -if len(sys.argv) > 1: -print("Usage: gradient_boosted_trees", file=sys.stderr) -exit(1) -sc = SparkContext(appName="PythonGBTExample") -sqlContext = SQLContext(sc) - -# Load the data stored in LIBSVM format as a DataFrame. -df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -# Map labels into an indexed column of labels in [0, numLabels) -stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") -si_model = stringIndexer.fit(df) -td = si_model.transform(df) -[train, test] = td.randomSplit([0.7, 0.3]) -testClassification(train, test) -testRegression(train,
spark git commit: [SPARK-11952][ML] Remove duplicate ml examples
Repository: spark Updated Branches: refs/heads/branch-1.6 3cb1b6d39 -> 6914b7504 [SPARK-11952][ML] Remove duplicate ml examples Remove duplicate ml examples (only for ml). mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9933 from yanboliang/SPARK-11685. (cherry picked from commit 56a0aba0a60326ba026056c9a23f3f6ec7258c19) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6914b750 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6914b750 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6914b750 Branch: refs/heads/branch-1.6 Commit: 6914b75046dceda47ba3ff904e67f55752e8d49d Parents: 3cb1b6d Author: Yanbo Liang <yblia...@gmail.com> Authored: Tue Nov 24 09:52:53 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 24 09:53:01 2015 -0800 -- .../main/python/ml/gradient_boosted_trees.py| 82 -- .../src/main/python/ml/logistic_regression.py | 66 --- .../src/main/python/ml/random_forest_example.py | 87 3 files changed, 235 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6914b750/examples/src/main/python/ml/gradient_boosted_trees.py -- diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py b/examples/src/main/python/ml/gradient_boosted_trees.py deleted file mode 100644 index c3bf8aa..000 --- a/examples/src/main/python/ml/gradient_boosted_trees.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys - -from pyspark import SparkContext -from pyspark.ml.classification import GBTClassifier -from pyspark.ml.feature import StringIndexer -from pyspark.ml.regression import GBTRegressor -from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics -from pyspark.sql import Row, SQLContext - -""" -A simple example demonstrating a Gradient Boosted Trees Classification/Regression Pipeline. -Note: GBTClassifier only supports binary classification currently -Run with: - bin/spark-submit examples/src/main/python/ml/gradient_boosted_trees.py -""" - - -def testClassification(train, test): -# Train a GradientBoostedTrees model. - -rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel") - -model = rf.fit(train) -predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ -.map(lambda x: (x.prediction, x.indexedLabel)) - -metrics = BinaryClassificationMetrics(predictionAndLabels) -print("AUC %.3f" % metrics.areaUnderROC) - - -def testRegression(train, test): -# Train a GradientBoostedTrees model. - -rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") - -model = rf.fit(train) -predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ -.map(lambda x: (x.prediction, x.indexedLabel)) - -metrics = RegressionMetrics(predictionAndLabels) -print("rmse %.3f" % metrics.rootMeanSquaredError) -print("r2 %.3f" % metrics.r2) -print("mae %.3f" % metrics.meanAbsoluteError) - - -if __name__ == "__main__": -if len(sys.argv) > 1: -print("Usage: gradient_boosted_trees", file=sys.stderr) -exit(1) -sc = SparkContext(appName="PythonGBTExample") -sqlContext = SQLContext(sc) - -# Load the data stored in LIBSVM format as a DataFrame. -df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -# Map labels into an indexed column of labels in [0, numLabels) -stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") -si_model = stringIndexer.fit(df) -td = si_model.transfo
spark git commit: [SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries ignore weight col
Repository: spark Updated Branches: refs/heads/master 56a0aba0a -> 9e24ba667 [SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries ignore weight col Doc for 1.6 that the summaries mostly ignore the weight column. To be corrected for 1.7 CC: mengxr thunterdb Author: Joseph K. Bradley <jos...@databricks.com> Closes #9927 from jkbradley/linregsummary-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e24ba66 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e24ba66 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e24ba66 Branch: refs/heads/master Commit: 9e24ba667e43290fbaa3cacb93cf5d9be790f1fd Parents: 56a0aba Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Nov 24 09:54:55 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 24 09:54:55 2015 -0800 -- .../ml/classification/LogisticRegression.scala| 18 ++ .../spark/ml/regression/LinearRegression.scala| 15 +++ 2 files changed, 33 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e24ba66/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 418bbdc..d320d64 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -755,23 +755,35 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns the receiver operating characteristic (ROC) curve, * which is an Dataframe having two fields (FPR, TPR) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic */ @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") /** * Computes the area under the receiver operating characteristic (ROC) curve. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC() /** * Returns the precision-recall curve, which is an Dataframe containing * two fields recall, precision with (0.0, 1.0) prepended to it. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision") /** * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val fMeasureByThreshold: DataFrame = { binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure") @@ -781,6 +793,9 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns a dataframe with two fields (threshold, precision) curve. * Every possible probability obtained in transforming the dataset are used * as thresholds used in calculating the precision. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val precisionByThreshold: DataFrame = { binaryMetrics.precisionByThreshold().toDF("threshold", "precision") @@ -790,6 +805,9 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns a dataframe with two fields (threshold, recall) curve. * Every possible probability obtained in transforming the dataset are used * as thresholds used in calculating the recall. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val recallByThreshold: DataFrame = { binaryMetrics.recallByThreshold().toDF("threshold", "recall") http://git-wip-us.apache.org/repos/asf/spark/blob/9e24ba66/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -
spark git commit: [SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries ignore weight col
Repository: spark Updated Branches: refs/heads/branch-1.6 6914b7504 -> 70febe224 [SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries ignore weight col Doc for 1.6 that the summaries mostly ignore the weight column. To be corrected for 1.7 CC: mengxr thunterdb Author: Joseph K. Bradley <jos...@databricks.com> Closes #9927 from jkbradley/linregsummary-doc. (cherry picked from commit 9e24ba667e43290fbaa3cacb93cf5d9be790f1fd) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70febe22 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70febe22 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70febe22 Branch: refs/heads/branch-1.6 Commit: 70febe224f64cb6468c14d4788a63b35d0475d41 Parents: 6914b75 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Nov 24 09:54:55 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 24 09:55:02 2015 -0800 -- .../ml/classification/LogisticRegression.scala| 18 ++ .../spark/ml/regression/LinearRegression.scala| 15 +++ 2 files changed, 33 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/70febe22/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 418bbdc..d320d64 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -755,23 +755,35 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns the receiver operating characteristic (ROC) curve, * which is an Dataframe having two fields (FPR, TPR) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic */ @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") /** * Computes the area under the receiver operating characteristic (ROC) curve. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC() /** * Returns the precision-recall curve, which is an Dataframe containing * two fields recall, precision with (0.0, 1.0) prepended to it. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision") /** * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val fMeasureByThreshold: DataFrame = { binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure") @@ -781,6 +793,9 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns a dataframe with two fields (threshold, precision) curve. * Every possible probability obtained in transforming the dataset are used * as thresholds used in calculating the precision. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val precisionByThreshold: DataFrame = { binaryMetrics.precisionByThreshold().toDF("threshold", "precision") @@ -790,6 +805,9 @@ class BinaryLogisticRegressionSummary private[classification] ( * Returns a dataframe with two fields (threshold, recall) curve. * Every possible probability obtained in transforming the dataset are used * as thresholds used in calculating the recall. + * + * Note: This ignores instance weights (setting all to 1.0) from [[LogisticRegression.weightCol]]. + * This will change in later Spark versions. */ @transient lazy val recallByThreshold: DataFrame = { binaryMetrics.recallByThreshold().toDF("threshold", &quo
spark git commit: [SPARK-11847][ML] Model export/import for spark.ml: LDA
Repository: spark Updated Branches: refs/heads/branch-1.6 70febe224 -> af86c38db [SPARK-11847][ML] Model export/import for spark.ml: LDA Add read/write support to LDA, similar to ALS. save/load for ml.LocalLDAModel is done. For DistributedLDAModel, I'm not sure if we can invoke save on the mllib.DistributedLDAModel directly. I'll send update after some test. Author: Yuhao Yang <hhb...@gmail.com> Closes #9894 from hhbyyh/ldaMLsave. (cherry picked from commit 52bc25c8e26d4be250d8ff7864067528f4f98592) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af86c38d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af86c38d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af86c38d Branch: refs/heads/branch-1.6 Commit: af86c38db7676c4dfc2724d5f86f0f5f3a22e349 Parents: 70febe2 Author: Yuhao Yang <hhb...@gmail.com> Authored: Tue Nov 24 09:56:17 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 24 09:56:24 2015 -0800 -- .../org/apache/spark/ml/clustering/LDA.scala| 110 ++- .../spark/mllib/clustering/LDAModel.scala | 4 +- .../apache/spark/ml/clustering/LDASuite.scala | 44 +++- 3 files changed, 150 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af86c38d/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 92e0581..830510b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -17,12 +17,13 @@ package org.apache.spark.ml.clustering +import org.apache.hadoop.fs.Path import org.apache.spark.Logging import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param.shared.{HasCheckpointInterval, HasFeaturesCol, HasSeed, HasMaxIter} import org.apache.spark.ml.param._ +import org.apache.spark.ml.util._ import org.apache.spark.mllib.clustering.{DistributedLDAModel => OldDistributedLDAModel, EMLDAOptimizer => OldEMLDAOptimizer, LDA => OldLDA, LDAModel => OldLDAModel, LDAOptimizer => OldLDAOptimizer, LocalLDAModel => OldLocalLDAModel, @@ -322,7 +323,7 @@ sealed abstract class LDAModel private[ml] ( @Since("1.6.0") override val uid: String, @Since("1.6.0") val vocabSize: Int, @Since("1.6.0") @transient protected val sqlContext: SQLContext) - extends Model[LDAModel] with LDAParams with Logging { + extends Model[LDAModel] with LDAParams with Logging with MLWritable { // NOTE to developers: // This abstraction should contain all important functionality for basic LDA usage. @@ -486,6 +487,64 @@ class LocalLDAModel private[ml] ( @Since("1.6.0") override def isDistributed: Boolean = false + + @Since("1.6.0") + override def write: MLWriter = new LocalLDAModel.LocalLDAModelWriter(this) +} + + +@Since("1.6.0") +object LocalLDAModel extends MLReadable[LocalLDAModel] { + + private[LocalLDAModel] + class LocalLDAModelWriter(instance: LocalLDAModel) extends MLWriter { + +private case class Data( +vocabSize: Int, +topicsMatrix: Matrix, +docConcentration: Vector, +topicConcentration: Double, +gammaShape: Double) + +override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path, sc) + val oldModel = instance.oldLocalModel + val data = Data(instance.vocabSize, oldModel.topicsMatrix, oldModel.docConcentration, +oldModel.topicConcentration, oldModel.gammaShape) + val dataPath = new Path(path, "data").toString + sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) +} + } + + private class LocalLDAModelReader extends MLReader[LocalLDAModel] { + +private val className = classOf[LocalLDAModel].getName + +override def load(path: String): LocalLDAModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + val dataPath = new Path(path, "data").toString + val data = sqlContext.read.parquet(dataPath) +.select("vocabSize", "topicsMatrix", "docConcentration", "topicConcentration", + "gammaShape") +.head() + val vocabSize = data.getAs[Int](0) + val topicsMatrix = data.getA
spark git commit: [SPARK-11847][ML] Model export/import for spark.ml: LDA
Repository: spark Updated Branches: refs/heads/master 9e24ba667 -> 52bc25c8e [SPARK-11847][ML] Model export/import for spark.ml: LDA Add read/write support to LDA, similar to ALS. save/load for ml.LocalLDAModel is done. For DistributedLDAModel, I'm not sure if we can invoke save on the mllib.DistributedLDAModel directly. I'll send update after some test. Author: Yuhao Yang <hhb...@gmail.com> Closes #9894 from hhbyyh/ldaMLsave. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52bc25c8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52bc25c8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52bc25c8 Branch: refs/heads/master Commit: 52bc25c8e26d4be250d8ff7864067528f4f98592 Parents: 9e24ba6 Author: Yuhao Yang <hhb...@gmail.com> Authored: Tue Nov 24 09:56:17 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 24 09:56:17 2015 -0800 -- .../org/apache/spark/ml/clustering/LDA.scala| 110 ++- .../spark/mllib/clustering/LDAModel.scala | 4 +- .../apache/spark/ml/clustering/LDASuite.scala | 44 +++- 3 files changed, 150 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/52bc25c8/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 92e0581..830510b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -17,12 +17,13 @@ package org.apache.spark.ml.clustering +import org.apache.hadoop.fs.Path import org.apache.spark.Logging import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param.shared.{HasCheckpointInterval, HasFeaturesCol, HasSeed, HasMaxIter} import org.apache.spark.ml.param._ +import org.apache.spark.ml.util._ import org.apache.spark.mllib.clustering.{DistributedLDAModel => OldDistributedLDAModel, EMLDAOptimizer => OldEMLDAOptimizer, LDA => OldLDA, LDAModel => OldLDAModel, LDAOptimizer => OldLDAOptimizer, LocalLDAModel => OldLocalLDAModel, @@ -322,7 +323,7 @@ sealed abstract class LDAModel private[ml] ( @Since("1.6.0") override val uid: String, @Since("1.6.0") val vocabSize: Int, @Since("1.6.0") @transient protected val sqlContext: SQLContext) - extends Model[LDAModel] with LDAParams with Logging { + extends Model[LDAModel] with LDAParams with Logging with MLWritable { // NOTE to developers: // This abstraction should contain all important functionality for basic LDA usage. @@ -486,6 +487,64 @@ class LocalLDAModel private[ml] ( @Since("1.6.0") override def isDistributed: Boolean = false + + @Since("1.6.0") + override def write: MLWriter = new LocalLDAModel.LocalLDAModelWriter(this) +} + + +@Since("1.6.0") +object LocalLDAModel extends MLReadable[LocalLDAModel] { + + private[LocalLDAModel] + class LocalLDAModelWriter(instance: LocalLDAModel) extends MLWriter { + +private case class Data( +vocabSize: Int, +topicsMatrix: Matrix, +docConcentration: Vector, +topicConcentration: Double, +gammaShape: Double) + +override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path, sc) + val oldModel = instance.oldLocalModel + val data = Data(instance.vocabSize, oldModel.topicsMatrix, oldModel.docConcentration, +oldModel.topicConcentration, oldModel.gammaShape) + val dataPath = new Path(path, "data").toString + sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) +} + } + + private class LocalLDAModelReader extends MLReader[LocalLDAModel] { + +private val className = classOf[LocalLDAModel].getName + +override def load(path: String): LocalLDAModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + val dataPath = new Path(path, "data").toString + val data = sqlContext.read.parquet(dataPath) +.select("vocabSize", "topicsMatrix", "docConcentration", "topicConcentration", + "gammaShape") +.head() + val vocabSize = data.getAs[Int](0) + val topicsMatrix = data.getAs[Matrix](1) + val docConcentration = data.getAs[Vector](2) + val topicConcentration = data.getAs[Double](3) + val gamma
spark git commit: [SPARK-11895][ML] rename and refactor DatasetExample under mllib/examples
Repository: spark Updated Branches: refs/heads/branch-1.6 fc4b88f3b -> a36d9bc75 [SPARK-11895][ML] rename and refactor DatasetExample under mllib/examples We used the name `Dataset` to refer to `SchemaRDD` in 1.2 in ML pipelines and created this example file. Since `Dataset` has a new meaning in Spark 1.6, we should rename it to avoid confusion. This PR also removes support for dense format to simplify the example code. cc: yinxusen Author: Xiangrui Meng <m...@databricks.com> Closes #9873 from mengxr/SPARK-11895. (cherry picked from commit fe89c1817d668e46adf70d0896c42c22a547c76a) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a36d9bc7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a36d9bc7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a36d9bc7 Branch: refs/heads/branch-1.6 Commit: a36d9bc7528ab8e6fe5e002f9b9b0a51a5b93568 Parents: fc4b88f Author: Xiangrui Meng <m...@databricks.com> Authored: Sun Nov 22 21:45:46 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 21:45:53 2015 -0800 -- .../spark/examples/ml/DataFrameExample.scala| 104 .../spark/examples/mllib/DatasetExample.scala | 123 --- 2 files changed, 104 insertions(+), 123 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a36d9bc7/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala new file mode 100644 index 000..424f001 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +import java.io.File + +import com.google.common.io.Files +import scopt.OptionParser + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.examples.mllib.AbstractParams +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +/** + * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with + * {{{ + * ./bin/run-example ml.DataFrameExample [options] + * }}} + * If you use it as a template to create your own app, please use `spark-submit` to submit your app. + */ +object DataFrameExample { + + case class Params(input: String = "data/mllib/sample_libsvm_data.txt") +extends AbstractParams[Params] + + def main(args: Array[String]) { +val defaultParams = Params() + +val parser = new OptionParser[Params]("DatasetExample") { + head("Dataset: an example app using DataFrame as a Dataset for ML.") + opt[String]("input") +.text(s"input path to dataset") +.action((x, c) => c.copy(input = x)) + checkConfig { params => +success + } +} + +parser.parse(args, defaultParams).map { params => + run(params) +}.getOrElse { + sys.exit(1) +} + } + + def run(params: Params) { + +val conf = new SparkConf().setAppName(s"DataFrameExample with $params") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) + +// Load input data +println(s"Loading LIBSVM file with UDT from ${params.input}.") +val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache() +println("Schema from LIBSVM:") +df.printSchema() +println(s"Loaded training data as a DataFrame with ${df.count()} records.") + +// Show statistical summary of labels. +val labelSummary = df.describe("label"
spark git commit: [SPARK-11902][ML] Unhandled case in VectorAssembler#transform
Repository: spark Updated Branches: refs/heads/master d9cf9c21f -> 4be360d4e [SPARK-11902][ML] Unhandled case in VectorAssembler#transform There is an unhandled case in the transform method of VectorAssembler if one of the input columns doesn't have one of the supported type DoubleType, NumericType, BooleanType or VectorUDT. So, if you try to transform a column of StringType you get a cryptic "scala.MatchError: StringType". This PR aims to fix this, throwing a SparkException when dealing with an unknown column type. Author: BenFradet <benjamin.fra...@gmail.com> Closes #9885 from BenFradet/SPARK-11902. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4be360d4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4be360d4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4be360d4 Branch: refs/heads/master Commit: 4be360d4ee6cdb4d06306feca38ddef5212608cf Parents: d9cf9c2 Author: BenFradet <benjamin.fra...@gmail.com> Authored: Sun Nov 22 22:05:01 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 22:05:01 2015 -0800 -- .../org/apache/spark/ml/feature/VectorAssembler.scala| 2 ++ .../apache/spark/ml/feature/VectorAssemblerSuite.scala | 11 +++ 2 files changed, 13 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4be360d4/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 0feec05..801096f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -84,6 +84,8 @@ class VectorAssembler(override val uid: String) val numAttrs = group.numAttributes.getOrElse(first.getAs[Vector](index).size) Array.fill(numAttrs)(NumericAttribute.defaultAttr) } +case otherType => + throw new SparkException(s"VectorAssembler does not support the $otherType type") } } val metadata = new AttributeGroup($(outputCol), attrs).toMetadata() http://git-wip-us.apache.org/repos/asf/spark/blob/4be360d4/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index fb21ab6..9c1c00f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -69,6 +69,17 @@ class VectorAssemblerSuite } } + test("transform should throw an exception in case of unsupported type") { +val df = sqlContext.createDataFrame(Seq(("a", "b", "c"))).toDF("a", "b", "c") +val assembler = new VectorAssembler() + .setInputCols(Array("a", "b", "c")) + .setOutputCol("features") +val thrown = intercept[SparkException] { + assembler.transform(df) +} +assert(thrown.getMessage contains "VectorAssembler does not support the StringType type") + } + test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11835] Adds a sidebar menu to MLlib's documentation
Repository: spark Updated Branches: refs/heads/branch-1.6 835b5488f -> 7f9d3358a [SPARK-11835] Adds a sidebar menu to MLlib's documentation This PR adds a sidebar menu when browsing the user guide of MLlib. It uses a YAML file to describe the structure of the documentation. It should be trivial to adapt this to the other projects. ![screen shot 2015-11-18 at 4 46 12 pm](https://cloud.githubusercontent.com/assets/7594753/11259591/a55173f4-8e17-11e5-9340-0aed79d66262.png) Author: Timothy Hunter <timhun...@databricks.com> Closes #9826 from thunterdb/spark-11835. (cherry picked from commit fc4b792d287095d70379a51f117c225d8d857078) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f9d3358 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f9d3358 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f9d3358 Branch: refs/heads/branch-1.6 Commit: 7f9d3358afd7e266c79e9989e4d874cd1183f474 Parents: 835b548 Author: Timothy Hunter <timhun...@databricks.com> Authored: Sun Nov 22 21:51:42 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 21:51:51 2015 -0800 -- docs/_data/menu-ml.yaml | 10 docs/_data/menu-mllib.yaml | 75 docs/_includes/nav-left-wrapper-ml.html | 8 +++ docs/_includes/nav-left.html| 17 +++ docs/_layouts/global.html | 24 ++--- docs/css/main.css | 37 ++ 6 files changed, 163 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f9d3358/docs/_data/menu-ml.yaml -- diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml new file mode 100644 index 000..dff3d33 --- /dev/null +++ b/docs/_data/menu-ml.yaml @@ -0,0 +1,10 @@ +- text: Feature extraction, transformation, and selection + url: ml-features.html +- text: Decision trees for classification and regression + url: ml-decision-tree.html +- text: Ensembles + url: ml-ensembles.html +- text: Linear methods with elastic-net regularization + url: ml-linear-methods.html +- text: Multilayer perceptron classifier + url: ml-ann.html http://git-wip-us.apache.org/repos/asf/spark/blob/7f9d3358/docs/_data/menu-mllib.yaml -- diff --git a/docs/_data/menu-mllib.yaml b/docs/_data/menu-mllib.yaml new file mode 100644 index 000..12d22ab --- /dev/null +++ b/docs/_data/menu-mllib.yaml @@ -0,0 +1,75 @@ +- text: Data types + url: mllib-data-types.html +- text: Basic statistics + url: mllib-statistics.html + subitems: +- text: Summary statistics + url: mllib-statistics.html#summary-statistics +- text: Correlations + url: mllib-statistics.html#correlations +- text: Stratified sampling + url: mllib-statistics.html#stratified-sampling +- text: Hypothesis testing + url: mllib-statistics.html#hypothesis-testing +- text: Random data generation + url: mllib-statistics.html#random-data-generation +- text: Classification and regression + url: mllib-classification-regression.html + subitems: +- text: Linear models (SVMs, logistic regression, linear regression) + url: mllib-linear-methods.html +- text: Naive Bayes + url: mllib-naive-bayes.html +- text: decision trees + url: mllib-decision-tree.html +- text: ensembles of trees (Random Forests and Gradient-Boosted Trees) + url: mllib-ensembles.html +- text: isotonic regression + url: mllib-isotonic-regression.html +- text: Collaborative filtering + url: mllib-collaborative-filtering.html + subitems: +- text: alternating least squares (ALS) + url: mllib-collaborative-filtering.html#collaborative-filtering +- text: Clustering + url: mllib-clustering.html + subitems: +- text: k-means + url: mllib-clustering.html#k-means +- text: Gaussian mixture + url: mllib-clustering.html#gaussian-mixture +- text: power iteration clustering (PIC) + url: mllib-clustering.html#power-iteration-clustering-pic +- text: latent Dirichlet allocation (LDA) + url: mllib-clustering.html#latent-dirichlet-allocation-lda +- text: streaming k-means + url: mllib-clustering.html#streaming-k-means +- text: Dimensionality reduction + url: mllib-dimensionality-reduction.html + subitems: +- text: singular value decomposition (SVD) + url: mllib-dimensionality-reduction.html#singular-value-decomposition-svd +- text: principal component analysis (PCA) + url: mllib-dimensionality-reduction.html#principal-component-analysis-pca +- text: Feature extraction and t
spark git commit: [SPARK-11835] Adds a sidebar menu to MLlib's documentation
Repository: spark Updated Branches: refs/heads/master a6fda0bfc -> fc4b792d2 [SPARK-11835] Adds a sidebar menu to MLlib's documentation This PR adds a sidebar menu when browsing the user guide of MLlib. It uses a YAML file to describe the structure of the documentation. It should be trivial to adapt this to the other projects. ![screen shot 2015-11-18 at 4 46 12 pm](https://cloud.githubusercontent.com/assets/7594753/11259591/a55173f4-8e17-11e5-9340-0aed79d66262.png) Author: Timothy Hunter <timhun...@databricks.com> Closes #9826 from thunterdb/spark-11835. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc4b792d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc4b792d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc4b792d Branch: refs/heads/master Commit: fc4b792d287095d70379a51f117c225d8d857078 Parents: a6fda0b Author: Timothy Hunter <timhun...@databricks.com> Authored: Sun Nov 22 21:51:42 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 21:51:42 2015 -0800 -- docs/_data/menu-ml.yaml | 10 docs/_data/menu-mllib.yaml | 75 docs/_includes/nav-left-wrapper-ml.html | 8 +++ docs/_includes/nav-left.html| 17 +++ docs/_layouts/global.html | 24 ++--- docs/css/main.css | 37 ++ 6 files changed, 163 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc4b792d/docs/_data/menu-ml.yaml -- diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml new file mode 100644 index 000..dff3d33 --- /dev/null +++ b/docs/_data/menu-ml.yaml @@ -0,0 +1,10 @@ +- text: Feature extraction, transformation, and selection + url: ml-features.html +- text: Decision trees for classification and regression + url: ml-decision-tree.html +- text: Ensembles + url: ml-ensembles.html +- text: Linear methods with elastic-net regularization + url: ml-linear-methods.html +- text: Multilayer perceptron classifier + url: ml-ann.html http://git-wip-us.apache.org/repos/asf/spark/blob/fc4b792d/docs/_data/menu-mllib.yaml -- diff --git a/docs/_data/menu-mllib.yaml b/docs/_data/menu-mllib.yaml new file mode 100644 index 000..12d22ab --- /dev/null +++ b/docs/_data/menu-mllib.yaml @@ -0,0 +1,75 @@ +- text: Data types + url: mllib-data-types.html +- text: Basic statistics + url: mllib-statistics.html + subitems: +- text: Summary statistics + url: mllib-statistics.html#summary-statistics +- text: Correlations + url: mllib-statistics.html#correlations +- text: Stratified sampling + url: mllib-statistics.html#stratified-sampling +- text: Hypothesis testing + url: mllib-statistics.html#hypothesis-testing +- text: Random data generation + url: mllib-statistics.html#random-data-generation +- text: Classification and regression + url: mllib-classification-regression.html + subitems: +- text: Linear models (SVMs, logistic regression, linear regression) + url: mllib-linear-methods.html +- text: Naive Bayes + url: mllib-naive-bayes.html +- text: decision trees + url: mllib-decision-tree.html +- text: ensembles of trees (Random Forests and Gradient-Boosted Trees) + url: mllib-ensembles.html +- text: isotonic regression + url: mllib-isotonic-regression.html +- text: Collaborative filtering + url: mllib-collaborative-filtering.html + subitems: +- text: alternating least squares (ALS) + url: mllib-collaborative-filtering.html#collaborative-filtering +- text: Clustering + url: mllib-clustering.html + subitems: +- text: k-means + url: mllib-clustering.html#k-means +- text: Gaussian mixture + url: mllib-clustering.html#gaussian-mixture +- text: power iteration clustering (PIC) + url: mllib-clustering.html#power-iteration-clustering-pic +- text: latent Dirichlet allocation (LDA) + url: mllib-clustering.html#latent-dirichlet-allocation-lda +- text: streaming k-means + url: mllib-clustering.html#streaming-k-means +- text: Dimensionality reduction + url: mllib-dimensionality-reduction.html + subitems: +- text: singular value decomposition (SVD) + url: mllib-dimensionality-reduction.html#singular-value-decomposition-svd +- text: principal component analysis (PCA) + url: mllib-dimensionality-reduction.html#principal-component-analysis-pca +- text: Feature extraction and transformation + url: mllib-feature-extraction.html +- text: Frequent pattern mining + url: mllib-frequent-pattern-mining.html + subitems: +
spark git commit: [SPARK-11912][ML] ml.feature.PCA minor refactor
Repository: spark Updated Branches: refs/heads/master fc4b792d2 -> d9cf9c21f [SPARK-11912][ML] ml.feature.PCA minor refactor Like [SPARK-11852](https://issues.apache.org/jira/browse/SPARK-11852), ```k``` is params and we should save it under ```metadata/``` rather than both under ```data/``` and ```metadata/```. Refactor the constructor of ```ml.feature.PCAModel``` to take only ```pc``` but construct ```mllib.feature.PCAModel``` inside ```transform```. Author: Yanbo Liang <yblia...@gmail.com> Closes #9897 from yanboliang/spark-11912. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9cf9c21 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9cf9c21 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9cf9c21 Branch: refs/heads/master Commit: d9cf9c21fc6b1aa22e68d66760afd42c4e1c18b8 Parents: fc4b792 Author: Yanbo Liang <yblia...@gmail.com> Authored: Sun Nov 22 21:56:07 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 21:56:07 2015 -0800 -- .../scala/org/apache/spark/ml/feature/PCA.scala | 23 +++ .../org/apache/spark/ml/feature/PCASuite.scala | 31 2 files changed, 24 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9cf9c21/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 32d7afe..aa88cb0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -73,7 +73,7 @@ class PCA (override val uid: String) extends Estimator[PCAModel] with PCAParams val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v} val pca = new feature.PCA(k = $(k)) val pcaModel = pca.fit(input) -copyValues(new PCAModel(uid, pcaModel).setParent(this)) +copyValues(new PCAModel(uid, pcaModel.pc).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -99,18 +99,17 @@ object PCA extends DefaultParamsReadable[PCA] { /** * :: Experimental :: * Model fitted by [[PCA]]. + * + * @param pc A principal components Matrix. Each column is one principal component. */ @Experimental class PCAModel private[ml] ( override val uid: String, -pcaModel: feature.PCAModel) +val pc: DenseMatrix) extends Model[PCAModel] with PCAParams with MLWritable { import PCAModel._ - /** a principal components Matrix. Each column is one principal component. */ - val pc: DenseMatrix = pcaModel.pc - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -124,6 +123,7 @@ class PCAModel private[ml] ( */ override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) +val pcaModel = new feature.PCAModel($(k), pc) val pcaOp = udf { pcaModel.transform _ } dataset.withColumn($(outputCol), pcaOp(col($(inputCol } @@ -139,7 +139,7 @@ class PCAModel private[ml] ( } override def copy(extra: ParamMap): PCAModel = { -val copied = new PCAModel(uid, pcaModel) +val copied = new PCAModel(uid, pc) copyValues(copied, extra).setParent(parent) } @@ -152,11 +152,11 @@ object PCAModel extends MLReadable[PCAModel] { private[PCAModel] class PCAModelWriter(instance: PCAModel) extends MLWriter { -private case class Data(k: Int, pc: DenseMatrix) +private case class Data(pc: DenseMatrix) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - val data = Data(instance.getK, instance.pc) + val data = Data(instance.pc) val dataPath = new Path(path, "data").toString sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } @@ -169,11 +169,10 @@ object PCAModel extends MLReadable[PCAModel] { override def load(path: String): PCAModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(k: Int, pc: DenseMatrix) = sqlContext.read.parquet(dataPath) -.select("k", "pc") + val Row(pc: DenseMatrix) = sqlContext.read.parquet(dataPath) +.select("pc") .head() - val oldModel = new feature.PCAModel(k, pc) - val model = new PCAModel(metadata.uid, oldModel) + val model = new PCAModel(metadata.uid, pc) DefaultParamsReader.getAndSetParams(model, metadata) model } http://git-wip-u
spark git commit: [SPARK-11912][ML] ml.feature.PCA minor refactor
Repository: spark Updated Branches: refs/heads/branch-1.6 7f9d3358a -> d482dced3 [SPARK-11912][ML] ml.feature.PCA minor refactor Like [SPARK-11852](https://issues.apache.org/jira/browse/SPARK-11852), ```k``` is params and we should save it under ```metadata/``` rather than both under ```data/``` and ```metadata/```. Refactor the constructor of ```ml.feature.PCAModel``` to take only ```pc``` but construct ```mllib.feature.PCAModel``` inside ```transform```. Author: Yanbo Liang <yblia...@gmail.com> Closes #9897 from yanboliang/spark-11912. (cherry picked from commit d9cf9c21fc6b1aa22e68d66760afd42c4e1c18b8) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d482dced Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d482dced Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d482dced Branch: refs/heads/branch-1.6 Commit: d482dced313d1d837508d3f449261419c8543c1d Parents: 7f9d335 Author: Yanbo Liang <yblia...@gmail.com> Authored: Sun Nov 22 21:56:07 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 21:56:17 2015 -0800 -- .../scala/org/apache/spark/ml/feature/PCA.scala | 23 +++ .../org/apache/spark/ml/feature/PCASuite.scala | 31 2 files changed, 24 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d482dced/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 32d7afe..aa88cb0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -73,7 +73,7 @@ class PCA (override val uid: String) extends Estimator[PCAModel] with PCAParams val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v} val pca = new feature.PCA(k = $(k)) val pcaModel = pca.fit(input) -copyValues(new PCAModel(uid, pcaModel).setParent(this)) +copyValues(new PCAModel(uid, pcaModel.pc).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -99,18 +99,17 @@ object PCA extends DefaultParamsReadable[PCA] { /** * :: Experimental :: * Model fitted by [[PCA]]. + * + * @param pc A principal components Matrix. Each column is one principal component. */ @Experimental class PCAModel private[ml] ( override val uid: String, -pcaModel: feature.PCAModel) +val pc: DenseMatrix) extends Model[PCAModel] with PCAParams with MLWritable { import PCAModel._ - /** a principal components Matrix. Each column is one principal component. */ - val pc: DenseMatrix = pcaModel.pc - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -124,6 +123,7 @@ class PCAModel private[ml] ( */ override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) +val pcaModel = new feature.PCAModel($(k), pc) val pcaOp = udf { pcaModel.transform _ } dataset.withColumn($(outputCol), pcaOp(col($(inputCol } @@ -139,7 +139,7 @@ class PCAModel private[ml] ( } override def copy(extra: ParamMap): PCAModel = { -val copied = new PCAModel(uid, pcaModel) +val copied = new PCAModel(uid, pc) copyValues(copied, extra).setParent(parent) } @@ -152,11 +152,11 @@ object PCAModel extends MLReadable[PCAModel] { private[PCAModel] class PCAModelWriter(instance: PCAModel) extends MLWriter { -private case class Data(k: Int, pc: DenseMatrix) +private case class Data(pc: DenseMatrix) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - val data = Data(instance.getK, instance.pc) + val data = Data(instance.pc) val dataPath = new Path(path, "data").toString sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } @@ -169,11 +169,10 @@ object PCAModel extends MLReadable[PCAModel] { override def load(path: String): PCAModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(k: Int, pc: DenseMatrix) = sqlContext.read.parquet(dataPath) -.select("k", "pc") + val Row(pc: DenseMatrix) = sqlContext.read.parquet(dataPath) +.select("pc") .head() - val oldModel = new feature.PCAModel(k, pc) - val model = new PCAModel(metadata.uid, oldModel) + val m
spark git commit: [SPARK-6791][ML] Add read/write for CrossValidator and Evaluators
Repository: spark Updated Branches: refs/heads/master fe89c1817 -> a6fda0bfc [SPARK-6791][ML] Add read/write for CrossValidator and Evaluators I believe this works for general estimators within CrossValidator, including compound estimators. (See the complex unit test.) Added read/write for all 3 Evaluators as well. CC: mengxr yanboliang Author: Joseph K. Bradley <jos...@databricks.com> Closes #9848 from jkbradley/cv-io. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6fda0bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6fda0bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6fda0bf Branch: refs/heads/master Commit: a6fda0bfc16a13b28b1cecc96f1ff91363089144 Parents: fe89c18 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Sun Nov 22 21:48:48 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Nov 22 21:48:48 2015 -0800 -- .../scala/org/apache/spark/ml/Pipeline.scala| 38 +-- .../BinaryClassificationEvaluator.scala | 11 +- .../MulticlassClassificationEvaluator.scala | 12 +- .../ml/evaluation/RegressionEvaluator.scala | 11 +- .../apache/spark/ml/recommendation/ALS.scala| 14 +- .../apache/spark/ml/tuning/CrossValidator.scala | 229 ++- .../org/apache/spark/ml/util/ReadWrite.scala| 48 ++-- .../org/apache/spark/ml/PipelineSuite.scala | 4 +- .../BinaryClassificationEvaluatorSuite.scala| 13 +- ...MulticlassClassificationEvaluatorSuite.scala | 13 +- .../evaluation/RegressionEvaluatorSuite.scala | 12 +- .../spark/ml/tuning/CrossValidatorSuite.scala | 202 +++- 12 files changed, 522 insertions(+), 85 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a6fda0bf/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index 6f15b37..4b2b3f8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -34,7 +34,6 @@ import org.apache.spark.ml.util.MLWriter import org.apache.spark.ml.util._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType -import org.apache.spark.util.Utils /** * :: DeveloperApi :: @@ -232,20 +231,9 @@ object Pipeline extends MLReadable[Pipeline] { stages: Array[PipelineStage], sc: SparkContext, path: String): Unit = { - // Copied and edited from DefaultParamsWriter.saveMetadata - // TODO: modify DefaultParamsWriter.saveMetadata to avoid duplication - val uid = instance.uid - val cls = instance.getClass.getName val stageUids = stages.map(_.uid) val jsonParams = List("stageUids" -> parse(compact(render(stageUids.toSeq - val metadata = ("class" -> cls) ~ -("timestamp" -> System.currentTimeMillis()) ~ -("sparkVersion" -> sc.version) ~ -("uid" -> uid) ~ -("paramMap" -> jsonParams) - val metadataPath = new Path(path, "metadata").toString - val metadataJson = compact(render(metadata)) - sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath) + DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap = Some(jsonParams)) // Save stages val stagesDir = new Path(path, "stages").toString @@ -266,30 +254,10 @@ object Pipeline extends MLReadable[Pipeline] { implicit val format = DefaultFormats val stagesDir = new Path(path, "stages").toString - val stageUids: Array[String] = metadata.params match { -case JObject(pairs) => - if (pairs.length != 1) { -// Should not happen unless file is corrupted or we have a bug. -throw new RuntimeException( - s"Pipeline read expected 1 Param (stageUids), but found ${pairs.length}.") - } - pairs.head match { -case ("stageUids", jsonValue) => - jsonValue.extract[Seq[String]].toArray -case (paramName, jsonValue) => - // Should not happen unless file is corrupted or we have a bug. - throw new RuntimeException(s"Pipeline read encountered unexpected Param $paramName" + -s" in metadata: ${metadata.metadataStr}") - } -case _ => - throw new IllegalArgumentException( -s"Cannot recognize JSON metadata: ${metadata.metadataStr}.") - } + val stageUids: Array
spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml
Repository: spark Updated Branches: refs/heads/master 9ace2e5c8 -> e359d5dcf [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml jira: https://issues.apache.org/jira/browse/SPARK-11689 Add simple user guide for LDA under spark.ml and example code under examples/. Use include_example to include example code in the user guide markdown. Check SPARK-11606 for instructions. Author: Yuhao Yang <hhb...@gmail.com> Closes #9722 from hhbyyh/ldaMLExample. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e359d5dc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e359d5dc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e359d5dc Branch: refs/heads/master Commit: e359d5dcf5bd300213054ebeae9fe75c4f7eb9e7 Parents: 9ace2e5 Author: Yuhao Yang <hhb...@gmail.com> Authored: Fri Nov 20 09:57:09 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 09:57:09 2015 -0800 -- docs/ml-clustering.md | 30 +++ docs/ml-guide.md| 3 +- docs/mllib-guide.md | 1 + .../spark/examples/ml/JavaLDAExample.java | 94 .../apache/spark/examples/ml/LDAExample.scala | 77 5 files changed, 204 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md new file mode 100644 index 000..1743ef4 --- /dev/null +++ b/docs/ml-clustering.md @@ -0,0 +1,30 @@ +--- +layout: global +title: Clustering - ML +displayTitle: ML - Clustering +--- + +In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). + +## Latent Dirichlet allocation (LDA) + +`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`, +and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by +`EMLDAOptimizer` to a `DistributedLDAModel` if needed. + + + +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details. + + +{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %} + + + + +Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index be18a05..6f35b30 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., provide class probabilities, and linear models provide model summaries. * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) @@ -950,4 +951,4 @@ model.transform(test) {% endhighlight %} - + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 91e50cc..54e35fc 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -69,6 +69,7 @@ We list major functionality from both below, with links to detailed guides. concepts. It also contains sections on using algorithms within the Pipelines API, for example: * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java new file mode 100644 index 000..b3a7d2e --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements.
spark git commit: [SPARK-11852][ML] StandardScaler minor refactor
Repository: spark Updated Branches: refs/heads/branch-1.6 eab90d3f3 -> b11aa1797 [SPARK-11852][ML] StandardScaler minor refactor ```withStd``` and ```withMean``` should be params of ```StandardScaler``` and ```StandardScalerModel```. Author: Yanbo Liang <yblia...@gmail.com> Closes #9839 from yanboliang/standardScaler-refactor. (cherry picked from commit 9ace2e5c8d7fbd360a93bc5fc4eace64a697b44f) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b11aa179 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b11aa179 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b11aa179 Branch: refs/heads/branch-1.6 Commit: b11aa1797c928f2cfaf1d8821eff4be4109ac41d Parents: eab90d3 Author: Yanbo Liang <yblia...@gmail.com> Authored: Fri Nov 20 09:55:53 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 09:56:02 2015 -0800 -- .../spark/ml/feature/StandardScaler.scala | 60 +--- .../spark/ml/feature/StandardScalerSuite.scala | 11 ++-- 2 files changed, 32 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b11aa179/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 6d54521..d76a9c6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -36,20 +36,30 @@ import org.apache.spark.sql.types.{StructField, StructType} private[feature] trait StandardScalerParams extends Params with HasInputCol with HasOutputCol { /** - * Centers the data with mean before scaling. + * Whether to center the data with mean before scaling. * It will build a dense output, so this does not work on sparse input * and will raise an exception. * Default: false * @group param */ - val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean") + val withMean: BooleanParam = new BooleanParam(this, "withMean", +"Whether to center data with mean") + + /** @group getParam */ + def getWithMean: Boolean = $(withMean) /** - * Scales the data to unit standard deviation. + * Whether to scale the data to unit standard deviation. * Default: true * @group param */ - val withStd: BooleanParam = new BooleanParam(this, "withStd", "Scale to unit standard deviation") + val withStd: BooleanParam = new BooleanParam(this, "withStd", +"Whether to scale the data to unit standard deviation") + + /** @group getParam */ + def getWithStd: Boolean = $(withStd) + + setDefault(withMean -> false, withStd -> true) } /** @@ -63,8 +73,6 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM def this() = this(Identifiable.randomUID("stdScal")) - setDefault(withMean -> false, withStd -> true) - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -82,7 +90,7 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v } val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd)) val scalerModel = scaler.fit(input) -copyValues(new StandardScalerModel(uid, scalerModel).setParent(this)) +copyValues(new StandardScalerModel(uid, scalerModel.std, scalerModel.mean).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -108,29 +116,19 @@ object StandardScaler extends DefaultParamsReadable[StandardScaler] { /** * :: Experimental :: * Model fitted by [[StandardScaler]]. + * + * @param std Standard deviation of the StandardScalerModel + * @param mean Mean of the StandardScalerModel */ @Experimental class StandardScalerModel private[ml] ( override val uid: String, -scaler: feature.StandardScalerModel) +val std: Vector, +val mean: Vector) extends Model[StandardScalerModel] with StandardScalerParams with MLWritable { import StandardScalerModel._ - /** Standard deviation of the StandardScalerModel */ - val std: Vector = scaler.std - - /** Mean of the StandardScalerModel */ - val mean: Vector = scaler.mean - - /** Whether to scale to unit standard deviation. */ - @Since("1.6.0") - def getWithStd: Boolean = scaler.withStd - - /** Whether to center da
spark git commit: [SPARK-11852][ML] StandardScaler minor refactor
Repository: spark Updated Branches: refs/heads/master a66142dec -> 9ace2e5c8 [SPARK-11852][ML] StandardScaler minor refactor ```withStd``` and ```withMean``` should be params of ```StandardScaler``` and ```StandardScalerModel```. Author: Yanbo Liang <yblia...@gmail.com> Closes #9839 from yanboliang/standardScaler-refactor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ace2e5c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ace2e5c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ace2e5c Branch: refs/heads/master Commit: 9ace2e5c8d7fbd360a93bc5fc4eace64a697b44f Parents: a66142d Author: Yanbo Liang <yblia...@gmail.com> Authored: Fri Nov 20 09:55:53 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 09:55:53 2015 -0800 -- .../spark/ml/feature/StandardScaler.scala | 60 +--- .../spark/ml/feature/StandardScalerSuite.scala | 11 ++-- 2 files changed, 32 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9ace2e5c/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 6d54521..d76a9c6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -36,20 +36,30 @@ import org.apache.spark.sql.types.{StructField, StructType} private[feature] trait StandardScalerParams extends Params with HasInputCol with HasOutputCol { /** - * Centers the data with mean before scaling. + * Whether to center the data with mean before scaling. * It will build a dense output, so this does not work on sparse input * and will raise an exception. * Default: false * @group param */ - val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean") + val withMean: BooleanParam = new BooleanParam(this, "withMean", +"Whether to center data with mean") + + /** @group getParam */ + def getWithMean: Boolean = $(withMean) /** - * Scales the data to unit standard deviation. + * Whether to scale the data to unit standard deviation. * Default: true * @group param */ - val withStd: BooleanParam = new BooleanParam(this, "withStd", "Scale to unit standard deviation") + val withStd: BooleanParam = new BooleanParam(this, "withStd", +"Whether to scale the data to unit standard deviation") + + /** @group getParam */ + def getWithStd: Boolean = $(withStd) + + setDefault(withMean -> false, withStd -> true) } /** @@ -63,8 +73,6 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM def this() = this(Identifiable.randomUID("stdScal")) - setDefault(withMean -> false, withStd -> true) - /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -82,7 +90,7 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v } val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd)) val scalerModel = scaler.fit(input) -copyValues(new StandardScalerModel(uid, scalerModel).setParent(this)) +copyValues(new StandardScalerModel(uid, scalerModel.std, scalerModel.mean).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -108,29 +116,19 @@ object StandardScaler extends DefaultParamsReadable[StandardScaler] { /** * :: Experimental :: * Model fitted by [[StandardScaler]]. + * + * @param std Standard deviation of the StandardScalerModel + * @param mean Mean of the StandardScalerModel */ @Experimental class StandardScalerModel private[ml] ( override val uid: String, -scaler: feature.StandardScalerModel) +val std: Vector, +val mean: Vector) extends Model[StandardScalerModel] with StandardScalerParams with MLWritable { import StandardScalerModel._ - /** Standard deviation of the StandardScalerModel */ - val std: Vector = scaler.std - - /** Mean of the StandardScalerModel */ - val mean: Vector = scaler.mean - - /** Whether to scale to unit standard deviation. */ - @Since("1.6.0") - def getWithStd: Boolean = scaler.withStd - - /** Whether to center data with mean. */ - @Since("1.6.0") - def getWithMean: Boolean = scaler.withMean - /** @group setParam */ def setI
spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml
Repository: spark Updated Branches: refs/heads/branch-1.6 b11aa1797 -> 92d3563fd [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml jira: https://issues.apache.org/jira/browse/SPARK-11689 Add simple user guide for LDA under spark.ml and example code under examples/. Use include_example to include example code in the user guide markdown. Check SPARK-11606 for instructions. Author: Yuhao Yang <hhb...@gmail.com> Closes #9722 from hhbyyh/ldaMLExample. (cherry picked from commit e359d5dcf5bd300213054ebeae9fe75c4f7eb9e7) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92d3563f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92d3563f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92d3563f Branch: refs/heads/branch-1.6 Commit: 92d3563fd0cf0c3f4fe037b404d172125b24cf2f Parents: b11aa17 Author: Yuhao Yang <hhb...@gmail.com> Authored: Fri Nov 20 09:57:09 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 09:57:24 2015 -0800 -- docs/ml-clustering.md | 30 +++ docs/ml-guide.md| 3 +- docs/mllib-guide.md | 1 + .../spark/examples/ml/JavaLDAExample.java | 94 .../apache/spark/examples/ml/LDAExample.scala | 77 5 files changed, 204 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md new file mode 100644 index 000..1743ef4 --- /dev/null +++ b/docs/ml-clustering.md @@ -0,0 +1,30 @@ +--- +layout: global +title: Clustering - ML +displayTitle: ML - Clustering +--- + +In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). + +## Latent Dirichlet allocation (LDA) + +`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`, +and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by +`EMLDAOptimizer` to a `DistributedLDAModel` if needed. + + + +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details. + + +{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %} + + + + +Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index be18a05..6f35b30 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., provide class probabilities, and linear models provide model summaries. * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) @@ -950,4 +951,4 @@ model.transform(test) {% endhighlight %} - + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 91e50cc..54e35fc 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -69,6 +69,7 @@ We list major functionality from both below, with links to detailed guides. concepts. It also contains sections on using algorithms within the Pipelines API, for example: * [Feature extraction, transformation, and selection](ml-features.html) +* [Clustering](ml-clustering.html) * [Decision trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java new file mode 100644 index 000..b3a7d2e --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLD
spark git commit: Revert "[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml"
Repository: spark Updated Branches: refs/heads/master 47815878a -> a2dce22e0 Revert "[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml" This reverts commit e359d5dcf5bd300213054ebeae9fe75c4f7eb9e7. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2dce22e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2dce22e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2dce22e Branch: refs/heads/master Commit: a2dce22e0a25922e2052318d32f32877b7c27ec2 Parents: 4781587 Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Nov 20 16:51:47 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 16:51:47 2015 -0800 -- docs/ml-clustering.md | 30 --- docs/ml-guide.md| 3 +- docs/mllib-guide.md | 1 - .../spark/examples/ml/JavaLDAExample.java | 94 .../apache/spark/examples/ml/LDAExample.scala | 77 5 files changed, 1 insertion(+), 204 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md deleted file mode 100644 index 1743ef4..000 --- a/docs/ml-clustering.md +++ /dev/null @@ -1,30 +0,0 @@ -layout: global -title: Clustering - ML -displayTitle: ML - Clustering - -In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). - -## Latent Dirichlet allocation (LDA) - -`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`, -and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by -`EMLDAOptimizer` to a `DistributedLDAModel` if needed. - - - -Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details. - - -{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %} - - - - -Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details. - -{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} - - - \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index 6f35b30..be18a05 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -40,7 +40,6 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., provide class probabilities, and linear models provide model summaries. * [Feature extraction, transformation, and selection](ml-features.html) -* [Clustering](ml-clustering.html) * [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) @@ -951,4 +950,4 @@ model.transform(test) {% endhighlight %} - \ No newline at end of file + http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 54e35fc..91e50cc 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -69,7 +69,6 @@ We list major functionality from both below, with links to detailed guides. concepts. It also contains sections on using algorithms within the Pipelines API, for example: * [Feature extraction, transformation, and selection](ml-features.html) -* [Clustering](ml-clustering.html) * [Decision trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java deleted file mode 100644 index b3a7d2e..000 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not
spark git commit: Revert "[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml"
Repository: spark Updated Branches: refs/heads/branch-1.6 285e4017a -> 33d856df5 Revert "[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml" This reverts commit 92d3563fd0cf0c3f4fe037b404d172125b24cf2f. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33d856df Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33d856df Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33d856df Branch: refs/heads/branch-1.6 Commit: 33d856df53689d7fd515a21ec4f34d1d5c74a958 Parents: 285e401 Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Nov 20 16:52:20 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 16:52:20 2015 -0800 -- docs/ml-clustering.md | 30 --- docs/ml-guide.md| 3 +- docs/mllib-guide.md | 1 - .../spark/examples/ml/JavaLDAExample.java | 94 .../apache/spark/examples/ml/LDAExample.scala | 77 5 files changed, 1 insertion(+), 204 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md deleted file mode 100644 index 1743ef4..000 --- a/docs/ml-clustering.md +++ /dev/null @@ -1,30 +0,0 @@ -layout: global -title: Clustering - ML -displayTitle: ML - Clustering - -In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html). - -## Latent Dirichlet allocation (LDA) - -`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`, -and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by -`EMLDAOptimizer` to a `DistributedLDAModel` if needed. - - - -Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details. - - -{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %} - - - - -Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details. - -{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} - - - \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index 6f35b30..be18a05 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -40,7 +40,6 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., provide class probabilities, and linear models provide model summaries. * [Feature extraction, transformation, and selection](ml-features.html) -* [Clustering](ml-clustering.html) * [Decision Trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) @@ -951,4 +950,4 @@ model.transform(test) {% endhighlight %} - \ No newline at end of file + http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/docs/mllib-guide.md -- diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 54e35fc..91e50cc 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -69,7 +69,6 @@ We list major functionality from both below, with links to detailed guides. concepts. It also contains sections on using algorithms within the Pipelines API, for example: * [Feature extraction, transformation, and selection](ml-features.html) -* [Clustering](ml-clustering.html) * [Decision trees for classification and regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java deleted file mode 100644 index b3a7d2e..000 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not
[2/2] spark git commit: [SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example
[SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example Author: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local> Closes #9689 from vikasnp/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ed47b1e6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ed47b1e6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ed47b1e6 Branch: refs/heads/master Commit: ed47b1e660b830e2d4fac8d6df93f634b260393c Parents: 4b84c72 Author: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local> Authored: Fri Nov 20 15:18:41 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Nov 20 15:18:41 2015 -0800 -- docs/mllib-evaluation-metrics.md| 940 +-- .../JavaBinaryClassificationMetricsExample.java | 113 +++ ...aMultiLabelClassificationMetricsExample.java | 80 ++ ...aMulticlassClassificationMetricsExample.java | 97 ++ .../mllib/JavaRankingMetricsExample.java| 176 .../mllib/JavaRegressionMetricsExample.java | 91 ++ .../binary_classification_metrics_example.py| 55 ++ .../python/mllib/multi_class_metrics_example.py | 69 ++ .../python/mllib/multi_label_metrics_example.py | 61 ++ .../python/mllib/ranking_metrics_example.py | 55 ++ .../python/mllib/regression_metrics_example.py | 59 ++ .../BinaryClassificationMetricsExample.scala| 103 ++ .../mllib/MultiLabelMetricsExample.scala| 69 ++ .../mllib/MulticlassMetricsExample.scala| 99 ++ .../examples/mllib/RankingMetricsExample.scala | 110 +++ .../mllib/RegressionMetricsExample.scala| 67 ++ 16 files changed, 1319 insertions(+), 925 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ed47b1e6/docs/mllib-evaluation-metrics.md -- diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index f73eff6..6924037 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -104,214 +104,21 @@ data, and evaluate the performance of the algorithm by several binary evaluation Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils - -// Load training data in LIBSVM format -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") - -// Split data into training (60%) and test (40%) -val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) -training.cache() - -// Run training algorithm to build the model -val model = new LogisticRegressionWithLBFGS() - .setNumClasses(2) - .run(training) - -// Clear the prediction threshold so the model will return probabilities -model.clearThreshold - -// Compute raw scores on the test set -val predictionAndLabels = test.map { case LabeledPoint(label, features) => - val prediction = model.predict(features) - (prediction, label) -} - -// Instantiate metrics object -val metrics = new BinaryClassificationMetrics(predictionAndLabels) - -// Precision by threshold -val precision = metrics.precisionByThreshold -precision.foreach { case (t, p) => -println(s"Threshold: $t, Precision: $p") -} - -// Recall by threshold -val recall = metrics.recallByThreshold -recall.foreach { case (t, r) => -println(s"Threshold: $t, Recall: $r") -} - -// Precision-Recall Curve -val PRC = metrics.pr - -// F-measure -val f1Score = metrics.fMeasureByThreshold -f1Score.foreach { case (t, f) => -println(s"Threshold: $t, F-score: $f, Beta = 1") -} - -val beta = 0.5 -val fScore = metrics.fMeasureByThreshold(beta) -f1Score.foreach { case (t, f) => -println(s"Threshold: $t, F-score: $f, Beta = 0.5") -} - -// AUPRC -val auPRC = metrics.areaUnderPR -println("Area under precision-recall curve = " + auPRC) - -// Compute thresholds used in ROC and PR curves -val thresholds = precision.map(_._1) - -// ROC Curve -val roc = metrics.roc - -// AUROC -val auROC = metrics.areaUnderROC -println("Area under ROC = " + auROC) - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala %} Refer to the [`LogisticR
[1/2] spark git commit: [SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example
Repository: spark Updated Branches: refs/heads/master 4b84c72df -> ed47b1e66 http://git-wip-us.apache.org/repos/asf/spark/blob/ed47b1e6/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala new file mode 100644 index 000..4503c15 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +// $example on$ +import org.apache.spark.mllib.evaluation.MultilabelMetrics +import org.apache.spark.rdd.RDD +// $example off$ +import org.apache.spark.{SparkContext, SparkConf} + +object MultiLabelMetricsExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("MultiLabelMetricsExample") +val sc = new SparkContext(conf) +// $example on$ +val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize( + Seq((Array(0.0, 1.0), Array(0.0, 2.0)), +(Array(0.0, 2.0), Array(0.0, 1.0)), +(Array.empty[Double], Array(0.0)), +(Array(2.0), Array(2.0)), +(Array(2.0, 0.0), Array(2.0, 0.0)), +(Array(0.0, 1.0, 2.0), Array(0.0, 1.0)), +(Array(1.0), Array(1.0, 2.0))), 2) + +// Instantiate metrics object +val metrics = new MultilabelMetrics(scoreAndLabels) + +// Summary stats +println(s"Recall = ${metrics.recall}") +println(s"Precision = ${metrics.precision}") +println(s"F1 measure = ${metrics.f1Measure}") +println(s"Accuracy = ${metrics.accuracy}") + +// Individual label stats +metrics.labels.foreach(label => + println(s"Class $label precision = ${metrics.precision(label)}")) +metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}")) +metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}")) + +// Micro stats +println(s"Micro recall = ${metrics.microRecall}") +println(s"Micro precision = ${metrics.microPrecision}") +println(s"Micro F1 measure = ${metrics.microF1Measure}") + +// Hamming loss +println(s"Hamming loss = ${metrics.hammingLoss}") + +// Subset accuracy +println(s"Subset accuracy = ${metrics.subsetAccuracy}") +// $example off$ + } +} +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/ed47b1e6/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala new file mode 100644 index 000..0904449 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +// $example on$ +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import
[1/2] spark git commit: [SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example
Repository: spark Updated Branches: refs/heads/branch-1.6 0665fb5ea -> 1dde97176 http://git-wip-us.apache.org/repos/asf/spark/blob/1dde9717/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala new file mode 100644 index 000..4503c15 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +// $example on$ +import org.apache.spark.mllib.evaluation.MultilabelMetrics +import org.apache.spark.rdd.RDD +// $example off$ +import org.apache.spark.{SparkContext, SparkConf} + +object MultiLabelMetricsExample { + def main(args: Array[String]): Unit = { +val conf = new SparkConf().setAppName("MultiLabelMetricsExample") +val sc = new SparkContext(conf) +// $example on$ +val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize( + Seq((Array(0.0, 1.0), Array(0.0, 2.0)), +(Array(0.0, 2.0), Array(0.0, 1.0)), +(Array.empty[Double], Array(0.0)), +(Array(2.0), Array(2.0)), +(Array(2.0, 0.0), Array(2.0, 0.0)), +(Array(0.0, 1.0, 2.0), Array(0.0, 1.0)), +(Array(1.0), Array(1.0, 2.0))), 2) + +// Instantiate metrics object +val metrics = new MultilabelMetrics(scoreAndLabels) + +// Summary stats +println(s"Recall = ${metrics.recall}") +println(s"Precision = ${metrics.precision}") +println(s"F1 measure = ${metrics.f1Measure}") +println(s"Accuracy = ${metrics.accuracy}") + +// Individual label stats +metrics.labels.foreach(label => + println(s"Class $label precision = ${metrics.precision(label)}")) +metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}")) +metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}")) + +// Micro stats +println(s"Micro recall = ${metrics.microRecall}") +println(s"Micro precision = ${metrics.microPrecision}") +println(s"Micro F1 measure = ${metrics.microF1Measure}") + +// Hamming loss +println(s"Hamming loss = ${metrics.hammingLoss}") + +// Subset accuracy +println(s"Subset accuracy = ${metrics.subsetAccuracy}") +// $example off$ + } +} +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/1dde9717/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala new file mode 100644 index 000..0904449 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +// $example on$ +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import
spark git commit: [SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression
Repository: spark Updated Branches: refs/heads/master 7ee7d5a3c -> 4114ce20f [SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression https://issues.apache.org/jira/browse/SPARK-11846 mengxr Author: Xusen Yin <yinxu...@gmail.com> Closes #9836 from yinxusen/SPARK-11846. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4114ce20 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4114ce20 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4114ce20 Branch: refs/heads/master Commit: 4114ce20fbe820f111e55e891ae3889b0e6e0006 Parents: 7ee7d5a Author: Xusen Yin <yinxu...@gmail.com> Authored: Thu Nov 19 22:01:02 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 22:01:02 2015 -0800 -- .../ml/regression/AFTSurvivalRegression.scala | 78 +++--- .../ml/regression/IsotonicRegression.scala | 83 ++-- .../regression/AFTSurvivalRegressionSuite.scala | 37 - .../ml/regression/IsotonicRegressionSuite.scala | 34 +++- 4 files changed, 210 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4114ce20/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index b7d0958..aedfb48 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -21,20 +21,20 @@ import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS} +import org.apache.hadoop.fs.Path -import org.apache.spark.{SparkException, Logging} -import org.apache.spark.annotation.{Since, Experimental} -import org.apache.spark.ml.{Model, Estimator} +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} -import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.mllib.linalg.BLAS +import org.apache.spark.ml.util._ +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel +import org.apache.spark.{Logging, SparkException} /** * Params for accelerated failure time (AFT) regression. @@ -120,7 +120,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params @Experimental @Since("1.6.0") class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: String) - extends Estimator[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams with Logging { + extends Estimator[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams + with DefaultParamsWritable with Logging { @Since("1.6.0") def this() = this(Identifiable.randomUID("aftSurvReg")) @@ -243,6 +244,13 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S override def copy(extra: ParamMap): AFTSurvivalRegression = defaultCopy(extra) } +@Since("1.6.0") +object AFTSurvivalRegression extends DefaultParamsReadable[AFTSurvivalRegression] { + + @Since("1.6.0") + override def load(path: String): AFTSurvivalRegression = super.load(path) +} + /** * :: Experimental :: * Model produced by [[AFTSurvivalRegression]]. @@ -254,7 +262,7 @@ class AFTSurvivalRegressionModel private[ml] ( @Since("1.6.0") val coefficients: Vector, @Since("1.6.0") val intercept: Double, @Since("1.6.0") val scale: Double) - extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams { + extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams with MLWritable { /** @group setParam */ @Since("1.6.0") @@ -312,6 +320,58 @@ class AFTSurvivalRegressionModel private[ml] ( copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale), extra) .setParent(parent) } + + @Since("1.6.0") + override def write: MLWriter = +new AFTSurvivalRegressionModel.AFTSurvivalRegressio
spark git commit: [SPARK-11829][ML] Add read/write to estimators under ml.feature (II)
Repository: spark Updated Branches: refs/heads/master 4114ce20f -> 3b7f056da [SPARK-11829][ML] Add read/write to estimators under ml.feature (II) Add read/write support to the following estimators under spark.ml: * ChiSqSelector * PCA * VectorIndexer * Word2Vec Author: Yanbo Liang <yblia...@gmail.com> Closes #9838 from yanboliang/spark-11829. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b7f056d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b7f056d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b7f056d Branch: refs/heads/master Commit: 3b7f056da87a23f3a96f0311b3a947a9b698f38b Parents: 4114ce2 Author: Yanbo Liang <yblia...@gmail.com> Authored: Thu Nov 19 22:02:17 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 22:02:17 2015 -0800 -- .../apache/spark/ml/feature/ChiSqSelector.scala | 65 +-- .../scala/org/apache/spark/ml/feature/PCA.scala | 67 ++-- .../apache/spark/ml/feature/VectorIndexer.scala | 66 +-- .../org/apache/spark/ml/feature/Word2Vec.scala | 67 ++-- .../apache/spark/mllib/feature/Word2Vec.scala | 6 +- .../spark/ml/feature/ChiSqSelectorSuite.scala | 22 ++- .../org/apache/spark/ml/feature/PCASuite.scala | 26 +++- .../spark/ml/feature/VectorIndexerSuite.scala | 22 ++- .../apache/spark/ml/feature/Word2VecSuite.scala | 30 - 9 files changed, 338 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b7f056d/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 5e4061f..dfec038 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -17,13 +17,14 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.Experimental +import org.apache.hadoop.fs.Path + +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml._ import org.apache.spark.ml.attribute.{AttributeGroup, _} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint @@ -60,7 +61,7 @@ private[feature] trait ChiSqSelectorParams extends Params */ @Experimental final class ChiSqSelector(override val uid: String) - extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams { + extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams with DefaultParamsWritable { def this() = this(Identifiable.randomUID("chiSqSelector")) @@ -95,6 +96,13 @@ final class ChiSqSelector(override val uid: String) override def copy(extra: ParamMap): ChiSqSelector = defaultCopy(extra) } +@Since("1.6.0") +object ChiSqSelector extends DefaultParamsReadable[ChiSqSelector] { + + @Since("1.6.0") + override def load(path: String): ChiSqSelector = super.load(path) +} + /** * :: Experimental :: * Model fitted by [[ChiSqSelector]]. @@ -103,7 +111,12 @@ final class ChiSqSelector(override val uid: String) final class ChiSqSelectorModel private[ml] ( override val uid: String, private val chiSqSelector: feature.ChiSqSelectorModel) - extends Model[ChiSqSelectorModel] with ChiSqSelectorParams { + extends Model[ChiSqSelectorModel] with ChiSqSelectorParams with MLWritable { + + import ChiSqSelectorModel._ + + /** list of indices to select (filter). Must be ordered asc */ + val selectedFeatures: Array[Int] = chiSqSelector.selectedFeatures /** @group setParam */ def setFeaturesCol(value: String): this.type = set(featuresCol, value) @@ -147,4 +160,46 @@ final class ChiSqSelectorModel private[ml] ( val copied = new ChiSqSelectorModel(uid, chiSqSelector) copyValues(copied, extra).setParent(parent) } + + @Since("1.6.0") + override def write: MLWriter = new ChiSqSelectorModelWriter(this) +} + +@Since("1.6.0") +object ChiSqSelectorModel extends MLReadable[ChiSqSelectorModel] { + + private[ChiSqSelectorModel] + class ChiSqSelectorModelWriter(instance: ChiSqSelectorModel) extends MLWriter { + +private case class Data(selectedFeatures: Seq[Int]) + +override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path,
spark git commit: [SPARK-11829][ML] Add read/write to estimators under ml.feature (II)
Repository: spark Updated Branches: refs/heads/branch-1.6 4774897f9 -> d7b3d5785 [SPARK-11829][ML] Add read/write to estimators under ml.feature (II) Add read/write support to the following estimators under spark.ml: * ChiSqSelector * PCA * VectorIndexer * Word2Vec Author: Yanbo Liang <yblia...@gmail.com> Closes #9838 from yanboliang/spark-11829. (cherry picked from commit 3b7f056da87a23f3a96f0311b3a947a9b698f38b) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7b3d578 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7b3d578 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7b3d578 Branch: refs/heads/branch-1.6 Commit: d7b3d578555d6fabfacd80da97b88aae56f81f1b Parents: 4774897 Author: Yanbo Liang <yblia...@gmail.com> Authored: Thu Nov 19 22:02:17 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 22:02:25 2015 -0800 -- .../apache/spark/ml/feature/ChiSqSelector.scala | 65 +-- .../scala/org/apache/spark/ml/feature/PCA.scala | 67 ++-- .../apache/spark/ml/feature/VectorIndexer.scala | 66 +-- .../org/apache/spark/ml/feature/Word2Vec.scala | 67 ++-- .../apache/spark/mllib/feature/Word2Vec.scala | 6 +- .../spark/ml/feature/ChiSqSelectorSuite.scala | 22 ++- .../org/apache/spark/ml/feature/PCASuite.scala | 26 +++- .../spark/ml/feature/VectorIndexerSuite.scala | 22 ++- .../apache/spark/ml/feature/Word2VecSuite.scala | 30 - 9 files changed, 338 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7b3d578/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 5e4061f..dfec038 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -17,13 +17,14 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.Experimental +import org.apache.hadoop.fs.Path + +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml._ import org.apache.spark.ml.attribute.{AttributeGroup, _} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint @@ -60,7 +61,7 @@ private[feature] trait ChiSqSelectorParams extends Params */ @Experimental final class ChiSqSelector(override val uid: String) - extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams { + extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams with DefaultParamsWritable { def this() = this(Identifiable.randomUID("chiSqSelector")) @@ -95,6 +96,13 @@ final class ChiSqSelector(override val uid: String) override def copy(extra: ParamMap): ChiSqSelector = defaultCopy(extra) } +@Since("1.6.0") +object ChiSqSelector extends DefaultParamsReadable[ChiSqSelector] { + + @Since("1.6.0") + override def load(path: String): ChiSqSelector = super.load(path) +} + /** * :: Experimental :: * Model fitted by [[ChiSqSelector]]. @@ -103,7 +111,12 @@ final class ChiSqSelector(override val uid: String) final class ChiSqSelectorModel private[ml] ( override val uid: String, private val chiSqSelector: feature.ChiSqSelectorModel) - extends Model[ChiSqSelectorModel] with ChiSqSelectorParams { + extends Model[ChiSqSelectorModel] with ChiSqSelectorParams with MLWritable { + + import ChiSqSelectorModel._ + + /** list of indices to select (filter). Must be ordered asc */ + val selectedFeatures: Array[Int] = chiSqSelector.selectedFeatures /** @group setParam */ def setFeaturesCol(value: String): this.type = set(featuresCol, value) @@ -147,4 +160,46 @@ final class ChiSqSelectorModel private[ml] ( val copied = new ChiSqSelectorModel(uid, chiSqSelector) copyValues(copied, extra).setParent(parent) } + + @Since("1.6.0") + override def write: MLWriter = new ChiSqSelectorModelWriter(this) +} + +@Since("1.6.0") +object ChiSqSelectorModel extends MLReadable[ChiSqSelectorModel] { + + private[ChiSqSelectorModel] + class ChiSqSelectorModelWriter(instance: ChiSqSelectorModel) extends MLWriter { + +private case class Data(selectedF
spark git commit: [SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression
Repository: spark Updated Branches: refs/heads/branch-1.6 a936fa5c5 -> 4774897f9 [SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression https://issues.apache.org/jira/browse/SPARK-11846 mengxr Author: Xusen Yin <yinxu...@gmail.com> Closes #9836 from yinxusen/SPARK-11846. (cherry picked from commit 4114ce20fbe820f111e55e891ae3889b0e6e0006) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4774897f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4774897f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4774897f Branch: refs/heads/branch-1.6 Commit: 4774897f9166c5674029ad97cdd6ea5902bcf17c Parents: a936fa5 Author: Xusen Yin <yinxu...@gmail.com> Authored: Thu Nov 19 22:01:02 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 22:01:15 2015 -0800 -- .../ml/regression/AFTSurvivalRegression.scala | 78 +++--- .../ml/regression/IsotonicRegression.scala | 83 ++-- .../regression/AFTSurvivalRegressionSuite.scala | 37 - .../ml/regression/IsotonicRegressionSuite.scala | 34 +++- 4 files changed, 210 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4774897f/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index b7d0958..aedfb48 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -21,20 +21,20 @@ import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS} +import org.apache.hadoop.fs.Path -import org.apache.spark.{SparkException, Logging} -import org.apache.spark.annotation.{Since, Experimental} -import org.apache.spark.ml.{Model, Estimator} +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} -import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.mllib.linalg.BLAS +import org.apache.spark.ml.util._ +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel +import org.apache.spark.{Logging, SparkException} /** * Params for accelerated failure time (AFT) regression. @@ -120,7 +120,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params @Experimental @Since("1.6.0") class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: String) - extends Estimator[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams with Logging { + extends Estimator[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams + with DefaultParamsWritable with Logging { @Since("1.6.0") def this() = this(Identifiable.randomUID("aftSurvReg")) @@ -243,6 +244,13 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S override def copy(extra: ParamMap): AFTSurvivalRegression = defaultCopy(extra) } +@Since("1.6.0") +object AFTSurvivalRegression extends DefaultParamsReadable[AFTSurvivalRegression] { + + @Since("1.6.0") + override def load(path: String): AFTSurvivalRegression = super.load(path) +} + /** * :: Experimental :: * Model produced by [[AFTSurvivalRegression]]. @@ -254,7 +262,7 @@ class AFTSurvivalRegressionModel private[ml] ( @Since("1.6.0") val coefficients: Vector, @Since("1.6.0") val intercept: Double, @Since("1.6.0") val scale: Double) - extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams { + extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams with MLWritable { /** @group setParam */ @Since("1.6.0") @@ -312,6 +320,58 @@ class AFTSurvivalRegressionModel private[ml] ( copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale), extra) .setParent
spark git commit: [SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval
Repository: spark Updated Branches: refs/heads/branch-1.6 d7b3d5785 -> 0a878ad0e [SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval * Update doc for PySpark ```HasCheckpointInterval``` that users can understand how to disable checkpoint. * Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```. Author: Yanbo Liang <yblia...@gmail.com> Closes #9856 from yanboliang/spark-11875. (cherry picked from commit 7216f405454f6f3557b5b1f72df8f393605faf60) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a878ad0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a878ad0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a878ad0 Branch: refs/heads/branch-1.6 Commit: 0a878ad0e422cdf00c4beedb5bea01ebba135347 Parents: d7b3d57 Author: Yanbo Liang <yblia...@gmail.com> Authored: Thu Nov 19 22:14:01 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 22:14:26 2015 -0800 -- python/pyspark/ml/param/_shared_params_code_gen.py | 6 -- python/pyspark/ml/param/shared.py | 14 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a878ad0/python/pyspark/ml/param/_shared_params_code_gen.py -- diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 070c5db..0528dc1 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -118,7 +118,8 @@ if __name__ == "__main__": ("inputCols", "input column names.", None), ("outputCol", "output column name.", "self.uid + '__output'"), ("numFeatures", "number of features.", None), -("checkpointInterval", "checkpoint interval (>= 1).", None), +("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " + + "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None), ("seed", "random seed.", "hash(type(self).__name__)"), ("tol", "the convergence tolerance for iterative algorithms.", None), ("stepSize", "Step size to be used for each iteration of optimization.", None), @@ -157,7 +158,8 @@ if __name__ == "__main__": ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."), ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " + "instances with nodes. If true, the algorithm will cache node IDs for each instance. " + - "Caching can speed up training of deeper trees.")] + "Caching can speed up training of deeper trees. Users can set how often should the " + + "cache be checkpointed or disable it by setting checkpointInterval.")] decisionTreeCode = '''class DecisionTreeParams(Params): """ http://git-wip-us.apache.org/repos/asf/spark/blob/0a878ad0/python/pyspark/ml/param/shared.py -- diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 4bdf2a8..4d96080 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -325,16 +325,16 @@ class HasNumFeatures(Params): class HasCheckpointInterval(Params): """ -Mixin for param checkpointInterval: checkpoint interval (>= 1). +Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. """ # a placeholder to make it appear in the generated doc -checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1).") +checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.") def __init__(self): super(HasCheckpointInterval, self).__init__() -#: param for checkpoint interval (>= 1). -self.checkpointInterval = Param(self,
spark git commit: [SPARK-11867] Add save/load for kmeans and naive bayes
Repository: spark Updated Branches: refs/heads/master 0fff8eb3e -> 3e1d120ce [SPARK-11867] Add save/load for kmeans and naive bayes https://issues.apache.org/jira/browse/SPARK-11867 Author: Xusen Yin <yinxu...@gmail.com> Closes #9849 from yinxusen/SPARK-11867. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e1d120c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e1d120c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e1d120c Branch: refs/heads/master Commit: 3e1d120cedb4bd9e1595e95d4d531cf61da6684d Parents: 0fff8eb Author: Xusen Yin <yinxu...@gmail.com> Authored: Thu Nov 19 23:43:18 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 23:43:18 2015 -0800 -- .../spark/ml/classification/NaiveBayes.scala| 68 ++-- .../org/apache/spark/ml/clustering/KMeans.scala | 67 +-- .../ml/classification/NaiveBayesSuite.scala | 47 -- .../spark/ml/clustering/KMeansSuite.scala | 41 +--- 4 files changed, 195 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e1d120c/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index a14dcec..c512a2c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -17,12 +17,15 @@ package org.apache.spark.ml.classification +import org.apache.hadoop.fs.Path + import org.apache.spark.SparkException -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes, NaiveBayesModel => OldNaiveBayesModel} +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes} +import org.apache.spark.mllib.classification.{NaiveBayesModel => OldNaiveBayesModel} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -72,7 +75,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams { @Experimental class NaiveBayes(override val uid: String) extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel] - with NaiveBayesParams { + with NaiveBayesParams with DefaultParamsWritable { def this() = this(Identifiable.randomUID("nb")) @@ -102,6 +105,13 @@ class NaiveBayes(override val uid: String) override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra) } +@Since("1.6.0") +object NaiveBayes extends DefaultParamsReadable[NaiveBayes] { + + @Since("1.6.0") + override def load(path: String): NaiveBayes = super.load(path) +} + /** * :: Experimental :: * Model produced by [[NaiveBayes]] @@ -114,7 +124,8 @@ class NaiveBayesModel private[ml] ( override val uid: String, val pi: Vector, val theta: Matrix) - extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with NaiveBayesParams { + extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] + with NaiveBayesParams with MLWritable { import OldNaiveBayes.{Bernoulli, Multinomial} @@ -203,12 +214,15 @@ class NaiveBayesModel private[ml] ( s"NaiveBayesModel (uid=$uid) with ${pi.size} classes" } + @Since("1.6.0") + override def write: MLWriter = new NaiveBayesModel.NaiveBayesModelWriter(this) } -private[ml] object NaiveBayesModel { +@Since("1.6.0") +object NaiveBayesModel extends MLReadable[NaiveBayesModel] { /** Convert a model from the old API */ - def fromOld( + private[ml] def fromOld( oldModel: OldNaiveBayesModel, parent: NaiveBayes): NaiveBayesModel = { val uid = if (parent != null) parent.uid else Identifiable.randomUID("nb") @@ -218,4 +232,44 @@ private[ml] object NaiveBayesModel { oldModel.theta.flatten, true) new NaiveBayesModel(uid, pi, theta) } + + @Since("1.6.0") + override def read: MLReader[NaiveBayesModel] = new NaiveBayesModelReader + + @Since("1.6.0") + override def load(path: String): NaiveBayesModel = super.load(path) + + /** [[MLWriter]] instance for [[NaiveBayesModel]] */ + private[NaiveBayesModel] class NaiveBayesModelWriter(instance: NaiveBaye
spark git commit: [SPARK-11867] Add save/load for kmeans and naive bayes
Repository: spark Updated Branches: refs/heads/branch-1.6 60d937529 -> 1ce6394e3 [SPARK-11867] Add save/load for kmeans and naive bayes https://issues.apache.org/jira/browse/SPARK-11867 Author: Xusen Yin <yinxu...@gmail.com> Closes #9849 from yinxusen/SPARK-11867. (cherry picked from commit 3e1d120cedb4bd9e1595e95d4d531cf61da6684d) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ce6394e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ce6394e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ce6394e Branch: refs/heads/branch-1.6 Commit: 1ce6394e3c86f8d0b80e990d8a35889ded94b6ea Parents: 60d9375 Author: Xusen Yin <yinxu...@gmail.com> Authored: Thu Nov 19 23:43:18 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 23:43:25 2015 -0800 -- .../spark/ml/classification/NaiveBayes.scala| 68 ++-- .../org/apache/spark/ml/clustering/KMeans.scala | 67 +-- .../ml/classification/NaiveBayesSuite.scala | 47 -- .../spark/ml/clustering/KMeansSuite.scala | 41 +--- 4 files changed, 195 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1ce6394e/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index a14dcec..c512a2c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -17,12 +17,15 @@ package org.apache.spark.ml.classification +import org.apache.hadoop.fs.Path + import org.apache.spark.SparkException -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes, NaiveBayesModel => OldNaiveBayesModel} +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes} +import org.apache.spark.mllib.classification.{NaiveBayesModel => OldNaiveBayesModel} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -72,7 +75,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams { @Experimental class NaiveBayes(override val uid: String) extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel] - with NaiveBayesParams { + with NaiveBayesParams with DefaultParamsWritable { def this() = this(Identifiable.randomUID("nb")) @@ -102,6 +105,13 @@ class NaiveBayes(override val uid: String) override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra) } +@Since("1.6.0") +object NaiveBayes extends DefaultParamsReadable[NaiveBayes] { + + @Since("1.6.0") + override def load(path: String): NaiveBayes = super.load(path) +} + /** * :: Experimental :: * Model produced by [[NaiveBayes]] @@ -114,7 +124,8 @@ class NaiveBayesModel private[ml] ( override val uid: String, val pi: Vector, val theta: Matrix) - extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with NaiveBayesParams { + extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] + with NaiveBayesParams with MLWritable { import OldNaiveBayes.{Bernoulli, Multinomial} @@ -203,12 +214,15 @@ class NaiveBayesModel private[ml] ( s"NaiveBayesModel (uid=$uid) with ${pi.size} classes" } + @Since("1.6.0") + override def write: MLWriter = new NaiveBayesModel.NaiveBayesModelWriter(this) } -private[ml] object NaiveBayesModel { +@Since("1.6.0") +object NaiveBayesModel extends MLReadable[NaiveBayesModel] { /** Convert a model from the old API */ - def fromOld( + private[ml] def fromOld( oldModel: OldNaiveBayesModel, parent: NaiveBayes): NaiveBayesModel = { val uid = if (parent != null) parent.uid else Identifiable.randomUID("nb") @@ -218,4 +232,44 @@ private[ml] object NaiveBayesModel { oldModel.theta.flatten, true) new NaiveBayesModel(uid, pi, theta) } + + @Since("1.6.0") + override def read: MLReader[NaiveBayesModel] = new NaiveBayesModelReader + + @Since("1.6.0") + override def load(path: String): NaiveBayesModel = super.load
spark git commit: [SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval
Repository: spark Updated Branches: refs/heads/master 3b7f056da -> 7216f4054 [SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval * Update doc for PySpark ```HasCheckpointInterval``` that users can understand how to disable checkpoint. * Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```. Author: Yanbo Liang <yblia...@gmail.com> Closes #9856 from yanboliang/spark-11875. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7216f405 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7216f405 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7216f405 Branch: refs/heads/master Commit: 7216f405454f6f3557b5b1f72df8f393605faf60 Parents: 3b7f056 Author: Yanbo Liang <yblia...@gmail.com> Authored: Thu Nov 19 22:14:01 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 22:14:01 2015 -0800 -- python/pyspark/ml/param/_shared_params_code_gen.py | 6 -- python/pyspark/ml/param/shared.py | 14 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7216f405/python/pyspark/ml/param/_shared_params_code_gen.py -- diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 070c5db..0528dc1 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -118,7 +118,8 @@ if __name__ == "__main__": ("inputCols", "input column names.", None), ("outputCol", "output column name.", "self.uid + '__output'"), ("numFeatures", "number of features.", None), -("checkpointInterval", "checkpoint interval (>= 1).", None), +("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " + + "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None), ("seed", "random seed.", "hash(type(self).__name__)"), ("tol", "the convergence tolerance for iterative algorithms.", None), ("stepSize", "Step size to be used for each iteration of optimization.", None), @@ -157,7 +158,8 @@ if __name__ == "__main__": ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."), ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " + "instances with nodes. If true, the algorithm will cache node IDs for each instance. " + - "Caching can speed up training of deeper trees.")] + "Caching can speed up training of deeper trees. Users can set how often should the " + + "cache be checkpointed or disable it by setting checkpointInterval.")] decisionTreeCode = '''class DecisionTreeParams(Params): """ http://git-wip-us.apache.org/repos/asf/spark/blob/7216f405/python/pyspark/ml/param/shared.py -- diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 4bdf2a8..4d96080 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -325,16 +325,16 @@ class HasNumFeatures(Params): class HasCheckpointInterval(Params): """ -Mixin for param checkpointInterval: checkpoint interval (>= 1). +Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. """ # a placeholder to make it appear in the generated doc -checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1).") +checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.") def __init__(self): super(HasCheckpointInterval, self).__init__() -#: param for checkpoint interval (>= 1). -self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1).") +#: param for set checkpoint interval (>= 1) or disable check
spark git commit: [SPARK-11869][ML] Clean up TempDirectory properly in ML tests
Repository: spark Updated Branches: refs/heads/branch-1.6 0a878ad0e -> 60d937529 [SPARK-11869][ML] Clean up TempDirectory properly in ML tests Need to remove parent directory (```className```) rather than just tempDir (```className/random_name```) I tested this with IDFSuite, which has 2 read/write tests, and it fixes the problem. CC: mengxr Can you confirm this is fine? I believe it is since the same ```random_name``` is used for all tests in a suite; we basically have an extra unneeded level of nesting. Author: Joseph K. Bradley <jos...@databricks.com> Closes #9851 from jkbradley/tempdir-cleanup. (cherry picked from commit 0fff8eb3e476165461658d4e16682ec64269fdfe) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60d93752 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60d93752 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60d93752 Branch: refs/heads/branch-1.6 Commit: 60d937529f6b885486e0a9b687883fb4bf66b780 Parents: 0a878ad Author: Joseph K. Bradley <jos...@databricks.com> Authored: Thu Nov 19 23:42:24 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 23:42:31 2015 -0800 -- mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60d93752/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala index 2742026..c8a0bb1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala @@ -35,7 +35,7 @@ trait TempDirectory extends BeforeAndAfterAll { self: Suite => override def beforeAll(): Unit = { super.beforeAll() -_tempDir = Utils.createTempDir(this.getClass.getName) +_tempDir = Utils.createTempDir(namePrefix = this.getClass.getName) } override def afterAll(): Unit = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11869][ML] Clean up TempDirectory properly in ML tests
Repository: spark Updated Branches: refs/heads/master 7216f4054 -> 0fff8eb3e [SPARK-11869][ML] Clean up TempDirectory properly in ML tests Need to remove parent directory (```className```) rather than just tempDir (```className/random_name```) I tested this with IDFSuite, which has 2 read/write tests, and it fixes the problem. CC: mengxr Can you confirm this is fine? I believe it is since the same ```random_name``` is used for all tests in a suite; we basically have an extra unneeded level of nesting. Author: Joseph K. Bradley <jos...@databricks.com> Closes #9851 from jkbradley/tempdir-cleanup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0fff8eb3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0fff8eb3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0fff8eb3 Branch: refs/heads/master Commit: 0fff8eb3e476165461658d4e16682ec64269fdfe Parents: 7216f40 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Thu Nov 19 23:42:24 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Nov 19 23:42:24 2015 -0800 -- mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0fff8eb3/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala index 2742026..c8a0bb1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala @@ -35,7 +35,7 @@ trait TempDirectory extends BeforeAndAfterAll { self: Suite => override def beforeAll(): Unit = { super.beforeAll() -_tempDir = Utils.createTempDir(this.getClass.getName) +_tempDir = Utils.createTempDir(namePrefix = this.getClass.getName) } override def afterAll(): Unit = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11816][ML] fix some style issue in ML/MLlib examples
Repository: spark Updated Branches: refs/heads/branch-1.6 19f4f26f3 -> 4b4a6bf5c [SPARK-11816][ML] fix some style issue in ML/MLlib examples jira: https://issues.apache.org/jira/browse/SPARK-11816 Currently I only fixed some obvious comments issue like // scalastyle:off println on the bottom. Yet the style in examples is not quite consistent, like only half of the examples are with // Example usage: ./bin/run-example mllib.FPGrowthExample \, Author: Yuhao Yang <hhb...@gmail.com> Closes #9808 from hhbyyh/exampleStyle. (cherry picked from commit 67c75828ff4df2e305bdf5d6be5a11201d1da3f3) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b4a6bf5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b4a6bf5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b4a6bf5 Branch: refs/heads/branch-1.6 Commit: 4b4a6bf5c0a8dd96897d7dd48c7beadd2c950824 Parents: 19f4f26 Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 18:49:46 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 18:50:10 2015 -0800 -- .../main/java/org/apache/spark/examples/ml/JavaKMeansExample.java | 2 +- .../apache/spark/examples/ml/AFTSurvivalRegressionExample.scala| 2 +- .../spark/examples/ml/DecisionTreeClassificationExample.scala | 1 + .../apache/spark/examples/ml/DecisionTreeRegressionExample.scala | 1 + .../spark/examples/ml/MultilayerPerceptronClassifierExample.scala | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index be2bf0c..47665ff 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -41,7 +41,7 @@ import org.apache.spark.sql.types.StructType; * An example demonstrating a k-means clustering. * Run with * - * bin/run-example ml.JavaSimpleParamsExample + * bin/run-example ml.JavaKMeansExample * */ public class JavaKMeansExample { http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala index 5da285e..f4b3613 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala @@ -59,4 +59,4 @@ object AFTSurvivalRegressionExample { sc.stop() } } -// scalastyle:off println +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala index ff8a0a9..db024b5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala @@ -90,3 +90,4 @@ object DecisionTreeClassificationExample { // $example off$ } } +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala index fc40272..ad01f55 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala @@ -78,3 +78,4 @@ object DecisionTreeRegressionExample { // $example off$ } } +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/o
spark git commit: [SPARK-11816][ML] fix some style issue in ML/MLlib examples
Repository: spark Updated Branches: refs/heads/master 9c0654d36 -> 67c75828f [SPARK-11816][ML] fix some style issue in ML/MLlib examples jira: https://issues.apache.org/jira/browse/SPARK-11816 Currently I only fixed some obvious comments issue like // scalastyle:off println on the bottom. Yet the style in examples is not quite consistent, like only half of the examples are with // Example usage: ./bin/run-example mllib.FPGrowthExample \, Author: Yuhao Yang <hhb...@gmail.com> Closes #9808 from hhbyyh/exampleStyle. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/67c75828 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/67c75828 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/67c75828 Branch: refs/heads/master Commit: 67c75828ff4df2e305bdf5d6be5a11201d1da3f3 Parents: 9c0654d Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 18:49:46 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 18:49:46 2015 -0800 -- .../main/java/org/apache/spark/examples/ml/JavaKMeansExample.java | 2 +- .../apache/spark/examples/ml/AFTSurvivalRegressionExample.scala| 2 +- .../spark/examples/ml/DecisionTreeClassificationExample.scala | 1 + .../apache/spark/examples/ml/DecisionTreeRegressionExample.scala | 1 + .../spark/examples/ml/MultilayerPerceptronClassifierExample.scala | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index be2bf0c..47665ff 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -41,7 +41,7 @@ import org.apache.spark.sql.types.StructType; * An example demonstrating a k-means clustering. * Run with * - * bin/run-example ml.JavaSimpleParamsExample + * bin/run-example ml.JavaKMeansExample * */ public class JavaKMeansExample { http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala index 5da285e..f4b3613 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala @@ -59,4 +59,4 @@ object AFTSurvivalRegressionExample { sc.stop() } } -// scalastyle:off println +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala index ff8a0a9..db024b5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala @@ -90,3 +90,4 @@ object DecisionTreeClassificationExample { // $example off$ } } +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala index fc40272..ad01f55 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala @@ -78,3 +78,4 @@ object DecisionTreeRegressionExample { // $example off$ } } +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala -- diff
spark git commit: [SPARK-6787][ML] add read/write to estimators under ml.feature (1)
Repository: spark Updated Branches: refs/heads/branch-1.6 d9945bc46 -> dc1e23744 [SPARK-6787][ML] add read/write to estimators under ml.feature (1) Add read/write support to the following estimators under spark.ml: * CountVectorizer * IDF * MinMaxScaler * StandardScaler (a little awkward because we store some params in spark.mllib model) * StringIndexer Added some necessary method for read/write. Maybe we should add `private[ml] trait DefaultParamsReadable` and `DefaultParamsWritable` to save some boilerplate code, though we still need to override `load` for Java compatibility. jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #9798 from mengxr/SPARK-6787. (cherry picked from commit 7e987de1770f4ab3d54bc05db8de0a1ef035941d) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc1e2374 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc1e2374 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc1e2374 Branch: refs/heads/branch-1.6 Commit: dc1e23744b7fc1b8ee5fac07cf56d5760d66503e Parents: d9945bc Author: Xiangrui Meng <m...@databricks.com> Authored: Wed Nov 18 15:47:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 15:47:57 2015 -0800 -- .../spark/ml/feature/CountVectorizer.scala | 72 -- .../scala/org/apache/spark/ml/feature/IDF.scala | 71 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 72 -- .../spark/ml/feature/StandardScaler.scala | 78 +++- .../apache/spark/ml/feature/StringIndexer.scala | 70 -- .../spark/ml/feature/CountVectorizerSuite.scala | 24 +- .../org/apache/spark/ml/feature/IDFSuite.scala | 19 - .../spark/ml/feature/MinMaxScalerSuite.scala| 25 ++- .../spark/ml/feature/StandardScalerSuite.scala | 64 +++- .../spark/ml/feature/StringIndexerSuite.scala | 19 - 10 files changed, 467 insertions(+), 47 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dc1e2374/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 49028e4..5ff9bfb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -16,17 +16,19 @@ */ package org.apache.spark.ml.feature -import org.apache.spark.annotation.Experimental +import org.apache.hadoop.fs.Path + +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} -import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.DataFrame import org.apache.spark.util.collection.OpenHashMap /** @@ -105,7 +107,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit */ @Experimental class CountVectorizer(override val uid: String) - extends Estimator[CountVectorizerModel] with CountVectorizerParams { + extends Estimator[CountVectorizerModel] with CountVectorizerParams with Writable { def this() = this(Identifiable.randomUID("cntVec")) @@ -169,6 +171,19 @@ class CountVectorizer(override val uid: String) } override def copy(extra: ParamMap): CountVectorizer = defaultCopy(extra) + + @Since("1.6.0") + override def write: Writer = new DefaultParamsWriter(this) +} + +@Since("1.6.0") +object CountVectorizer extends Readable[CountVectorizer] { + + @Since("1.6.0") + override def read: Reader[CountVectorizer] = new DefaultParamsReader + + @Since("1.6.0") + override def load(path: String): CountVectorizer = super.load(path) } /** @@ -178,7 +193,9 @@ class CountVectorizer(override val uid: String) */ @Experimental class CountVectorizerModel(override val uid: String, val vocabulary: Array[String]) - extends Model[CountVectorizerModel] with CountVectorizerParams { + extends Model[CountVectorizerModel] with CountVectorizerParams with Writable { + + import CountVectorizerModel._ def this(vocabulary: Array[String
spark git commit: [SPARK-6787][ML] add read/write to estimators under ml.feature (1)
Repository: spark Updated Branches: refs/heads/master 5df08949f -> 7e987de17 [SPARK-6787][ML] add read/write to estimators under ml.feature (1) Add read/write support to the following estimators under spark.ml: * CountVectorizer * IDF * MinMaxScaler * StandardScaler (a little awkward because we store some params in spark.mllib model) * StringIndexer Added some necessary method for read/write. Maybe we should add `private[ml] trait DefaultParamsReadable` and `DefaultParamsWritable` to save some boilerplate code, though we still need to override `load` for Java compatibility. jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #9798 from mengxr/SPARK-6787. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e987de1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e987de1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e987de1 Branch: refs/heads/master Commit: 7e987de1770f4ab3d54bc05db8de0a1ef035941d Parents: 5df0894 Author: Xiangrui Meng <m...@databricks.com> Authored: Wed Nov 18 15:47:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 15:47:49 2015 -0800 -- .../spark/ml/feature/CountVectorizer.scala | 72 -- .../scala/org/apache/spark/ml/feature/IDF.scala | 71 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 72 -- .../spark/ml/feature/StandardScaler.scala | 78 +++- .../apache/spark/ml/feature/StringIndexer.scala | 70 -- .../spark/ml/feature/CountVectorizerSuite.scala | 24 +- .../org/apache/spark/ml/feature/IDFSuite.scala | 19 - .../spark/ml/feature/MinMaxScalerSuite.scala| 25 ++- .../spark/ml/feature/StandardScalerSuite.scala | 64 +++- .../spark/ml/feature/StringIndexerSuite.scala | 19 - 10 files changed, 467 insertions(+), 47 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7e987de1/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 49028e4..5ff9bfb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -16,17 +16,19 @@ */ package org.apache.spark.ml.feature -import org.apache.spark.annotation.Experimental +import org.apache.hadoop.fs.Path + +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} -import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.DataFrame import org.apache.spark.util.collection.OpenHashMap /** @@ -105,7 +107,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit */ @Experimental class CountVectorizer(override val uid: String) - extends Estimator[CountVectorizerModel] with CountVectorizerParams { + extends Estimator[CountVectorizerModel] with CountVectorizerParams with Writable { def this() = this(Identifiable.randomUID("cntVec")) @@ -169,6 +171,19 @@ class CountVectorizer(override val uid: String) } override def copy(extra: ParamMap): CountVectorizer = defaultCopy(extra) + + @Since("1.6.0") + override def write: Writer = new DefaultParamsWriter(this) +} + +@Since("1.6.0") +object CountVectorizer extends Readable[CountVectorizer] { + + @Since("1.6.0") + override def read: Reader[CountVectorizer] = new DefaultParamsReader + + @Since("1.6.0") + override def load(path: String): CountVectorizer = super.load(path) } /** @@ -178,7 +193,9 @@ class CountVectorizer(override val uid: String) */ @Experimental class CountVectorizerModel(override val uid: String, val vocabulary: Array[String]) - extends Model[CountVectorizerModel] with CountVectorizerParams { + extends Model[CountVectorizerModel] with CountVectorizerParams with Writable { + + import CountVectorizerModel._ def this(vocabulary: Array[String]) = { this(Identifiable.randomUID("cntVecModel"), vocabulary) @@ -232,4 +249,47 @@ class CountVectorizerM
spark git commit: [SPARK-11720][SQL][ML] Handle edge cases when count = 0 or 1 for Stats function
Repository: spark Updated Branches: refs/heads/master 7c5b64180 -> 09ad9533d [SPARK-11720][SQL][ML] Handle edge cases when count = 0 or 1 for Stats function return Double.NaN for mean/average when count == 0 for all numeric types that is converted to Double, Decimal type continue to return null. Author: JihongMa <linlin200...@gmail.com> Closes #9705 from JihongMA/SPARK-11720. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/09ad9533 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/09ad9533 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/09ad9533 Branch: refs/heads/master Commit: 09ad9533d5760652de59fa4830c24cb8667958ac Parents: 7c5b641 Author: JihongMa <linlin200...@gmail.com> Authored: Wed Nov 18 13:03:37 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:03:37 2015 -0800 -- python/pyspark/sql/dataframe.py | 2 +- .../expressions/aggregate/CentralMomentAgg.scala | 2 +- .../catalyst/expressions/aggregate/Kurtosis.scala | 9 + .../catalyst/expressions/aggregate/Skewness.scala | 9 + .../catalyst/expressions/aggregate/Stddev.scala | 18 ++ .../catalyst/expressions/aggregate/Variance.scala | 18 ++ .../spark/sql/DataFrameAggregateSuite.scala | 18 -- .../org/apache/spark/sql/DataFrameSuite.scala | 2 +- 8 files changed, 53 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/09ad9533/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index ad6ad02..0dd75ba 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -761,7 +761,7 @@ class DataFrame(object): +---+--+-+ | count| 2|2| | mean| 3.5| null| -| stddev|2.1213203435596424| NaN| +| stddev|2.1213203435596424| null| |min| 2|Alice| |max| 5| Bob| +---+--+-+ http://git-wip-us.apache.org/repos/asf/spark/blob/09ad9533/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala index de5872a..d07d4c3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala @@ -206,7 +206,7 @@ abstract class CentralMomentAgg(child: Expression) extends ImperativeAggregate w * @param centralMoments Length `momentOrder + 1` array of central moments (un-normalized) * needed to compute the aggregate stat. */ - def getStatistic(n: Double, mean: Double, centralMoments: Array[Double]): Double + def getStatistic(n: Double, mean: Double, centralMoments: Array[Double]): Any override final def eval(buffer: InternalRow): Any = { val n = buffer.getDouble(nOffset) http://git-wip-us.apache.org/repos/asf/spark/blob/09ad9533/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala index 8fa3aac..c2bf2cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala @@ -37,16 +37,17 @@ case class Kurtosis(child: Expression, override protected val momentOrder = 4 // NOTE: this is the formula for excess kurtosis, which is default for R and SciPy - override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Double = { + override def getStatistic(n: Double, mean: Double, moments: Array[Double]): Any = { require(moments.length == momentOrder + 1, s"$prettyName requires ${momentOrder + 1} central moments, received: ${moments.length}") val m2 = moments(2) val m4 = moments(4) -if (n == 0.0 || m2 == 0.0) { +if (n == 0.0) { + null +} else if (m2 == 0.0) {
spark git commit: [SPARK-6790][ML] Add spark.ml LinearRegression import/export
Repository: spark Updated Branches: refs/heads/branch-1.6 39c8a995d -> bcc6813dd [SPARK-6790][ML] Add spark.ml LinearRegression import/export This replaces [https://github.com/apache/spark/pull/9656] with updates. fayeshine should be the main author when this PR is committed. CC: mengxr fayeshine Author: Wenjian Huang <nextr...@163.com> Author: Joseph K. Bradley <jos...@databricks.com> Closes #9814 from jkbradley/fayeshine-patch-6790. (cherry picked from commit 045a4f045821dcf60442f0600c2df1b79bddb536) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bcc6813d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bcc6813d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bcc6813d Branch: refs/heads/branch-1.6 Commit: bcc6813dd8b050fd4bf9dbd2708e413b43b3e80d Parents: 39c8a99 Author: Wenjian Huang <nextr...@163.com> Authored: Wed Nov 18 13:06:25 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:06:32 2015 -0800 -- .../spark/ml/regression/LinearRegression.scala | 77 +++- .../ml/regression/LinearRegressionSuite.scala | 34 - 2 files changed, 106 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bcc6813d/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 913140e..ca55d59 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -22,6 +22,7 @@ import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} import breeze.stats.distributions.StudentsT +import org.apache.hadoop.fs.Path import org.apache.spark.{Logging, SparkException} import org.apache.spark.ml.feature.Instance @@ -30,7 +31,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.util._ import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ @@ -65,7 +66,7 @@ private[regression] trait LinearRegressionParams extends PredictorParams @Experimental class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String) extends Regressor[Vector, LinearRegression, LinearRegressionModel] - with LinearRegressionParams with Logging { + with LinearRegressionParams with Writable with Logging { @Since("1.4.0") def this() = this(Identifiable.randomUID("linReg")) @@ -341,6 +342,19 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String @Since("1.4.0") override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra) + + @Since("1.6.0") + override def write: Writer = new DefaultParamsWriter(this) +} + +@Since("1.6.0") +object LinearRegression extends Readable[LinearRegression] { + + @Since("1.6.0") + override def read: Reader[LinearRegression] = new DefaultParamsReader[LinearRegression] + + @Since("1.6.0") + override def load(path: String): LinearRegression = read.load(path) } /** @@ -354,7 +368,7 @@ class LinearRegressionModel private[ml] ( val coefficients: Vector, val intercept: Double) extends RegressionModel[Vector, LinearRegressionModel] - with LinearRegressionParams { + with LinearRegressionParams with Writable { private var trainingSummary: Option[LinearRegressionTrainingSummary] = None @@ -422,6 +436,63 @@ class LinearRegressionModel private[ml] ( if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get) newModel.setParent(parent) } + + /** + * Returns a [[Writer]] instance for this ML instance. + * + * For [[LinearRegressionModel]], this does NOT currently save the training [[summary]]. + * An option to save [[summary]] may be added in the future. + * + * This also does not save the [[parent]] currently. + */ + @Since("1.6.0") + override def write: Writer = new LinearRegressionModel.LinearRegressionModelWriter(this) +} + +@Since("1.6.0") +object LinearRegressionMo
spark git commit: [SPARK-6790][ML] Add spark.ml LinearRegression import/export
Repository: spark Updated Branches: refs/heads/master 09ad9533d -> 045a4f045 [SPARK-6790][ML] Add spark.ml LinearRegression import/export This replaces [https://github.com/apache/spark/pull/9656] with updates. fayeshine should be the main author when this PR is committed. CC: mengxr fayeshine Author: Wenjian Huang <nextr...@163.com> Author: Joseph K. Bradley <jos...@databricks.com> Closes #9814 from jkbradley/fayeshine-patch-6790. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/045a4f04 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/045a4f04 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/045a4f04 Branch: refs/heads/master Commit: 045a4f045821dcf60442f0600c2df1b79bddb536 Parents: 09ad953 Author: Wenjian Huang <nextr...@163.com> Authored: Wed Nov 18 13:06:25 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:06:25 2015 -0800 -- .../spark/ml/regression/LinearRegression.scala | 77 +++- .../ml/regression/LinearRegressionSuite.scala | 34 - 2 files changed, 106 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/045a4f04/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 913140e..ca55d59 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -22,6 +22,7 @@ import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} import breeze.stats.distributions.StudentsT +import org.apache.hadoop.fs.Path import org.apache.spark.{Logging, SparkException} import org.apache.spark.ml.feature.Instance @@ -30,7 +31,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.util._ import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ @@ -65,7 +66,7 @@ private[regression] trait LinearRegressionParams extends PredictorParams @Experimental class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String) extends Regressor[Vector, LinearRegression, LinearRegressionModel] - with LinearRegressionParams with Logging { + with LinearRegressionParams with Writable with Logging { @Since("1.4.0") def this() = this(Identifiable.randomUID("linReg")) @@ -341,6 +342,19 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String @Since("1.4.0") override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra) + + @Since("1.6.0") + override def write: Writer = new DefaultParamsWriter(this) +} + +@Since("1.6.0") +object LinearRegression extends Readable[LinearRegression] { + + @Since("1.6.0") + override def read: Reader[LinearRegression] = new DefaultParamsReader[LinearRegression] + + @Since("1.6.0") + override def load(path: String): LinearRegression = read.load(path) } /** @@ -354,7 +368,7 @@ class LinearRegressionModel private[ml] ( val coefficients: Vector, val intercept: Double) extends RegressionModel[Vector, LinearRegressionModel] - with LinearRegressionParams { + with LinearRegressionParams with Writable { private var trainingSummary: Option[LinearRegressionTrainingSummary] = None @@ -422,6 +436,63 @@ class LinearRegressionModel private[ml] ( if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get) newModel.setParent(parent) } + + /** + * Returns a [[Writer]] instance for this ML instance. + * + * For [[LinearRegressionModel]], this does NOT currently save the training [[summary]]. + * An option to save [[summary]] may be added in the future. + * + * This also does not save the [[parent]] currently. + */ + @Since("1.6.0") + override def write: Writer = new LinearRegressionModel.LinearRegressionModelWriter(this) +} + +@Since("1.6.0") +object LinearRegressionModel extends Readable[LinearRegressionModel] { + + @Since("1.6.0") + override def read: Reader[LinearR
spark git commit: [SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel
Repository: spark Updated Branches: refs/heads/master 045a4f045 -> 2acdf10b1 [SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel Also modifies DefaultParamsWriter.saveMetadata to take optional extra metadata. CC: mengxr yanboliang Author: Joseph K. Bradley <jos...@databricks.com> Closes #9786 from jkbradley/als-io. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2acdf10b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2acdf10b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2acdf10b Branch: refs/heads/master Commit: 2acdf10b1f3bb1242dba64efa798c672fde9f0d2 Parents: 045a4f0 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Wed Nov 18 13:16:31 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:16:31 2015 -0800 -- .../apache/spark/ml/recommendation/ALS.scala| 75 +-- .../org/apache/spark/ml/util/ReadWrite.scala| 14 +++- .../spark/ml/recommendation/ALSSuite.scala | 78 +--- 3 files changed, 150 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2acdf10b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 535f266..d92514d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -27,13 +27,16 @@ import scala.util.hashing.byteswap64 import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.hadoop.fs.{FileSystem, Path} +import org.json4s.{DefaultFormats, JValue} +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, Partitioner} -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{Since, DeveloperApi, Experimental} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.CholeskyDecomposition import org.apache.spark.mllib.optimization.NNLS import org.apache.spark.rdd.RDD @@ -182,7 +185,7 @@ class ALSModel private[ml] ( val rank: Int, @transient val userFactors: DataFrame, @transient val itemFactors: DataFrame) - extends Model[ALSModel] with ALSModelParams { + extends Model[ALSModel] with ALSModelParams with Writable { /** @group setParam */ def setUserCol(value: String): this.type = set(userCol, value) @@ -220,8 +223,60 @@ class ALSModel private[ml] ( val copied = new ALSModel(uid, rank, userFactors, itemFactors) copyValues(copied, extra).setParent(parent) } + + @Since("1.6.0") + override def write: Writer = new ALSModel.ALSModelWriter(this) } +@Since("1.6.0") +object ALSModel extends Readable[ALSModel] { + + @Since("1.6.0") + override def read: Reader[ALSModel] = new ALSModelReader + + @Since("1.6.0") + override def load(path: String): ALSModel = read.load(path) + + private[recommendation] class ALSModelWriter(instance: ALSModel) extends Writer { + +override protected def saveImpl(path: String): Unit = { + val extraMetadata = render("rank" -> instance.rank) + DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) + val userPath = new Path(path, "userFactors").toString + instance.userFactors.write.format("parquet").save(userPath) + val itemPath = new Path(path, "itemFactors").toString + instance.itemFactors.write.format("parquet").save(itemPath) +} + } + + private[recommendation] class ALSModelReader extends Reader[ALSModel] { + +/** Checked against metadata when loading model */ +private val className = "org.apache.spark.ml.recommendation.ALSModel" + +override def load(path: String): ALSModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + implicit val format = DefaultFormats + val rank: Int = metadata.extraMetadata match { +case Some(m: JValue) => + (m \ "rank").extract[Int] +case None => + throw new RuntimeException(s"ALSModel loader could not read rank from JSON metadata:" + +s" ${metadata.metadataStr}") + } + + val userPath = new Path(path, "userFactors").toString + val userFactors =
spark git commit: [SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel
Repository: spark Updated Branches: refs/heads/branch-1.6 bcc6813dd -> 23b8c2256 [SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel Also modifies DefaultParamsWriter.saveMetadata to take optional extra metadata. CC: mengxr yanboliang Author: Joseph K. Bradley <jos...@databricks.com> Closes #9786 from jkbradley/als-io. (cherry picked from commit 2acdf10b1f3bb1242dba64efa798c672fde9f0d2) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/23b8c225 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/23b8c225 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/23b8c225 Branch: refs/heads/branch-1.6 Commit: 23b8c2256d55d76ebe22977c03c0b893e5b6c408 Parents: bcc6813 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Wed Nov 18 13:16:31 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:16:39 2015 -0800 -- .../apache/spark/ml/recommendation/ALS.scala| 75 +-- .../org/apache/spark/ml/util/ReadWrite.scala| 14 +++- .../spark/ml/recommendation/ALSSuite.scala | 78 +--- 3 files changed, 150 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/23b8c225/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 535f266..d92514d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -27,13 +27,16 @@ import scala.util.hashing.byteswap64 import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.hadoop.fs.{FileSystem, Path} +import org.json4s.{DefaultFormats, JValue} +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ import org.apache.spark.{Logging, Partitioner} -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{Since, DeveloperApi, Experimental} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.CholeskyDecomposition import org.apache.spark.mllib.optimization.NNLS import org.apache.spark.rdd.RDD @@ -182,7 +185,7 @@ class ALSModel private[ml] ( val rank: Int, @transient val userFactors: DataFrame, @transient val itemFactors: DataFrame) - extends Model[ALSModel] with ALSModelParams { + extends Model[ALSModel] with ALSModelParams with Writable { /** @group setParam */ def setUserCol(value: String): this.type = set(userCol, value) @@ -220,8 +223,60 @@ class ALSModel private[ml] ( val copied = new ALSModel(uid, rank, userFactors, itemFactors) copyValues(copied, extra).setParent(parent) } + + @Since("1.6.0") + override def write: Writer = new ALSModel.ALSModelWriter(this) } +@Since("1.6.0") +object ALSModel extends Readable[ALSModel] { + + @Since("1.6.0") + override def read: Reader[ALSModel] = new ALSModelReader + + @Since("1.6.0") + override def load(path: String): ALSModel = read.load(path) + + private[recommendation] class ALSModelWriter(instance: ALSModel) extends Writer { + +override protected def saveImpl(path: String): Unit = { + val extraMetadata = render("rank" -> instance.rank) + DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) + val userPath = new Path(path, "userFactors").toString + instance.userFactors.write.format("parquet").save(userPath) + val itemPath = new Path(path, "itemFactors").toString + instance.itemFactors.write.format("parquet").save(itemPath) +} + } + + private[recommendation] class ALSModelReader extends Reader[ALSModel] { + +/** Checked against metadata when loading model */ +private val className = "org.apache.spark.ml.recommendation.ALSModel" + +override def load(path: String): ALSModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + implicit val format = DefaultFormats + val rank: Int = metadata.extraMetadata match { +case Some(m: JValue) => + (m \ "rank").extract[Int] +case None => + throw new RuntimeException(s"ALSModel loader could not read rank from JSON metadata:" + +s" ${meta
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/master 2acdf10b1 -> e391abdf2 [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e391abdf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e391abdf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e391abdf Branch: refs/heads/master Commit: e391abdf2cb6098a35347bd123b815ee9ac5b689 Parents: 2acdf10 Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:25:15 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e391abdf/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index f3e4d34..7ab0d89 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -145,8 +145,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.4 e12fbd80c -> eda1ff4ee [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. (cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eda1ff4e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eda1ff4e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eda1ff4e Branch: refs/heads/branch-1.4 Commit: eda1ff4eede3968c24a0d1338432eae5682e8432 Parents: e12fbd8 Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:25:54 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eda1ff4e/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 74e7dbf..3493186 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -141,8 +141,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.6 23b8c2256 -> 18e308b84 [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. (cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18e308b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18e308b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18e308b8 Branch: refs/heads/branch-1.6 Commit: 18e308b84fe7ffeca730397152582b31a4b88a82 Parents: 23b8c22 Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:25:22 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18e308b8/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index f3e4d34..7ab0d89 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -145,8 +145,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.5 f802b07ab -> 0439e32e2 [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. (cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0439e32e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0439e32e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0439e32e Branch: refs/heads/branch-1.5 Commit: 0439e32e2e88cc3a3364a37509fa25aebf2c299f Parents: f802b07 Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:25:37 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0439e32e/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 36b124c..c226e3c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -148,8 +148,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.3 1bfa00d54 -> 5278ef0f1 [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. (cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5278ef0f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5278ef0f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5278ef0f Branch: refs/heads/branch-1.3 Commit: 5278ef0f1aead5de7e32da8bb40ba15fabe7473d Parents: 1bfa00d Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:26:05 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5278ef0f/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 59a79e5..dea35e3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -134,8 +134,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.2 4b6e24e25 -> 307f27e24 [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. (cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/307f27e2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/307f27e2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/307f27e2 Branch: refs/heads/branch-1.2 Commit: 307f27e24e17afd92030194a3e6fec312fc19f4f Parents: 4b6e24e Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:26:18 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/307f27e2/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 7960f3c..d983dd3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -127,8 +127,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.1 19835ec1f -> 11ee9d191 [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec jira: https://issues.apache.org/jira/browse/SPARK-11813 I found the problem during training a large corpus. Avoid serialization of vocab in Word2Vec has 2 benefits. 1. Performance improvement for less serialization. 2. Increase the capacity of Word2Vec a lot. Currently in the fit of word2vec, the closure mainly includes serialization of Word2Vec and 2 global table. the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab 2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab. Their sum cannot exceed Int.max due to the restriction of ByteArrayOutputStream. In any case, avoiding serialization of vocab helps decrease the size of the closure serialization, especially when vectorSize is small, thus to allow larger vocabulary. Actually there's another possible fix, make local copy of fields to avoid including Word2Vec in the closure. Let me know if that's preferred. Author: Yuhao Yang <hhb...@gmail.com> Closes #9803 from hhbyyh/w2vVocab. (cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/11ee9d19 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/11ee9d19 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/11ee9d19 Branch: refs/heads/branch-1.1 Commit: 11ee9d191e26a41a44ff0ca8730a129934942ee7 Parents: 19835ec Author: Yuhao Yang <hhb...@gmail.com> Authored: Wed Nov 18 13:25:15 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:26:39 2015 -0800 -- .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/11ee9d19/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index fc14447..a3e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -127,8 +127,8 @@ class Word2Vec extends Serializable with Logging { private var trainWordsCount = 0 private var vocabSize = 0 - private var vocab: Array[VocabWord] = null - private var vocabHash = mutable.HashMap.empty[String, Int] + @transient private var vocab: Array[VocabWord] = null + @transient private var vocabHash = mutable.HashMap.empty[String, Int] private def learnVocab(words: RDD[String]): Unit = { vocab = words.map(w => (w, 1)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example codes
Repository: spark Updated Branches: refs/heads/master e391abdf2 -> e222d7584 [SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example codes This PR includes: * Update SparkR:::glm, SparkR:::summary API docs. * Update SparkR machine learning user guide and example codes to show: * supporting feature interaction in R formula. * summary for gaussian GLM model. * coefficients for binomial GLM model. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9727 from yanboliang/spark-11684. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e222d758 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e222d758 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e222d758 Branch: refs/heads/master Commit: e222d758499ad2609046cc1a2cc8afb45c5bccbb Parents: e391abd Author: Yanbo Liang <yblia...@gmail.com> Authored: Wed Nov 18 13:30:29 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:30:29 2015 -0800 -- R/pkg/R/mllib.R | 18 +-- docs/sparkr.md | 50 .../spark/ml/regression/LinearRegression.scala | 3 ++ 3 files changed, 60 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e222d758/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index f23e1c7..8d3b438 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -32,6 +32,12 @@ setClass("PipelineModel", representation(model = "jobj")) #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg. #' @param lambda Regularization parameter #' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details) +#' @param standardize Whether to standardize features before training +#' @param solver The solver algorithm used for optimization, this can be "l-bfgs", "normal" and +#' "auto". "l-bfgs" denotes Limited-memory BFGS which is a limited-memory +#' quasi-Newton optimization method. "normal" denotes using Normal Equation as an +#' analytical solution to the linear regression problem. The default value is "auto" +#' which means that the solver algorithm is selected automatically. #' @return a fitted MLlib model #' @rdname glm #' @export @@ -79,9 +85,15 @@ setMethod("predict", signature(object = "PipelineModel"), #' #' Returns the summary of a model produced by glm(), similarly to R's summary(). #' -#' @param x A fitted MLlib model -#' @return a list with a 'coefficient' component, which is the matrix of coefficients. See -#' summary.glm for more information. +#' @param object A fitted MLlib model +#' @return a list with 'devianceResiduals' and 'coefficients' components for gaussian family +#' or a list with 'coefficients' component for binomial family. \cr +#' For gaussian family: the 'devianceResiduals' gives the min/max deviance residuals +#' of the estimation, the 'coefficients' gives the estimated coefficients and their +#' estimated standard errors, t values and p-values. (It only available when model +#' fitted by normal solver.) \cr +#' For binomial family: the 'coefficients' gives the estimated coefficients. +#' See summary.glm for more information. \cr #' @rdname summary #' @export #' @examples http://git-wip-us.apache.org/repos/asf/spark/blob/e222d758/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 437bd47..a744b76 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -286,24 +286,37 @@ head(teenagers) # Machine Learning -SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR. +SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. + +The [summar
spark git commit: [SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example codes
Repository: spark Updated Branches: refs/heads/branch-1.6 18e308b84 -> 03c2d20dc [SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example codes This PR includes: * Update SparkR:::glm, SparkR:::summary API docs. * Update SparkR machine learning user guide and example codes to show: * supporting feature interaction in R formula. * summary for gaussian GLM model. * coefficients for binomial GLM model. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9727 from yanboliang/spark-11684. (cherry picked from commit e222d758499ad2609046cc1a2cc8afb45c5bccbb) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/03c2d20d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/03c2d20d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/03c2d20d Branch: refs/heads/branch-1.6 Commit: 03c2d20dcfdecd910b8b2b036d581720e1a370e8 Parents: 18e308b Author: Yanbo Liang <yblia...@gmail.com> Authored: Wed Nov 18 13:30:29 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:30:36 2015 -0800 -- R/pkg/R/mllib.R | 18 +-- docs/sparkr.md | 50 .../spark/ml/regression/LinearRegression.scala | 3 ++ 3 files changed, 60 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/03c2d20d/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index f23e1c7..8d3b438 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -32,6 +32,12 @@ setClass("PipelineModel", representation(model = "jobj")) #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg. #' @param lambda Regularization parameter #' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details) +#' @param standardize Whether to standardize features before training +#' @param solver The solver algorithm used for optimization, this can be "l-bfgs", "normal" and +#' "auto". "l-bfgs" denotes Limited-memory BFGS which is a limited-memory +#' quasi-Newton optimization method. "normal" denotes using Normal Equation as an +#' analytical solution to the linear regression problem. The default value is "auto" +#' which means that the solver algorithm is selected automatically. #' @return a fitted MLlib model #' @rdname glm #' @export @@ -79,9 +85,15 @@ setMethod("predict", signature(object = "PipelineModel"), #' #' Returns the summary of a model produced by glm(), similarly to R's summary(). #' -#' @param x A fitted MLlib model -#' @return a list with a 'coefficient' component, which is the matrix of coefficients. See -#' summary.glm for more information. +#' @param object A fitted MLlib model +#' @return a list with 'devianceResiduals' and 'coefficients' components for gaussian family +#' or a list with 'coefficients' component for binomial family. \cr +#' For gaussian family: the 'devianceResiduals' gives the min/max deviance residuals +#' of the estimation, the 'coefficients' gives the estimated coefficients and their +#' estimated standard errors, t values and p-values. (It only available when model +#' fitted by normal solver.) \cr +#' For binomial family: the 'coefficients' gives the estimated coefficients. +#' See summary.glm for more information. \cr #' @rdname summary #' @export #' @examples http://git-wip-us.apache.org/repos/asf/spark/blob/03c2d20d/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 437bd47..a744b76 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -286,24 +286,37 @@ head(teenagers) # Machine Learning -SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR. +SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support
spark git commit: [SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol
Repository: spark Updated Branches: refs/heads/master e222d7584 -> 603a721c2 [SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol [SPARK-7685](https://issues.apache.org/jira/browse/SPARK-7685) and [SPARK-9642](https://issues.apache.org/jira/browse/SPARK-9642) have already supported setting weight column for ```LogisticRegression``` and ```LinearRegression```. It's a very important feature, PySpark should also support. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9811 from yanboliang/spark-11820. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/603a721c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/603a721c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/603a721c Branch: refs/heads/master Commit: 603a721c21488e17c15c45ce1de893e6b3d02274 Parents: e222d75 Author: Yanbo Liang <yblia...@gmail.com> Authored: Wed Nov 18 13:32:06 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:32:06 2015 -0800 -- python/pyspark/ml/classification.py | 17 + python/pyspark/ml/regression.py | 16 2 files changed, 17 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/603a721c/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 603f2c7..4a2982e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -36,7 +36,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassif @inherit_doc class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, - HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds): + HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, + HasWeightCol): """ Logistic regression. Currently, this class only supports binary classification. @@ -44,9 +45,9 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors >>> df = sc.parallelize([ -... Row(label=1.0, features=Vectors.dense(1.0)), -... Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF() ->>> lr = LogisticRegression(maxIter=5, regParam=0.01) +... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), +... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF() +>>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight") >>> model = lr.fit(df) >>> model.weights DenseVector([5.5...]) @@ -80,12 +81,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", - rawPredictionCol="rawPrediction", standardization=True): + rawPredictionCol="rawPrediction", standardization=True, weightCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ - rawPredictionCol="rawPrediction", standardization=True) + rawPredictionCol="rawPrediction", standardization=True, weightCol=None) If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() @@ -105,12 +106,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", - rawPredictionCol="rawPredict
spark git commit: [SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol
Repository: spark Updated Branches: refs/heads/branch-1.6 03c2d20dc -> 464b2d421 [SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol [SPARK-7685](https://issues.apache.org/jira/browse/SPARK-7685) and [SPARK-9642](https://issues.apache.org/jira/browse/SPARK-9642) have already supported setting weight column for ```LogisticRegression``` and ```LinearRegression```. It's a very important feature, PySpark should also support. mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #9811 from yanboliang/spark-11820. (cherry picked from commit 603a721c21488e17c15c45ce1de893e6b3d02274) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/464b2d42 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/464b2d42 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/464b2d42 Branch: refs/heads/branch-1.6 Commit: 464b2d4215534761c1a3fc84abc4007d47e391fd Parents: 03c2d20 Author: Yanbo Liang <yblia...@gmail.com> Authored: Wed Nov 18 13:32:06 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Nov 18 13:32:12 2015 -0800 -- python/pyspark/ml/classification.py | 17 + python/pyspark/ml/regression.py | 16 2 files changed, 17 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/464b2d42/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 603f2c7..4a2982e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -36,7 +36,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassif @inherit_doc class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, - HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds): + HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, + HasWeightCol): """ Logistic regression. Currently, this class only supports binary classification. @@ -44,9 +45,9 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors >>> df = sc.parallelize([ -... Row(label=1.0, features=Vectors.dense(1.0)), -... Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF() ->>> lr = LogisticRegression(maxIter=5, regParam=0.01) +... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), +... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF() +>>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight") >>> model = lr.fit(df) >>> model.weights DenseVector([5.5...]) @@ -80,12 +81,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", - rawPredictionCol="rawPrediction", standardization=True): + rawPredictionCol="rawPrediction", standardization=True, weightCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ - rawPredictionCol="rawPrediction", standardization=True) + rawPredictionCol="rawPrediction", standardization=True, weightCol=None) If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() @@ -105,12 +106,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
spark git commit: [SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector
Repository: spark Updated Branches: refs/heads/branch-1.6 fa9d56f9e -> 78dc07cdf [SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector This PR makes the default read/write work with simple transformers/estimators that have params of type `Param[Vector]`. jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #9776 from mengxr/SPARK-11764. (cherry picked from commit 3e9e6380236985ec5b51b459f8c61f964a76ff8b) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/78dc07cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/78dc07cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/78dc07cd Branch: refs/heads/branch-1.6 Commit: 78dc07cdf900cb0837e5a21e35e80af0ca6a7f26 Parents: fa9d56f Author: Xiangrui Meng <m...@databricks.com> Authored: Tue Nov 17 14:04:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 15:22:51 2015 -0800 -- .../org/apache/spark/ml/param/params.scala | 12 +-- .../org/apache/spark/ml/param/ParamsSuite.scala | 22 2 files changed, 28 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/78dc07cd/mllib/src/main/scala/org/apache/spark/ml/param/params.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index c932570..d182b0a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -29,6 +29,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.ml.util.Identifiable +import org.apache.spark.mllib.linalg.{Vector, Vectors} /** * :: DeveloperApi :: @@ -88,9 +89,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali value match { case x: String => compact(render(JString(x))) + case v: Vector => +v.toJson case _ => throw new NotImplementedError( - "The default jsonEncode only supports string. " + + "The default jsonEncode only supports string and vector. " + s"${this.getClass.getName} must override jsonEncode for ${value.getClass.getName}.") } } @@ -100,9 +103,14 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali parse(json) match { case JString(x) => x.asInstanceOf[T] + case JObject(v) => +val keys = v.map(_._1) +assert(keys.contains("type") && keys.contains("values"), + s"Expect a JSON serialized vector but cannot find fields 'type' and 'values' in $json.") +Vectors.fromJson(json).asInstanceOf[T] case _ => throw new NotImplementedError( - "The default jsonDecode only supports string. " + + "The default jsonDecode only supports string and vector. " + s"${this.getClass.getName} must override jsonDecode to support its value type.") } } http://git-wip-us.apache.org/repos/asf/spark/blob/78dc07cd/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index eeb03db..a1878be 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.param import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.linalg.{Vector, Vectors} class ParamsSuite extends SparkFunSuite { @@ -80,7 +81,7 @@ class ParamsSuite extends SparkFunSuite { } } -{ // StringParam +{ // Param[String] val param = new Param[String](dummy, "name", "doc") // Currently we do not support null. for (value <- Seq("", "1", "abc", "quote\"", "newline\n")) { @@ -89,6 +90,19 @@ class ParamsSuite extends SparkFunSuite { } } +{ // Param[Vector] + val param = new Param[Vector](dummy, "name", "doc") + val values = Seq( +Vectors.dense(Array.empty[Double]), +Vectors.dense(0.0, 2.0), +Vectors.sparse(0, Array.empty, Array.empty), +Vectors.sparse(2, Array(1), Array(2.0))) +
spark git commit: [SPARK-11769][ML] Add save, load to all basic Transformers
Repository: spark Updated Branches: refs/heads/master d92514966 -> d98d1cb00 [SPARK-11769][ML] Add save, load to all basic Transformers This excludes Estimators and ones which include Vector and other non-basic types for Params or data. This adds: * Bucketizer * DCT * HashingTF * Interaction * NGram * Normalizer * OneHotEncoder * PolynomialExpansion * QuantileDiscretizer * RFormula * SQLTransformer * StopWordsRemover * StringIndexer * Tokenizer * VectorAssembler * VectorSlicer CC: mengxr Author: Joseph K. Bradley <jos...@databricks.com> Closes #9755 from jkbradley/transformer-io. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d98d1cb0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d98d1cb0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d98d1cb0 Branch: refs/heads/master Commit: d98d1cb000c8c4e391d73ae86efd09f15e5d165c Parents: d925149 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Nov 17 12:43:56 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 12:43:56 2015 -0800 -- .../org/apache/spark/ml/feature/Binarizer.scala | 8 - .../apache/spark/ml/feature/Bucketizer.scala| 22 .../scala/org/apache/spark/ml/feature/DCT.scala | 19 +-- .../org/apache/spark/ml/feature/HashingTF.scala | 20 +-- .../apache/spark/ml/feature/Interaction.scala | 29 +--- .../org/apache/spark/ml/feature/NGram.scala | 19 +-- .../apache/spark/ml/feature/Normalizer.scala| 20 +-- .../apache/spark/ml/feature/OneHotEncoder.scala | 19 +-- .../spark/ml/feature/PolynomialExpansion.scala | 20 --- .../spark/ml/feature/QuantileDiscretizer.scala | 22 .../spark/ml/feature/SQLTransformer.scala | 27 +-- .../spark/ml/feature/StopWordsRemover.scala | 19 +-- .../apache/spark/ml/feature/StringIndexer.scala | 22 +--- .../org/apache/spark/ml/feature/Tokenizer.scala | 35 +--- .../spark/ml/feature/VectorAssembler.scala | 18 +++--- .../apache/spark/ml/feature/VectorSlicer.scala | 22 .../spark/ml/feature/BinarizerSuite.scala | 8 ++--- .../spark/ml/feature/BucketizerSuite.scala | 12 +-- .../org/apache/spark/ml/feature/DCTSuite.scala | 11 +- .../spark/ml/feature/HashingTFSuite.scala | 11 +- .../spark/ml/feature/InteractionSuite.scala | 10 +- .../apache/spark/ml/feature/NGramSuite.scala| 11 +- .../spark/ml/feature/NormalizerSuite.scala | 11 +- .../spark/ml/feature/OneHotEncoderSuite.scala | 12 ++- .../ml/feature/PolynomialExpansionSuite.scala | 12 ++- .../ml/feature/QuantileDiscretizerSuite.scala | 13 +++- .../spark/ml/feature/SQLTransformerSuite.scala | 10 +- .../ml/feature/StopWordsRemoverSuite.scala | 14 +++- .../spark/ml/feature/StringIndexerSuite.scala | 13 ++-- .../spark/ml/feature/TokenizerSuite.scala | 25 -- .../spark/ml/feature/VectorAssemblerSuite.scala | 11 +- .../spark/ml/feature/VectorSlicerSuite.scala| 12 ++- 32 files changed, 453 insertions(+), 84 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d98d1cb0/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index e5c2557..e2be654 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ @@ -87,10 +87,16 @@ final class Binarizer(override val uid: String) override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) + @Since("1.6.0") override def write: Writer = new DefaultParamsWriter(this) } +@Since("1.6.0") object Binarizer extends Readable[Binarizer] { + @Since("1.6.0") override def read: Reader[Binarizer] = new DefaultParamsReader[Binarizer] + + @Since("1.6.0") + override def load(path: String): Binarizer = read.load(path) } http://git-wip-us.apache.org/repos/asf/spark/blob/d98d1cb0/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feat
spark git commit: [SPARK-11769][ML] Add save, load to all basic Transformers
Repository: spark Updated Branches: refs/heads/branch-1.6 88431fb3e -> e7f901647 [SPARK-11769][ML] Add save, load to all basic Transformers This excludes Estimators and ones which include Vector and other non-basic types for Params or data. This adds: * Bucketizer * DCT * HashingTF * Interaction * NGram * Normalizer * OneHotEncoder * PolynomialExpansion * QuantileDiscretizer * RFormula * SQLTransformer * StopWordsRemover * StringIndexer * Tokenizer * VectorAssembler * VectorSlicer CC: mengxr Author: Joseph K. Bradley <jos...@databricks.com> Closes #9755 from jkbradley/transformer-io. (cherry picked from commit d98d1cb000c8c4e391d73ae86efd09f15e5d165c) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7f90164 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7f90164 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7f90164 Branch: refs/heads/branch-1.6 Commit: e7f90164740d920bbdca06e90098b8bd8b775715 Parents: 88431fb Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Nov 17 12:43:56 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 12:44:04 2015 -0800 -- .../org/apache/spark/ml/feature/Binarizer.scala | 8 - .../apache/spark/ml/feature/Bucketizer.scala| 22 .../scala/org/apache/spark/ml/feature/DCT.scala | 19 +-- .../org/apache/spark/ml/feature/HashingTF.scala | 20 +-- .../apache/spark/ml/feature/Interaction.scala | 29 +--- .../org/apache/spark/ml/feature/NGram.scala | 19 +-- .../apache/spark/ml/feature/Normalizer.scala| 20 +-- .../apache/spark/ml/feature/OneHotEncoder.scala | 19 +-- .../spark/ml/feature/PolynomialExpansion.scala | 20 --- .../spark/ml/feature/QuantileDiscretizer.scala | 22 .../spark/ml/feature/SQLTransformer.scala | 27 +-- .../spark/ml/feature/StopWordsRemover.scala | 19 +-- .../apache/spark/ml/feature/StringIndexer.scala | 22 +--- .../org/apache/spark/ml/feature/Tokenizer.scala | 35 +--- .../spark/ml/feature/VectorAssembler.scala | 18 +++--- .../apache/spark/ml/feature/VectorSlicer.scala | 22 .../spark/ml/feature/BinarizerSuite.scala | 8 ++--- .../spark/ml/feature/BucketizerSuite.scala | 12 +-- .../org/apache/spark/ml/feature/DCTSuite.scala | 11 +- .../spark/ml/feature/HashingTFSuite.scala | 11 +- .../spark/ml/feature/InteractionSuite.scala | 10 +- .../apache/spark/ml/feature/NGramSuite.scala| 11 +- .../spark/ml/feature/NormalizerSuite.scala | 11 +- .../spark/ml/feature/OneHotEncoderSuite.scala | 12 ++- .../ml/feature/PolynomialExpansionSuite.scala | 12 ++- .../ml/feature/QuantileDiscretizerSuite.scala | 13 +++- .../spark/ml/feature/SQLTransformerSuite.scala | 10 +- .../ml/feature/StopWordsRemoverSuite.scala | 14 +++- .../spark/ml/feature/StringIndexerSuite.scala | 13 ++-- .../spark/ml/feature/TokenizerSuite.scala | 25 -- .../spark/ml/feature/VectorAssemblerSuite.scala | 11 +- .../spark/ml/feature/VectorSlicerSuite.scala| 12 ++- 32 files changed, 453 insertions(+), 84 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e7f90164/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index e5c2557..e2be654 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ @@ -87,10 +87,16 @@ final class Binarizer(override val uid: String) override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) + @Since("1.6.0") override def write: Writer = new DefaultParamsWriter(this) } +@Since("1.6.0") object Binarizer extends Readable[Binarizer] { + @Since("1.6.0") override def read: Reader[Binarizer] = new DefaultParamsReader[Binarizer] + + @Since("1.6.0") + override def load(path: String): Binarizer = read.load(path) } http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9016
spark git commit: [SPARK-11728] Replace example code in ml-ensembles.md using include_example
Repository: spark Updated Branches: refs/heads/branch-1.6 dbb2ea06d -> 47e58322c [SPARK-11728] Replace example code in ml-ensembles.md using include_example JIRA issue https://issues.apache.org/jira/browse/SPARK-11728. The ml-ensembles.md file contains `OneVsRestExample`. Instead of writing new code files of two `OneVsRestExample`s, I use two existing files in the examples directory, they are `OneVsRestExample.scala` and `JavaOneVsRestExample.scala`. Author: Xusen Yin <yinxu...@gmail.com> Closes #9716 from yinxusen/SPARK-11728. (cherry picked from commit 9154f89befb7a33d4853cea95efd7dc6b25d033b) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/47e58322 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/47e58322 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/47e58322 Branch: refs/heads/branch-1.6 Commit: 47e58322cac2f319dec07a97f4eaf38f9fce2407 Parents: dbb2ea0 Author: Xusen Yin <yinxu...@gmail.com> Authored: Tue Nov 17 23:44:06 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 23:44:14 2015 -0800 -- docs/ml-ensembles.md| 754 +-- ...avaGradientBoostedTreeClassifierExample.java | 102 +++ ...JavaGradientBoostedTreeRegressorExample.java | 90 +++ .../spark/examples/ml/JavaOneVsRestExample.java | 4 + .../ml/JavaRandomForestClassifierExample.java | 101 +++ .../ml/JavaRandomForestRegressorExample.java| 90 +++ .../gradient_boosted_tree_classifier_example.py | 77 ++ .../gradient_boosted_tree_regressor_example.py | 74 ++ .../ml/random_forest_classifier_example.py | 77 ++ .../ml/random_forest_regressor_example.py | 74 ++ .../GradientBoostedTreeClassifierExample.scala | 97 +++ .../GradientBoostedTreeRegressorExample.scala | 85 +++ .../spark/examples/ml/OneVsRestExample.scala| 4 + .../ml/RandomForestClassifierExample.scala | 97 +++ .../ml/RandomForestRegressorExample.scala | 84 +++ 15 files changed, 1070 insertions(+), 740 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/47e58322/docs/ml-ensembles.md -- diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index ce15f5e..f6c3c30 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -115,194 +115,21 @@ We use two feature transformers to prepare the data; these help index categories Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details. -{% highlight scala %} -import org.apache.spark.ml.Pipeline -import org.apache.spark.ml.classification.RandomForestClassifier -import org.apache.spark.ml.classification.RandomForestClassificationModel -import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator - -// Load and parse the data file, converting it to a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -// Index labels, adding metadata to the label column. -// Fit on whole dataset to include all labels in index. -val labelIndexer = new StringIndexer() - .setInputCol("label") - .setOutputCol("indexedLabel") - .fit(data) -// Automatically identify categorical features, and index them. -// Set maxCategories so features with > 4 distinct values are treated as continuous. -val featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data) - -// Split the data into training and test sets (30% held out for testing) -val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) - -// Train a RandomForest model. -val rf = new RandomForestClassifier() - .setLabelCol("indexedLabel") - .setFeaturesCol("indexedFeatures") - .setNumTrees(10) - -// Convert indexed labels back to original labels. -val labelConverter = new IndexToString() - .setInputCol("prediction") - .setOutputCol("predictedLabel") - .setLabels(labelIndexer.labels) - -// Chain indexers and forest in a Pipeline -val pipeline = new Pipeline() - .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) - -// Train model. This also runs the indexers. -val model = pipeline.fit(trainingData) - -// Make predictions. -val predictions = model.transform(testData) - -// Select example rows to display. -predictions.select("predictedLabel", "label", "features").show(5) - -// Select (prediction, true label) and compute test error -val eva
spark git commit: [SPARK-11728] Replace example code in ml-ensembles.md using include_example
Repository: spark Updated Branches: refs/heads/master 2f191c66b -> 9154f89be [SPARK-11728] Replace example code in ml-ensembles.md using include_example JIRA issue https://issues.apache.org/jira/browse/SPARK-11728. The ml-ensembles.md file contains `OneVsRestExample`. Instead of writing new code files of two `OneVsRestExample`s, I use two existing files in the examples directory, they are `OneVsRestExample.scala` and `JavaOneVsRestExample.scala`. Author: Xusen Yin <yinxu...@gmail.com> Closes #9716 from yinxusen/SPARK-11728. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9154f89b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9154f89b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9154f89b Branch: refs/heads/master Commit: 9154f89befb7a33d4853cea95efd7dc6b25d033b Parents: 2f191c6 Author: Xusen Yin <yinxu...@gmail.com> Authored: Tue Nov 17 23:44:06 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 23:44:06 2015 -0800 -- docs/ml-ensembles.md| 754 +-- ...avaGradientBoostedTreeClassifierExample.java | 102 +++ ...JavaGradientBoostedTreeRegressorExample.java | 90 +++ .../spark/examples/ml/JavaOneVsRestExample.java | 4 + .../ml/JavaRandomForestClassifierExample.java | 101 +++ .../ml/JavaRandomForestRegressorExample.java| 90 +++ .../gradient_boosted_tree_classifier_example.py | 77 ++ .../gradient_boosted_tree_regressor_example.py | 74 ++ .../ml/random_forest_classifier_example.py | 77 ++ .../ml/random_forest_regressor_example.py | 74 ++ .../GradientBoostedTreeClassifierExample.scala | 97 +++ .../GradientBoostedTreeRegressorExample.scala | 85 +++ .../spark/examples/ml/OneVsRestExample.scala| 4 + .../ml/RandomForestClassifierExample.scala | 97 +++ .../ml/RandomForestRegressorExample.scala | 84 +++ 15 files changed, 1070 insertions(+), 740 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9154f89b/docs/ml-ensembles.md -- diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index ce15f5e..f6c3c30 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -115,194 +115,21 @@ We use two feature transformers to prepare the data; these help index categories Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details. -{% highlight scala %} -import org.apache.spark.ml.Pipeline -import org.apache.spark.ml.classification.RandomForestClassifier -import org.apache.spark.ml.classification.RandomForestClassificationModel -import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator - -// Load and parse the data file, converting it to a DataFrame. -val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -// Index labels, adding metadata to the label column. -// Fit on whole dataset to include all labels in index. -val labelIndexer = new StringIndexer() - .setInputCol("label") - .setOutputCol("indexedLabel") - .fit(data) -// Automatically identify categorical features, and index them. -// Set maxCategories so features with > 4 distinct values are treated as continuous. -val featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data) - -// Split the data into training and test sets (30% held out for testing) -val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) - -// Train a RandomForest model. -val rf = new RandomForestClassifier() - .setLabelCol("indexedLabel") - .setFeaturesCol("indexedFeatures") - .setNumTrees(10) - -// Convert indexed labels back to original labels. -val labelConverter = new IndexToString() - .setInputCol("prediction") - .setOutputCol("predictedLabel") - .setLabels(labelIndexer.labels) - -// Chain indexers and forest in a Pipeline -val pipeline = new Pipeline() - .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) - -// Train model. This also runs the indexers. -val model = pipeline.fit(trainingData) - -// Make predictions. -val predictions = model.transform(testData) - -// Select example rows to display. -predictions.select("predictedLabel", "label", "features").show(5) - -// Select (prediction, true label) and compute test error -val evaluator = new MulticlassClassificationEvaluator() - .setLabelCol("indexedLabel") - .setPredictionCol("prediction&qu
spark git commit: [SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector
Repository: spark Updated Branches: refs/heads/master 6eb7008b7 -> 3e9e63802 [SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector This PR makes the default read/write work with simple transformers/estimators that have params of type `Param[Vector]`. jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #9776 from mengxr/SPARK-11764. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e9e6380 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e9e6380 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e9e6380 Branch: refs/heads/master Commit: 3e9e6380236985ec5b51b459f8c61f964a76ff8b Parents: 6eb7008 Author: Xiangrui Meng <m...@databricks.com> Authored: Tue Nov 17 14:04:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 14:04:49 2015 -0800 -- .../org/apache/spark/ml/param/params.scala | 12 +-- .../org/apache/spark/ml/param/ParamsSuite.scala | 22 2 files changed, 28 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e9e6380/mllib/src/main/scala/org/apache/spark/ml/param/params.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index c932570..d182b0a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -29,6 +29,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.ml.util.Identifiable +import org.apache.spark.mllib.linalg.{Vector, Vectors} /** * :: DeveloperApi :: @@ -88,9 +89,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali value match { case x: String => compact(render(JString(x))) + case v: Vector => +v.toJson case _ => throw new NotImplementedError( - "The default jsonEncode only supports string. " + + "The default jsonEncode only supports string and vector. " + s"${this.getClass.getName} must override jsonEncode for ${value.getClass.getName}.") } } @@ -100,9 +103,14 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali parse(json) match { case JString(x) => x.asInstanceOf[T] + case JObject(v) => +val keys = v.map(_._1) +assert(keys.contains("type") && keys.contains("values"), + s"Expect a JSON serialized vector but cannot find fields 'type' and 'values' in $json.") +Vectors.fromJson(json).asInstanceOf[T] case _ => throw new NotImplementedError( - "The default jsonDecode only supports string. " + + "The default jsonDecode only supports string and vector. " + s"${this.getClass.getName} must override jsonDecode to support its value type.") } } http://git-wip-us.apache.org/repos/asf/spark/blob/3e9e6380/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index eeb03db..a1878be 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.param import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.linalg.{Vector, Vectors} class ParamsSuite extends SparkFunSuite { @@ -80,7 +81,7 @@ class ParamsSuite extends SparkFunSuite { } } -{ // StringParam +{ // Param[String] val param = new Param[String](dummy, "name", "doc") // Currently we do not support null. for (value <- Seq("", "1", "abc", "quote\"", "newline\n")) { @@ -89,6 +90,19 @@ class ParamsSuite extends SparkFunSuite { } } +{ // Param[Vector] + val param = new Param[Vector](dummy, "name", "doc") + val values = Seq( +Vectors.dense(Array.empty[Double]), +Vectors.dense(0.0, 2.0), +Vectors.sparse(0, Array.empty, Array.empty), +Vectors.sparse(2, Array(1), Array(2.0))) + for (value <- values) { +val json = param.jsonEncode(value) +assert(param.jsonDecode(json) === value) +
spark git commit: [SPARK-11763][ML] Add save, load to LogisticRegression Estimator
Repository: spark Updated Branches: refs/heads/master 328eb49e6 -> 6eb7008b7 [SPARK-11763][ML] Add save,load to LogisticRegression Estimator Add save/load to LogisticRegression Estimator, and refactor tests a little to make it easier to add similar support to other Estimator, Model pairs. Moved LogisticRegressionReader/Writer to within LogisticRegressionModel CC: mengxr Author: Joseph K. Bradley <jos...@databricks.com> Closes #9749 from jkbradley/lr-io-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6eb7008b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6eb7008b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6eb7008b Branch: refs/heads/master Commit: 6eb7008b7f33a36b06d0615b68cc21ed90ad1d8a Parents: 328eb49 Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Nov 17 14:03:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 14:03:49 2015 -0800 -- .../ml/classification/LogisticRegression.scala | 91 +++- .../org/apache/spark/ml/util/ReadWrite.scala| 1 + .../org/apache/spark/ml/PipelineSuite.scala | 7 -- .../ml/classification/ClassifierSuite.scala | 32 +++ .../LogisticRegressionSuite.scala | 37 ++-- .../ProbabilisticClassifierSuite.scala | 14 +++ .../spark/ml/util/DefaultReadWriteTest.scala| 50 ++- 7 files changed, 173 insertions(+), 59 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6eb7008b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index a88f526..71c2533 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -157,7 +157,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas @Experimental class LogisticRegression(override val uid: String) extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] - with LogisticRegressionParams with Logging { + with LogisticRegressionParams with Writable with Logging { def this() = this(Identifiable.randomUID("logreg")) @@ -385,6 +385,12 @@ class LogisticRegression(override val uid: String) } override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra) + + override def write: Writer = new DefaultParamsWriter(this) +} + +object LogisticRegression extends Readable[LogisticRegression] { + override def read: Reader[LogisticRegression] = new DefaultParamsReader[LogisticRegression] } /** @@ -517,61 +523,62 @@ class LogisticRegressionModel private[ml] ( * * For [[LogisticRegressionModel]], this does NOT currently save the training [[summary]]. * An option to save [[summary]] may be added in the future. + * + * This also does not save the [[parent]] currently. */ - override def write: Writer = new LogisticRegressionWriter(this) -} - - -/** [[Writer]] instance for [[LogisticRegressionModel]] */ -private[classification] class LogisticRegressionWriter(instance: LogisticRegressionModel) - extends Writer with Logging { - - private case class Data( - numClasses: Int, - numFeatures: Int, - intercept: Double, - coefficients: Vector) - - override protected def saveImpl(path: String): Unit = { -// Save metadata and Params -DefaultParamsWriter.saveMetadata(instance, path, sc) -// Save model data: numClasses, numFeatures, intercept, coefficients -val data = Data(instance.numClasses, instance.numFeatures, instance.intercept, - instance.coefficients) -val dataPath = new Path(path, "data").toString - sqlContext.createDataFrame(Seq(data)).write.format("parquet").save(dataPath) - } + override def write: Writer = new LogisticRegressionModel.LogisticRegressionModelWriter(this) } object LogisticRegressionModel extends Readable[LogisticRegressionModel] { - override def read: Reader[LogisticRegressionModel] = new LogisticRegressionReader + override def read: Reader[LogisticRegressionModel] = new LogisticRegressionModelReader override def load(path: String): LogisticRegressionModel = read.load(path) -} + /** [[Writer]] instance for [[LogisticRegressionModel]] */ + private[classification] class LogisticRegressionModelWriter(instance: LogisticRegressionModel) +extends Writer with Logging { + +private case class Data( +numClasses: Int, +
spark git commit: [SPARK-11763][ML] Add save, load to LogisticRegression Estimator
Repository: spark Updated Branches: refs/heads/branch-1.6 c0ada5b93 -> a529427a1 [SPARK-11763][ML] Add save,load to LogisticRegression Estimator Add save/load to LogisticRegression Estimator, and refactor tests a little to make it easier to add similar support to other Estimator, Model pairs. Moved LogisticRegressionReader/Writer to within LogisticRegressionModel CC: mengxr Author: Joseph K. Bradley <jos...@databricks.com> Closes #9749 from jkbradley/lr-io-2. (cherry picked from commit 6eb7008b7f33a36b06d0615b68cc21ed90ad1d8a) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a529427a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a529427a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a529427a Branch: refs/heads/branch-1.6 Commit: a529427a1d819ceb68fb2d7ab2c61f12bdaf0273 Parents: c0ada5b Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Nov 17 14:03:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 14:03:56 2015 -0800 -- .../ml/classification/LogisticRegression.scala | 91 +++- .../org/apache/spark/ml/util/ReadWrite.scala| 1 + .../org/apache/spark/ml/PipelineSuite.scala | 7 -- .../ml/classification/ClassifierSuite.scala | 32 +++ .../LogisticRegressionSuite.scala | 37 ++-- .../ProbabilisticClassifierSuite.scala | 14 +++ .../spark/ml/util/DefaultReadWriteTest.scala| 50 ++- 7 files changed, 173 insertions(+), 59 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a529427a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index a88f526..71c2533 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -157,7 +157,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas @Experimental class LogisticRegression(override val uid: String) extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] - with LogisticRegressionParams with Logging { + with LogisticRegressionParams with Writable with Logging { def this() = this(Identifiable.randomUID("logreg")) @@ -385,6 +385,12 @@ class LogisticRegression(override val uid: String) } override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra) + + override def write: Writer = new DefaultParamsWriter(this) +} + +object LogisticRegression extends Readable[LogisticRegression] { + override def read: Reader[LogisticRegression] = new DefaultParamsReader[LogisticRegression] } /** @@ -517,61 +523,62 @@ class LogisticRegressionModel private[ml] ( * * For [[LogisticRegressionModel]], this does NOT currently save the training [[summary]]. * An option to save [[summary]] may be added in the future. + * + * This also does not save the [[parent]] currently. */ - override def write: Writer = new LogisticRegressionWriter(this) -} - - -/** [[Writer]] instance for [[LogisticRegressionModel]] */ -private[classification] class LogisticRegressionWriter(instance: LogisticRegressionModel) - extends Writer with Logging { - - private case class Data( - numClasses: Int, - numFeatures: Int, - intercept: Double, - coefficients: Vector) - - override protected def saveImpl(path: String): Unit = { -// Save metadata and Params -DefaultParamsWriter.saveMetadata(instance, path, sc) -// Save model data: numClasses, numFeatures, intercept, coefficients -val data = Data(instance.numClasses, instance.numFeatures, instance.intercept, - instance.coefficients) -val dataPath = new Path(path, "data").toString - sqlContext.createDataFrame(Seq(data)).write.format("parquet").save(dataPath) - } + override def write: Writer = new LogisticRegressionModel.LogisticRegressionModelWriter(this) } object LogisticRegressionModel extends Readable[LogisticRegressionModel] { - override def read: Reader[LogisticRegressionModel] = new LogisticRegressionReader + override def read: Reader[LogisticRegressionModel] = new LogisticRegressionModelReader override def load(path: String): LogisticRegressionModel = read.load(path) -} + /** [[Writer]] instance for [[LogisticRegressionModel]] */ + private[classification] class LogisticRegressionModelWrite
spark git commit: [SPARK-11729] Replace example code in ml-linear-methods.md using include_example
Repository: spark Updated Branches: refs/heads/branch-1.6 b468f8171 -> c0ada5b93 [SPARK-11729] Replace example code in ml-linear-methods.md using include_example JIRA link: https://issues.apache.org/jira/browse/SPARK-11729 Author: Xusen Yin <yinxu...@gmail.com> Closes #9713 from yinxusen/SPARK-11729. (cherry picked from commit 328eb49e671337e09188853b29c8f32fb157) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0ada5b9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0ada5b9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0ada5b9 Branch: refs/heads/branch-1.6 Commit: c0ada5b932c180c0ba72d30463bd4a8a622c8c65 Parents: b468f81 Author: Xusen Yin <yinxu...@gmail.com> Authored: Tue Nov 17 13:59:59 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 14:00:08 2015 -0800 -- docs/ml-linear-methods.md | 218 +-- ...vaLinearRegressionWithElasticNetExample.java | 65 ++ .../JavaLogisticRegressionSummaryExample.java | 84 +++ ...LogisticRegressionWithElasticNetExample.java | 55 + .../ml/linear_regression_with_elastic_net.py| 44 .../ml/logistic_regression_with_elastic_net.py | 44 .../LinearRegressionWithElasticNetExample.scala | 61 ++ .../ml/LogisticRegressionSummaryExample.scala | 77 +++ ...ogisticRegressionWithElasticNetExample.scala | 53 + 9 files changed, 491 insertions(+), 210 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0ada5b9/docs/ml-linear-methods.md -- diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 85edfd3..0c13d7d 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -57,77 +57,15 @@ $\alpha$ and `regParam` corresponds to $\lambda$. -{% highlight scala %} -import org.apache.spark.ml.classification.LogisticRegression - -// Load training data -val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -val lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8) - -// Fit the model -val lrModel = lr.fit(training) - -// Print the coefficients and intercept for logistic regression -println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala %} -{% highlight java %} -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.SQLContext; - -public class LogisticRegressionWithElasticNetExample { - public static void main(String[] args) { -SparkConf conf = new SparkConf() - .setAppName("Logistic Regression with Elastic Net Example"); - -SparkContext sc = new SparkContext(conf); -SQLContext sql = new SQLContext(sc); -String path = "data/mllib/sample_libsvm_data.txt"; - -// Load training data -DataFrame training = sqlContext.read().format("libsvm").load(path); - -LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8); - -// Fit the model -LogisticRegressionModel lrModel = lr.fit(training); - -// Print the coefficients and intercept for logistic regression -System.out.println("Coefficients: " + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); - } -} -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java %} -{% highlight python %} -from pyspark.ml.classification import LogisticRegression - -# Load training data -training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) - -# Fit the model -lrModel = lr.fit(training) - -# Print the coefficients and intercept for logistic regression -print("Coefficients: " + str(lrModel.coefficients)) -print("Intercept: " + str(lrModel.intercept)) -{% endhighlight %} +{% include_example python/ml/logistic_regression_with_elastic_net.py %} @@ -152,33 +90,7 @@ This will likely change when multiclass classification is supported. Continuing the earlier example: -{% highlight scala %} -import org.apache.spark.ml.classification.BinaryLogistic
spark git commit: [SPARK-11729] Replace example code in ml-linear-methods.md using include_example
Repository: spark Updated Branches: refs/heads/master fa603e08d -> 328eb49e6 [SPARK-11729] Replace example code in ml-linear-methods.md using include_example JIRA link: https://issues.apache.org/jira/browse/SPARK-11729 Author: Xusen Yin <yinxu...@gmail.com> Closes #9713 from yinxusen/SPARK-11729. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/328eb49e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/328eb49e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/328eb49e Branch: refs/heads/master Commit: 328eb49e671337e09188853b29c8f32fb157 Parents: fa603e0 Author: Xusen Yin <yinxu...@gmail.com> Authored: Tue Nov 17 13:59:59 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 13:59:59 2015 -0800 -- docs/ml-linear-methods.md | 218 +-- ...vaLinearRegressionWithElasticNetExample.java | 65 ++ .../JavaLogisticRegressionSummaryExample.java | 84 +++ ...LogisticRegressionWithElasticNetExample.java | 55 + .../ml/linear_regression_with_elastic_net.py| 44 .../ml/logistic_regression_with_elastic_net.py | 44 .../LinearRegressionWithElasticNetExample.scala | 61 ++ .../ml/LogisticRegressionSummaryExample.scala | 77 +++ ...ogisticRegressionWithElasticNetExample.scala | 53 + 9 files changed, 491 insertions(+), 210 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/328eb49e/docs/ml-linear-methods.md -- diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 85edfd3..0c13d7d 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -57,77 +57,15 @@ $\alpha$ and `regParam` corresponds to $\lambda$. -{% highlight scala %} -import org.apache.spark.ml.classification.LogisticRegression - -// Load training data -val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -val lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8) - -// Fit the model -val lrModel = lr.fit(training) - -// Print the coefficients and intercept for logistic regression -println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala %} -{% highlight java %} -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.SQLContext; - -public class LogisticRegressionWithElasticNetExample { - public static void main(String[] args) { -SparkConf conf = new SparkConf() - .setAppName("Logistic Regression with Elastic Net Example"); - -SparkContext sc = new SparkContext(conf); -SQLContext sql = new SQLContext(sc); -String path = "data/mllib/sample_libsvm_data.txt"; - -// Load training data -DataFrame training = sqlContext.read().format("libsvm").load(path); - -LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8); - -// Fit the model -LogisticRegressionModel lrModel = lr.fit(training); - -// Print the coefficients and intercept for logistic regression -System.out.println("Coefficients: " + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); - } -} -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java %} -{% highlight python %} -from pyspark.ml.classification import LogisticRegression - -# Load training data -training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") - -lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) - -# Fit the model -lrModel = lr.fit(training) - -# Print the coefficients and intercept for logistic regression -print("Coefficients: " + str(lrModel.coefficients)) -print("Intercept: " + str(lrModel.intercept)) -{% endhighlight %} +{% include_example python/ml/logistic_regression_with_elastic_net.py %} @@ -152,33 +90,7 @@ This will likely change when multiclass classification is supported. Continuing the earlier example: -{% highlight scala %} -import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary - -// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier exam
spark git commit: [SPARK-7013][ML][TEST] Add unit test for spark.ml StandardScaler
Repository: spark Updated Branches: refs/heads/branch-1.6 737f07172 -> 3f63f08f9 [SPARK-7013][ML][TEST] Add unit test for spark.ml StandardScaler I have added unit test for ML's StandardScaler By comparing with R's output, please review for me. Thx. Author: RoyGaoVLIS <roy...@zju.edu.cn> Closes #6665 from RoyGao/7013. (cherry picked from commit 67a5132c21bc8338adbae80b33b85e8fa0ddda34) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f63f08f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f63f08f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f63f08f Branch: refs/heads/branch-1.6 Commit: 3f63f08f9db6073ef9b6318ba20ebfbd1bbd263a Parents: 737f071 Author: RoyGaoVLIS <roy...@zju.edu.cn> Authored: Tue Nov 17 23:00:49 2015 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Nov 17 23:01:03 2015 -0800 -- .../spark/ml/feature/StandardScalerSuite.scala | 108 +++ 1 file changed, 108 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3f63f08f/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala new file mode 100644 index 000..879a3ae --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + + +import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + +class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext{ + + @transient var data: Array[Vector] = _ + @transient var resWithStd: Array[Vector] = _ + @transient var resWithMean: Array[Vector] = _ + @transient var resWithBoth: Array[Vector] = _ + + override def beforeAll(): Unit = { +super.beforeAll() + +data = Array( + Vectors.dense(-2.0, 2.3, 0.0), + Vectors.dense(0.0, -5.1, 1.0), + Vectors.dense(1.7, -0.6, 3.3) +) +resWithMean = Array( + Vectors.dense(-1.9, 3.4333, -1.4333), + Vectors.dense(0.1, -3.9667, -0.4333), + Vectors.dense(1.8, 0.5333, 1.8667) +) +resWithStd = Array( + Vectors.dense(-1.079898494312, 0.616834091415, 0.0), + Vectors.dense(0.0, -1.367762550529, 0.590968109266), + Vectors.dense(0.917913720165, -0.160913241239, 1.950194760579) +) +resWithBoth = Array( + Vectors.dense(-1.0259035695965, 0.920781324866, -0.8470542899497), + Vectors.dense(0.0539949247156, -1.063815317078, -0.256086180682), + Vectors.dense(0.9719086448809, 0.143033992212, 1.103140470631) +) + } + + def assertResult(dataframe: DataFrame): Unit = { +dataframe.select("standarded_features", "expected").collect().foreach { + case Row(vector1: Vector, vector2: Vector) => +assert(vector1 ~== vector2 absTol 1E-5, + "The vector value is not correct after standardization.") +} + } + + test("Standardization with default parameter") { +val df0 = sqlContext.createDataFrame(data.zip(resWithStd)).toDF("features", "expected") + +val standardscaler0 = new StandardScaler() + .setInputCol("features") + .setOutputCol("standarded_features") + .fit(df0) + +assertResult(standardscaler0.transform(df0)) + } + + test("Standardization with setter") { +val df1 = sqlContext.createDataFrame(data.zip(resWithBot