from:"meng"

spark git commit: [SPARK-6519][ML] Add spark.ml API for bisecting k-means

2016-01-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 8e4f894e9 -> 9376ae723


[SPARK-6519][ML] Add spark.ml API for bisecting k-means

Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>

Closes #9604 from yu-iskw/SPARK-6519.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9376ae72
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9376ae72
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9376ae72

Branch: refs/heads/master
Commit: 9376ae723e4ec0515120c488541617a0538f8879
Parents: 8e4f894
Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>
Authored: Wed Jan 20 10:48:10 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jan 20 10:48:10 2016 -0800

--
 .../spark/ml/clustering/BisectingKMeans.scala   | 196 +++
 .../ml/clustering/BisectingKMeansSuite.scala|  85 
 2 files changed, 281 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9376ae72/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
new file mode 100644
index 000..0b47cbb
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params}
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.clustering.
+  {BisectingKMeans => MLlibBisectingKMeans, BisectingKMeansModel => 
MLlibBisectingKMeansModel}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.{IntegerType, StructType}
+
+
+/**
+ * Common params for BisectingKMeans and BisectingKMeansModel
+ */
+private[clustering] trait BisectingKMeansParams extends Params
+  with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
+
+  /**
+   * Set the number of clusters to create (k). Must be > 1. Default: 2.
+   * @group param
+   */
+  @Since("2.0.0")
+  final val k = new IntParam(this, "k", "number of clusters to create", (x: 
Int) => x > 1)
+
+  /** @group getParam */
+  @Since("2.0.0")
+  def getK: Int = $(k)
+
+  /** @group expertParam */
+  @Since("2.0.0")
+  final val minDivisibleClusterSize = new Param[Double](
+this,
+"minDivisibleClusterSize",
+"the minimum number of points (if >= 1.0) or the minimum proportion",
+(value: Double) => value > 0)
+
+  /** @group expertGetParam */
+  @Since("2.0.0")
+  def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize)
+
+  /**
+   * Validates and transforms the input schema.
+   * @param schema input schema
+   * @return output schema
+   */
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Model fitted by BisectingKMeans.
+ *
+ * @param parentModel a model trained by 
spark.mllib.clustering.BisectingKMeans.
+ */
+@Since("2.0.0")
+@Experimental
+class BisectingKMeansModel private[ml] (
+@Since("2.0.0") override val uid: String,
+private val parentModel: MLlibBisectingKMeansModel
+  ) extends Model[BisectingKMeansModel] with BisectingKMeansParams {
+
+  @Since("2.0.0")
+  override def copy(extra: ParamMap): BisectingKMe

spark git commit: [SPARK-12230][ML] WeightedLeastSquares.fit() should handle division by zero properly if standard deviation of target variable is zero.

2016-01-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9bb35c5b5 -> 9753835cf


[SPARK-12230][ML] WeightedLeastSquares.fit() should handle division by zero 
properly if standard deviation of target variable is zero.

This fixes the behavior of WeightedLeastSquars.fit() when the standard 
deviation of the target variable is zero. If the fitIntercept is true, there is 
no need to train.

Author: Imran Younus <iyou...@us.ibm.com>

Closes #10274 from iyounus/SPARK-12230_bug_fix_in_weighted_least_squares.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9753835c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9753835c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9753835c

Branch: refs/heads/master
Commit: 9753835cf3acc135e61bf668223046e29306c80d
Parents: 9bb35c5
Author: Imran Younus <iyou...@us.ibm.com>
Authored: Wed Jan 20 11:16:59 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jan 20 11:16:59 2016 -0800

--
 .../spark/ml/optim/WeightedLeastSquares.scala   | 21 +-
 .../ml/optim/WeightedLeastSquaresSuite.scala| 69 ++--
 2 files changed, 83 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9753835c/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala 
b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 8617722..797870e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -86,6 +86,24 @@ private[ml] class WeightedLeastSquares(
 val aaBar = summary.aaBar
 val aaValues = aaBar.values
 
+if (bStd == 0) {
+  if (fitIntercept) {
+logWarning(s"The standard deviation of the label is zero, so the 
coefficients will be " +
+  s"zeros and the intercept will be the mean of the label; as a 
result, " +
+  s"training is not needed.")
+val coefficients = new DenseVector(Array.ofDim(k-1))
+val intercept = bBar
+val diagInvAtWA = new DenseVector(Array(0D))
+return new WeightedLeastSquaresModel(coefficients, intercept, 
diagInvAtWA)
+  } else {
+require(!(regParam > 0.0 && standardizeLabel),
+  "The standard deviation of the label is zero. " +
+"Model cannot be regularized with standardization=true")
+logWarning(s"The standard deviation of the label is zero. " +
+  "Consider setting fitIntercept=true.")
+  }
+}
+
 // add regularization to diagonals
 var i = 0
 var j = 2
@@ -94,8 +112,7 @@ private[ml] class WeightedLeastSquares(
   if (standardizeFeatures) {
 lambda *= aVar(j - 2)
   }
-  if (standardizeLabel) {
-// TODO: handle the case when bStd = 0
+  if (standardizeLabel && bStd != 0) {
 lambda /= bStd
   }
   aaValues(i) += lambda

http://git-wip-us.apache.org/repos/asf/spark/blob/9753835c/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
index b542ba3..0b58a98 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.rdd.RDD
 class WeightedLeastSquaresSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 
   private var instances: RDD[Instance] = _
+  private var instancesConstLabel: RDD[Instance] = _
 
   override def beforeAll(): Unit = {
 super.beforeAll()
@@ -43,6 +44,20 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with 
MLlibTestSparkContext
   Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)),
   Instance(29.0, 4.0, Vectors.dense(3.0, 13.0))
 ), 2)
+
+/*
+   R code:
+
+   A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+   b.const <- c(17, 17, 17, 17)
+   w <- c(1, 2, 3, 4)
+ */
+instancesConstLabel = sc.parallelize(Seq(
+  Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+  Instance(17.0, 2.0, Vectors.dense(1.0, 7.0)),
+  Instance(17.0, 3.0, Vectors.dense(2.0, 11.0)),
+  Instance(17.0, 4.0, Vectors.dense(3.0, 13.0))
+), 2)
   }
 
   test("WLS against lm&

spark git commit: [SPARK-11295][PYSPARK] Add packages to JUnit output for Python tests

2016-01-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9376ae723 -> 9bb35c5b5


[SPARK-11295][PYSPARK] Add packages to JUnit output for Python tests

This is #9263 from gliptak (improving grouping/display of test case results) 
with a small fix of bisecting k-means unit test.

Author: GÃ¡bor LiptÃ¡k <glip...@gmail.com>
Author: Xiangrui Meng <m...@databricks.com>

Closes #10850 from mengxr/SPARK-11295.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9bb35c5b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9bb35c5b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9bb35c5b

Branch: refs/heads/master
Commit: 9bb35c5b59e58dbebbdc6856d611bff73dd35a91
Parents: 9376ae7
Author: GÃ¡bor LiptÃ¡k <glip...@gmail.com>
Authored: Wed Jan 20 11:11:10 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jan 20 11:11:10 2016 -0800

--
 python/pyspark/ml/tests.py|  1 +
 python/pyspark/mllib/tests.py | 26 +++---
 python/pyspark/sql/tests.py   |  1 +
 python/pyspark/streaming/tests.py |  1 +
 python/pyspark/tests.py   |  1 +
 5 files changed, 19 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9bb35c5b/python/pyspark/ml/tests.py
--
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 4eb17bf..9ea639d 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -394,6 +394,7 @@ class CrossValidatorTests(PySparkTestCase):
 
 
 if __name__ == "__main__":
+from pyspark.ml.tests import *
 if xmlrunner:
 
unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
 else:

http://git-wip-us.apache.org/repos/asf/spark/blob/9bb35c5b/python/pyspark/mllib/tests.py
--
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 32ed48e..79ce495 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -77,21 +77,24 @@ except:
 pass
 
 ser = PickleSerializer()
-sc = SparkContext('local[4]', "MLlib tests")
 
 
 class MLlibTestCase(unittest.TestCase):
 def setUp(self):
-self.sc = sc
+self.sc = SparkContext('local[4]', "MLlib tests")
+
+def tearDown(self):
+self.sc.stop()
 
 
 class MLLibStreamingTestCase(unittest.TestCase):
 def setUp(self):
-self.sc = sc
+self.sc = SparkContext('local[4]', "MLlib tests")
 self.ssc = StreamingContext(self.sc, 1.0)
 
 def tearDown(self):
 self.ssc.stop(False)
+self.sc.stop()
 
 @staticmethod
 def _eventually(condition, timeout=30.0, catch_assertions=False):
@@ -423,7 +426,7 @@ class ListTests(MLlibTestCase):
 from pyspark.mllib.clustering import BisectingKMeans
 data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
 bskm = BisectingKMeans()
-model = bskm.train(sc.parallelize(data, 2), k=4)
+model = bskm.train(self.sc.parallelize(data, 2), k=4)
 p = array([0.0, 0.0])
 rdd_p = self.sc.parallelize([p])
 self.assertEqual(model.predict(p), model.predict(rdd_p).first())
@@ -1166,7 +1169,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
 clusterWeights=[1.0, 1.0, 1.0, 1.0])
 
 predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, 
-1.5]]]
-predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
+predict_data = [self.sc.parallelize(batch, 1) for batch in 
predict_data]
 predict_stream = self.ssc.queueStream(predict_data)
 predict_val = stkm.predictOn(predict_stream)
 
@@ -1197,7 +1200,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
 # classification based in the initial model would have been 0
 # proving that the model is updated.
 batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
-batches = [sc.parallelize(batch) for batch in batches]
+batches = [self.sc.parallelize(batch) for batch in batches]
 input_stream = self.ssc.queueStream(batches)
 predict_results = []
 
@@ -1230,7 +1233,7 @@ class LinearDataGeneratorTests(MLlibTestCase):
 self.assertEqual(len(point.features), 3)
 
 linear_data = LinearDataGenerator.generateLinearRDD(
-sc=sc, nexamples=6, nfeatures=2, eps=0.1,
+sc=self.sc, nexamples=6, nfeatures=2, eps=0.1,
 nParts=2, intercept=0.0).collect()
 self.assertEqual(len(linear_data), 6)
 for point in linear_data:
@@ -1406,7 +1409,7 @@ class 
StreamingLinearRegressionWithTests(MLLib

spark git commit: Revert "[SPARK-11295] Add packages to JUnit output for Python tests"

2016-01-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3ac648289 -> beda90142


Revert "[SPARK-11295] Add packages to JUnit output for Python tests"

This reverts commit c6f971b4aeca7265ab374fa46c5c452461d9b6a7.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/beda9014
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/beda9014
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/beda9014

Branch: refs/heads/master
Commit: beda9014220be77dd735e6af1903e7d93dceb110
Parents: 3ac6482
Author: Xiangrui Meng <m...@databricks.com>
Authored: Tue Jan 19 16:51:17 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Jan 19 16:51:17 2016 -0800

--
 python/pyspark/ml/tests.py|  1 -
 python/pyspark/mllib/tests.py | 24 ++--
 python/pyspark/sql/tests.py   |  1 -
 python/pyspark/streaming/tests.py |  1 -
 python/pyspark/tests.py   |  1 -
 5 files changed, 10 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/beda9014/python/pyspark/ml/tests.py
--
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 9ea639d..4eb17bf 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -394,7 +394,6 @@ class CrossValidatorTests(PySparkTestCase):
 
 
 if __name__ == "__main__":
-from pyspark.ml.tests import *
 if xmlrunner:
 
unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
 else:

http://git-wip-us.apache.org/repos/asf/spark/blob/beda9014/python/pyspark/mllib/tests.py
--
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index ea7d297..32ed48e 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -77,24 +77,21 @@ except:
 pass
 
 ser = PickleSerializer()
+sc = SparkContext('local[4]', "MLlib tests")
 
 
 class MLlibTestCase(unittest.TestCase):
 def setUp(self):
-self.sc = SparkContext('local[4]', "MLlib tests")
-
-def tearDown(self):
-self.sc.stop()
+self.sc = sc
 
 
 class MLLibStreamingTestCase(unittest.TestCase):
 def setUp(self):
-self.sc = SparkContext('local[4]', "MLlib tests")
+self.sc = sc
 self.ssc = StreamingContext(self.sc, 1.0)
 
 def tearDown(self):
 self.ssc.stop(False)
-self.sc.stop()
 
 @staticmethod
 def _eventually(condition, timeout=30.0, catch_assertions=False):
@@ -1169,7 +1166,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
 clusterWeights=[1.0, 1.0, 1.0, 1.0])
 
 predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, 
-1.5]]]
-predict_data = [self.sc.parallelize(batch, 1) for batch in 
predict_data]
+predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
 predict_stream = self.ssc.queueStream(predict_data)
 predict_val = stkm.predictOn(predict_stream)
 
@@ -1200,7 +1197,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
 # classification based in the initial model would have been 0
 # proving that the model is updated.
 batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
-batches = [self.sc.parallelize(batch) for batch in batches]
+batches = [sc.parallelize(batch) for batch in batches]
 input_stream = self.ssc.queueStream(batches)
 predict_results = []
 
@@ -1233,7 +1230,7 @@ class LinearDataGeneratorTests(MLlibTestCase):
 self.assertEqual(len(point.features), 3)
 
 linear_data = LinearDataGenerator.generateLinearRDD(
-sc=self.sc, nexamples=6, nfeatures=2, eps=0.1,
+sc=sc, nexamples=6, nfeatures=2, eps=0.1,
 nParts=2, intercept=0.0).collect()
 self.assertEqual(len(linear_data), 6)
 for point in linear_data:
@@ -1409,7 +1406,7 @@ class 
StreamingLinearRegressionWithTests(MLLibStreamingTestCase):
 for i in range(10):
 batch = LinearDataGenerator.generateLinearInput(
 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1)
-batches.append(self.sc.parallelize(batch))
+batches.append(sc.parallelize(batch))
 
 input_stream = self.ssc.queueStream(batches)
 slr.trainOn(input_stream)
@@ -1433,7 +1430,7 @@ class 
StreamingLinearRegressionWithTests(MLLibStreamingTestCase):
 for i in range(10):
 batch = LinearDataGenerator.generateLinearInput(
 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
-batches.append(self.sc.par

spark git commit: [SPARK-12346][ML] Missing attribute names in GLM for vector-type features

2016-01-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 53184ce77 -> 8c2b67f55


[SPARK-12346][ML] Missing attribute names in GLM for vector-type features

Currently `summary()` fails on a GLM model fitted over a vector feature missing 
ML attrs, since the output feature attrs will also have no name. We can avoid 
this situation by forcing `VectorAssembler` to make up suitable names when 
inputs are missing names.

cc mengxr

Author: Eric Liang <e...@databricks.com>

Closes #10323 from ericl/spark-12346.

(cherry picked from commit 5e492e9d5bc0992cbcffe64a9aaf3b334b173d2c)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c2b67f5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c2b67f5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c2b67f5

Branch: refs/heads/branch-1.6
Commit: 8c2b67f55416562a0f1fafeefb073f79701c9cc9
Parents: 53184ce
Author: Eric Liang <e...@databricks.com>
Authored: Mon Jan 18 12:50:58 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jan 18 12:51:06 2016 -0800

--
 .../spark/ml/feature/VectorAssembler.scala  |  6 ++--
 .../apache/spark/ml/feature/RFormulaSuite.scala | 38 
 .../spark/ml/feature/VectorAssemblerSuite.scala |  4 +--
 3 files changed, 43 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8c2b67f5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 801096f..ec7ead5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -70,19 +70,19 @@ class VectorAssembler(override val uid: String)
   val group = AttributeGroup.fromStructField(field)
   if (group.attributes.isDefined) {
 // If attributes are defined, copy them with updated names.
-group.attributes.get.map { attr =>
+group.attributes.get.zipWithIndex.map { case (attr, i) =>
   if (attr.name.isDefined) {
 // TODO: Define a rigorous naming scheme.
 attr.withName(c + "_" + attr.name.get)
   } else {
-attr
+attr.withName(c + "_" + i)
   }
 }
   } else {
 // Otherwise, treat all attributes as numeric. If we cannot get 
the number of attributes
 // from metadata, check the first row.
 val numAttrs = 
group.numAttributes.getOrElse(first.getAs[Vector](index).size)
-Array.fill(numAttrs)(NumericAttribute.defaultAttr)
+Array.tabulate(numAttrs)(i => 
NumericAttribute.defaultAttr.withName(c + "_" + i))
   }
 case otherType =>
   throw new SparkException(s"VectorAssembler does not support the 
$otherType type")

http://git-wip-us.apache.org/repos/asf/spark/blob/8c2b67f5/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index dc20a5e..16e565d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -143,6 +143,44 @@ class RFormulaSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 assert(attrs === expectedAttrs)
   }
 
+  test("vector attribute generation") {
+val formula = new RFormula().setFormula("id ~ vec")
+val original = sqlContext.createDataFrame(
+  Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0)))
+).toDF("id", "vec")
+val model = formula.fit(original)
+val result = model.transform(original)
+val attrs = AttributeGroup.fromStructField(result.schema("features"))
+val expectedAttrs = new AttributeGroup(
+  "features",
+  Array[Attribute](
+new NumericAttribute(Some("vec_0"), Some(1)),
+new NumericAttribute(Some("vec_1"), Some(2
+assert(attrs === expectedAttrs)
+  }
+
+  test("vector attribute generation with unnamed input attrs") {
+val formula = new RFormula().setFormula("id ~ vec2")
+val base = sqlContext.createDataFrame(
+  Seq((1, Vectors.dense(0.0,

spark git commit: [SPARK-12346][ML] Missing attribute names in GLM for vector-type features

2016-01-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 44fcf992a -> 5e492e9d5


[SPARK-12346][ML] Missing attribute names in GLM for vector-type features

Currently `summary()` fails on a GLM model fitted over a vector feature missing 
ML attrs, since the output feature attrs will also have no name. We can avoid 
this situation by forcing `VectorAssembler` to make up suitable names when 
inputs are missing names.

cc mengxr

Author: Eric Liang <e...@databricks.com>

Closes #10323 from ericl/spark-12346.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e492e9d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e492e9d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e492e9d

Branch: refs/heads/master
Commit: 5e492e9d5bc0992cbcffe64a9aaf3b334b173d2c
Parents: 44fcf99
Author: Eric Liang <e...@databricks.com>
Authored: Mon Jan 18 12:50:58 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jan 18 12:50:58 2016 -0800

--
 .../spark/ml/feature/VectorAssembler.scala  |  6 ++--
 .../apache/spark/ml/feature/RFormulaSuite.scala | 38 
 .../spark/ml/feature/VectorAssemblerSuite.scala |  4 +--
 3 files changed, 43 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5e492e9d/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 716bc63..7ff5ad1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -70,19 +70,19 @@ class VectorAssembler(override val uid: String)
   val group = AttributeGroup.fromStructField(field)
   if (group.attributes.isDefined) {
 // If attributes are defined, copy them with updated names.
-group.attributes.get.map { attr =>
+group.attributes.get.zipWithIndex.map { case (attr, i) =>
   if (attr.name.isDefined) {
 // TODO: Define a rigorous naming scheme.
 attr.withName(c + "_" + attr.name.get)
   } else {
-attr
+attr.withName(c + "_" + i)
   }
 }
   } else {
 // Otherwise, treat all attributes as numeric. If we cannot get 
the number of attributes
 // from metadata, check the first row.
 val numAttrs = 
group.numAttributes.getOrElse(first.getAs[Vector](index).size)
-Array.fill(numAttrs)(NumericAttribute.defaultAttr)
+Array.tabulate(numAttrs)(i => 
NumericAttribute.defaultAttr.withName(c + "_" + i))
   }
 case otherType =>
   throw new SparkException(s"VectorAssembler does not support the 
$otherType type")

http://git-wip-us.apache.org/repos/asf/spark/blob/5e492e9d/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index dc20a5e..16e565d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -143,6 +143,44 @@ class RFormulaSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 assert(attrs === expectedAttrs)
   }
 
+  test("vector attribute generation") {
+val formula = new RFormula().setFormula("id ~ vec")
+val original = sqlContext.createDataFrame(
+  Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0)))
+).toDF("id", "vec")
+val model = formula.fit(original)
+val result = model.transform(original)
+val attrs = AttributeGroup.fromStructField(result.schema("features"))
+val expectedAttrs = new AttributeGroup(
+  "features",
+  Array[Attribute](
+new NumericAttribute(Some("vec_0"), Some(1)),
+new NumericAttribute(Some("vec_1"), Some(2
+assert(attrs === expectedAttrs)
+  }
+
+  test("vector attribute generation with unnamed input attrs") {
+val formula = new RFormula().setFormula("id ~ vec2")
+val base = sqlContext.createDataFrame(
+  Seq((1, Vectors.dense(0.0, 1.0)), (2, Vectors.dense(1.0, 2.0)))
+).toDF("id", "vec")
+val metadata = new AttributeGroup(
+  &q

svn commit: r1723237 - in /spark: mllib/index.md site/mllib/index.html site/news/index.html

2016-01-05 Thread meng

Author: meng
Date: Wed Jan  6 06:27:47 2016
New Revision: 1723237

URL: http://svn.apache.org/viewvc?rev=1723237=rev
Log:
list bisecting k-means and AFT regression on mllib page

Modified:
spark/mllib/index.md
spark/site/mllib/index.html
spark/site/news/index.html

Modified: spark/mllib/index.md
URL: 
http://svn.apache.org/viewvc/spark/mllib/index.md?rev=1723237=1723236=1723237=diff
==
--- spark/mllib/index.md (original)
+++ spark/mllib/index.md Wed Jan  6 06:27:47 2016
@@ -90,8 +90,9 @@ subproject: MLlib
   classification and regression tree
   random forest and gradient-boosted trees
   recommendation via alternating least squares (ALS)
-  clustering via k-means, Gaussian mixtures (GMM), and power iteration 
clustering
+  clustering via k-means, bisecting k-means, Gaussian mixtures (GMM), 
and power iteration clustering
   topic modeling via latent Dirichlet allocation (LDA)
+  survival analysis via accelerated failure time model
   singular value decomposition (SVD) and QR decomposition
   principal component analysis (PCA)
   linear regression with L1, L2, and elastic-net 
regularization

Modified: spark/site/mllib/index.html
URL: 
http://svn.apache.org/viewvc/spark/site/mllib/index.html?rev=1723237=1723236=1723237=diff
==
--- spark/site/mllib/index.html (original)
+++ spark/site/mllib/index.html Wed Jan  6 06:27:47 2016
@@ -257,8 +257,9 @@
   classification and regression tree
   random forest and gradient-boosted trees
   recommendation via alternating least squares (ALS)
-  clustering via k-means, Gaussian mixtures (GMM), and power iteration 
clustering
+  clustering via k-means, bisecting k-means, Gaussian mixtures (GMM), 
and power iteration clustering
   topic modeling via latent Dirichlet allocation (LDA)
+  survival analysis via accelerated failure time model
   singular value decomposition (SVD) and QR decomposition
   principal component analysis (PCA)
   linear regression with L1, L2, and elastic-net 
regularization

Modified: spark/site/news/index.html
URL: 
http://svn.apache.org/viewvc/spark/site/news/index.html?rev=1723237=1723236=1723237=diff
==
--- spark/site/news/index.html (original)
+++ spark/site/news/index.html Wed Jan  6 06:27:47 2016
@@ -275,7 +275,7 @@ With this release the Spark community co
 
 
   One month to Spark Summit 2015 
in San Francisco
-  May 15, 2015
+  May 16, 2015
 
 There is one month left until https://spark-summit.org/2015/;>Spark Summit 2015, which
 will be held in San Francisco on June 15th to 17th.
@@ -287,7 +287,7 @@ The Summit will contain 
 
   Announcing Spark Summit Europe
-  May 15, 2015
+  May 16, 2015
 
 Abstract submissions are now open for the 
first ever https://www.prevalentdesignevents.com/sparksummit2015/europe/speaker/;>Spark
 Summit Europe. The event will take place on October 27th to 29th in 
Amsterdam. Submissions are welcome across a variety of Spark related topics, 
including use cases and ongoing development.
 



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[3/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example

2015-12-09 Thread meng

[SPARK-11551][DOC] Replace example code in ml-features.md using include_example

PR on behalf of somideshmukh, thanks!

Author: Xusen Yin <yinxu...@gmail.com>
Author: somideshmukh <somi...@us.ibm.com>

Closes #10219 from yinxusen/SPARK-11551.

(cherry picked from commit 051c6a066f7b5fcc7472412144c15b50a5319bd5)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfb42013
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfb42013
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfb42013

Branch: refs/heads/branch-1.6
Commit: bfb4201395c6a1905c6eb46de4ea3eefe8d17309
Parents: ee0a6e7
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Wed Dec 9 12:00:48 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Dec 9 12:01:00 2015 -0800

--
 docs/ml-features.md | 1112 +-
 .../spark/examples/ml/JavaBinarizerExample.java |   68 ++
 .../examples/ml/JavaBucketizerExample.java  |   71 ++
 .../spark/examples/ml/JavaDCTExample.java   |   65 +
 .../ml/JavaElementwiseProductExample.java   |   75 ++
 .../examples/ml/JavaMinMaxScalerExample.java|   51 +
 .../spark/examples/ml/JavaNGramExample.java |   71 ++
 .../examples/ml/JavaNormalizerExample.java  |   54 +
 .../examples/ml/JavaOneHotEncoderExample.java   |   78 ++
 .../spark/examples/ml/JavaPCAExample.java   |   71 ++
 .../ml/JavaPolynomialExpansionExample.java  |   71 ++
 .../spark/examples/ml/JavaRFormulaExample.java  |   69 ++
 .../examples/ml/JavaStandardScalerExample.java  |   54 +
 .../ml/JavaStopWordsRemoverExample.java |   65 +
 .../examples/ml/JavaStringIndexerExample.java   |   66 ++
 .../spark/examples/ml/JavaTokenizerExample.java |   75 ++
 .../examples/ml/JavaVectorAssemblerExample.java |   67 ++
 .../examples/ml/JavaVectorIndexerExample.java   |   61 +
 .../examples/ml/JavaVectorSlicerExample.java|   73 ++
 .../src/main/python/ml/binarizer_example.py |   43 +
 .../src/main/python/ml/bucketizer_example.py|   43 +
 .../python/ml/elementwise_product_example.py|   39 +
 examples/src/main/python/ml/n_gram_example.py   |   42 +
 .../src/main/python/ml/normalizer_example.py|   43 +
 .../main/python/ml/onehot_encoder_example.py|   48 +
 examples/src/main/python/ml/pca_example.py  |   42 +
 .../python/ml/polynomial_expansion_example.py   |   43 +
 examples/src/main/python/ml/rformula_example.py |   44 +
 .../main/python/ml/standard_scaler_example.py   |   43 +
 .../main/python/ml/stopwords_remover_example.py |   40 +
 .../main/python/ml/string_indexer_example.py|   39 +
 .../src/main/python/ml/tokenizer_example.py |   44 +
 .../main/python/ml/vector_assembler_example.py  |   42 +
 .../main/python/ml/vector_indexer_example.py|   40 +
 .../spark/examples/ml/BinarizerExample.scala|   48 +
 .../spark/examples/ml/BucketizerExample.scala   |   52 +
 .../apache/spark/examples/ml/DCTExample.scala   |   54 +
 .../examples/ml/ElementWiseProductExample.scala |   52 +
 .../spark/examples/ml/MinMaxScalerExample.scala |   50 +
 .../apache/spark/examples/ml/NGramExample.scala |   47 +
 .../spark/examples/ml/NormalizerExample.scala   |   52 +
 .../examples/ml/OneHotEncoderExample.scala  |   58 +
 .../apache/spark/examples/ml/PCAExample.scala   |   53 +
 .../ml/PolynomialExpansionExample.scala |   51 +
 .../spark/examples/ml/RFormulaExample.scala |   49 +
 .../examples/ml/StandardScalerExample.scala |   52 +
 .../examples/ml/StopWordsRemoverExample.scala   |   48 +
 .../examples/ml/StringIndexerExample.scala  |   48 +
 .../spark/examples/ml/TokenizerExample.scala|   54 +
 .../examples/ml/VectorAssemblerExample.scala|   49 +
 .../examples/ml/VectorIndexerExample.scala  |   54 +
 .../spark/examples/ml/VectorSlicerExample.scala |   58 +
 52 files changed, 2820 insertions(+), 1061 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 55e4012..7ad7c4e 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala 
docs](api/scala/index.html#org.apache.spark.ml.fea
 and the [RegexTokenizer Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
-
-val sentenceDataFrame = sqlContext.createDataFrame(Seq(
-  (0, "Hi I heard about Spark"),
-  (1, "I wish Java could use case classes"),
-  (2, "Logistic,regression,models,are,neat")

[3/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example

2015-12-09 Thread meng

[SPARK-11551][DOC] Replace example code in ml-features.md using include_example

PR on behalf of somideshmukh, thanks!

Author: Xusen Yin <yinxu...@gmail.com>
Author: somideshmukh <somi...@us.ibm.com>

Closes #10219 from yinxusen/SPARK-11551.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/051c6a06
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/051c6a06
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/051c6a06

Branch: refs/heads/master
Commit: 051c6a066f7b5fcc7472412144c15b50a5319bd5
Parents: 1eb7c22
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Wed Dec 9 12:00:48 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Dec 9 12:00:48 2015 -0800

--
 docs/ml-features.md | 1112 +-
 .../spark/examples/ml/JavaBinarizerExample.java |   68 ++
 .../examples/ml/JavaBucketizerExample.java  |   71 ++
 .../spark/examples/ml/JavaDCTExample.java   |   65 +
 .../ml/JavaElementwiseProductExample.java   |   75 ++
 .../examples/ml/JavaMinMaxScalerExample.java|   51 +
 .../spark/examples/ml/JavaNGramExample.java |   71 ++
 .../examples/ml/JavaNormalizerExample.java  |   54 +
 .../examples/ml/JavaOneHotEncoderExample.java   |   78 ++
 .../spark/examples/ml/JavaPCAExample.java   |   71 ++
 .../ml/JavaPolynomialExpansionExample.java  |   71 ++
 .../spark/examples/ml/JavaRFormulaExample.java  |   69 ++
 .../examples/ml/JavaStandardScalerExample.java  |   54 +
 .../ml/JavaStopWordsRemoverExample.java |   65 +
 .../examples/ml/JavaStringIndexerExample.java   |   66 ++
 .../spark/examples/ml/JavaTokenizerExample.java |   75 ++
 .../examples/ml/JavaVectorAssemblerExample.java |   67 ++
 .../examples/ml/JavaVectorIndexerExample.java   |   61 +
 .../examples/ml/JavaVectorSlicerExample.java|   73 ++
 .../src/main/python/ml/binarizer_example.py |   43 +
 .../src/main/python/ml/bucketizer_example.py|   43 +
 .../python/ml/elementwise_product_example.py|   39 +
 examples/src/main/python/ml/n_gram_example.py   |   42 +
 .../src/main/python/ml/normalizer_example.py|   43 +
 .../main/python/ml/onehot_encoder_example.py|   48 +
 examples/src/main/python/ml/pca_example.py  |   42 +
 .../python/ml/polynomial_expansion_example.py   |   43 +
 examples/src/main/python/ml/rformula_example.py |   44 +
 .../main/python/ml/standard_scaler_example.py   |   43 +
 .../main/python/ml/stopwords_remover_example.py |   40 +
 .../main/python/ml/string_indexer_example.py|   39 +
 .../src/main/python/ml/tokenizer_example.py |   44 +
 .../main/python/ml/vector_assembler_example.py  |   42 +
 .../main/python/ml/vector_indexer_example.py|   40 +
 .../spark/examples/ml/BinarizerExample.scala|   48 +
 .../spark/examples/ml/BucketizerExample.scala   |   52 +
 .../apache/spark/examples/ml/DCTExample.scala   |   54 +
 .../examples/ml/ElementWiseProductExample.scala |   52 +
 .../spark/examples/ml/MinMaxScalerExample.scala |   50 +
 .../apache/spark/examples/ml/NGramExample.scala |   47 +
 .../spark/examples/ml/NormalizerExample.scala   |   52 +
 .../examples/ml/OneHotEncoderExample.scala  |   58 +
 .../apache/spark/examples/ml/PCAExample.scala   |   53 +
 .../ml/PolynomialExpansionExample.scala |   51 +
 .../spark/examples/ml/RFormulaExample.scala |   49 +
 .../examples/ml/StandardScalerExample.scala |   52 +
 .../examples/ml/StopWordsRemoverExample.scala   |   48 +
 .../examples/ml/StringIndexerExample.scala  |   48 +
 .../spark/examples/ml/TokenizerExample.scala|   54 +
 .../examples/ml/VectorAssemblerExample.scala|   49 +
 .../examples/ml/VectorIndexerExample.scala  |   54 +
 .../spark/examples/ml/VectorSlicerExample.scala |   58 +
 52 files changed, 2820 insertions(+), 1061 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 55e4012..7ad7c4e 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala 
docs](api/scala/index.html#org.apache.spark.ml.fea
 and the [RegexTokenizer Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
-
-val sentenceDataFrame = sqlContext.createDataFrame(Seq(
-  (0, "Hi I heard about Spark"),
-  (1, "I wish Java could use case classes"),
-  (2, "Logistic,regression,models,are,neat")
-)).toDF("label", "sentence")
-val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words&

[1/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example

2015-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 1eb7c22ce -> 051c6a066


http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
new file mode 100644
index 000..9fa494c
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StringIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StringIndexerExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("StringIndexerExample")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// $example on$
+val df = sqlContext.createDataFrame(
+  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+).toDF("id", "category")
+
+val indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+
+val indexed = indexer.fit(df).transform(df)
+indexed.show()
+// $example off$
+sc.stop()
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
new file mode 100644
index 000..01e0d13
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TokenizerExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("TokenizerExample")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// $example on$
+val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+  (0, "Hi I heard about Spark"),
+  (1, "I wish Java could use case classes"),
+  (2, "Logistic,regression,models,are,neat")
+)).toDF("label", "sentence")
+
+val tokenizer = new 
Tokenizer().setInputCol("sentence").setOutputCol("words")
+val regexTokenizer = new RegexTokenizer()
+  .setInputCol("sentence")
+  .setOutputCol("words")
+  .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+val tokenized = tokenizer.transform(sentenceDataFrame)
+tokenized.select("words", "label").take(3).foreach(println)
+val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+regexTokenized.select("words", "label").take(3).foreach(println)
+// $example off$
+sc.stop()
+  }
+}

[2/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example

2015-12-09 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
new file mode 100644
index 000..668f71e
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPolynomialExpansionExample {
+  public static void main(String[] args) {
+SparkConf conf = new 
SparkConf().setAppName("JavaPolynomialExpansionExample");
+JavaSparkContext jsc = new JavaSparkContext(conf);
+SQLContext jsql = new SQLContext(jsc);
+
+// $example on$
+PolynomialExpansion polyExpansion = new PolynomialExpansion()
+  .setInputCol("features")
+  .setOutputCol("polyFeatures")
+  .setDegree(3);
+
+JavaRDD data = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Vectors.dense(-2.0, 2.3)),
+  RowFactory.create(Vectors.dense(0.0, 0.0)),
+  RowFactory.create(Vectors.dense(0.6, -1.1))
+));
+
+StructType schema = new StructType(new StructField[]{
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+
+DataFrame df = jsql.createDataFrame(data, schema);
+DataFrame polyDF = polyExpansion.transform(df);
+
+Row[] row = polyDF.select("polyFeatures").take(3);
+for (Row r : row) {
+  System.out.println(r.get(0));
+}
+// $example off$
+jsc.stop();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/051c6a06/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
new file mode 100644
index 000..1e1062b
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import

[1/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example

2015-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 ee0a6e722 -> bfb420139


http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
new file mode 100644
index 000..9fa494c
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StringIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StringIndexerExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("StringIndexerExample")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// $example on$
+val df = sqlContext.createDataFrame(
+  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+).toDF("id", "category")
+
+val indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+
+val indexed = indexer.fit(df).transform(df)
+indexed.show()
+// $example off$
+sc.stop()
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
new file mode 100644
index 000..01e0d13
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TokenizerExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("TokenizerExample")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// $example on$
+val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+  (0, "Hi I heard about Spark"),
+  (1, "I wish Java could use case classes"),
+  (2, "Logistic,regression,models,are,neat")
+)).toDF("label", "sentence")
+
+val tokenizer = new 
Tokenizer().setInputCol("sentence").setOutputCol("words")
+val regexTokenizer = new RegexTokenizer()
+  .setInputCol("sentence")
+  .setOutputCol("words")
+  .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+val tokenized = tokenizer.transform(sentenceDataFrame)
+tokenized.select("words", "label").take(3).foreach(println)
+val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+regexTokenized.select("words", "label").take(3).foreach(println)
+// $example off$
+sc.stop()
+  }

[2/3] spark git commit: [SPARK-11551][DOC] Replace example code in ml-features.md using include_example

2015-12-09 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
new file mode 100644
index 000..668f71e
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPolynomialExpansionExample {
+  public static void main(String[] args) {
+SparkConf conf = new 
SparkConf().setAppName("JavaPolynomialExpansionExample");
+JavaSparkContext jsc = new JavaSparkContext(conf);
+SQLContext jsql = new SQLContext(jsc);
+
+// $example on$
+PolynomialExpansion polyExpansion = new PolynomialExpansion()
+  .setInputCol("features")
+  .setOutputCol("polyFeatures")
+  .setDegree(3);
+
+JavaRDD data = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Vectors.dense(-2.0, 2.3)),
+  RowFactory.create(Vectors.dense(0.0, 0.0)),
+  RowFactory.create(Vectors.dense(0.6, -1.1))
+));
+
+StructType schema = new StructType(new StructField[]{
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+
+DataFrame df = jsql.createDataFrame(data, schema);
+DataFrame polyDF = polyExpansion.transform(df);
+
+Row[] row = polyDF.select("polyFeatures").take(3);
+for (Row r : row) {
+  System.out.println(r.get(0));
+}
+// $example off$
+jsc.stop();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/bfb42013/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
new file mode 100644
index 000..1e1062b
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import

spark git commit: [SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 8652fc03c -> 5c8216920


[SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code

Add ```SQLTransformer``` user guide, example code and make Scala API doc more 
clear.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #10006 from yanboliang/spark-11958.

(cherry picked from commit 4a39b5a1bee28cec792d509654f6236390cafdcb)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c821692
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c821692
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c821692

Branch: refs/heads/branch-1.6
Commit: 5c8216920b4110d8fc4329e1fe52543ee17c4a54
Parents: 8652fc0
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Mon Dec 7 23:50:57 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:51:10 2015 -0800

--
 docs/ml-features.md | 59 
 .../examples/ml/JavaSQLTransformerExample.java  | 59 
 examples/src/main/python/ml/sql_transformer.py  | 40 +
 .../examples/ml/SQLTransformerExample.scala | 45 +++
 .../spark/ml/feature/SQLTransformer.scala   | 11 +++-
 5 files changed, 212 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5c821692/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 5105a94..f85e0d5 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -756,6 +756,65 @@ for more details on the API.
 
 
 
+## SQLTransformer
+
+`SQLTransformer` implements the transformations which are defined by SQL 
statement.
+Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
+where `"__THIS__"` represents the underlying table of the input dataset.
+The select clause specifies the fields, constants, and expressions to display 
in
+the output, it can be any select clause that Spark SQL supports. Users can also
+use Spark SQL built-in function and UDFs to operate on these selected columns.
+For example, `SQLTransformer` supports statements like:
+
+* `SELECT a, a + b AS a_b FROM __THIS__`
+* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
+* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b`
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id`, `v1` and `v2`:
+
+
+ id |  v1 |  v2
+|-|-
+ 0  | 1.0 | 3.0  
+ 2  | 2.0 | 5.0
+
+
+This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + 
v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`:
+
+
+ id |  v1 |  v2 |  v3 |  v4
+|-|-|-|-
+ 0  | 1.0 | 3.0 | 4.0 | 3.0
+ 2  | 2.0 | 5.0 | 7.0 |10.0
+
+
+
+
+
+Refer to the [SQLTransformer Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer)
+for more details on the API.
+
+{% include_example 
scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %}
+
+
+
+
+Refer to the [SQLTransformer Java 
docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html)
+for more details on the API.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %}
+
+
+
+
+Refer to the [SQLTransformer Python 
docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more 
details on the API.
+
+{% include_example python/ml/sql_transformer.py %}
+
+
+
 ## VectorAssembler
 
 `VectorAssembler` is a transformer that combines a given list of columns into 
a single vector

http://git-wip-us.apache.org/repos/asf/spark/blob/5c821692/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
new file mode 100644
index 000..d55c707
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to

spark git commit: [SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 7d05a6245 -> 4a39b5a1b


[SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code

Add ```SQLTransformer``` user guide, example code and make Scala API doc more 
clear.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #10006 from yanboliang/spark-11958.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a39b5a1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a39b5a1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a39b5a1

Branch: refs/heads/master
Commit: 4a39b5a1bee28cec792d509654f6236390cafdcb
Parents: 7d05a62
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Mon Dec 7 23:50:57 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:50:57 2015 -0800

--
 docs/ml-features.md | 59 
 .../examples/ml/JavaSQLTransformerExample.java  | 59 
 examples/src/main/python/ml/sql_transformer.py  | 40 +
 .../examples/ml/SQLTransformerExample.scala | 45 +++
 .../spark/ml/feature/SQLTransformer.scala   | 11 +++-
 5 files changed, 212 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4a39b5a1/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 5105a94..f85e0d5 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -756,6 +756,65 @@ for more details on the API.
 
 
 
+## SQLTransformer
+
+`SQLTransformer` implements the transformations which are defined by SQL 
statement.
+Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
+where `"__THIS__"` represents the underlying table of the input dataset.
+The select clause specifies the fields, constants, and expressions to display 
in
+the output, it can be any select clause that Spark SQL supports. Users can also
+use Spark SQL built-in function and UDFs to operate on these selected columns.
+For example, `SQLTransformer` supports statements like:
+
+* `SELECT a, a + b AS a_b FROM __THIS__`
+* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
+* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b`
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id`, `v1` and `v2`:
+
+
+ id |  v1 |  v2
+|-|-
+ 0  | 1.0 | 3.0  
+ 2  | 2.0 | 5.0
+
+
+This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + 
v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`:
+
+
+ id |  v1 |  v2 |  v3 |  v4
+|-|-|-|-
+ 0  | 1.0 | 3.0 | 4.0 | 3.0
+ 2  | 2.0 | 5.0 | 7.0 |10.0
+
+
+
+
+
+Refer to the [SQLTransformer Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer)
+for more details on the API.
+
+{% include_example 
scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %}
+
+
+
+
+Refer to the [SQLTransformer Java 
docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html)
+for more details on the API.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %}
+
+
+
+
+Refer to the [SQLTransformer Python 
docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more 
details on the API.
+
+{% include_example python/ml/sql_transformer.py %}
+
+
+
 ## VectorAssembler
 
 `VectorAssembler` is a transformer that combines a given list of columns into 
a single vector

http://git-wip-us.apache.org/repos/asf/spark/blob/4a39b5a1/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
new file mode 100644
index 000..d55c707
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF

spark git commit: [SPARK-10259][ML] Add @since annotation to ml.classification

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 73896588d -> 7d05a6245


[SPARK-10259][ML] Add @since annotation to ml.classification

Add since annotation to ml.classification

Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp>

Closes #8534 from taishi-oss/issue10259.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d05a624
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d05a624
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d05a624

Branch: refs/heads/master
Commit: 7d05a624510f7299b3dd07f87c203db1ff7caa3e
Parents: 7389658
Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp>
Authored: Mon Dec 7 23:46:55 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:46:55 2015 -0800

--
 .../classification/DecisionTreeClassifier.scala | 30 +++--
 .../spark/ml/classification/GBTClassifier.scala | 35 +--
 .../ml/classification/LogisticRegression.scala  | 64 +++-
 .../MultilayerPerceptronClassifier.scala| 23 +--
 .../spark/ml/classification/NaiveBayes.scala| 19 --
 .../spark/ml/classification/OneVsRest.scala | 24 ++--
 .../classification/RandomForestClassifier.scala | 34 +--
 7 files changed, 185 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7d05a624/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index c478aea..8c4cec1 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, 
TreeClassifierParams}
 import org.apache.spark.ml.tree.impl.RandomForest
@@ -36,32 +36,44 @@ import org.apache.spark.sql.DataFrame
  * It supports both binary and multiclass labels, as well as both continuous 
and categorical
  * features.
  */
+@Since("1.4.0")
 @Experimental
-final class DecisionTreeClassifier(override val uid: String)
+final class DecisionTreeClassifier @Since("1.4.0") (
+@Since("1.4.0") override val uid: String)
   extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, 
DecisionTreeClassificationModel]
   with DecisionTreeParams with TreeClassifierParams {
 
+  @Since("1.4.0")
   def this() = this(Identifiable.randomUID("dtc"))
 
   // Override parameter setters from parent trait for Java API compatibility.
 
+  @Since("1.4.0")
   override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
 
+  @Since("1.4.0")
   override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
 
+  @Since("1.4.0")
   override def setMinInstancesPerNode(value: Int): this.type =
 super.setMinInstancesPerNode(value)
 
+  @Since("1.4.0")
   override def setMinInfoGain(value: Double): this.type = 
super.setMinInfoGain(value)
 
+  @Since("1.4.0")
   override def setMaxMemoryInMB(value: Int): this.type = 
super.setMaxMemoryInMB(value)
 
+  @Since("1.4.0")
   override def setCacheNodeIds(value: Boolean): this.type = 
super.setCacheNodeIds(value)
 
+  @Since("1.4.0")
   override def setCheckpointInterval(value: Int): this.type = 
super.setCheckpointInterval(value)
 
+  @Since("1.4.0")
   override def setImpurity(value: String): this.type = super.setImpurity(value)
 
+  @Since("1.6.0")
   override def setSeed(value: Long): this.type = super.setSeed(value)
 
   override protected def train(dataset: DataFrame): 
DecisionTreeClassificationModel = {
@@ -89,12 +101,15 @@ final class DecisionTreeClassifier(override val uid: 
String)
   subsamplingRate = 1.0)
   }
 
+  @Since("1.4.1")
   override def copy(extra: ParamMap): DecisionTreeClassifier = 
defaultCopy(extra)
 }
 
+@Since("1.4.0")
 @Experimental
 object DecisionTreeClassifier {
   /** Accessor for supported impurities: entropy, gini */
+  @Since("1.4.0")
   final val supportedImpurities: Array[String] = 
TreeClassifierParams.supportedImpurities
 }
 
@@ -104,12 +119,13 @@ object DecisionTreeClassifier {
  * It supports both binary and multiclass labels, as well as both continuous

spark git commit: [SPARK-10259][ML] Add @since annotation to ml.classification

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 3c683ed5f -> 8652fc03c


[SPARK-10259][ML] Add @since annotation to ml.classification

Add since annotation to ml.classification

Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp>

Closes #8534 from taishi-oss/issue10259.

(cherry picked from commit 7d05a624510f7299b3dd07f87c203db1ff7caa3e)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8652fc03
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8652fc03
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8652fc03

Branch: refs/heads/branch-1.6
Commit: 8652fc03c21f79b41ce13f41991feba11fc7b29c
Parents: 3c683ed
Author: Takahashi Hiroshi <takahashi.hiro...@lab.ntt.co.jp>
Authored: Mon Dec 7 23:46:55 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:47:03 2015 -0800

--
 .../classification/DecisionTreeClassifier.scala | 30 +++--
 .../spark/ml/classification/GBTClassifier.scala | 35 +--
 .../ml/classification/LogisticRegression.scala  | 64 +++-
 .../MultilayerPerceptronClassifier.scala| 23 +--
 .../spark/ml/classification/NaiveBayes.scala| 19 --
 .../spark/ml/classification/OneVsRest.scala | 24 ++--
 .../classification/RandomForestClassifier.scala | 34 +--
 7 files changed, 185 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8652fc03/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index c478aea..8c4cec1 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.classification
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, 
TreeClassifierParams}
 import org.apache.spark.ml.tree.impl.RandomForest
@@ -36,32 +36,44 @@ import org.apache.spark.sql.DataFrame
  * It supports both binary and multiclass labels, as well as both continuous 
and categorical
  * features.
  */
+@Since("1.4.0")
 @Experimental
-final class DecisionTreeClassifier(override val uid: String)
+final class DecisionTreeClassifier @Since("1.4.0") (
+@Since("1.4.0") override val uid: String)
   extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, 
DecisionTreeClassificationModel]
   with DecisionTreeParams with TreeClassifierParams {
 
+  @Since("1.4.0")
   def this() = this(Identifiable.randomUID("dtc"))
 
   // Override parameter setters from parent trait for Java API compatibility.
 
+  @Since("1.4.0")
   override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
 
+  @Since("1.4.0")
   override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
 
+  @Since("1.4.0")
   override def setMinInstancesPerNode(value: Int): this.type =
 super.setMinInstancesPerNode(value)
 
+  @Since("1.4.0")
   override def setMinInfoGain(value: Double): this.type = 
super.setMinInfoGain(value)
 
+  @Since("1.4.0")
   override def setMaxMemoryInMB(value: Int): this.type = 
super.setMaxMemoryInMB(value)
 
+  @Since("1.4.0")
   override def setCacheNodeIds(value: Boolean): this.type = 
super.setCacheNodeIds(value)
 
+  @Since("1.4.0")
   override def setCheckpointInterval(value: Int): this.type = 
super.setCheckpointInterval(value)
 
+  @Since("1.4.0")
   override def setImpurity(value: String): this.type = super.setImpurity(value)
 
+  @Since("1.6.0")
   override def setSeed(value: Long): this.type = super.setSeed(value)
 
   override protected def train(dataset: DataFrame): 
DecisionTreeClassificationModel = {
@@ -89,12 +101,15 @@ final class DecisionTreeClassifier(override val uid: 
String)
   subsamplingRate = 1.0)
   }
 
+  @Since("1.4.1")
   override def copy(extra: ParamMap): DecisionTreeClassifier = 
defaultCopy(extra)
 }
 
+@Since("1.4.0")
 @Experimental
 object DecisionTreeClassifier {
   /** Accessor for supported impurities: entropy, gini */
+  @Since("1.4.0")
   final val supportedImpurities: Array[String] = 
TreeClassifierParams.supportedI

[3/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example

2015-12-07 Thread meng

[SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using 
include_example

Made new patch contaning only markdown examples moved to exmaple/folder.
Ony three  java code were not shfted since they were contaning compliation 
error ,these classes are
1)StandardScale 2)NormalizerExample 3)VectorIndexer

Author: Xusen Yin <yinxu...@gmail.com>
Author: somideshmukh <somi...@us.ibm.com>

Closes #10002 from somideshmukh/SomilBranch1.33.

(cherry picked from commit 78209b0ccaf3f22b5e2345dfb2b98edfdb746819)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3c683ed5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3c683ed5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3c683ed5

Branch: refs/heads/branch-1.6
Commit: 3c683ed5ffe704a6fec7c6d434eeed784276470d
Parents: 115bfbd
Author: somideshmukh <somi...@us.ibm.com>
Authored: Mon Dec 7 23:26:34 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:26:56 2015 -0800

--
 docs/ml-features.md | 1109 +-
 .../spark/examples/ml/JavaBinarizerExample.java |   68 ++
 .../examples/ml/JavaBucketizerExample.java  |   70 ++
 .../spark/examples/ml/JavaDCTExample.java   |   65 +
 .../ml/JavaElementwiseProductExample.java   |   75 ++
 .../examples/ml/JavaMinMaxScalerExample.java|   50 +
 .../spark/examples/ml/JavaNGramExample.java |   71 ++
 .../examples/ml/JavaNormalizerExample.java  |   52 +
 .../examples/ml/JavaOneHotEncoderExample.java   |   77 ++
 .../spark/examples/ml/JavaPCAExample.java   |   71 ++
 .../ml/JavaPolynomialExpansionExample.java  |   71 ++
 .../spark/examples/ml/JavaRFormulaExample.java  |   69 ++
 .../examples/ml/JavaStandardScalerExample.java  |   53 +
 .../ml/JavaStopWordsRemoverExample.java |   65 +
 .../examples/ml/JavaStringIndexerExample.java   |   66 ++
 .../spark/examples/ml/JavaTokenizerExample.java |   75 ++
 .../examples/ml/JavaVectorAssemblerExample.java |   67 ++
 .../examples/ml/JavaVectorIndexerExample.java   |   60 +
 .../examples/ml/JavaVectorSlicerExample.java|   73 ++
 .../src/main/python/ml/binarizer_example.py |   43 +
 .../src/main/python/ml/bucketizer_example.py|   42 +
 .../python/ml/elementwise_product_example.py|   39 +
 examples/src/main/python/ml/n_gram_example.py   |   42 +
 .../src/main/python/ml/normalizer_example.py|   41 +
 .../main/python/ml/onehot_encoder_example.py|   47 +
 examples/src/main/python/ml/pca_example.py  |   42 +
 .../python/ml/polynomial_expansion_example.py   |   43 +
 examples/src/main/python/ml/rformula_example.py |   44 +
 .../main/python/ml/standard_scaler_example.py   |   42 +
 .../main/python/ml/stopwords_remover_example.py |   40 +
 .../main/python/ml/string_indexer_example.py|   39 +
 .../src/main/python/ml/tokenizer_example.py |   44 +
 .../main/python/ml/vector_assembler_example.py  |   42 +
 .../main/python/ml/vector_indexer_example.py|   39 +
 .../spark/examples/ml/BinarizerExample.scala|   48 +
 .../spark/examples/ml/BucketizerExample.scala   |   51 +
 .../apache/spark/examples/ml/DCTExample.scala   |   54 +
 .../examples/ml/ElementWiseProductExample.scala |   53 +
 .../spark/examples/ml/MinMaxScalerExample.scala |   49 +
 .../apache/spark/examples/ml/NGramExample.scala |   47 +
 .../spark/examples/ml/NormalizerExample.scala   |   50 +
 .../examples/ml/OneHotEncoderExample.scala  |   58 +
 .../apache/spark/examples/ml/PCAExample.scala   |   54 +
 .../ml/PolynomialExpansionExample.scala |   53 +
 .../spark/examples/ml/RFormulaExample.scala |   49 +
 .../examples/ml/StandardScalerExample.scala |   51 +
 .../examples/ml/StopWordsRemoverExample.scala   |   48 +
 .../examples/ml/StringIndexerExample.scala  |   49 +
 .../spark/examples/ml/TokenizerExample.scala|   54 +
 .../examples/ml/VectorAssemblerExample.scala|   49 +
 .../examples/ml/VectorIndexerExample.scala  |   53 +
 .../spark/examples/ml/VectorSlicerExample.scala |   58 +
 52 files changed, 2806 insertions(+), 1058 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3c683ed5/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index b499d6e..5105a94 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala 
docs](api/scala/index.html#org.apache.spark.ml.fea
 and the [RegexTokenizer Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{Tokenizer, RegexToken

[2/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example

2015-12-07 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/3c683ed5/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
new file mode 100644
index 000..668f71e
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPolynomialExpansionExample {
+  public static void main(String[] args) {
+SparkConf conf = new 
SparkConf().setAppName("JavaPolynomialExpansionExample");
+JavaSparkContext jsc = new JavaSparkContext(conf);
+SQLContext jsql = new SQLContext(jsc);
+
+// $example on$
+PolynomialExpansion polyExpansion = new PolynomialExpansion()
+  .setInputCol("features")
+  .setOutputCol("polyFeatures")
+  .setDegree(3);
+
+JavaRDD data = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Vectors.dense(-2.0, 2.3)),
+  RowFactory.create(Vectors.dense(0.0, 0.0)),
+  RowFactory.create(Vectors.dense(0.6, -1.1))
+));
+
+StructType schema = new StructType(new StructField[]{
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+
+DataFrame df = jsql.createDataFrame(data, schema);
+DataFrame polyDF = polyExpansion.transform(df);
+
+Row[] row = polyDF.select("polyFeatures").take(3);
+for (Row r : row) {
+  System.out.println(r.get(0));
+}
+// $example off$
+jsc.stop();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/3c683ed5/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
new file mode 100644
index 000..1e1062b
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import

[1/3] spark git commit: [SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3e7e05f5e -> 78209b0cc


http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
new file mode 100644
index 000..1be8a5f
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StringIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StringIndexerExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("StringIndexerExample")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// $example on$
+val df = sqlContext.createDataFrame(
+  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+).toDF("id", "category")
+
+val indexer = new StringIndexer()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+
+val indexed = indexer.fit(df).transform(df)
+indexed.show()
+// $example off$
+sc.stop()
+  }
+}
+// scalastyle:on println
+

http://git-wip-us.apache.org/repos/asf/spark/blob/78209b0c/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
new file mode 100644
index 000..01e0d13
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TokenizerExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("TokenizerExample")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// $example on$
+val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+  (0, "Hi I heard about Spark"),
+  (1, "I wish Java could use case classes"),
+  (2, "Logistic,regression,models,are,neat")
+)).toDF("label", "sentence")
+
+val tokenizer = new 
Tokenizer().setInputCol("sentence").setOutputCol("words")
+val regexTokenizer = new RegexTokenizer()
+  .setInputCol("sentence")
+  .setOutputCol("words")
+  .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+val tokenized = tokenizer.transform(sentenceDataFrame)
+tokenized.select("words", "label").take(3).foreach(println)
+val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+regexTokenized.select("words", "label").take(3).foreach(println)
+// $example off$
+sc.stop()
+  }
+}

spark git commit: [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib - 1.5 backport

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 3868ab644 -> 2f30927a5


[SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib - 1.5 backport

This backports [https://github.com/apache/spark/pull/10161] to Spark 1.5, with 
the difference that ChiSqSelector does not require modification.

Switched from using SQLContext constructor to using getOrCreate, mainly in 
model save/load methods.

This covers all instances in spark.mllib. There were no uses of the constructor 
in spark.ml.

CC: yhuai mengxr

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #10183 from jkbradley/sqlcontext-backport1.5.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f30927a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f30927a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f30927a

Branch: refs/heads/branch-1.5
Commit: 2f30927a5f40f2862e777bfe97282ddcfc0a063a
Parents: 3868ab6
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Mon Dec 7 23:37:23 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:37:23 2015 -0800

--
 .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala   | 6 +++---
 .../org/apache/spark/mllib/classification/NaiveBayes.scala   | 8 
 .../mllib/classification/impl/GLMClassificationModel.scala   | 4 ++--
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala | 4 ++--
 .../org/apache/spark/mllib/clustering/KMeansModel.scala  | 4 ++--
 .../spark/mllib/clustering/PowerIterationClustering.scala| 4 ++--
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 .../mllib/recommendation/MatrixFactorizationModel.scala  | 4 ++--
 .../apache/spark/mllib/regression/IsotonicRegression.scala   | 4 ++--
 .../spark/mllib/regression/impl/GLMRegressionModel.scala | 4 ++--
 .../apache/spark/mllib/tree/model/DecisionTreeModel.scala| 4 ++--
 .../apache/spark/mllib/tree/model/treeEnsembleModels.scala   | 4 ++--
 12 files changed, 27 insertions(+), 27 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2f30927a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index f585aac..06e13b7 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -1149,7 +1149,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = {
 // We use DataFrames for serialization of IndexedRows to Python,
 // so return a DataFrame.
-val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext)
+val sqlContext = SQLContext.getOrCreate(indexedRowMatrix.rows.sparkContext)
 sqlContext.createDataFrame(indexedRowMatrix.rows)
   }
 
@@ -1159,7 +1159,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = {
 // We use DataFrames for serialization of MatrixEntry entries to
 // Python, so return a DataFrame.
-val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext)
+val sqlContext = 
SQLContext.getOrCreate(coordinateMatrix.entries.sparkContext)
 sqlContext.createDataFrame(coordinateMatrix.entries)
   }
 
@@ -1169,7 +1169,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = {
 // We use DataFrames for serialization of sub-matrix blocks to
 // Python, so return a DataFrame.
-val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext)
+val sqlContext = SQLContext.getOrCreate(blockMatrix.blocks.sparkContext)
 sqlContext.createDataFrame(blockMatrix.blocks)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/2f30927a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index a956084..aef9ef2 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -192,7 +192,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 modelType: String)
 
 def save(sc: SparkContext, path: String, data: Data): Unit = {
-  val sqlCon

spark git commit: Closes #10098

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 78209b0cc -> 73896588d


Closes #10098


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73896588
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73896588
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73896588

Branch: refs/heads/master
Commit: 73896588dd3af6ba77c9692cd5120ee32448eb22
Parents: 78209b0
Author: Xiangrui Meng <m...@databricks.com>
Authored: Mon Dec 7 23:34:16 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 23:34:16 2015 -0800

--

--



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 cdeb89b34 -> 115bfbdae


[SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib

Switched from using SQLContext constructor to using getOrCreate, mainly in 
model save/load methods.

This covers all instances in spark.mllib.  There were no uses of the 
constructor in spark.ml.

CC: mengxr yhuai

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #10161 from jkbradley/mllib-sqlcontext-fix.

(cherry picked from commit 3e7e05f5ee763925ed60410d7de04cf36b723de1)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/115bfbda
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/115bfbda
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/115bfbda

Branch: refs/heads/branch-1.6
Commit: 115bfbdae82b1c2804ea501ffd420d0aa17aac45
Parents: cdeb89b
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Mon Dec 7 16:37:09 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 16:37:16 2015 -0800

--
 .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala   | 6 +++---
 .../org/apache/spark/mllib/classification/NaiveBayes.scala   | 8 
 .../mllib/classification/impl/GLMClassificationModel.scala   | 4 ++--
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala | 4 ++--
 .../org/apache/spark/mllib/clustering/KMeansModel.scala  | 4 ++--
 .../spark/mllib/clustering/PowerIterationClustering.scala| 4 ++--
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 4 ++--
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 .../mllib/recommendation/MatrixFactorizationModel.scala  | 4 ++--
 .../apache/spark/mllib/regression/IsotonicRegression.scala   | 4 ++--
 .../spark/mllib/regression/impl/GLMRegressionModel.scala | 4 ++--
 .../apache/spark/mllib/tree/model/DecisionTreeModel.scala| 4 ++--
 .../apache/spark/mllib/tree/model/treeEnsembleModels.scala   | 4 ++--
 13 files changed, 29 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/115bfbda/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 54b03a9..2aa6aec 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -1191,7 +1191,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = {
 // We use DataFrames for serialization of IndexedRows to Python,
 // so return a DataFrame.
-val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext)
+val sqlContext = SQLContext.getOrCreate(indexedRowMatrix.rows.sparkContext)
 sqlContext.createDataFrame(indexedRowMatrix.rows)
   }
 
@@ -1201,7 +1201,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = {
 // We use DataFrames for serialization of MatrixEntry entries to
 // Python, so return a DataFrame.
-val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext)
+val sqlContext = 
SQLContext.getOrCreate(coordinateMatrix.entries.sparkContext)
 sqlContext.createDataFrame(coordinateMatrix.entries)
   }
 
@@ -1211,7 +1211,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = {
 // We use DataFrames for serialization of sub-matrix blocks to
 // Python, so return a DataFrame.
-val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext)
+val sqlContext = SQLContext.getOrCreate(blockMatrix.blocks.sparkContext)
 sqlContext.createDataFrame(blockMatrix.blocks)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/115bfbda/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index a956084..aef9ef2 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -192,7 +192,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 modelType: String)
 
 def save(sc: SparkContext, path

spark git commit: [SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib

2015-12-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 36282f78b -> 3e7e05f5e


[SPARK-12160][MLLIB] Use SQLContext.getOrCreate in MLlib

Switched from using SQLContext constructor to using getOrCreate, mainly in 
model save/load methods.

This covers all instances in spark.mllib.  There were no uses of the 
constructor in spark.ml.

CC: mengxr yhuai

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #10161 from jkbradley/mllib-sqlcontext-fix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e7e05f5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e7e05f5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e7e05f5

Branch: refs/heads/master
Commit: 3e7e05f5ee763925ed60410d7de04cf36b723de1
Parents: 36282f7
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Mon Dec 7 16:37:09 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 7 16:37:09 2015 -0800

--
 .../org/apache/spark/mllib/api/python/PythonMLLibAPI.scala   | 6 +++---
 .../org/apache/spark/mllib/classification/NaiveBayes.scala   | 8 
 .../mllib/classification/impl/GLMClassificationModel.scala   | 4 ++--
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala | 4 ++--
 .../org/apache/spark/mllib/clustering/KMeansModel.scala  | 4 ++--
 .../spark/mllib/clustering/PowerIterationClustering.scala| 4 ++--
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 4 ++--
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 .../mllib/recommendation/MatrixFactorizationModel.scala  | 4 ++--
 .../apache/spark/mllib/regression/IsotonicRegression.scala   | 4 ++--
 .../spark/mllib/regression/impl/GLMRegressionModel.scala | 4 ++--
 .../apache/spark/mllib/tree/model/DecisionTreeModel.scala| 4 ++--
 .../apache/spark/mllib/tree/model/treeEnsembleModels.scala   | 4 ++--
 13 files changed, 29 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3e7e05f5/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 54b03a9..2aa6aec 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -1191,7 +1191,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getIndexedRows(indexedRowMatrix: IndexedRowMatrix): DataFrame = {
 // We use DataFrames for serialization of IndexedRows to Python,
 // so return a DataFrame.
-val sqlContext = new SQLContext(indexedRowMatrix.rows.sparkContext)
+val sqlContext = SQLContext.getOrCreate(indexedRowMatrix.rows.sparkContext)
 sqlContext.createDataFrame(indexedRowMatrix.rows)
   }
 
@@ -1201,7 +1201,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getMatrixEntries(coordinateMatrix: CoordinateMatrix): DataFrame = {
 // We use DataFrames for serialization of MatrixEntry entries to
 // Python, so return a DataFrame.
-val sqlContext = new SQLContext(coordinateMatrix.entries.sparkContext)
+val sqlContext = 
SQLContext.getOrCreate(coordinateMatrix.entries.sparkContext)
 sqlContext.createDataFrame(coordinateMatrix.entries)
   }
 
@@ -1211,7 +1211,7 @@ private[python] class PythonMLLibAPI extends Serializable 
{
   def getMatrixBlocks(blockMatrix: BlockMatrix): DataFrame = {
 // We use DataFrames for serialization of sub-matrix blocks to
 // Python, so return a DataFrame.
-val sqlContext = new SQLContext(blockMatrix.blocks.sparkContext)
+val sqlContext = SQLContext.getOrCreate(blockMatrix.blocks.sparkContext)
 sqlContext.createDataFrame(blockMatrix.blocks)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/3e7e05f5/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index a956084..aef9ef2 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -192,7 +192,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 modelType: String)
 
 def save(sc: SparkContext, path: String, data: Data): Unit = {
-  val sqlContext = new SQLContext(sc)
+  val sqlContext = SQLContext.getOrCreate(sc)
   import

spark git commit: [MINOR][ML] Use coefficients replace weights

2015-12-03 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 688e521c2 -> d576e76bb


[MINOR][ML] Use coefficients replace weights

Use ```coefficients``` replace ```weights```, I wish they are the last two.
mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #10065 from yanboliang/coefficients.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d576e76b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d576e76b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d576e76b

Branch: refs/heads/master
Commit: d576e76bbaa818480d31d2b8fbbe4b15718307d9
Parents: 688e521
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Dec 3 11:37:34 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Dec 3 11:37:34 2015 -0800

--
 python/pyspark/ml/classification.py | 2 +-
 python/pyspark/ml/regression.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d576e76b/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 4a2982e..5599b8f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -49,7 +49,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], 
[]))]).toDF()
 >>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
 >>> model = lr.fit(df)
->>> model.weights
+>>> model.coefficients
 DenseVector([5.5...])
 >>> model.intercept
 -2.68...

http://git-wip-us.apache.org/repos/asf/spark/blob/d576e76b/python/pyspark/ml/regression.py
--
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 944e648..a0bb8ce 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -40,7 +40,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPrediction
 Linear regression.
 
 The learning objective is to minimize the squared error, with 
regularization.
-The specific squared error loss function used is: L = 1/2n ||A weights - 
y||^2^
+The specific squared error loss function used is: L = 1/2n ||A 
coefficients - y||^2^
 
 This support multiple types of regularization:
  - none (a.k.a. ordinary least squares)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][ML] Use coefficients replace weights

2015-12-03 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 bf8b95fa4 -> e0577f542


[MINOR][ML] Use coefficients replace weights

Use ```coefficients``` replace ```weights```, I wish they are the last two.
mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #10065 from yanboliang/coefficients.

(cherry picked from commit d576e76bbaa818480d31d2b8fbbe4b15718307d9)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0577f54
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0577f54
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0577f54

Branch: refs/heads/branch-1.6
Commit: e0577f542878d582651aad7c65dc33c47014b4fb
Parents: bf8b95f
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Dec 3 11:37:34 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Dec 3 11:37:41 2015 -0800

--
 python/pyspark/ml/classification.py | 2 +-
 python/pyspark/ml/regression.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e0577f54/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 4a2982e..5599b8f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -49,7 +49,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], 
[]))]).toDF()
 >>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
 >>> model = lr.fit(df)
->>> model.weights
+>>> model.coefficients
 DenseVector([5.5...])
 >>> model.intercept
 -2.68...

http://git-wip-us.apache.org/repos/asf/spark/blob/e0577f54/python/pyspark/ml/regression.py
--
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 944e648..a0bb8ce 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -40,7 +40,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPrediction
 Linear regression.
 
 The learning objective is to minimize the squared error, with 
regularization.
-The specific squared error loss function used is: L = 1/2n ||A weights - 
y||^2^
+The specific squared error loss function used is: L = 1/2n ||A 
coefficients - y||^2^
 
 This support multiple types of regularization:
  - none (a.k.a. ordinary least squares)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning

2015-12-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 452690ba1 -> de07d06ab


[SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning

cc mengxr noel-smith

I worked on this issues based on https://github.com/apache/spark/pull/8729.
ehsanmok  thank you for your contricution!

Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>
Author: Ehsan M.Kermani <ehsanmo1...@gmail.com>

Closes #9338 from yu-iskw/JIRA-10266.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de07d06a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de07d06a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de07d06a

Branch: refs/heads/master
Commit: de07d06abecf3516c95d099b6c01a86e0c8cfd8c
Parents: 452690b
Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>
Authored: Wed Dec 2 14:15:54 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Dec 2 14:15:54 2015 -0800

--
 .../apache/spark/ml/tuning/CrossValidator.scala | 34 ++--
 .../spark/ml/tuning/ParamGridBuilder.scala  | 14 ++--
 .../spark/ml/tuning/TrainValidationSplit.scala  | 26 ---
 3 files changed, 58 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/de07d06a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala 
b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 83a9048..5c09f1a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -19,18 +19,18 @@ package org.apache.spark.ml.tuning
 
 import com.github.fommil.netlib.F2jBLAS
 import org.apache.hadoop.fs.Path
-import org.json4s.{JObject, DefaultFormats}
 import org.json4s.jackson.JsonMethods._
+import org.json4s.{DefaultFormats, JObject}
 
-import org.apache.spark.ml.classification.OneVsRestParams
-import org.apache.spark.ml.feature.RFormulaModel
-import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml._
+import org.apache.spark.ml.classification.OneVsRestParams
 import org.apache.spark.ml.evaluation.Evaluator
+import org.apache.spark.ml.feature.RFormulaModel
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.DefaultParamsReader.Metadata
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
@@ -58,26 +58,34 @@ private[ml] trait CrossValidatorParams extends 
ValidatorParams {
  * :: Experimental ::
  * K-fold cross validation.
  */
+@Since("1.2.0")
 @Experimental
-class CrossValidator(override val uid: String) extends 
Estimator[CrossValidatorModel]
+class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
+  extends Estimator[CrossValidatorModel]
   with CrossValidatorParams with MLWritable with Logging {
 
+  @Since("1.2.0")
   def this() = this(Identifiable.randomUID("cv"))
 
   private val f2jBLAS = new F2jBLAS
 
   /** @group setParam */
+  @Since("1.2.0")
   def setEstimator(value: Estimator[_]): this.type = set(estimator, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setEstimatorParamMaps(value: Array[ParamMap]): this.type = 
set(estimatorParamMaps, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setEvaluator(value: Evaluator): this.type = set(evaluator, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setNumFolds(value: Int): this.type = set(numFolds, value)
 
+  @Since("1.4.0")
   override def fit(dataset: DataFrame): CrossValidatorModel = {
 val schema = dataset.schema
 transformSchema(schema, logging = true)
@@ -116,10 +124,12 @@ class CrossValidator(override val uid: String) extends 
Estimator[CrossValidatorM
 copyValues(new CrossValidatorModel(uid, bestModel, 
metrics).setParent(this))
   }
 
+  @Since("1.4.0")
   override def transformSchema(schema: StructType): StructType = {
 $(estimator).transformSchema(schema)
   }
 
+  @Since("1.4.0")
   override def validateParams(): Unit = {
 super.validateParams()
 val est = $(estimator)
@@ -128,6 +138,7 @@ class CrossValidator(override val uid: String) extends 
Estimator[CrossValidatorM
 }
   }
 
+  @Since("1.4.0")
   override def copy(extra: ParamMap): CrossValidator = {
 val copied = defaultCopy(extra).asInstanceOf[CrossValidator]

spark git commit: [SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning

2015-12-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 5d915fed3 -> 911259e9a


[SPARK-10266][DOCUMENTATION, ML] Fixed @Since annotation for ml.tunning

cc mengxr noel-smith

I worked on this issues based on https://github.com/apache/spark/pull/8729.
ehsanmok  thank you for your contricution!

Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>
Author: Ehsan M.Kermani <ehsanmo1...@gmail.com>

Closes #9338 from yu-iskw/JIRA-10266.

(cherry picked from commit de07d06abecf3516c95d099b6c01a86e0c8cfd8c)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/911259e9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/911259e9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/911259e9

Branch: refs/heads/branch-1.6
Commit: 911259e9af6f9a81e775b1aa6d82fa44956bf993
Parents: 5d915fe
Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>
Authored: Wed Dec 2 14:15:54 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Dec 2 14:16:05 2015 -0800

--
 .../apache/spark/ml/tuning/CrossValidator.scala | 34 ++--
 .../spark/ml/tuning/ParamGridBuilder.scala  | 14 ++--
 .../spark/ml/tuning/TrainValidationSplit.scala  | 26 ---
 3 files changed, 58 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/911259e9/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala 
b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 83a9048..5c09f1a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -19,18 +19,18 @@ package org.apache.spark.ml.tuning
 
 import com.github.fommil.netlib.F2jBLAS
 import org.apache.hadoop.fs.Path
-import org.json4s.{JObject, DefaultFormats}
 import org.json4s.jackson.JsonMethods._
+import org.json4s.{DefaultFormats, JObject}
 
-import org.apache.spark.ml.classification.OneVsRestParams
-import org.apache.spark.ml.feature.RFormulaModel
-import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml._
+import org.apache.spark.ml.classification.OneVsRestParams
 import org.apache.spark.ml.evaluation.Evaluator
+import org.apache.spark.ml.feature.RFormulaModel
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.DefaultParamsReader.Metadata
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
@@ -58,26 +58,34 @@ private[ml] trait CrossValidatorParams extends 
ValidatorParams {
  * :: Experimental ::
  * K-fold cross validation.
  */
+@Since("1.2.0")
 @Experimental
-class CrossValidator(override val uid: String) extends 
Estimator[CrossValidatorModel]
+class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
+  extends Estimator[CrossValidatorModel]
   with CrossValidatorParams with MLWritable with Logging {
 
+  @Since("1.2.0")
   def this() = this(Identifiable.randomUID("cv"))
 
   private val f2jBLAS = new F2jBLAS
 
   /** @group setParam */
+  @Since("1.2.0")
   def setEstimator(value: Estimator[_]): this.type = set(estimator, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setEstimatorParamMaps(value: Array[ParamMap]): this.type = 
set(estimatorParamMaps, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setEvaluator(value: Evaluator): this.type = set(evaluator, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setNumFolds(value: Int): this.type = set(numFolds, value)
 
+  @Since("1.4.0")
   override def fit(dataset: DataFrame): CrossValidatorModel = {
 val schema = dataset.schema
 transformSchema(schema, logging = true)
@@ -116,10 +124,12 @@ class CrossValidator(override val uid: String) extends 
Estimator[CrossValidatorM
 copyValues(new CrossValidatorModel(uid, bestModel, 
metrics).setParent(this))
   }
 
+  @Since("1.4.0")
   override def transformSchema(schema: StructType): StructType = {
 $(estimator).transformSchema(schema)
   }
 
+  @Since("1.4.0")
   override def validateParams(): Unit = {
 super.validateParams()
 val est = $(estimator)
@@ -128,6 +138,7 @@ class CrossValidator(override val uid: String) extends 
Estimator[CrossValidatorM
 }
   }
 
+  @Since("

spark git commit: [SPARK-12000] do not specify arg types when reference a method in ScalaDoc

2015-12-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 cb142fd1e -> 656d44e20


[SPARK-12000] do not specify arg types when reference a method in ScalaDoc

This fixes SPARK-12000, verified on my local with JDK 7. It seems that 
`scaladoc` try to match method names and messed up with annotations.

cc: JoshRosen jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #10114 from mengxr/SPARK-12000.2.

(cherry picked from commit 9bb695b7a82d837e2c7a724514ea6b203efb5364)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/656d44e2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/656d44e2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/656d44e2

Branch: refs/heads/branch-1.6
Commit: 656d44e2021d2f637d724c1d71ecdca1f447a4be
Parents: cb142fd
Author: Xiangrui Meng <m...@databricks.com>
Authored: Wed Dec 2 17:19:31 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Dec 2 17:19:45 2015 -0800

--
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala  | 2 +-
 .../org/apache/spark/mllib/clustering/BisectingKMeansModel.scala | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/656d44e2/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 29a7aa0..82adfa6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -214,7 +214,7 @@ class BisectingKMeans private (
   }
 
   /**
-   * Java-friendly version of [[run(RDD[Vector])*]]
+   * Java-friendly version of [[run()]].
*/
   def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/656d44e2/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 5015f15..f942e56 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -64,7 +64,7 @@ class BisectingKMeansModel @Since("1.6.0") (
   }
 
   /**
-   * Java-friendly version of [[predict(RDD[Vector])*]]
+   * Java-friendly version of [[predict()]].
*/
   @Since("1.6.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
@@ -88,7 +88,7 @@ class BisectingKMeansModel @Since("1.6.0") (
   }
 
   /**
-   * Java-friendly version of [[computeCost(RDD[Vector])*]].
+   * Java-friendly version of [[computeCost()]].
*/
   @Since("1.6.0")
   def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-12000] do not specify arg types when reference a method in ScalaDoc

2015-12-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master d0d7ec533 -> 9bb695b7a


[SPARK-12000] do not specify arg types when reference a method in ScalaDoc

This fixes SPARK-12000, verified on my local with JDK 7. It seems that 
`scaladoc` try to match method names and messed up with annotations.

cc: JoshRosen jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #10114 from mengxr/SPARK-12000.2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9bb695b7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9bb695b7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9bb695b7

Branch: refs/heads/master
Commit: 9bb695b7a82d837e2c7a724514ea6b203efb5364
Parents: d0d7ec5
Author: Xiangrui Meng <m...@databricks.com>
Authored: Wed Dec 2 17:19:31 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Dec 2 17:19:31 2015 -0800

--
 .../org/apache/spark/mllib/clustering/BisectingKMeans.scala  | 2 +-
 .../org/apache/spark/mllib/clustering/BisectingKMeansModel.scala | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9bb695b7/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 29a7aa0..82adfa6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -214,7 +214,7 @@ class BisectingKMeans private (
   }
 
   /**
-   * Java-friendly version of [[run(RDD[Vector])*]]
+   * Java-friendly version of [[run()]].
*/
   def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/9bb695b7/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 5015f15..f942e56 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -64,7 +64,7 @@ class BisectingKMeansModel @Since("1.6.0") (
   }
 
   /**
-   * Java-friendly version of [[predict(RDD[Vector])*]]
+   * Java-friendly version of [[predict()]].
*/
   @Since("1.6.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
@@ -88,7 +88,7 @@ class BisectingKMeansModel @Since("1.6.0") (
   }
 
   /**
-   * Java-friendly version of [[computeCost(RDD[Vector])*]].
+   * Java-friendly version of [[computeCost()]].
*/
   @Since("1.6.0")
   def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOCS] fixed list display in ml-ensembles

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 32911de77 -> 0978ec11c


[MINOR][DOCS] fixed list display in ml-ensembles

The list in ml-ensembles.md wasn't properly formatted and, as a result, was 
looking like this:
![old](http://i.imgur.com/2ZhELLR.png)

This PR aims to make it look like this:
![new](http://i.imgur.com/0Xriwd2.png)

Author: BenFradet <benjamin.fra...@gmail.com>

Closes #10025 from BenFradet/ml-ensembles-doc.

(cherry picked from commit f2fbfa444f6e8d27953ec2d1c0b3abd603c963f9)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0978ec11
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0978ec11
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0978ec11

Branch: refs/heads/branch-1.6
Commit: 0978ec11c9a080bd493da2e9d11c81c08e8e6962
Parents: 32911de
Author: BenFradet <benjamin.fra...@gmail.com>
Authored: Mon Nov 30 13:02:08 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 13:02:19 2015 -0800

--
 docs/ml-ensembles.md | 1 +
 1 file changed, 1 insertion(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0978ec11/docs/ml-ensembles.md
--
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index f6c3c30..14fef76 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -20,6 +20,7 @@ Both use [MLlib decision trees](ml-decision-tree.html) as 
their base models.
 Users can find more information about ensemble algorithms in the [MLlib 
Ensemble guide](mllib-ensembles.html).  In this section, we demonstrate the 
Pipelines API for ensembles.
 
 The main differences between this API and the [original MLlib ensembles 
API](mllib-ensembles.html) are:
+
 * support for ML Pipelines
 * separation of classification vs. regression
 * use of DataFrame metadata to distinguish continuous and categorical features


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOCS] fixed list display in ml-ensembles

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 8df584b02 -> f2fbfa444


[MINOR][DOCS] fixed list display in ml-ensembles

The list in ml-ensembles.md wasn't properly formatted and, as a result, was 
looking like this:
![old](http://i.imgur.com/2ZhELLR.png)

This PR aims to make it look like this:
![new](http://i.imgur.com/0Xriwd2.png)

Author: BenFradet <benjamin.fra...@gmail.com>

Closes #10025 from BenFradet/ml-ensembles-doc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2fbfa44
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2fbfa44
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2fbfa44

Branch: refs/heads/master
Commit: f2fbfa444f6e8d27953ec2d1c0b3abd603c963f9
Parents: 8df584b
Author: BenFradet <benjamin.fra...@gmail.com>
Authored: Mon Nov 30 13:02:08 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 13:02:08 2015 -0800

--
 docs/ml-ensembles.md | 1 +
 1 file changed, 1 insertion(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f2fbfa44/docs/ml-ensembles.md
--
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index f6c3c30..14fef76 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -20,6 +20,7 @@ Both use [MLlib decision trees](ml-decision-tree.html) as 
their base models.
 Users can find more information about ensemble algorithms in the [MLlib 
Ensemble guide](mllib-ensembles.html).  In this section, we demonstrate the 
Pipelines API for ensembles.
 
 The main differences between this API and the [original MLlib ensembles 
API](mllib-ensembles.html) are:
+
 * support for ML Pipelines
 * separation of classification vs. regression
 * use of DataFrame metadata to distinguish continuous and categorical features


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11960][MLLIB][DOC] User guide for streaming tests

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 a387cef3a -> ebf87ebc0


[SPARK-11960][MLLIB][DOC] User guide for streaming tests

CC jkbradley mengxr josepablocam

Author: Feynman Liang <feynman.li...@gmail.com>

Closes #10005 from feynmanliang/streaming-test-user-guide.

(cherry picked from commit 55358889309cf2d856b72e72e0f3081dfdf61cfa)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebf87ebc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebf87ebc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebf87ebc

Branch: refs/heads/branch-1.6
Commit: ebf87ebc02075497f4682e3ad0f8e63d33f3b86e
Parents: a387cef
Author: Feynman Liang <feynman.li...@gmail.com>
Authored: Mon Nov 30 15:38:44 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 15:38:51 2015 -0800

--
 docs/mllib-guide.md |  1 +
 docs/mllib-statistics.md| 25 
 .../examples/mllib/StreamingTestExample.scala   |  2 ++
 3 files changed, 28 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ebf87ebc/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 54e35fc..43772ad 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -34,6 +34,7 @@ We list major functionality from both below, with links to 
detailed guides.
   * [correlations](mllib-statistics.html#correlations)
   * [stratified sampling](mllib-statistics.html#stratified-sampling)
   * [hypothesis testing](mllib-statistics.html#hypothesis-testing)
+  * [streaming significance 
testing](mllib-statistics.html#streaming-significance-testing)
   * [random data generation](mllib-statistics.html#random-data-generation)
 * [Classification and regression](mllib-classification-regression.html)
   * [linear models (SVMs, logistic regression, linear 
regression)](mllib-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/ebf87ebc/docs/mllib-statistics.md
--
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index ade5b07..de209f6 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -521,6 +521,31 @@ print(testResult) # summary of the test including the 
p-value, test statistic,
 
 
 
+### Streaming Significance Testing
+MLlib provides online implementations of some tests to support use cases
+like A/B testing. These tests may be performed on a Spark Streaming
+`DStream[(Boolean,Double)]` where the first element of each tuple
+indicates control group (`false`) or treatment group (`true`) and the
+second element is the value of an observation.
+
+Streaming significance testing supports the following parameters:
+
+* `peacePeriod` - The number of initial data points from the stream to
+ignore, used to mitigate novelty effects.
+* `windowSize` - The number of past batches to perform hypothesis
+testing over. Setting to `0` will perform cumulative processing using
+all prior batches.
+
+
+
+
+[`StreamingTest`](api/scala/index.html#org.apache.spark.mllib.stat.test.StreamingTest)
+provides streaming hypothesis testing.
+
+{% include_example 
scala/org/apache/spark/examples/mllib/StreamingTestExample.scala %}
+
+
+
 
 ## Random data generation
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ebf87ebc/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
index ab29f90..b6677c6 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
@@ -64,6 +64,7 @@ object StreamingTestExample {
   dir.toString
 })
 
+// $example on$
 val data = ssc.textFileStream(dataDir).map(line => line.split(",") match {
   case Array(label, value) => (label.toBoolean, value.toDouble)
 })
@@ -75,6 +76,7 @@ object StreamingTestExample {
 
 val out = streamingTest.registerStream(data)
 out.print()
+// $example off$
 
 // Stop processing if test becomes significant or we time out
 var timeoutCounter = numBatchesTimeout


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 a8c6d8acc -> 1562ef10f


[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

jira: https://issues.apache.org/jira/browse/SPARK-11689

Add simple user guide for LDA under spark.ml and example code under examples/. 
Use include_example to include example code in the user guide markdown. Check 
SPARK-11606 for instructions.

Original PR is reverted due to document build error. 
https://github.com/apache/spark/pull/9722

mengxr feynmanliang yinxusen  Sorry for the troubling.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9974 from hhbyyh/ldaMLExample.

(cherry picked from commit e232720a65dfb9ae6135cbb7674e35eddd88d625)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1562ef10
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1562ef10
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1562ef10

Branch: refs/heads/branch-1.6
Commit: 1562ef10f5d1722a6c275726083684e6d0463a4f
Parents: a8c6d8a
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Mon Nov 30 14:56:51 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 14:56:58 2015 -0800

--
 docs/ml-clustering.md   | 31 +++
 docs/ml-guide.md|  3 +-
 docs/mllib-guide.md |  1 +
 .../spark/examples/ml/JavaLDAExample.java   | 97 
 .../apache/spark/examples/ml/LDAExample.scala   | 77 
 5 files changed, 208 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
new file mode 100644
index 000..cfefb5d
--- /dev/null
+++ b/docs/ml-clustering.md
@@ -0,0 +1,31 @@
+---
+layout: global
+title: Clustering - ML
+displayTitle: ML - Clustering
+---
+
+In this section, we introduce the pipeline API for [clustering in 
mllib](mllib-clustering.html).
+
+## Latent Dirichlet allocation (LDA)
+
+`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and 
`OnlineLDAOptimizer`,
+and generates a `LDAModel` as the base models. Expert users may cast a 
`LDAModel` generated by
+`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
+
+
+
+
+
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
+
+
+
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) 
for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
+
+
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index be18a05..6f35b30 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the 
`spark.ml` API; e.g.,
 provide class probabilities, and linear models provide model summaries.
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
@@ -950,4 +951,4 @@ model.transform(test)
 {% endhighlight %}
 
 
-
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 91e50cc..54e35fc 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -69,6 +69,7 @@ We list major functionality from both below, with links to 
detailed guides.
 concepts. It also contains sections on using algorithms within the Pipelines 
API, for example:
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/1562ef10/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples

spark git commit: [SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python)

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e232720a6 -> de64b65f7


[SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python)

Remove duplicate mllib example (DT/RF/GBT in Java/Python).
Since we have tutorial code for DT/RF/GBT classification/regression in 
Scala/Java/Python and example applications for DT/RF/GBT in Scala, so we mark 
these as duplicated and remove them.
mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9954 from yanboliang/SPARK-11975.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de64b65f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de64b65f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de64b65f

Branch: refs/heads/master
Commit: de64b65f7cf2ac58c1abc310ba547637fdbb8557
Parents: e232720
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Mon Nov 30 15:01:08 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 15:01:08 2015 -0800

--
 .../spark/examples/mllib/JavaDecisionTree.java  | 116 ---
 .../mllib/JavaGradientBoostedTreesRunner.java   | 126 
 .../examples/mllib/JavaRandomForestExample.java | 139 --
 .../main/python/mllib/decision_tree_runner.py   | 144 ---
 .../main/python/mllib/gradient_boosted_trees.py |  77 --
 .../main/python/mllib/random_forest_example.py  |  90 
 6 files changed, 692 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/de64b65f/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java
deleted file mode 100644
index 1f82e3f..000
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import java.util.HashMap;
-
-import scala.Tuple2;
-
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.tree.DecisionTree;
-import org.apache.spark.mllib.tree.model.DecisionTreeModel;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.SparkConf;
-
-/**
- * Classification and regression using decision trees.
- */
-public final class JavaDecisionTree {
-
-  public static void main(String[] args) {
-String datapath = "data/mllib/sample_libsvm_data.txt";
-if (args.length == 1) {
-  datapath = args[0];
-} else if (args.length > 1) {
-  System.err.println("Usage: JavaDecisionTree ");
-  System.exit(1);
-}
-SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree");
-JavaSparkContext sc = new JavaSparkContext(sparkConf);
-
-JavaRDD data = MLUtils.loadLibSVMFile(sc.sc(), 
datapath).toJavaRDD().cache();
-
-// Compute the number of classes from the data.
-Integer numClasses = data.map(new Function<LabeledPoint, Double>() {
-  @Override public Double call(LabeledPoint p) {
-return p.label();
-  }
-}).countByValue().size();
-
-// Set parameters.
-//  Empty categoricalFeaturesInfo indicates all features are continuous.
-HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, 
Integer>();
-String impurity = "gini";
-Integer maxDepth = 5;
-Integer maxBins = 32;
-
-// Train a DecisionTree model for classification.
-final DecisionTreeModel model = Decis

spark git commit: [SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python)

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 1562ef10f -> a387cef3a


[SPARK-11975][ML] Remove duplicate mllib example (DT/RF/GBT in Java/Python)

Remove duplicate mllib example (DT/RF/GBT in Java/Python).
Since we have tutorial code for DT/RF/GBT classification/regression in 
Scala/Java/Python and example applications for DT/RF/GBT in Scala, so we mark 
these as duplicated and remove them.
mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9954 from yanboliang/SPARK-11975.

(cherry picked from commit de64b65f7cf2ac58c1abc310ba547637fdbb8557)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a387cef3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a387cef3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a387cef3

Branch: refs/heads/branch-1.6
Commit: a387cef3a40d47a8ca7fa9c6aa2842318700df49
Parents: 1562ef1
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Mon Nov 30 15:01:08 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 15:01:16 2015 -0800

--
 .../spark/examples/mllib/JavaDecisionTree.java  | 116 ---
 .../mllib/JavaGradientBoostedTreesRunner.java   | 126 
 .../examples/mllib/JavaRandomForestExample.java | 139 --
 .../main/python/mllib/decision_tree_runner.py   | 144 ---
 .../main/python/mllib/gradient_boosted_trees.py |  77 --
 .../main/python/mllib/random_forest_example.py  |  90 
 6 files changed, 692 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a387cef3/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java
deleted file mode 100644
index 1f82e3f..000
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTree.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import java.util.HashMap;
-
-import scala.Tuple2;
-
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.tree.DecisionTree;
-import org.apache.spark.mllib.tree.model.DecisionTreeModel;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.SparkConf;
-
-/**
- * Classification and regression using decision trees.
- */
-public final class JavaDecisionTree {
-
-  public static void main(String[] args) {
-String datapath = "data/mllib/sample_libsvm_data.txt";
-if (args.length == 1) {
-  datapath = args[0];
-} else if (args.length > 1) {
-  System.err.println("Usage: JavaDecisionTree ");
-  System.exit(1);
-}
-SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree");
-JavaSparkContext sc = new JavaSparkContext(sparkConf);
-
-JavaRDD data = MLUtils.loadLibSVMFile(sc.sc(), 
datapath).toJavaRDD().cache();
-
-// Compute the number of classes from the data.
-Integer numClasses = data.map(new Function<LabeledPoint, Double>() {
-  @Override public Double call(LabeledPoint p) {
-return p.label();
-  }
-}).countByValue().size();
-
-// Set parameters.
-//  Empty categoricalFeaturesInfo indicates all features are continuous.
-HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, 
Integer>();
-String impurity = "gini";
-Integer maxDepth = 5;
-

spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

2015-11-30 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master a8ceec5e8 -> e232720a6


[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

jira: https://issues.apache.org/jira/browse/SPARK-11689

Add simple user guide for LDA under spark.ml and example code under examples/. 
Use include_example to include example code in the user guide markdown. Check 
SPARK-11606 for instructions.

Original PR is reverted due to document build error. 
https://github.com/apache/spark/pull/9722

mengxr feynmanliang yinxusen  Sorry for the troubling.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9974 from hhbyyh/ldaMLExample.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e232720a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e232720a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e232720a

Branch: refs/heads/master
Commit: e232720a65dfb9ae6135cbb7674e35eddd88d625
Parents: a8ceec5
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Mon Nov 30 14:56:51 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Nov 30 14:56:51 2015 -0800

--
 docs/ml-clustering.md   | 31 +++
 docs/ml-guide.md|  3 +-
 docs/mllib-guide.md |  1 +
 .../spark/examples/ml/JavaLDAExample.java   | 97 
 .../apache/spark/examples/ml/LDAExample.scala   | 77 
 5 files changed, 208 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
new file mode 100644
index 000..cfefb5d
--- /dev/null
+++ b/docs/ml-clustering.md
@@ -0,0 +1,31 @@
+---
+layout: global
+title: Clustering - ML
+displayTitle: ML - Clustering
+---
+
+In this section, we introduce the pipeline API for [clustering in 
mllib](mllib-clustering.html).
+
+## Latent Dirichlet allocation (LDA)
+
+`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and 
`OnlineLDAOptimizer`,
+and generates a `LDAModel` as the base models. Expert users may cast a 
`LDAModel` generated by
+`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
+
+
+
+
+
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
+
+
+
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) 
for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
+
+
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index be18a05..6f35b30 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the 
`spark.ml` API; e.g.,
 provide class probabilities, and linear models provide model summaries.
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
@@ -950,4 +951,4 @@ model.transform(test)
 {% endhighlight %}
 
 
-
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 91e50cc..54e35fc 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -69,6 +69,7 @@ We list major functionality from both below, with links to 
detailed guides.
 concepts. It also contains sections on using algorithms within the Pipelines 
API, for example:
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/e232720a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
new file mode 100644
index 000..3a5d323
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/m

spark git commit: [SPARK-11952][ML] Remove duplicate ml examples

2015-11-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e5aaae6e1 -> 56a0aba0a


[SPARK-11952][ML] Remove duplicate ml examples

Remove duplicate ml examples (only for ml).  mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9933 from yanboliang/SPARK-11685.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56a0aba0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56a0aba0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56a0aba0

Branch: refs/heads/master
Commit: 56a0aba0a60326ba026056c9a23f3f6ec7258c19
Parents: e5aaae6
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Tue Nov 24 09:52:53 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 24 09:52:53 2015 -0800

--
 .../main/python/ml/gradient_boosted_trees.py| 82 --
 .../src/main/python/ml/logistic_regression.py   | 66 ---
 .../src/main/python/ml/random_forest_example.py | 87 
 3 files changed, 235 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/56a0aba0/examples/src/main/python/ml/gradient_boosted_trees.py
--
diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py 
b/examples/src/main/python/ml/gradient_boosted_trees.py
deleted file mode 100644
index c3bf8aa..000
--- a/examples/src/main/python/ml/gradient_boosted_trees.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-import sys
-
-from pyspark import SparkContext
-from pyspark.ml.classification import GBTClassifier
-from pyspark.ml.feature import StringIndexer
-from pyspark.ml.regression import GBTRegressor
-from pyspark.mllib.evaluation import BinaryClassificationMetrics, 
RegressionMetrics
-from pyspark.sql import Row, SQLContext
-
-"""
-A simple example demonstrating a Gradient Boosted Trees 
Classification/Regression Pipeline.
-Note: GBTClassifier only supports binary classification currently
-Run with:
-  bin/spark-submit examples/src/main/python/ml/gradient_boosted_trees.py
-"""
-
-
-def testClassification(train, test):
-# Train a GradientBoostedTrees model.
-
-rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")
-
-model = rf.fit(train)
-predictionAndLabels = model.transform(test).select("prediction", 
"indexedLabel") \
-.map(lambda x: (x.prediction, x.indexedLabel))
-
-metrics = BinaryClassificationMetrics(predictionAndLabels)
-print("AUC %.3f" % metrics.areaUnderROC)
-
-
-def testRegression(train, test):
-# Train a GradientBoostedTrees model.
-
-rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")
-
-model = rf.fit(train)
-predictionAndLabels = model.transform(test).select("prediction", 
"indexedLabel") \
-.map(lambda x: (x.prediction, x.indexedLabel))
-
-metrics = RegressionMetrics(predictionAndLabels)
-print("rmse %.3f" % metrics.rootMeanSquaredError)
-print("r2 %.3f" % metrics.r2)
-print("mae %.3f" % metrics.meanAbsoluteError)
-
-
-if __name__ == "__main__":
-if len(sys.argv) > 1:
-print("Usage: gradient_boosted_trees", file=sys.stderr)
-exit(1)
-sc = SparkContext(appName="PythonGBTExample")
-sqlContext = SQLContext(sc)
-
-# Load the data stored in LIBSVM format as a DataFrame.
-df = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-# Map labels into an indexed column of labels in [0, numLabels)
-stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
-si_model = stringIndexer.fit(df)
-td = si_model.transform(df)
-[train, test] = td.randomSplit([0.7, 0.3])
-testClassification(train, test)
-testRegression(train,

spark git commit: [SPARK-11952][ML] Remove duplicate ml examples

2015-11-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 3cb1b6d39 -> 6914b7504


[SPARK-11952][ML] Remove duplicate ml examples

Remove duplicate ml examples (only for ml).  mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9933 from yanboliang/SPARK-11685.

(cherry picked from commit 56a0aba0a60326ba026056c9a23f3f6ec7258c19)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6914b750
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6914b750
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6914b750

Branch: refs/heads/branch-1.6
Commit: 6914b75046dceda47ba3ff904e67f55752e8d49d
Parents: 3cb1b6d
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Tue Nov 24 09:52:53 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 24 09:53:01 2015 -0800

--
 .../main/python/ml/gradient_boosted_trees.py| 82 --
 .../src/main/python/ml/logistic_regression.py   | 66 ---
 .../src/main/python/ml/random_forest_example.py | 87 
 3 files changed, 235 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6914b750/examples/src/main/python/ml/gradient_boosted_trees.py
--
diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py 
b/examples/src/main/python/ml/gradient_boosted_trees.py
deleted file mode 100644
index c3bf8aa..000
--- a/examples/src/main/python/ml/gradient_boosted_trees.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-import sys
-
-from pyspark import SparkContext
-from pyspark.ml.classification import GBTClassifier
-from pyspark.ml.feature import StringIndexer
-from pyspark.ml.regression import GBTRegressor
-from pyspark.mllib.evaluation import BinaryClassificationMetrics, 
RegressionMetrics
-from pyspark.sql import Row, SQLContext
-
-"""
-A simple example demonstrating a Gradient Boosted Trees 
Classification/Regression Pipeline.
-Note: GBTClassifier only supports binary classification currently
-Run with:
-  bin/spark-submit examples/src/main/python/ml/gradient_boosted_trees.py
-"""
-
-
-def testClassification(train, test):
-# Train a GradientBoostedTrees model.
-
-rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")
-
-model = rf.fit(train)
-predictionAndLabels = model.transform(test).select("prediction", 
"indexedLabel") \
-.map(lambda x: (x.prediction, x.indexedLabel))
-
-metrics = BinaryClassificationMetrics(predictionAndLabels)
-print("AUC %.3f" % metrics.areaUnderROC)
-
-
-def testRegression(train, test):
-# Train a GradientBoostedTrees model.
-
-rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")
-
-model = rf.fit(train)
-predictionAndLabels = model.transform(test).select("prediction", 
"indexedLabel") \
-.map(lambda x: (x.prediction, x.indexedLabel))
-
-metrics = RegressionMetrics(predictionAndLabels)
-print("rmse %.3f" % metrics.rootMeanSquaredError)
-print("r2 %.3f" % metrics.r2)
-print("mae %.3f" % metrics.meanAbsoluteError)
-
-
-if __name__ == "__main__":
-if len(sys.argv) > 1:
-print("Usage: gradient_boosted_trees", file=sys.stderr)
-exit(1)
-sc = SparkContext(appName="PythonGBTExample")
-sqlContext = SQLContext(sc)
-
-# Load the data stored in LIBSVM format as a DataFrame.
-df = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-# Map labels into an indexed column of labels in [0, numLabels)
-stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
-si_model = stringIndexer.fit(df)
-td = si_model.transfo

spark git commit: [SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries ignore weight col

2015-11-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 56a0aba0a -> 9e24ba667


[SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries 
ignore weight col

Doc for 1.6 that the summaries mostly ignore the weight column.
To be corrected for 1.7

CC: mengxr thunterdb

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9927 from jkbradley/linregsummary-doc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e24ba66
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e24ba66
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e24ba66

Branch: refs/heads/master
Commit: 9e24ba667e43290fbaa3cacb93cf5d9be790f1fd
Parents: 56a0aba
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Nov 24 09:54:55 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 24 09:54:55 2015 -0800

--
 .../ml/classification/LogisticRegression.scala| 18 ++
 .../spark/ml/regression/LinearRegression.scala| 15 +++
 2 files changed, 33 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9e24ba66/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 418bbdc..d320d64 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -755,23 +755,35 @@ class BinaryLogisticRegressionSummary 
private[classification] (
* Returns the receiver operating characteristic (ROC) curve,
* which is an Dataframe having two fields (FPR, TPR)
* with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
* @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
*/
   @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
 
   /**
* Computes the area under the receiver operating characteristic (ROC) curve.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
 
   /**
* Returns the precision-recall curve, which is an Dataframe containing
* two fields recall, precision with (0.0, 1.0) prepended to it.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", 
"precision")
 
   /**
* Returns a dataframe with two fields (threshold, F-Measure) curve with 
beta = 1.0.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val fMeasureByThreshold: DataFrame = {
 binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure")
@@ -781,6 +793,9 @@ class BinaryLogisticRegressionSummary 
private[classification] (
* Returns a dataframe with two fields (threshold, precision) curve.
* Every possible probability obtained in transforming the dataset are used
* as thresholds used in calculating the precision.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val precisionByThreshold: DataFrame = {
 binaryMetrics.precisionByThreshold().toDF("threshold", "precision")
@@ -790,6 +805,9 @@ class BinaryLogisticRegressionSummary 
private[classification] (
* Returns a dataframe with two fields (threshold, recall) curve.
* Every possible probability obtained in transforming the dataset are used
* as thresholds used in calculating the recall.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val recallByThreshold: DataFrame = {
 binaryMetrics.recallByThreshold().toDF("threshold", "recall")

http://git-wip-us.apache.org/repos/asf/spark/blob/9e24ba66/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
-

spark git commit: [SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries ignore weight col

2015-11-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 6914b7504 -> 70febe224


[SPARK-11521][ML][DOC] Document that Logistic, Linear Regression summaries 
ignore weight col

Doc for 1.6 that the summaries mostly ignore the weight column.
To be corrected for 1.7

CC: mengxr thunterdb

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9927 from jkbradley/linregsummary-doc.

(cherry picked from commit 9e24ba667e43290fbaa3cacb93cf5d9be790f1fd)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70febe22
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70febe22
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70febe22

Branch: refs/heads/branch-1.6
Commit: 70febe224f64cb6468c14d4788a63b35d0475d41
Parents: 6914b75
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Nov 24 09:54:55 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 24 09:55:02 2015 -0800

--
 .../ml/classification/LogisticRegression.scala| 18 ++
 .../spark/ml/regression/LinearRegression.scala| 15 +++
 2 files changed, 33 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/70febe22/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 418bbdc..d320d64 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -755,23 +755,35 @@ class BinaryLogisticRegressionSummary 
private[classification] (
* Returns the receiver operating characteristic (ROC) curve,
* which is an Dataframe having two fields (FPR, TPR)
* with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
* @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
*/
   @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
 
   /**
* Computes the area under the receiver operating characteristic (ROC) curve.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
 
   /**
* Returns the precision-recall curve, which is an Dataframe containing
* two fields recall, precision with (0.0, 1.0) prepended to it.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", 
"precision")
 
   /**
* Returns a dataframe with two fields (threshold, F-Measure) curve with 
beta = 1.0.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val fMeasureByThreshold: DataFrame = {
 binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure")
@@ -781,6 +793,9 @@ class BinaryLogisticRegressionSummary 
private[classification] (
* Returns a dataframe with two fields (threshold, precision) curve.
* Every possible probability obtained in transforming the dataset are used
* as thresholds used in calculating the precision.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val precisionByThreshold: DataFrame = {
 binaryMetrics.precisionByThreshold().toDF("threshold", "precision")
@@ -790,6 +805,9 @@ class BinaryLogisticRegressionSummary 
private[classification] (
* Returns a dataframe with two fields (threshold, recall) curve.
* Every possible probability obtained in transforming the dataset are used
* as thresholds used in calculating the recall.
+   *
+   * Note: This ignores instance weights (setting all to 1.0) from 
[[LogisticRegression.weightCol]].
+   *   This will change in later Spark versions.
*/
   @transient lazy val recallByThreshold: DataFrame = {
 binaryMetrics.recallByThreshold().toDF("threshold", &quo

spark git commit: [SPARK-11847][ML] Model export/import for spark.ml: LDA

2015-11-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 70febe224 -> af86c38db


[SPARK-11847][ML] Model export/import for spark.ml: LDA

Add read/write support to LDA, similar to ALS.

save/load for ml.LocalLDAModel is done.
For DistributedLDAModel, I'm not sure if we can invoke save on the 
mllib.DistributedLDAModel directly. I'll send update after some test.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9894 from hhbyyh/ldaMLsave.

(cherry picked from commit 52bc25c8e26d4be250d8ff7864067528f4f98592)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af86c38d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af86c38d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af86c38d

Branch: refs/heads/branch-1.6
Commit: af86c38db7676c4dfc2724d5f86f0f5f3a22e349
Parents: 70febe2
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Tue Nov 24 09:56:17 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 24 09:56:24 2015 -0800

--
 .../org/apache/spark/ml/clustering/LDA.scala| 110 ++-
 .../spark/mllib/clustering/LDAModel.scala   |   4 +-
 .../apache/spark/ml/clustering/LDASuite.scala   |  44 +++-
 3 files changed, 150 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/af86c38d/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 92e0581..830510b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.ml.clustering
 
+import org.apache.hadoop.fs.Path
 import org.apache.spark.Logging
 import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param.shared.{HasCheckpointInterval, 
HasFeaturesCol, HasSeed, HasMaxIter}
 import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.clustering.{DistributedLDAModel => 
OldDistributedLDAModel,
 EMLDAOptimizer => OldEMLDAOptimizer, LDA => OldLDA, LDAModel => 
OldLDAModel,
 LDAOptimizer => OldLDAOptimizer, LocalLDAModel => OldLocalLDAModel,
@@ -322,7 +323,7 @@ sealed abstract class LDAModel private[ml] (
 @Since("1.6.0") override val uid: String,
 @Since("1.6.0") val vocabSize: Int,
 @Since("1.6.0") @transient protected val sqlContext: SQLContext)
-  extends Model[LDAModel] with LDAParams with Logging {
+  extends Model[LDAModel] with LDAParams with Logging with MLWritable {
 
   // NOTE to developers:
   //  This abstraction should contain all important functionality for basic 
LDA usage.
@@ -486,6 +487,64 @@ class LocalLDAModel private[ml] (
 
   @Since("1.6.0")
   override def isDistributed: Boolean = false
+
+  @Since("1.6.0")
+  override def write: MLWriter = new LocalLDAModel.LocalLDAModelWriter(this)
+}
+
+
+@Since("1.6.0")
+object LocalLDAModel extends MLReadable[LocalLDAModel] {
+
+  private[LocalLDAModel]
+  class LocalLDAModelWriter(instance: LocalLDAModel) extends MLWriter {
+
+private case class Data(
+vocabSize: Int,
+topicsMatrix: Matrix,
+docConcentration: Vector,
+topicConcentration: Double,
+gammaShape: Double)
+
+override protected def saveImpl(path: String): Unit = {
+  DefaultParamsWriter.saveMetadata(instance, path, sc)
+  val oldModel = instance.oldLocalModel
+  val data = Data(instance.vocabSize, oldModel.topicsMatrix, 
oldModel.docConcentration,
+oldModel.topicConcentration, oldModel.gammaShape)
+  val dataPath = new Path(path, "data").toString
+  
sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+}
+  }
+
+  private class LocalLDAModelReader extends MLReader[LocalLDAModel] {
+
+private val className = classOf[LocalLDAModel].getName
+
+override def load(path: String): LocalLDAModel = {
+  val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+  val dataPath = new Path(path, "data").toString
+  val data = sqlContext.read.parquet(dataPath)
+.select("vocabSize", "topicsMatrix", "docConcentration", 
"topicConcentration",
+  "gammaShape")
+.head()
+  val vocabSize = data.getAs[Int](0)
+  val topicsMatrix = data.getA

spark git commit: [SPARK-11847][ML] Model export/import for spark.ml: LDA

2015-11-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9e24ba667 -> 52bc25c8e


[SPARK-11847][ML] Model export/import for spark.ml: LDA

Add read/write support to LDA, similar to ALS.

save/load for ml.LocalLDAModel is done.
For DistributedLDAModel, I'm not sure if we can invoke save on the 
mllib.DistributedLDAModel directly. I'll send update after some test.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9894 from hhbyyh/ldaMLsave.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52bc25c8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52bc25c8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52bc25c8

Branch: refs/heads/master
Commit: 52bc25c8e26d4be250d8ff7864067528f4f98592
Parents: 9e24ba6
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Tue Nov 24 09:56:17 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 24 09:56:17 2015 -0800

--
 .../org/apache/spark/ml/clustering/LDA.scala| 110 ++-
 .../spark/mllib/clustering/LDAModel.scala   |   4 +-
 .../apache/spark/ml/clustering/LDASuite.scala   |  44 +++-
 3 files changed, 150 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/52bc25c8/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 92e0581..830510b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.ml.clustering
 
+import org.apache.hadoop.fs.Path
 import org.apache.spark.Logging
 import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param.shared.{HasCheckpointInterval, 
HasFeaturesCol, HasSeed, HasMaxIter}
 import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.clustering.{DistributedLDAModel => 
OldDistributedLDAModel,
 EMLDAOptimizer => OldEMLDAOptimizer, LDA => OldLDA, LDAModel => 
OldLDAModel,
 LDAOptimizer => OldLDAOptimizer, LocalLDAModel => OldLocalLDAModel,
@@ -322,7 +323,7 @@ sealed abstract class LDAModel private[ml] (
 @Since("1.6.0") override val uid: String,
 @Since("1.6.0") val vocabSize: Int,
 @Since("1.6.0") @transient protected val sqlContext: SQLContext)
-  extends Model[LDAModel] with LDAParams with Logging {
+  extends Model[LDAModel] with LDAParams with Logging with MLWritable {
 
   // NOTE to developers:
   //  This abstraction should contain all important functionality for basic 
LDA usage.
@@ -486,6 +487,64 @@ class LocalLDAModel private[ml] (
 
   @Since("1.6.0")
   override def isDistributed: Boolean = false
+
+  @Since("1.6.0")
+  override def write: MLWriter = new LocalLDAModel.LocalLDAModelWriter(this)
+}
+
+
+@Since("1.6.0")
+object LocalLDAModel extends MLReadable[LocalLDAModel] {
+
+  private[LocalLDAModel]
+  class LocalLDAModelWriter(instance: LocalLDAModel) extends MLWriter {
+
+private case class Data(
+vocabSize: Int,
+topicsMatrix: Matrix,
+docConcentration: Vector,
+topicConcentration: Double,
+gammaShape: Double)
+
+override protected def saveImpl(path: String): Unit = {
+  DefaultParamsWriter.saveMetadata(instance, path, sc)
+  val oldModel = instance.oldLocalModel
+  val data = Data(instance.vocabSize, oldModel.topicsMatrix, 
oldModel.docConcentration,
+oldModel.topicConcentration, oldModel.gammaShape)
+  val dataPath = new Path(path, "data").toString
+  
sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+}
+  }
+
+  private class LocalLDAModelReader extends MLReader[LocalLDAModel] {
+
+private val className = classOf[LocalLDAModel].getName
+
+override def load(path: String): LocalLDAModel = {
+  val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+  val dataPath = new Path(path, "data").toString
+  val data = sqlContext.read.parquet(dataPath)
+.select("vocabSize", "topicsMatrix", "docConcentration", 
"topicConcentration",
+  "gammaShape")
+.head()
+  val vocabSize = data.getAs[Int](0)
+  val topicsMatrix = data.getAs[Matrix](1)
+  val docConcentration = data.getAs[Vector](2)
+  val topicConcentration = data.getAs[Double](3)
+  val gamma

spark git commit: [SPARK-11895][ML] rename and refactor DatasetExample under mllib/examples

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 fc4b88f3b -> a36d9bc75


[SPARK-11895][ML] rename and refactor DatasetExample under mllib/examples

We used the name `Dataset` to refer to `SchemaRDD` in 1.2 in ML pipelines and 
created this example file. Since `Dataset` has a new meaning in Spark 1.6, we 
should rename it to avoid confusion. This PR also removes support for dense 
format to simplify the example code.

cc: yinxusen

Author: Xiangrui Meng <m...@databricks.com>

Closes #9873 from mengxr/SPARK-11895.

(cherry picked from commit fe89c1817d668e46adf70d0896c42c22a547c76a)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a36d9bc7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a36d9bc7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a36d9bc7

Branch: refs/heads/branch-1.6
Commit: a36d9bc7528ab8e6fe5e002f9b9b0a51a5b93568
Parents: fc4b88f
Author: Xiangrui Meng <m...@databricks.com>
Authored: Sun Nov 22 21:45:46 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 21:45:53 2015 -0800

--
 .../spark/examples/ml/DataFrameExample.scala| 104 
 .../spark/examples/mllib/DatasetExample.scala   | 123 ---
 2 files changed, 104 insertions(+), 123 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a36d9bc7/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
new file mode 100644
index 000..424f001
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+import java.io.File
+
+import com.google.common.io.Files
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.examples.mllib.AbstractParams
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+/**
+ * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with
+ * {{{
+ * ./bin/run-example ml.DataFrameExample [options]
+ * }}}
+ * If you use it as a template to create your own app, please use 
`spark-submit` to submit your app.
+ */
+object DataFrameExample {
+
+  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
+extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+val defaultParams = Params()
+
+val parser = new OptionParser[Params]("DatasetExample") {
+  head("Dataset: an example app using DataFrame as a Dataset for ML.")
+  opt[String]("input")
+.text(s"input path to dataset")
+.action((x, c) => c.copy(input = x))
+  checkConfig { params =>
+success
+  }
+}
+
+parser.parse(args, defaultParams).map { params =>
+  run(params)
+}.getOrElse {
+  sys.exit(1)
+}
+  }
+
+  def run(params: Params) {
+
+val conf = new SparkConf().setAppName(s"DataFrameExample with $params")
+val sc = new SparkContext(conf)
+val sqlContext = new SQLContext(sc)
+
+// Load input data
+println(s"Loading LIBSVM file with UDT from ${params.input}.")
+val df: DataFrame = 
sqlContext.read.format("libsvm").load(params.input).cache()
+println("Schema from LIBSVM:")
+df.printSchema()
+println(s"Loaded training data as a DataFrame with ${df.count()} records.")
+
+// Show statistical summary of labels.
+val labelSummary = df.describe("label"

spark git commit: [SPARK-11902][ML] Unhandled case in VectorAssembler#transform

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master d9cf9c21f -> 4be360d4e


[SPARK-11902][ML] Unhandled case in VectorAssembler#transform

There is an unhandled case in the transform method of VectorAssembler if one of 
the input columns doesn't have one of the supported type DoubleType, 
NumericType, BooleanType or VectorUDT.

So, if you try to transform a column of StringType you get a cryptic 
"scala.MatchError: StringType".

This PR aims to fix this, throwing a SparkException when dealing with an 
unknown column type.

Author: BenFradet <benjamin.fra...@gmail.com>

Closes #9885 from BenFradet/SPARK-11902.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4be360d4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4be360d4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4be360d4

Branch: refs/heads/master
Commit: 4be360d4ee6cdb4d06306feca38ddef5212608cf
Parents: d9cf9c2
Author: BenFradet <benjamin.fra...@gmail.com>
Authored: Sun Nov 22 22:05:01 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 22:05:01 2015 -0800

--
 .../org/apache/spark/ml/feature/VectorAssembler.scala|  2 ++
 .../apache/spark/ml/feature/VectorAssemblerSuite.scala   | 11 +++
 2 files changed, 13 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4be360d4/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 0feec05..801096f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -84,6 +84,8 @@ class VectorAssembler(override val uid: String)
 val numAttrs = 
group.numAttributes.getOrElse(first.getAs[Vector](index).size)
 Array.fill(numAttrs)(NumericAttribute.defaultAttr)
   }
+case otherType =>
+  throw new SparkException(s"VectorAssembler does not support the 
$otherType type")
   }
 }
 val metadata = new AttributeGroup($(outputCol), attrs).toMetadata()

http://git-wip-us.apache.org/repos/asf/spark/blob/4be360d4/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index fb21ab6..9c1c00f 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -69,6 +69,17 @@ class VectorAssemblerSuite
 }
   }
 
+  test("transform should throw an exception in case of unsupported type") {
+val df = sqlContext.createDataFrame(Seq(("a", "b", "c"))).toDF("a", "b", 
"c")
+val assembler = new VectorAssembler()
+  .setInputCols(Array("a", "b", "c"))
+  .setOutputCol("features")
+val thrown = intercept[SparkException] {
+  assembler.transform(df)
+}
+assert(thrown.getMessage contains "VectorAssembler does not support the 
StringType type")
+  }
+
   test("ML attributes") {
 val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", 
"safari")
 val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11835] Adds a sidebar menu to MLlib's documentation

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 835b5488f -> 7f9d3358a


[SPARK-11835] Adds a sidebar menu to MLlib's documentation

This PR adds a sidebar menu when browsing the user guide of MLlib. It uses a 
YAML file to describe the structure of the documentation. It should be trivial 
to adapt this to the other projects.

![screen shot 2015-11-18 at 4 46 12 
pm](https://cloud.githubusercontent.com/assets/7594753/11259591/a55173f4-8e17-11e5-9340-0aed79d66262.png)

Author: Timothy Hunter <timhun...@databricks.com>

Closes #9826 from thunterdb/spark-11835.

(cherry picked from commit fc4b792d287095d70379a51f117c225d8d857078)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f9d3358
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f9d3358
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f9d3358

Branch: refs/heads/branch-1.6
Commit: 7f9d3358afd7e266c79e9989e4d874cd1183f474
Parents: 835b548
Author: Timothy Hunter <timhun...@databricks.com>
Authored: Sun Nov 22 21:51:42 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 21:51:51 2015 -0800

--
 docs/_data/menu-ml.yaml | 10 
 docs/_data/menu-mllib.yaml  | 75 
 docs/_includes/nav-left-wrapper-ml.html |  8 +++
 docs/_includes/nav-left.html| 17 +++
 docs/_layouts/global.html   | 24 ++---
 docs/css/main.css   | 37 ++
 6 files changed, 163 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7f9d3358/docs/_data/menu-ml.yaml
--
diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml
new file mode 100644
index 000..dff3d33
--- /dev/null
+++ b/docs/_data/menu-ml.yaml
@@ -0,0 +1,10 @@
+- text: Feature extraction, transformation, and selection
+  url: ml-features.html
+- text: Decision trees for classification and regression
+  url: ml-decision-tree.html
+- text: Ensembles
+  url: ml-ensembles.html
+- text: Linear methods with elastic-net regularization
+  url: ml-linear-methods.html
+- text: Multilayer perceptron classifier
+  url: ml-ann.html

http://git-wip-us.apache.org/repos/asf/spark/blob/7f9d3358/docs/_data/menu-mllib.yaml
--
diff --git a/docs/_data/menu-mllib.yaml b/docs/_data/menu-mllib.yaml
new file mode 100644
index 000..12d22ab
--- /dev/null
+++ b/docs/_data/menu-mllib.yaml
@@ -0,0 +1,75 @@
+- text: Data types
+  url: mllib-data-types.html
+- text: Basic statistics
+  url: mllib-statistics.html
+  subitems:
+- text: Summary statistics
+  url: mllib-statistics.html#summary-statistics
+- text: Correlations
+  url: mllib-statistics.html#correlations
+- text: Stratified sampling
+  url: mllib-statistics.html#stratified-sampling
+- text: Hypothesis testing
+  url: mllib-statistics.html#hypothesis-testing
+- text: Random data generation
+  url: mllib-statistics.html#random-data-generation
+- text: Classification and regression
+  url: mllib-classification-regression.html
+  subitems:
+- text: Linear models (SVMs, logistic regression, linear regression)
+  url: mllib-linear-methods.html
+- text: Naive Bayes
+  url: mllib-naive-bayes.html
+- text: decision trees
+  url: mllib-decision-tree.html
+- text: ensembles of trees (Random Forests and Gradient-Boosted Trees)
+  url: mllib-ensembles.html
+- text: isotonic regression
+  url: mllib-isotonic-regression.html
+- text: Collaborative filtering
+  url: mllib-collaborative-filtering.html
+  subitems:
+- text: alternating least squares (ALS)
+  url: mllib-collaborative-filtering.html#collaborative-filtering
+- text: Clustering
+  url: mllib-clustering.html
+  subitems:
+- text: k-means
+  url: mllib-clustering.html#k-means
+- text: Gaussian mixture
+  url: mllib-clustering.html#gaussian-mixture
+- text: power iteration clustering (PIC)
+  url: mllib-clustering.html#power-iteration-clustering-pic
+- text: latent Dirichlet allocation (LDA)
+  url: mllib-clustering.html#latent-dirichlet-allocation-lda
+- text: streaming k-means
+  url: mllib-clustering.html#streaming-k-means
+- text: Dimensionality reduction
+  url: mllib-dimensionality-reduction.html
+  subitems:
+- text: singular value decomposition (SVD)
+  url: mllib-dimensionality-reduction.html#singular-value-decomposition-svd
+- text: principal component analysis (PCA)
+  url: mllib-dimensionality-reduction.html#principal-component-analysis-pca
+- text: Feature extraction and t

spark git commit: [SPARK-11835] Adds a sidebar menu to MLlib's documentation

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master a6fda0bfc -> fc4b792d2


[SPARK-11835] Adds a sidebar menu to MLlib's documentation

This PR adds a sidebar menu when browsing the user guide of MLlib. It uses a 
YAML file to describe the structure of the documentation. It should be trivial 
to adapt this to the other projects.

![screen shot 2015-11-18 at 4 46 12 
pm](https://cloud.githubusercontent.com/assets/7594753/11259591/a55173f4-8e17-11e5-9340-0aed79d66262.png)

Author: Timothy Hunter <timhun...@databricks.com>

Closes #9826 from thunterdb/spark-11835.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc4b792d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc4b792d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc4b792d

Branch: refs/heads/master
Commit: fc4b792d287095d70379a51f117c225d8d857078
Parents: a6fda0b
Author: Timothy Hunter <timhun...@databricks.com>
Authored: Sun Nov 22 21:51:42 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 21:51:42 2015 -0800

--
 docs/_data/menu-ml.yaml | 10 
 docs/_data/menu-mllib.yaml  | 75 
 docs/_includes/nav-left-wrapper-ml.html |  8 +++
 docs/_includes/nav-left.html| 17 +++
 docs/_layouts/global.html   | 24 ++---
 docs/css/main.css   | 37 ++
 6 files changed, 163 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fc4b792d/docs/_data/menu-ml.yaml
--
diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml
new file mode 100644
index 000..dff3d33
--- /dev/null
+++ b/docs/_data/menu-ml.yaml
@@ -0,0 +1,10 @@
+- text: Feature extraction, transformation, and selection
+  url: ml-features.html
+- text: Decision trees for classification and regression
+  url: ml-decision-tree.html
+- text: Ensembles
+  url: ml-ensembles.html
+- text: Linear methods with elastic-net regularization
+  url: ml-linear-methods.html
+- text: Multilayer perceptron classifier
+  url: ml-ann.html

http://git-wip-us.apache.org/repos/asf/spark/blob/fc4b792d/docs/_data/menu-mllib.yaml
--
diff --git a/docs/_data/menu-mllib.yaml b/docs/_data/menu-mllib.yaml
new file mode 100644
index 000..12d22ab
--- /dev/null
+++ b/docs/_data/menu-mllib.yaml
@@ -0,0 +1,75 @@
+- text: Data types
+  url: mllib-data-types.html
+- text: Basic statistics
+  url: mllib-statistics.html
+  subitems:
+- text: Summary statistics
+  url: mllib-statistics.html#summary-statistics
+- text: Correlations
+  url: mllib-statistics.html#correlations
+- text: Stratified sampling
+  url: mllib-statistics.html#stratified-sampling
+- text: Hypothesis testing
+  url: mllib-statistics.html#hypothesis-testing
+- text: Random data generation
+  url: mllib-statistics.html#random-data-generation
+- text: Classification and regression
+  url: mllib-classification-regression.html
+  subitems:
+- text: Linear models (SVMs, logistic regression, linear regression)
+  url: mllib-linear-methods.html
+- text: Naive Bayes
+  url: mllib-naive-bayes.html
+- text: decision trees
+  url: mllib-decision-tree.html
+- text: ensembles of trees (Random Forests and Gradient-Boosted Trees)
+  url: mllib-ensembles.html
+- text: isotonic regression
+  url: mllib-isotonic-regression.html
+- text: Collaborative filtering
+  url: mllib-collaborative-filtering.html
+  subitems:
+- text: alternating least squares (ALS)
+  url: mllib-collaborative-filtering.html#collaborative-filtering
+- text: Clustering
+  url: mllib-clustering.html
+  subitems:
+- text: k-means
+  url: mllib-clustering.html#k-means
+- text: Gaussian mixture
+  url: mllib-clustering.html#gaussian-mixture
+- text: power iteration clustering (PIC)
+  url: mllib-clustering.html#power-iteration-clustering-pic
+- text: latent Dirichlet allocation (LDA)
+  url: mllib-clustering.html#latent-dirichlet-allocation-lda
+- text: streaming k-means
+  url: mllib-clustering.html#streaming-k-means
+- text: Dimensionality reduction
+  url: mllib-dimensionality-reduction.html
+  subitems:
+- text: singular value decomposition (SVD)
+  url: mllib-dimensionality-reduction.html#singular-value-decomposition-svd
+- text: principal component analysis (PCA)
+  url: mllib-dimensionality-reduction.html#principal-component-analysis-pca
+- text: Feature extraction and transformation
+  url: mllib-feature-extraction.html
+- text: Frequent pattern mining
+  url: mllib-frequent-pattern-mining.html
+  subitems:
+

spark git commit: [SPARK-11912][ML] ml.feature.PCA minor refactor

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master fc4b792d2 -> d9cf9c21f


[SPARK-11912][ML] ml.feature.PCA minor refactor

Like [SPARK-11852](https://issues.apache.org/jira/browse/SPARK-11852), ```k``` 
is params and we should save it under ```metadata/``` rather than both under 
```data/``` and ```metadata/```. Refactor the constructor of 
```ml.feature.PCAModel```  to take only ```pc``` but construct 
```mllib.feature.PCAModel``` inside ```transform```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9897 from yanboliang/spark-11912.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9cf9c21
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9cf9c21
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9cf9c21

Branch: refs/heads/master
Commit: d9cf9c21fc6b1aa22e68d66760afd42c4e1c18b8
Parents: fc4b792
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Sun Nov 22 21:56:07 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 21:56:07 2015 -0800

--
 .../scala/org/apache/spark/ml/feature/PCA.scala | 23 +++
 .../org/apache/spark/ml/feature/PCASuite.scala  | 31 
 2 files changed, 24 insertions(+), 30 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d9cf9c21/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 32d7afe..aa88cb0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -73,7 +73,7 @@ class PCA (override val uid: String) extends 
Estimator[PCAModel] with PCAParams
 val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v}
 val pca = new feature.PCA(k = $(k))
 val pcaModel = pca.fit(input)
-copyValues(new PCAModel(uid, pcaModel).setParent(this))
+copyValues(new PCAModel(uid, pcaModel.pc).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -99,18 +99,17 @@ object PCA extends DefaultParamsReadable[PCA] {
 /**
  * :: Experimental ::
  * Model fitted by [[PCA]].
+ *
+ * @param pc A principal components Matrix. Each column is one principal 
component.
  */
 @Experimental
 class PCAModel private[ml] (
 override val uid: String,
-pcaModel: feature.PCAModel)
+val pc: DenseMatrix)
   extends Model[PCAModel] with PCAParams with MLWritable {
 
   import PCAModel._
 
-  /** a principal components Matrix. Each column is one principal component. */
-  val pc: DenseMatrix = pcaModel.pc
-
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
@@ -124,6 +123,7 @@ class PCAModel private[ml] (
*/
   override def transform(dataset: DataFrame): DataFrame = {
 transformSchema(dataset.schema, logging = true)
+val pcaModel = new feature.PCAModel($(k), pc)
 val pcaOp = udf { pcaModel.transform _ }
 dataset.withColumn($(outputCol), pcaOp(col($(inputCol
   }
@@ -139,7 +139,7 @@ class PCAModel private[ml] (
   }
 
   override def copy(extra: ParamMap): PCAModel = {
-val copied = new PCAModel(uid, pcaModel)
+val copied = new PCAModel(uid, pc)
 copyValues(copied, extra).setParent(parent)
   }
 
@@ -152,11 +152,11 @@ object PCAModel extends MLReadable[PCAModel] {
 
   private[PCAModel] class PCAModelWriter(instance: PCAModel) extends MLWriter {
 
-private case class Data(k: Int, pc: DenseMatrix)
+private case class Data(pc: DenseMatrix)
 
 override protected def saveImpl(path: String): Unit = {
   DefaultParamsWriter.saveMetadata(instance, path, sc)
-  val data = Data(instance.getK, instance.pc)
+  val data = Data(instance.pc)
   val dataPath = new Path(path, "data").toString
   
sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
 }
@@ -169,11 +169,10 @@ object PCAModel extends MLReadable[PCAModel] {
 override def load(path: String): PCAModel = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
   val dataPath = new Path(path, "data").toString
-  val Row(k: Int, pc: DenseMatrix) = sqlContext.read.parquet(dataPath)
-.select("k", "pc")
+  val Row(pc: DenseMatrix) = sqlContext.read.parquet(dataPath)
+.select("pc")
 .head()
-  val oldModel = new feature.PCAModel(k, pc)
-  val model = new PCAModel(metadata.uid, oldModel)
+  val model = new PCAModel(metadata.uid, pc)
   DefaultParamsReader.getAndSetParams(model, metadata)
   model
 }

http://git-wip-u

spark git commit: [SPARK-11912][ML] ml.feature.PCA minor refactor

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 7f9d3358a -> d482dced3


[SPARK-11912][ML] ml.feature.PCA minor refactor

Like [SPARK-11852](https://issues.apache.org/jira/browse/SPARK-11852), ```k``` 
is params and we should save it under ```metadata/``` rather than both under 
```data/``` and ```metadata/```. Refactor the constructor of 
```ml.feature.PCAModel```  to take only ```pc``` but construct 
```mllib.feature.PCAModel``` inside ```transform```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9897 from yanboliang/spark-11912.

(cherry picked from commit d9cf9c21fc6b1aa22e68d66760afd42c4e1c18b8)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d482dced
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d482dced
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d482dced

Branch: refs/heads/branch-1.6
Commit: d482dced313d1d837508d3f449261419c8543c1d
Parents: 7f9d335
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Sun Nov 22 21:56:07 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 21:56:17 2015 -0800

--
 .../scala/org/apache/spark/ml/feature/PCA.scala | 23 +++
 .../org/apache/spark/ml/feature/PCASuite.scala  | 31 
 2 files changed, 24 insertions(+), 30 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d482dced/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 32d7afe..aa88cb0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -73,7 +73,7 @@ class PCA (override val uid: String) extends 
Estimator[PCAModel] with PCAParams
 val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v}
 val pca = new feature.PCA(k = $(k))
 val pcaModel = pca.fit(input)
-copyValues(new PCAModel(uid, pcaModel).setParent(this))
+copyValues(new PCAModel(uid, pcaModel.pc).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -99,18 +99,17 @@ object PCA extends DefaultParamsReadable[PCA] {
 /**
  * :: Experimental ::
  * Model fitted by [[PCA]].
+ *
+ * @param pc A principal components Matrix. Each column is one principal 
component.
  */
 @Experimental
 class PCAModel private[ml] (
 override val uid: String,
-pcaModel: feature.PCAModel)
+val pc: DenseMatrix)
   extends Model[PCAModel] with PCAParams with MLWritable {
 
   import PCAModel._
 
-  /** a principal components Matrix. Each column is one principal component. */
-  val pc: DenseMatrix = pcaModel.pc
-
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
@@ -124,6 +123,7 @@ class PCAModel private[ml] (
*/
   override def transform(dataset: DataFrame): DataFrame = {
 transformSchema(dataset.schema, logging = true)
+val pcaModel = new feature.PCAModel($(k), pc)
 val pcaOp = udf { pcaModel.transform _ }
 dataset.withColumn($(outputCol), pcaOp(col($(inputCol
   }
@@ -139,7 +139,7 @@ class PCAModel private[ml] (
   }
 
   override def copy(extra: ParamMap): PCAModel = {
-val copied = new PCAModel(uid, pcaModel)
+val copied = new PCAModel(uid, pc)
 copyValues(copied, extra).setParent(parent)
   }
 
@@ -152,11 +152,11 @@ object PCAModel extends MLReadable[PCAModel] {
 
   private[PCAModel] class PCAModelWriter(instance: PCAModel) extends MLWriter {
 
-private case class Data(k: Int, pc: DenseMatrix)
+private case class Data(pc: DenseMatrix)
 
 override protected def saveImpl(path: String): Unit = {
   DefaultParamsWriter.saveMetadata(instance, path, sc)
-  val data = Data(instance.getK, instance.pc)
+  val data = Data(instance.pc)
   val dataPath = new Path(path, "data").toString
   
sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
 }
@@ -169,11 +169,10 @@ object PCAModel extends MLReadable[PCAModel] {
 override def load(path: String): PCAModel = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
   val dataPath = new Path(path, "data").toString
-  val Row(k: Int, pc: DenseMatrix) = sqlContext.read.parquet(dataPath)
-.select("k", "pc")
+  val Row(pc: DenseMatrix) = sqlContext.read.parquet(dataPath)
+.select("pc")
 .head()
-  val oldModel = new feature.PCAModel(k, pc)
-  val model = new PCAModel(metadata.uid, oldModel)
+  val m

spark git commit: [SPARK-6791][ML] Add read/write for CrossValidator and Evaluators

2015-11-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master fe89c1817 -> a6fda0bfc


[SPARK-6791][ML] Add read/write for CrossValidator and Evaluators

I believe this works for general estimators within CrossValidator, including 
compound estimators.  (See the complex unit test.)

Added read/write for all 3 Evaluators as well.

CC: mengxr yanboliang

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9848 from jkbradley/cv-io.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6fda0bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6fda0bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6fda0bf

Branch: refs/heads/master
Commit: a6fda0bfc16a13b28b1cecc96f1ff91363089144
Parents: fe89c18
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Sun Nov 22 21:48:48 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Nov 22 21:48:48 2015 -0800

--
 .../scala/org/apache/spark/ml/Pipeline.scala|  38 +--
 .../BinaryClassificationEvaluator.scala |  11 +-
 .../MulticlassClassificationEvaluator.scala |  12 +-
 .../ml/evaluation/RegressionEvaluator.scala |  11 +-
 .../apache/spark/ml/recommendation/ALS.scala|  14 +-
 .../apache/spark/ml/tuning/CrossValidator.scala | 229 ++-
 .../org/apache/spark/ml/util/ReadWrite.scala|  48 ++--
 .../org/apache/spark/ml/PipelineSuite.scala |   4 +-
 .../BinaryClassificationEvaluatorSuite.scala|  13 +-
 ...MulticlassClassificationEvaluatorSuite.scala |  13 +-
 .../evaluation/RegressionEvaluatorSuite.scala   |  12 +-
 .../spark/ml/tuning/CrossValidatorSuite.scala   | 202 +++-
 12 files changed, 522 insertions(+), 85 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a6fda0bf/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala 
b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 6f15b37..4b2b3f8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -34,7 +34,6 @@ import org.apache.spark.ml.util.MLWriter
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -232,20 +231,9 @@ object Pipeline extends MLReadable[Pipeline] {
 stages: Array[PipelineStage],
 sc: SparkContext,
 path: String): Unit = {
-  // Copied and edited from DefaultParamsWriter.saveMetadata
-  // TODO: modify DefaultParamsWriter.saveMetadata to avoid duplication
-  val uid = instance.uid
-  val cls = instance.getClass.getName
   val stageUids = stages.map(_.uid)
   val jsonParams = List("stageUids" -> 
parse(compact(render(stageUids.toSeq
-  val metadata = ("class" -> cls) ~
-("timestamp" -> System.currentTimeMillis()) ~
-("sparkVersion" -> sc.version) ~
-("uid" -> uid) ~
-("paramMap" -> jsonParams)
-  val metadataPath = new Path(path, "metadata").toString
-  val metadataJson = compact(render(metadata))
-  sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath)
+  DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap = 
Some(jsonParams))
 
   // Save stages
   val stagesDir = new Path(path, "stages").toString
@@ -266,30 +254,10 @@ object Pipeline extends MLReadable[Pipeline] {
 
   implicit val format = DefaultFormats
   val stagesDir = new Path(path, "stages").toString
-  val stageUids: Array[String] = metadata.params match {
-case JObject(pairs) =>
-  if (pairs.length != 1) {
-// Should not happen unless file is corrupted or we have a bug.
-throw new RuntimeException(
-  s"Pipeline read expected 1 Param (stageUids), but found 
${pairs.length}.")
-  }
-  pairs.head match {
-case ("stageUids", jsonValue) =>
-  jsonValue.extract[Seq[String]].toArray
-case (paramName, jsonValue) =>
-  // Should not happen unless file is corrupted or we have a bug.
-  throw new RuntimeException(s"Pipeline read encountered 
unexpected Param $paramName" +
-s" in metadata: ${metadata.metadataStr}")
-  }
-case _ =>
-  throw new IllegalArgumentException(
-s"Cannot recognize JSON metadata: ${metadata.metadataStr}.")
-  }
+  val stageUids: Array

spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9ace2e5c8 -> e359d5dcf


[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

jira: https://issues.apache.org/jira/browse/SPARK-11689

Add simple user guide for LDA under spark.ml and example code under examples/. 
Use include_example to include example code in the user guide markdown. Check 
SPARK-11606 for instructions.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9722 from hhbyyh/ldaMLExample.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e359d5dc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e359d5dc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e359d5dc

Branch: refs/heads/master
Commit: e359d5dcf5bd300213054ebeae9fe75c4f7eb9e7
Parents: 9ace2e5
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Fri Nov 20 09:57:09 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 09:57:09 2015 -0800

--
 docs/ml-clustering.md   | 30 +++
 docs/ml-guide.md|  3 +-
 docs/mllib-guide.md |  1 +
 .../spark/examples/ml/JavaLDAExample.java   | 94 
 .../apache/spark/examples/ml/LDAExample.scala   | 77 
 5 files changed, 204 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
new file mode 100644
index 000..1743ef4
--- /dev/null
+++ b/docs/ml-clustering.md
@@ -0,0 +1,30 @@
+---
+layout: global
+title: Clustering - ML
+displayTitle: ML - Clustering
+---
+
+In this section, we introduce the pipeline API for [clustering in 
mllib](mllib-clustering.html).
+
+## Latent Dirichlet allocation (LDA)
+
+`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and 
`OnlineLDAOptimizer`,
+and generates a `LDAModel` as the base models. Expert users may cast a 
`LDAModel` generated by
+`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
+
+
+
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
+
+
+{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
+
+
+
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) 
for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
+
+
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index be18a05..6f35b30 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the 
`spark.ml` API; e.g.,
 provide class probabilities, and linear models provide model summaries.
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
@@ -950,4 +951,4 @@ model.transform(test)
 {% endhighlight %}
 
 
-
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 91e50cc..54e35fc 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -69,6 +69,7 @@ We list major functionality from both below, with links to 
detailed guides.
 concepts. It also contains sections on using algorithms within the Pipelines 
API, for example:
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/e359d5dc/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
new file mode 100644
index 000..b3a7d2e
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.

spark git commit: [SPARK-11852][ML] StandardScaler minor refactor

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 eab90d3f3 -> b11aa1797


[SPARK-11852][ML] StandardScaler minor refactor

```withStd``` and ```withMean``` should be params of ```StandardScaler``` and 
```StandardScalerModel```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9839 from yanboliang/standardScaler-refactor.

(cherry picked from commit 9ace2e5c8d7fbd360a93bc5fc4eace64a697b44f)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b11aa179
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b11aa179
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b11aa179

Branch: refs/heads/branch-1.6
Commit: b11aa1797c928f2cfaf1d8821eff4be4109ac41d
Parents: eab90d3
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Fri Nov 20 09:55:53 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 09:56:02 2015 -0800

--
 .../spark/ml/feature/StandardScaler.scala   | 60 +---
 .../spark/ml/feature/StandardScalerSuite.scala  | 11 ++--
 2 files changed, 32 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b11aa179/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 6d54521..d76a9c6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -36,20 +36,30 @@ import org.apache.spark.sql.types.{StructField, StructType}
 private[feature] trait StandardScalerParams extends Params with HasInputCol 
with HasOutputCol {
 
   /**
-   * Centers the data with mean before scaling.
+   * Whether to center the data with mean before scaling.
* It will build a dense output, so this does not work on sparse input
* and will raise an exception.
* Default: false
* @group param
*/
-  val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data 
with mean")
+  val withMean: BooleanParam = new BooleanParam(this, "withMean",
+"Whether to center data with mean")
+
+  /** @group getParam */
+  def getWithMean: Boolean = $(withMean)
 
   /**
-   * Scales the data to unit standard deviation.
+   * Whether to scale the data to unit standard deviation.
* Default: true
* @group param
*/
-  val withStd: BooleanParam = new BooleanParam(this, "withStd", "Scale to unit 
standard deviation")
+  val withStd: BooleanParam = new BooleanParam(this, "withStd",
+"Whether to scale the data to unit standard deviation")
+
+  /** @group getParam */
+  def getWithStd: Boolean = $(withStd)
+
+  setDefault(withMean -> false, withStd -> true)
 }
 
 /**
@@ -63,8 +73,6 @@ class StandardScaler(override val uid: String) extends 
Estimator[StandardScalerM
 
   def this() = this(Identifiable.randomUID("stdScal"))
 
-  setDefault(withMean -> false, withStd -> true)
-
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
@@ -82,7 +90,7 @@ class StandardScaler(override val uid: String) extends 
Estimator[StandardScalerM
 val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
 val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = 
$(withStd))
 val scalerModel = scaler.fit(input)
-copyValues(new StandardScalerModel(uid, scalerModel).setParent(this))
+copyValues(new StandardScalerModel(uid, scalerModel.std, 
scalerModel.mean).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -108,29 +116,19 @@ object StandardScaler extends 
DefaultParamsReadable[StandardScaler] {
 /**
  * :: Experimental ::
  * Model fitted by [[StandardScaler]].
+ *
+ * @param std Standard deviation of the StandardScalerModel
+ * @param mean Mean of the StandardScalerModel
  */
 @Experimental
 class StandardScalerModel private[ml] (
 override val uid: String,
-scaler: feature.StandardScalerModel)
+val std: Vector,
+val mean: Vector)
   extends Model[StandardScalerModel] with StandardScalerParams with MLWritable 
{
 
   import StandardScalerModel._
 
-  /** Standard deviation of the StandardScalerModel */
-  val std: Vector = scaler.std
-
-  /** Mean of the StandardScalerModel */
-  val mean: Vector = scaler.mean
-
-  /** Whether to scale to unit standard deviation. */
-  @Since("1.6.0")
-  def getWithStd: Boolean = scaler.withStd
-
-  /** Whether to center da

spark git commit: [SPARK-11852][ML] StandardScaler minor refactor

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master a66142dec -> 9ace2e5c8


[SPARK-11852][ML] StandardScaler minor refactor

```withStd``` and ```withMean``` should be params of ```StandardScaler``` and 
```StandardScalerModel```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9839 from yanboliang/standardScaler-refactor.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ace2e5c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ace2e5c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ace2e5c

Branch: refs/heads/master
Commit: 9ace2e5c8d7fbd360a93bc5fc4eace64a697b44f
Parents: a66142d
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Fri Nov 20 09:55:53 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 09:55:53 2015 -0800

--
 .../spark/ml/feature/StandardScaler.scala   | 60 +---
 .../spark/ml/feature/StandardScalerSuite.scala  | 11 ++--
 2 files changed, 32 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9ace2e5c/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 6d54521..d76a9c6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -36,20 +36,30 @@ import org.apache.spark.sql.types.{StructField, StructType}
 private[feature] trait StandardScalerParams extends Params with HasInputCol 
with HasOutputCol {
 
   /**
-   * Centers the data with mean before scaling.
+   * Whether to center the data with mean before scaling.
* It will build a dense output, so this does not work on sparse input
* and will raise an exception.
* Default: false
* @group param
*/
-  val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data 
with mean")
+  val withMean: BooleanParam = new BooleanParam(this, "withMean",
+"Whether to center data with mean")
+
+  /** @group getParam */
+  def getWithMean: Boolean = $(withMean)
 
   /**
-   * Scales the data to unit standard deviation.
+   * Whether to scale the data to unit standard deviation.
* Default: true
* @group param
*/
-  val withStd: BooleanParam = new BooleanParam(this, "withStd", "Scale to unit 
standard deviation")
+  val withStd: BooleanParam = new BooleanParam(this, "withStd",
+"Whether to scale the data to unit standard deviation")
+
+  /** @group getParam */
+  def getWithStd: Boolean = $(withStd)
+
+  setDefault(withMean -> false, withStd -> true)
 }
 
 /**
@@ -63,8 +73,6 @@ class StandardScaler(override val uid: String) extends 
Estimator[StandardScalerM
 
   def this() = this(Identifiable.randomUID("stdScal"))
 
-  setDefault(withMean -> false, withStd -> true)
-
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
@@ -82,7 +90,7 @@ class StandardScaler(override val uid: String) extends 
Estimator[StandardScalerM
 val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
 val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = 
$(withStd))
 val scalerModel = scaler.fit(input)
-copyValues(new StandardScalerModel(uid, scalerModel).setParent(this))
+copyValues(new StandardScalerModel(uid, scalerModel.std, 
scalerModel.mean).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -108,29 +116,19 @@ object StandardScaler extends 
DefaultParamsReadable[StandardScaler] {
 /**
  * :: Experimental ::
  * Model fitted by [[StandardScaler]].
+ *
+ * @param std Standard deviation of the StandardScalerModel
+ * @param mean Mean of the StandardScalerModel
  */
 @Experimental
 class StandardScalerModel private[ml] (
 override val uid: String,
-scaler: feature.StandardScalerModel)
+val std: Vector,
+val mean: Vector)
   extends Model[StandardScalerModel] with StandardScalerParams with MLWritable 
{
 
   import StandardScalerModel._
 
-  /** Standard deviation of the StandardScalerModel */
-  val std: Vector = scaler.std
-
-  /** Mean of the StandardScalerModel */
-  val mean: Vector = scaler.mean
-
-  /** Whether to scale to unit standard deviation. */
-  @Since("1.6.0")
-  def getWithStd: Boolean = scaler.withStd
-
-  /** Whether to center data with mean. */
-  @Since("1.6.0")
-  def getWithMean: Boolean = scaler.withMean
-
   /** @group setParam */
   def setI

spark git commit: [SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 b11aa1797 -> 92d3563fd


[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml

jira: https://issues.apache.org/jira/browse/SPARK-11689

Add simple user guide for LDA under spark.ml and example code under examples/. 
Use include_example to include example code in the user guide markdown. Check 
SPARK-11606 for instructions.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9722 from hhbyyh/ldaMLExample.

(cherry picked from commit e359d5dcf5bd300213054ebeae9fe75c4f7eb9e7)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92d3563f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92d3563f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92d3563f

Branch: refs/heads/branch-1.6
Commit: 92d3563fd0cf0c3f4fe037b404d172125b24cf2f
Parents: b11aa17
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Fri Nov 20 09:57:09 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 09:57:24 2015 -0800

--
 docs/ml-clustering.md   | 30 +++
 docs/ml-guide.md|  3 +-
 docs/mllib-guide.md |  1 +
 .../spark/examples/ml/JavaLDAExample.java   | 94 
 .../apache/spark/examples/ml/LDAExample.scala   | 77 
 5 files changed, 204 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
new file mode 100644
index 000..1743ef4
--- /dev/null
+++ b/docs/ml-clustering.md
@@ -0,0 +1,30 @@
+---
+layout: global
+title: Clustering - ML
+displayTitle: ML - Clustering
+---
+
+In this section, we introduce the pipeline API for [clustering in 
mllib](mllib-clustering.html).
+
+## Latent Dirichlet allocation (LDA)
+
+`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and 
`OnlineLDAOptimizer`,
+and generates a `LDAModel` as the base models. Expert users may cast a 
`LDAModel` generated by
+`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
+
+
+
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
+
+
+{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
+
+
+
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) 
for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
+
+
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index be18a05..6f35b30 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the 
`spark.ml` API; e.g.,
 provide class probabilities, and linear models provide model summaries.
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
@@ -950,4 +951,4 @@ model.transform(test)
 {% endhighlight %}
 
 
-
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 91e50cc..54e35fc 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -69,6 +69,7 @@ We list major functionality from both below, with links to 
detailed guides.
 concepts. It also contains sections on using algorithms within the Pipelines 
API, for example:
 
 * [Feature extraction, transformation, and selection](ml-features.html)
+* [Clustering](ml-clustering.html)
 * [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/92d3563f/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
new file mode 100644
index 000..b3a7d2e
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLD

spark git commit: Revert "[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml"

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 47815878a -> a2dce22e0


Revert "[SPARK-11689][ML] Add user guide and example code for LDA under 
spark.ml"

This reverts commit e359d5dcf5bd300213054ebeae9fe75c4f7eb9e7.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2dce22e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2dce22e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2dce22e

Branch: refs/heads/master
Commit: a2dce22e0a25922e2052318d32f32877b7c27ec2
Parents: 4781587
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Nov 20 16:51:47 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 16:51:47 2015 -0800

--
 docs/ml-clustering.md   | 30 ---
 docs/ml-guide.md|  3 +-
 docs/mllib-guide.md |  1 -
 .../spark/examples/ml/JavaLDAExample.java   | 94 
 .../apache/spark/examples/ml/LDAExample.scala   | 77 
 5 files changed, 1 insertion(+), 204 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
deleted file mode 100644
index 1743ef4..000
--- a/docs/ml-clustering.md
+++ /dev/null
@@ -1,30 +0,0 @@

-layout: global
-title: Clustering - ML
-displayTitle: ML - Clustering

-
-In this section, we introduce the pipeline API for [clustering in 
mllib](mllib-clustering.html).
-
-## Latent Dirichlet allocation (LDA)
-
-`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and 
`OnlineLDAOptimizer`,
-and generates a `LDAModel` as the base models. Expert users may cast a 
`LDAModel` generated by
-`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
-
-
-
-Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
-
-
-{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
-
-
-
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) 
for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
-
-
-
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 6f35b30..be18a05 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -40,7 +40,6 @@ Also, some algorithms have additional capabilities in the 
`spark.ml` API; e.g.,
 provide class probabilities, and linear models provide model summaries.
 
 * [Feature extraction, transformation, and selection](ml-features.html)
-* [Clustering](ml-clustering.html)
 * [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
@@ -951,4 +950,4 @@ model.transform(test)
 {% endhighlight %}
 
 
-
\ No newline at end of file
+

http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 54e35fc..91e50cc 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -69,7 +69,6 @@ We list major functionality from both below, with links to 
detailed guides.
 concepts. It also contains sections on using algorithms within the Pipelines 
API, for example:
 
 * [Feature extraction, transformation, and selection](ml-features.html)
-* [Clustering](ml-clustering.html)
 * [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/a2dce22e/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
deleted file mode 100644
index b3a7d2e..000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not

spark git commit: Revert "[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml"

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 285e4017a -> 33d856df5


Revert "[SPARK-11689][ML] Add user guide and example code for LDA under 
spark.ml"

This reverts commit 92d3563fd0cf0c3f4fe037b404d172125b24cf2f.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33d856df
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33d856df
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33d856df

Branch: refs/heads/branch-1.6
Commit: 33d856df53689d7fd515a21ec4f34d1d5c74a958
Parents: 285e401
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Nov 20 16:52:20 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 16:52:20 2015 -0800

--
 docs/ml-clustering.md   | 30 ---
 docs/ml-guide.md|  3 +-
 docs/mllib-guide.md |  1 -
 .../spark/examples/ml/JavaLDAExample.java   | 94 
 .../apache/spark/examples/ml/LDAExample.scala   | 77 
 5 files changed, 1 insertion(+), 204 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
deleted file mode 100644
index 1743ef4..000
--- a/docs/ml-clustering.md
+++ /dev/null
@@ -1,30 +0,0 @@

-layout: global
-title: Clustering - ML
-displayTitle: ML - Clustering

-
-In this section, we introduce the pipeline API for [clustering in 
mllib](mllib-clustering.html).
-
-## Latent Dirichlet allocation (LDA)
-
-`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and 
`OnlineLDAOptimizer`,
-and generates a `LDAModel` as the base models. Expert users may cast a 
`LDAModel` generated by
-`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
-
-
-
-Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
-
-
-{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
-
-
-
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) 
for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
-
-
-
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 6f35b30..be18a05 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -40,7 +40,6 @@ Also, some algorithms have additional capabilities in the 
`spark.ml` API; e.g.,
 provide class probabilities, and linear models provide model summaries.
 
 * [Feature extraction, transformation, and selection](ml-features.html)
-* [Clustering](ml-clustering.html)
 * [Decision Trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
@@ -951,4 +950,4 @@ model.transform(test)
 {% endhighlight %}
 
 
-
\ No newline at end of file
+

http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/docs/mllib-guide.md
--
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 54e35fc..91e50cc 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -69,7 +69,6 @@ We list major functionality from both below, with links to 
detailed guides.
 concepts. It also contains sections on using algorithms within the Pipelines 
API, for example:
 
 * [Feature extraction, transformation, and selection](ml-features.html)
-* [Clustering](ml-clustering.html)
 * [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)

http://git-wip-us.apache.org/repos/asf/spark/blob/33d856df/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
deleted file mode 100644
index b3a7d2e..000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not

[2/2] spark git commit: [SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example

2015-11-20 Thread meng

[SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using 
include_example

Author: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>

Closes #9689 from vikasnp/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ed47b1e6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ed47b1e6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ed47b1e6

Branch: refs/heads/master
Commit: ed47b1e660b830e2d4fac8d6df93f634b260393c
Parents: 4b84c72
Author: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Authored: Fri Nov 20 15:18:41 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Nov 20 15:18:41 2015 -0800

--
 docs/mllib-evaluation-metrics.md| 940 +--
 .../JavaBinaryClassificationMetricsExample.java | 113 +++
 ...aMultiLabelClassificationMetricsExample.java |  80 ++
 ...aMulticlassClassificationMetricsExample.java |  97 ++
 .../mllib/JavaRankingMetricsExample.java| 176 
 .../mllib/JavaRegressionMetricsExample.java |  91 ++
 .../binary_classification_metrics_example.py|  55 ++
 .../python/mllib/multi_class_metrics_example.py |  69 ++
 .../python/mllib/multi_label_metrics_example.py |  61 ++
 .../python/mllib/ranking_metrics_example.py |  55 ++
 .../python/mllib/regression_metrics_example.py  |  59 ++
 .../BinaryClassificationMetricsExample.scala| 103 ++
 .../mllib/MultiLabelMetricsExample.scala|  69 ++
 .../mllib/MulticlassMetricsExample.scala|  99 ++
 .../examples/mllib/RankingMetricsExample.scala  | 110 +++
 .../mllib/RegressionMetricsExample.scala|  67 ++
 16 files changed, 1319 insertions(+), 925 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ed47b1e6/docs/mllib-evaluation-metrics.md
--
diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index f73eff6..6924037 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -104,214 +104,21 @@ data, and evaluate the performance of the algorithm by 
several binary evaluation
 
 Refer to the [`LogisticRegressionWithLBFGS` Scala 
docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS)
 and [`BinaryClassificationMetrics` Scala 
docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics)
 for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
-import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-
-// Load training data in LIBSVM format
-val data = MLUtils.loadLibSVMFile(sc, 
"data/mllib/sample_binary_classification_data.txt")
-
-// Split data into training (60%) and test (40%)
-val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
-training.cache()
-
-// Run training algorithm to build the model
-val model = new LogisticRegressionWithLBFGS()
-  .setNumClasses(2)
-  .run(training)
-
-// Clear the prediction threshold so the model will return probabilities
-model.clearThreshold
-
-// Compute raw scores on the test set
-val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
-  val prediction = model.predict(features)
-  (prediction, label)
-}
-
-// Instantiate metrics object
-val metrics = new BinaryClassificationMetrics(predictionAndLabels)
-
-// Precision by threshold
-val precision = metrics.precisionByThreshold
-precision.foreach { case (t, p) =>
-println(s"Threshold: $t, Precision: $p")
-}
-
-// Recall by threshold
-val recall = metrics.recallByThreshold
-recall.foreach { case (t, r) =>
-println(s"Threshold: $t, Recall: $r")
-}
-
-// Precision-Recall Curve
-val PRC = metrics.pr
-
-// F-measure
-val f1Score = metrics.fMeasureByThreshold
-f1Score.foreach { case (t, f) =>
-println(s"Threshold: $t, F-score: $f, Beta = 1")
-}
-
-val beta = 0.5
-val fScore = metrics.fMeasureByThreshold(beta)
-f1Score.foreach { case (t, f) =>
-println(s"Threshold: $t, F-score: $f, Beta = 0.5")
-}
-
-// AUPRC
-val auPRC = metrics.areaUnderPR
-println("Area under precision-recall curve = " + auPRC)
-
-// Compute thresholds used in ROC and PR curves
-val thresholds = precision.map(_._1)
-
-// ROC Curve
-val roc = metrics.roc
-
-// AUROC
-val auROC = metrics.areaUnderROC
-println("Area under ROC = " + auROC)
-
-{% endhighlight %}
+{% include_example 
scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala 
%}
 
 
 
 
 Refer to the [`LogisticR

[1/2] spark git commit: [SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 4b84c72df -> ed47b1e66


http://git-wip-us.apache.org/repos/asf/spark/blob/ed47b1e6/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
new file mode 100644
index 000..4503c15
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.evaluation.MultilabelMetrics
+import org.apache.spark.rdd.RDD
+// $example off$
+import org.apache.spark.{SparkContext, SparkConf}
+
+object MultiLabelMetricsExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("MultiLabelMetricsExample")
+val sc = new SparkContext(conf)
+// $example on$
+val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
+  Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
+(Array(0.0, 2.0), Array(0.0, 1.0)),
+(Array.empty[Double], Array(0.0)),
+(Array(2.0), Array(2.0)),
+(Array(2.0, 0.0), Array(2.0, 0.0)),
+(Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
+(Array(1.0), Array(1.0, 2.0))), 2)
+
+// Instantiate metrics object
+val metrics = new MultilabelMetrics(scoreAndLabels)
+
+// Summary stats
+println(s"Recall = ${metrics.recall}")
+println(s"Precision = ${metrics.precision}")
+println(s"F1 measure = ${metrics.f1Measure}")
+println(s"Accuracy = ${metrics.accuracy}")
+
+// Individual label stats
+metrics.labels.foreach(label =>
+  println(s"Class $label precision = ${metrics.precision(label)}"))
+metrics.labels.foreach(label => println(s"Class $label recall = 
${metrics.recall(label)}"))
+metrics.labels.foreach(label => println(s"Class $label F1-score = 
${metrics.f1Measure(label)}"))
+
+// Micro stats
+println(s"Micro recall = ${metrics.microRecall}")
+println(s"Micro precision = ${metrics.microPrecision}")
+println(s"Micro F1 measure = ${metrics.microF1Measure}")
+
+// Hamming loss
+println(s"Hamming loss = ${metrics.hammingLoss}")
+
+// Subset accuracy
+println(s"Subset accuracy = ${metrics.subsetAccuracy}")
+// $example off$
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/ed47b1e6/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
new file mode 100644
index 000..0904449
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import

[1/2] spark git commit: [SPARK-11549][DOCS] Replace example code in mllib-evaluation-metrics.md using include_example

2015-11-20 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 0665fb5ea -> 1dde97176


http://git-wip-us.apache.org/repos/asf/spark/blob/1dde9717/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
new file mode 100644
index 000..4503c15
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.evaluation.MultilabelMetrics
+import org.apache.spark.rdd.RDD
+// $example off$
+import org.apache.spark.{SparkContext, SparkConf}
+
+object MultiLabelMetricsExample {
+  def main(args: Array[String]): Unit = {
+val conf = new SparkConf().setAppName("MultiLabelMetricsExample")
+val sc = new SparkContext(conf)
+// $example on$
+val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
+  Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
+(Array(0.0, 2.0), Array(0.0, 1.0)),
+(Array.empty[Double], Array(0.0)),
+(Array(2.0), Array(2.0)),
+(Array(2.0, 0.0), Array(2.0, 0.0)),
+(Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
+(Array(1.0), Array(1.0, 2.0))), 2)
+
+// Instantiate metrics object
+val metrics = new MultilabelMetrics(scoreAndLabels)
+
+// Summary stats
+println(s"Recall = ${metrics.recall}")
+println(s"Precision = ${metrics.precision}")
+println(s"F1 measure = ${metrics.f1Measure}")
+println(s"Accuracy = ${metrics.accuracy}")
+
+// Individual label stats
+metrics.labels.foreach(label =>
+  println(s"Class $label precision = ${metrics.precision(label)}"))
+metrics.labels.foreach(label => println(s"Class $label recall = 
${metrics.recall(label)}"))
+metrics.labels.foreach(label => println(s"Class $label F1-score = 
${metrics.f1Measure(label)}"))
+
+// Micro stats
+println(s"Micro recall = ${metrics.microRecall}")
+println(s"Micro precision = ${metrics.microPrecision}")
+println(s"Micro F1 measure = ${metrics.microF1Measure}")
+
+// Hamming loss
+println(s"Hamming loss = ${metrics.hammingLoss}")
+
+// Subset accuracy
+println(s"Subset accuracy = ${metrics.subsetAccuracy}")
+// $example off$
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/1dde9717/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
new file mode 100644
index 000..0904449
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import

spark git commit: [SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 7ee7d5a3c -> 4114ce20f


[SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression

https://issues.apache.org/jira/browse/SPARK-11846

mengxr

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9836 from yinxusen/SPARK-11846.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4114ce20
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4114ce20
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4114ce20

Branch: refs/heads/master
Commit: 4114ce20fbe820f111e55e891ae3889b0e6e0006
Parents: 7ee7d5a
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Thu Nov 19 22:01:02 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 22:01:02 2015 -0800

--
 .../ml/regression/AFTSurvivalRegression.scala   | 78 +++---
 .../ml/regression/IsotonicRegression.scala  | 83 ++--
 .../regression/AFTSurvivalRegressionSuite.scala | 37 -
 .../ml/regression/IsotonicRegressionSuite.scala | 34 +++-
 4 files changed, 210 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4114ce20/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index b7d0958..aedfb48 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -21,20 +21,20 @@ import scala.collection.mutable
 
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS}
+import org.apache.hadoop.fs.Path
 
-import org.apache.spark.{SparkException, Logging}
-import org.apache.spark.annotation.{Since, Experimental}
-import org.apache.spark.ml.{Model, Estimator}
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
-import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
-import org.apache.spark.mllib.linalg.BLAS
+import org.apache.spark.ml.util._
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, DataFrame}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.{Logging, SparkException}
 
 /**
  * Params for accelerated failure time (AFT) regression.
@@ -120,7 +120,8 @@ private[regression] trait AFTSurvivalRegressionParams 
extends Params
 @Experimental
 @Since("1.6.0")
 class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: 
String)
-  extends Estimator[AFTSurvivalRegressionModel] with 
AFTSurvivalRegressionParams with Logging {
+  extends Estimator[AFTSurvivalRegressionModel] with 
AFTSurvivalRegressionParams
+  with DefaultParamsWritable with Logging {
 
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("aftSurvReg"))
@@ -243,6 +244,13 @@ class AFTSurvivalRegression @Since("1.6.0") 
(@Since("1.6.0") override val uid: S
   override def copy(extra: ParamMap): AFTSurvivalRegression = 
defaultCopy(extra)
 }
 
+@Since("1.6.0")
+object AFTSurvivalRegression extends 
DefaultParamsReadable[AFTSurvivalRegression] {
+
+  @Since("1.6.0")
+  override def load(path: String): AFTSurvivalRegression = super.load(path)
+}
+
 /**
  * :: Experimental ::
  * Model produced by [[AFTSurvivalRegression]].
@@ -254,7 +262,7 @@ class AFTSurvivalRegressionModel private[ml] (
 @Since("1.6.0") val coefficients: Vector,
 @Since("1.6.0") val intercept: Double,
 @Since("1.6.0") val scale: Double)
-  extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
+  extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams 
with MLWritable {
 
   /** @group setParam */
   @Since("1.6.0")
@@ -312,6 +320,58 @@ class AFTSurvivalRegressionModel private[ml] (
 copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, 
scale), extra)
   .setParent(parent)
   }
+
+  @Since("1.6.0")
+  override def write: MLWriter =
+new AFTSurvivalRegressionModel.AFTSurvivalRegressio

spark git commit: [SPARK-11829][ML] Add read/write to estimators under ml.feature (II)

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 4114ce20f -> 3b7f056da


[SPARK-11829][ML] Add read/write to estimators under ml.feature (II)

Add read/write support to the following estimators under spark.ml:
* ChiSqSelector
* PCA
* VectorIndexer
* Word2Vec

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9838 from yanboliang/spark-11829.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b7f056d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b7f056d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b7f056d

Branch: refs/heads/master
Commit: 3b7f056da87a23f3a96f0311b3a947a9b698f38b
Parents: 4114ce2
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Nov 19 22:02:17 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 22:02:17 2015 -0800

--
 .../apache/spark/ml/feature/ChiSqSelector.scala | 65 +--
 .../scala/org/apache/spark/ml/feature/PCA.scala | 67 ++--
 .../apache/spark/ml/feature/VectorIndexer.scala | 66 +--
 .../org/apache/spark/ml/feature/Word2Vec.scala  | 67 ++--
 .../apache/spark/mllib/feature/Word2Vec.scala   |  6 +-
 .../spark/ml/feature/ChiSqSelectorSuite.scala   | 22 ++-
 .../org/apache/spark/ml/feature/PCASuite.scala  | 26 +++-
 .../spark/ml/feature/VectorIndexerSuite.scala   | 22 ++-
 .../apache/spark/ml/feature/Word2VecSuite.scala | 30 -
 9 files changed, 338 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3b7f056d/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 5e4061f..dfec038 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.{AttributeGroup, _}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -60,7 +61,7 @@ private[feature] trait ChiSqSelectorParams extends Params
  */
 @Experimental
 final class ChiSqSelector(override val uid: String)
-  extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams {
+  extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams with 
DefaultParamsWritable {
 
   def this() = this(Identifiable.randomUID("chiSqSelector"))
 
@@ -95,6 +96,13 @@ final class ChiSqSelector(override val uid: String)
   override def copy(extra: ParamMap): ChiSqSelector = defaultCopy(extra)
 }
 
+@Since("1.6.0")
+object ChiSqSelector extends DefaultParamsReadable[ChiSqSelector] {
+
+  @Since("1.6.0")
+  override def load(path: String): ChiSqSelector = super.load(path)
+}
+
 /**
  * :: Experimental ::
  * Model fitted by [[ChiSqSelector]].
@@ -103,7 +111,12 @@ final class ChiSqSelector(override val uid: String)
 final class ChiSqSelectorModel private[ml] (
 override val uid: String,
 private val chiSqSelector: feature.ChiSqSelectorModel)
-  extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
+  extends Model[ChiSqSelectorModel] with ChiSqSelectorParams with MLWritable {
+
+  import ChiSqSelectorModel._
+
+  /** list of indices to select (filter). Must be ordered asc */
+  val selectedFeatures: Array[Int] = chiSqSelector.selectedFeatures
 
   /** @group setParam */
   def setFeaturesCol(value: String): this.type = set(featuresCol, value)
@@ -147,4 +160,46 @@ final class ChiSqSelectorModel private[ml] (
 val copied = new ChiSqSelectorModel(uid, chiSqSelector)
 copyValues(copied, extra).setParent(parent)
   }
+
+  @Since("1.6.0")
+  override def write: MLWriter = new ChiSqSelectorModelWriter(this)
+}
+
+@Since("1.6.0")
+object ChiSqSelectorModel extends MLReadable[ChiSqSelectorModel] {
+
+  private[ChiSqSelectorModel]
+  class ChiSqSelectorModelWriter(instance: ChiSqSelectorModel) extends 
MLWriter {
+
+private case class Data(selectedFeatures: Seq[Int])
+
+override protected def saveImpl(path: String): Unit = {
+  DefaultParamsWriter.saveMetadata(instance, path,

spark git commit: [SPARK-11829][ML] Add read/write to estimators under ml.feature (II)

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 4774897f9 -> d7b3d5785


[SPARK-11829][ML] Add read/write to estimators under ml.feature (II)

Add read/write support to the following estimators under spark.ml:
* ChiSqSelector
* PCA
* VectorIndexer
* Word2Vec

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9838 from yanboliang/spark-11829.

(cherry picked from commit 3b7f056da87a23f3a96f0311b3a947a9b698f38b)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7b3d578
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7b3d578
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7b3d578

Branch: refs/heads/branch-1.6
Commit: d7b3d578555d6fabfacd80da97b88aae56f81f1b
Parents: 4774897
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Nov 19 22:02:17 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 22:02:25 2015 -0800

--
 .../apache/spark/ml/feature/ChiSqSelector.scala | 65 +--
 .../scala/org/apache/spark/ml/feature/PCA.scala | 67 ++--
 .../apache/spark/ml/feature/VectorIndexer.scala | 66 +--
 .../org/apache/spark/ml/feature/Word2Vec.scala  | 67 ++--
 .../apache/spark/mllib/feature/Word2Vec.scala   |  6 +-
 .../spark/ml/feature/ChiSqSelectorSuite.scala   | 22 ++-
 .../org/apache/spark/ml/feature/PCASuite.scala  | 26 +++-
 .../spark/ml/feature/VectorIndexerSuite.scala   | 22 ++-
 .../apache/spark/ml/feature/Word2VecSuite.scala | 30 -
 9 files changed, 338 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d7b3d578/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 5e4061f..dfec038 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.{AttributeGroup, _}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -60,7 +61,7 @@ private[feature] trait ChiSqSelectorParams extends Params
  */
 @Experimental
 final class ChiSqSelector(override val uid: String)
-  extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams {
+  extends Estimator[ChiSqSelectorModel] with ChiSqSelectorParams with 
DefaultParamsWritable {
 
   def this() = this(Identifiable.randomUID("chiSqSelector"))
 
@@ -95,6 +96,13 @@ final class ChiSqSelector(override val uid: String)
   override def copy(extra: ParamMap): ChiSqSelector = defaultCopy(extra)
 }
 
+@Since("1.6.0")
+object ChiSqSelector extends DefaultParamsReadable[ChiSqSelector] {
+
+  @Since("1.6.0")
+  override def load(path: String): ChiSqSelector = super.load(path)
+}
+
 /**
  * :: Experimental ::
  * Model fitted by [[ChiSqSelector]].
@@ -103,7 +111,12 @@ final class ChiSqSelector(override val uid: String)
 final class ChiSqSelectorModel private[ml] (
 override val uid: String,
 private val chiSqSelector: feature.ChiSqSelectorModel)
-  extends Model[ChiSqSelectorModel] with ChiSqSelectorParams {
+  extends Model[ChiSqSelectorModel] with ChiSqSelectorParams with MLWritable {
+
+  import ChiSqSelectorModel._
+
+  /** list of indices to select (filter). Must be ordered asc */
+  val selectedFeatures: Array[Int] = chiSqSelector.selectedFeatures
 
   /** @group setParam */
   def setFeaturesCol(value: String): this.type = set(featuresCol, value)
@@ -147,4 +160,46 @@ final class ChiSqSelectorModel private[ml] (
 val copied = new ChiSqSelectorModel(uid, chiSqSelector)
 copyValues(copied, extra).setParent(parent)
   }
+
+  @Since("1.6.0")
+  override def write: MLWriter = new ChiSqSelectorModelWriter(this)
+}
+
+@Since("1.6.0")
+object ChiSqSelectorModel extends MLReadable[ChiSqSelectorModel] {
+
+  private[ChiSqSelectorModel]
+  class ChiSqSelectorModelWriter(instance: ChiSqSelectorModel) extends 
MLWriter {
+
+private case class Data(selectedF

spark git commit: [SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 a936fa5c5 -> 4774897f9


[SPARK-11846] Add save/load for AFTSurvivalRegression and IsotonicRegression

https://issues.apache.org/jira/browse/SPARK-11846

mengxr

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9836 from yinxusen/SPARK-11846.

(cherry picked from commit 4114ce20fbe820f111e55e891ae3889b0e6e0006)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4774897f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4774897f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4774897f

Branch: refs/heads/branch-1.6
Commit: 4774897f9166c5674029ad97cdd6ea5902bcf17c
Parents: a936fa5
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Thu Nov 19 22:01:02 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 22:01:15 2015 -0800

--
 .../ml/regression/AFTSurvivalRegression.scala   | 78 +++---
 .../ml/regression/IsotonicRegression.scala  | 83 ++--
 .../regression/AFTSurvivalRegressionSuite.scala | 37 -
 .../ml/regression/IsotonicRegressionSuite.scala | 34 +++-
 4 files changed, 210 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4774897f/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index b7d0958..aedfb48 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -21,20 +21,20 @@ import scala.collection.mutable
 
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS}
+import org.apache.hadoop.fs.Path
 
-import org.apache.spark.{SparkException, Logging}
-import org.apache.spark.annotation.{Since, Experimental}
-import org.apache.spark.ml.{Model, Estimator}
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
-import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
-import org.apache.spark.mllib.linalg.BLAS
+import org.apache.spark.ml.util._
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, DataFrame}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.{Logging, SparkException}
 
 /**
  * Params for accelerated failure time (AFT) regression.
@@ -120,7 +120,8 @@ private[regression] trait AFTSurvivalRegressionParams 
extends Params
 @Experimental
 @Since("1.6.0")
 class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: 
String)
-  extends Estimator[AFTSurvivalRegressionModel] with 
AFTSurvivalRegressionParams with Logging {
+  extends Estimator[AFTSurvivalRegressionModel] with 
AFTSurvivalRegressionParams
+  with DefaultParamsWritable with Logging {
 
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("aftSurvReg"))
@@ -243,6 +244,13 @@ class AFTSurvivalRegression @Since("1.6.0") 
(@Since("1.6.0") override val uid: S
   override def copy(extra: ParamMap): AFTSurvivalRegression = 
defaultCopy(extra)
 }
 
+@Since("1.6.0")
+object AFTSurvivalRegression extends 
DefaultParamsReadable[AFTSurvivalRegression] {
+
+  @Since("1.6.0")
+  override def load(path: String): AFTSurvivalRegression = super.load(path)
+}
+
 /**
  * :: Experimental ::
  * Model produced by [[AFTSurvivalRegression]].
@@ -254,7 +262,7 @@ class AFTSurvivalRegressionModel private[ml] (
 @Since("1.6.0") val coefficients: Vector,
 @Since("1.6.0") val intercept: Double,
 @Since("1.6.0") val scale: Double)
-  extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
+  extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams 
with MLWritable {
 
   /** @group setParam */
   @Since("1.6.0")
@@ -312,6 +320,58 @@ class AFTSurvivalRegressionModel private[ml] (
 copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, 
scale), extra)
   .setParent

spark git commit: [SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 d7b3d5785 -> 0a878ad0e


[SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval

* Update doc for PySpark ```HasCheckpointInterval``` that users can understand 
how to disable checkpoint.
* Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to 
notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9856 from yanboliang/spark-11875.

(cherry picked from commit 7216f405454f6f3557b5b1f72df8f393605faf60)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a878ad0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a878ad0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a878ad0

Branch: refs/heads/branch-1.6
Commit: 0a878ad0e422cdf00c4beedb5bea01ebba135347
Parents: d7b3d57
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Nov 19 22:14:01 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 22:14:26 2015 -0800

--
 python/pyspark/ml/param/_shared_params_code_gen.py |  6 --
 python/pyspark/ml/param/shared.py  | 14 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0a878ad0/python/pyspark/ml/param/_shared_params_code_gen.py
--
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 070c5db..0528dc1 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -118,7 +118,8 @@ if __name__ == "__main__":
 ("inputCols", "input column names.", None),
 ("outputCol", "output column name.", "self.uid + '__output'"),
 ("numFeatures", "number of features.", None),
-("checkpointInterval", "checkpoint interval (>= 1).", None),
+("checkpointInterval", "set checkpoint interval (>= 1) or disable 
checkpoint (-1). " +
+ "E.g. 10 means that the cache will get checkpointed every 10 
iterations.", None),
 ("seed", "random seed.", "hash(type(self).__name__)"),
 ("tol", "the convergence tolerance for iterative algorithms.", None),
 ("stepSize", "Step size to be used for each iteration of 
optimization.", None),
@@ -157,7 +158,8 @@ if __name__ == "__main__":
 ("maxMemoryInMB", "Maximum memory in MB allocated to histogram 
aggregation."),
 ("cacheNodeIds", "If false, the algorithm will pass trees to executors 
to match " +
  "instances with nodes. If true, the algorithm will cache node IDs for 
each instance. " +
- "Caching can speed up training of deeper trees.")]
+ "Caching can speed up training of deeper trees. Users can set how 
often should the " +
+ "cache be checkpointed or disable it by setting checkpointInterval.")]
 
 decisionTreeCode = '''class DecisionTreeParams(Params):
 """

http://git-wip-us.apache.org/repos/asf/spark/blob/0a878ad0/python/pyspark/ml/param/shared.py
--
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 4bdf2a8..4d96080 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -325,16 +325,16 @@ class HasNumFeatures(Params):
 
 class HasCheckpointInterval(Params):
 """
-Mixin for param checkpointInterval: checkpoint interval (>= 1).
+Mixin for param checkpointInterval: set checkpoint interval (>= 1) or 
disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed 
every 10 iterations.
 """
 
 # a placeholder to make it appear in the generated doc
-checkpointInterval = Param(Params._dummy(), "checkpointInterval", 
"checkpoint interval (>= 1).")
+checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set 
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the 
cache will get checkpointed every 10 iterations.")
 
 def __init__(self):
 super(HasCheckpointInterval, self).__init__()
-#: param for checkpoint interval (>= 1).
-self.checkpointInterval = Param(self,

spark git commit: [SPARK-11867] Add save/load for kmeans and naive bayes

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 0fff8eb3e -> 3e1d120ce


[SPARK-11867] Add save/load for kmeans and naive bayes

https://issues.apache.org/jira/browse/SPARK-11867

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9849 from yinxusen/SPARK-11867.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e1d120c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e1d120c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e1d120c

Branch: refs/heads/master
Commit: 3e1d120cedb4bd9e1595e95d4d531cf61da6684d
Parents: 0fff8eb
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Thu Nov 19 23:43:18 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 23:43:18 2015 -0800

--
 .../spark/ml/classification/NaiveBayes.scala| 68 ++--
 .../org/apache/spark/ml/clustering/KMeans.scala | 67 +--
 .../ml/classification/NaiveBayesSuite.scala | 47 --
 .../spark/ml/clustering/KMeansSuite.scala   | 41 +---
 4 files changed, 195 insertions(+), 28 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3e1d120c/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index a14dcec..c512a2c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.ml.classification
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, 
ParamValidators}
-import org.apache.spark.ml.util.Identifiable
-import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes, 
NaiveBayesModel => OldNaiveBayesModel}
+import org.apache.spark.ml.util._
+import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes}
+import org.apache.spark.mllib.classification.{NaiveBayesModel => 
OldNaiveBayesModel}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
@@ -72,7 +75,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
 @Experimental
 class NaiveBayes(override val uid: String)
   extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
-  with NaiveBayesParams {
+  with NaiveBayesParams with DefaultParamsWritable {
 
   def this() = this(Identifiable.randomUID("nb"))
 
@@ -102,6 +105,13 @@ class NaiveBayes(override val uid: String)
   override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra)
 }
 
+@Since("1.6.0")
+object NaiveBayes extends DefaultParamsReadable[NaiveBayes] {
+
+  @Since("1.6.0")
+  override def load(path: String): NaiveBayes = super.load(path)
+}
+
 /**
  * :: Experimental ::
  * Model produced by [[NaiveBayes]]
@@ -114,7 +124,8 @@ class NaiveBayesModel private[ml] (
 override val uid: String,
 val pi: Vector,
 val theta: Matrix)
-  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with 
NaiveBayesParams {
+  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel]
+  with NaiveBayesParams with MLWritable {
 
   import OldNaiveBayes.{Bernoulli, Multinomial}
 
@@ -203,12 +214,15 @@ class NaiveBayesModel private[ml] (
 s"NaiveBayesModel (uid=$uid) with ${pi.size} classes"
   }
 
+  @Since("1.6.0")
+  override def write: MLWriter = new 
NaiveBayesModel.NaiveBayesModelWriter(this)
 }
 
-private[ml] object NaiveBayesModel {
+@Since("1.6.0")
+object NaiveBayesModel extends MLReadable[NaiveBayesModel] {
 
   /** Convert a model from the old API */
-  def fromOld(
+  private[ml] def fromOld(
   oldModel: OldNaiveBayesModel,
   parent: NaiveBayes): NaiveBayesModel = {
 val uid = if (parent != null) parent.uid else Identifiable.randomUID("nb")
@@ -218,4 +232,44 @@ private[ml] object NaiveBayesModel {
   oldModel.theta.flatten, true)
 new NaiveBayesModel(uid, pi, theta)
   }
+
+  @Since("1.6.0")
+  override def read: MLReader[NaiveBayesModel] = new NaiveBayesModelReader
+
+  @Since("1.6.0")
+  override def load(path: String): NaiveBayesModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[NaiveBayesModel]] */
+  private[NaiveBayesModel] class NaiveBayesModelWriter(instance: 
NaiveBaye

spark git commit: [SPARK-11867] Add save/load for kmeans and naive bayes

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 60d937529 -> 1ce6394e3


[SPARK-11867] Add save/load for kmeans and naive bayes

https://issues.apache.org/jira/browse/SPARK-11867

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9849 from yinxusen/SPARK-11867.

(cherry picked from commit 3e1d120cedb4bd9e1595e95d4d531cf61da6684d)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ce6394e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ce6394e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ce6394e

Branch: refs/heads/branch-1.6
Commit: 1ce6394e3c86f8d0b80e990d8a35889ded94b6ea
Parents: 60d9375
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Thu Nov 19 23:43:18 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 23:43:25 2015 -0800

--
 .../spark/ml/classification/NaiveBayes.scala| 68 ++--
 .../org/apache/spark/ml/clustering/KMeans.scala | 67 +--
 .../ml/classification/NaiveBayesSuite.scala | 47 --
 .../spark/ml/clustering/KMeansSuite.scala   | 41 +---
 4 files changed, 195 insertions(+), 28 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1ce6394e/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index a14dcec..c512a2c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.ml.classification
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.SparkException
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, 
ParamValidators}
-import org.apache.spark.ml.util.Identifiable
-import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes, 
NaiveBayesModel => OldNaiveBayesModel}
+import org.apache.spark.ml.util._
+import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes}
+import org.apache.spark.mllib.classification.{NaiveBayesModel => 
OldNaiveBayesModel}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
@@ -72,7 +75,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
 @Experimental
 class NaiveBayes(override val uid: String)
   extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
-  with NaiveBayesParams {
+  with NaiveBayesParams with DefaultParamsWritable {
 
   def this() = this(Identifiable.randomUID("nb"))
 
@@ -102,6 +105,13 @@ class NaiveBayes(override val uid: String)
   override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra)
 }
 
+@Since("1.6.0")
+object NaiveBayes extends DefaultParamsReadable[NaiveBayes] {
+
+  @Since("1.6.0")
+  override def load(path: String): NaiveBayes = super.load(path)
+}
+
 /**
  * :: Experimental ::
  * Model produced by [[NaiveBayes]]
@@ -114,7 +124,8 @@ class NaiveBayesModel private[ml] (
 override val uid: String,
 val pi: Vector,
 val theta: Matrix)
-  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with 
NaiveBayesParams {
+  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel]
+  with NaiveBayesParams with MLWritable {
 
   import OldNaiveBayes.{Bernoulli, Multinomial}
 
@@ -203,12 +214,15 @@ class NaiveBayesModel private[ml] (
 s"NaiveBayesModel (uid=$uid) with ${pi.size} classes"
   }
 
+  @Since("1.6.0")
+  override def write: MLWriter = new 
NaiveBayesModel.NaiveBayesModelWriter(this)
 }
 
-private[ml] object NaiveBayesModel {
+@Since("1.6.0")
+object NaiveBayesModel extends MLReadable[NaiveBayesModel] {
 
   /** Convert a model from the old API */
-  def fromOld(
+  private[ml] def fromOld(
   oldModel: OldNaiveBayesModel,
   parent: NaiveBayes): NaiveBayesModel = {
 val uid = if (parent != null) parent.uid else Identifiable.randomUID("nb")
@@ -218,4 +232,44 @@ private[ml] object NaiveBayesModel {
   oldModel.theta.flatten, true)
 new NaiveBayesModel(uid, pi, theta)
   }
+
+  @Since("1.6.0")
+  override def read: MLReader[NaiveBayesModel] = new NaiveBayesModelReader
+
+  @Since("1.6.0")
+  override def load(path: String): NaiveBayesModel = super.load

spark git commit: [SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3b7f056da -> 7216f4054


[SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval

* Update doc for PySpark ```HasCheckpointInterval``` that users can understand 
how to disable checkpoint.
* Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to 
notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9856 from yanboliang/spark-11875.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7216f405
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7216f405
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7216f405

Branch: refs/heads/master
Commit: 7216f405454f6f3557b5b1f72df8f393605faf60
Parents: 3b7f056
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Nov 19 22:14:01 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 22:14:01 2015 -0800

--
 python/pyspark/ml/param/_shared_params_code_gen.py |  6 --
 python/pyspark/ml/param/shared.py  | 14 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7216f405/python/pyspark/ml/param/_shared_params_code_gen.py
--
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 070c5db..0528dc1 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -118,7 +118,8 @@ if __name__ == "__main__":
 ("inputCols", "input column names.", None),
 ("outputCol", "output column name.", "self.uid + '__output'"),
 ("numFeatures", "number of features.", None),
-("checkpointInterval", "checkpoint interval (>= 1).", None),
+("checkpointInterval", "set checkpoint interval (>= 1) or disable 
checkpoint (-1). " +
+ "E.g. 10 means that the cache will get checkpointed every 10 
iterations.", None),
 ("seed", "random seed.", "hash(type(self).__name__)"),
 ("tol", "the convergence tolerance for iterative algorithms.", None),
 ("stepSize", "Step size to be used for each iteration of 
optimization.", None),
@@ -157,7 +158,8 @@ if __name__ == "__main__":
 ("maxMemoryInMB", "Maximum memory in MB allocated to histogram 
aggregation."),
 ("cacheNodeIds", "If false, the algorithm will pass trees to executors 
to match " +
  "instances with nodes. If true, the algorithm will cache node IDs for 
each instance. " +
- "Caching can speed up training of deeper trees.")]
+ "Caching can speed up training of deeper trees. Users can set how 
often should the " +
+ "cache be checkpointed or disable it by setting checkpointInterval.")]
 
 decisionTreeCode = '''class DecisionTreeParams(Params):
 """

http://git-wip-us.apache.org/repos/asf/spark/blob/7216f405/python/pyspark/ml/param/shared.py
--
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 4bdf2a8..4d96080 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -325,16 +325,16 @@ class HasNumFeatures(Params):
 
 class HasCheckpointInterval(Params):
 """
-Mixin for param checkpointInterval: checkpoint interval (>= 1).
+Mixin for param checkpointInterval: set checkpoint interval (>= 1) or 
disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed 
every 10 iterations.
 """
 
 # a placeholder to make it appear in the generated doc
-checkpointInterval = Param(Params._dummy(), "checkpointInterval", 
"checkpoint interval (>= 1).")
+checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set 
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the 
cache will get checkpointed every 10 iterations.")
 
 def __init__(self):
 super(HasCheckpointInterval, self).__init__()
-#: param for checkpoint interval (>= 1).
-self.checkpointInterval = Param(self, "checkpointInterval", 
"checkpoint interval (>= 1).")
+#: param for set checkpoint interval (>= 1) or disable check

spark git commit: [SPARK-11869][ML] Clean up TempDirectory properly in ML tests

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 0a878ad0e -> 60d937529


[SPARK-11869][ML] Clean up TempDirectory properly in ML tests

Need to remove parent directory (```className```) rather than just tempDir 
(```className/random_name```)

I tested this with IDFSuite, which has 2 read/write tests, and it fixes the 
problem.

CC: mengxr  Can you confirm this is fine?  I believe it is since the same 
```random_name``` is used for all tests in a suite; we basically have an extra 
unneeded level of nesting.

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9851 from jkbradley/tempdir-cleanup.

(cherry picked from commit 0fff8eb3e476165461658d4e16682ec64269fdfe)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60d93752
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60d93752
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60d93752

Branch: refs/heads/branch-1.6
Commit: 60d937529f6b885486e0a9b687883fb4bf66b780
Parents: 0a878ad
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Thu Nov 19 23:42:24 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 23:42:31 2015 -0800

--
 mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/60d93752/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala 
b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
index 2742026..c8a0bb1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
@@ -35,7 +35,7 @@ trait TempDirectory extends BeforeAndAfterAll { self: Suite =>
 
   override def beforeAll(): Unit = {
 super.beforeAll()
-_tempDir = Utils.createTempDir(this.getClass.getName)
+_tempDir = Utils.createTempDir(namePrefix = this.getClass.getName)
   }
 
   override def afterAll(): Unit = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11869][ML] Clean up TempDirectory properly in ML tests

2015-11-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 7216f4054 -> 0fff8eb3e


[SPARK-11869][ML] Clean up TempDirectory properly in ML tests

Need to remove parent directory (```className```) rather than just tempDir 
(```className/random_name```)

I tested this with IDFSuite, which has 2 read/write tests, and it fixes the 
problem.

CC: mengxr  Can you confirm this is fine?  I believe it is since the same 
```random_name``` is used for all tests in a suite; we basically have an extra 
unneeded level of nesting.

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9851 from jkbradley/tempdir-cleanup.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0fff8eb3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0fff8eb3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0fff8eb3

Branch: refs/heads/master
Commit: 0fff8eb3e476165461658d4e16682ec64269fdfe
Parents: 7216f40
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Thu Nov 19 23:42:24 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Nov 19 23:42:24 2015 -0800

--
 mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0fff8eb3/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala 
b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
index 2742026..c8a0bb1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/TempDirectory.scala
@@ -35,7 +35,7 @@ trait TempDirectory extends BeforeAndAfterAll { self: Suite =>
 
   override def beforeAll(): Unit = {
 super.beforeAll()
-_tempDir = Utils.createTempDir(this.getClass.getName)
+_tempDir = Utils.createTempDir(namePrefix = this.getClass.getName)
   }
 
   override def afterAll(): Unit = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11816][ML] fix some style issue in ML/MLlib examples

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 19f4f26f3 -> 4b4a6bf5c


[SPARK-11816][ML] fix some style issue in ML/MLlib examples

jira: https://issues.apache.org/jira/browse/SPARK-11816
Currently I only fixed some obvious comments issue like
// scalastyle:off println
on the bottom.

Yet the style in examples is not quite consistent, like only half of the 
examples  are with
// Example usage: ./bin/run-example mllib.FPGrowthExample \,

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9808 from hhbyyh/exampleStyle.

(cherry picked from commit 67c75828ff4df2e305bdf5d6be5a11201d1da3f3)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b4a6bf5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b4a6bf5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b4a6bf5

Branch: refs/heads/branch-1.6
Commit: 4b4a6bf5c0a8dd96897d7dd48c7beadd2c950824
Parents: 19f4f26
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 18:49:46 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 18:50:10 2015 -0800

--
 .../main/java/org/apache/spark/examples/ml/JavaKMeansExample.java  | 2 +-
 .../apache/spark/examples/ml/AFTSurvivalRegressionExample.scala| 2 +-
 .../spark/examples/ml/DecisionTreeClassificationExample.scala  | 1 +
 .../apache/spark/examples/ml/DecisionTreeRegressionExample.scala   | 1 +
 .../spark/examples/ml/MultilayerPerceptronClassifierExample.scala  | 2 +-
 5 files changed, 5 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
index be2bf0c..47665ff 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
@@ -41,7 +41,7 @@ import org.apache.spark.sql.types.StructType;
  * An example demonstrating a k-means clustering.
  * Run with
  * 
- * bin/run-example ml.JavaSimpleParamsExample  
+ * bin/run-example ml.JavaKMeansExample  
  * 
  */
 public class JavaKMeansExample {

http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
index 5da285e..f4b3613 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
@@ -59,4 +59,4 @@ object AFTSurvivalRegressionExample {
 sc.stop()
   }
 }
-// scalastyle:off println
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index ff8a0a9..db024b5 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -90,3 +90,4 @@ object DecisionTreeClassificationExample {
 // $example off$
   }
 }
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index fc40272..ad01f55 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -78,3 +78,4 @@ object DecisionTreeRegressionExample {
 // $example off$
   }
 }
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/4b4a6bf5/examples/src/main/scala/o

spark git commit: [SPARK-11816][ML] fix some style issue in ML/MLlib examples

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9c0654d36 -> 67c75828f


[SPARK-11816][ML] fix some style issue in ML/MLlib examples

jira: https://issues.apache.org/jira/browse/SPARK-11816
Currently I only fixed some obvious comments issue like
// scalastyle:off println
on the bottom.

Yet the style in examples is not quite consistent, like only half of the 
examples  are with
// Example usage: ./bin/run-example mllib.FPGrowthExample \,

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9808 from hhbyyh/exampleStyle.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/67c75828
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/67c75828
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/67c75828

Branch: refs/heads/master
Commit: 67c75828ff4df2e305bdf5d6be5a11201d1da3f3
Parents: 9c0654d
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 18:49:46 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 18:49:46 2015 -0800

--
 .../main/java/org/apache/spark/examples/ml/JavaKMeansExample.java  | 2 +-
 .../apache/spark/examples/ml/AFTSurvivalRegressionExample.scala| 2 +-
 .../spark/examples/ml/DecisionTreeClassificationExample.scala  | 1 +
 .../apache/spark/examples/ml/DecisionTreeRegressionExample.scala   | 1 +
 .../spark/examples/ml/MultilayerPerceptronClassifierExample.scala  | 2 +-
 5 files changed, 5 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
index be2bf0c..47665ff 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
@@ -41,7 +41,7 @@ import org.apache.spark.sql.types.StructType;
  * An example demonstrating a k-means clustering.
  * Run with
  * 
- * bin/run-example ml.JavaSimpleParamsExample  
+ * bin/run-example ml.JavaKMeansExample  
  * 
  */
 public class JavaKMeansExample {

http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
index 5da285e..f4b3613 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
@@ -59,4 +59,4 @@ object AFTSurvivalRegressionExample {
 sc.stop()
   }
 }
-// scalastyle:off println
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index ff8a0a9..db024b5 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -90,3 +90,4 @@ object DecisionTreeClassificationExample {
 // $example off$
   }
 }
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index fc40272..ad01f55 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -78,3 +78,4 @@ object DecisionTreeRegressionExample {
 // $example off$
   }
 }
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/67c75828/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
--
diff

spark git commit: [SPARK-6787][ML] add read/write to estimators under ml.feature (1)

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 d9945bc46 -> dc1e23744


[SPARK-6787][ML] add read/write to estimators under ml.feature (1)

Add read/write support to the following estimators under spark.ml:

* CountVectorizer
* IDF
* MinMaxScaler
* StandardScaler (a little awkward because we store some params in spark.mllib 
model)
* StringIndexer

Added some necessary method for read/write. Maybe we should add `private[ml] 
trait DefaultParamsReadable` and `DefaultParamsWritable` to save some 
boilerplate code, though we still need to override `load` for Java 
compatibility.

jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #9798 from mengxr/SPARK-6787.

(cherry picked from commit 7e987de1770f4ab3d54bc05db8de0a1ef035941d)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc1e2374
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc1e2374
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc1e2374

Branch: refs/heads/branch-1.6
Commit: dc1e23744b7fc1b8ee5fac07cf56d5760d66503e
Parents: d9945bc
Author: Xiangrui Meng <m...@databricks.com>
Authored: Wed Nov 18 15:47:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 15:47:57 2015 -0800

--
 .../spark/ml/feature/CountVectorizer.scala  | 72 --
 .../scala/org/apache/spark/ml/feature/IDF.scala | 71 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  | 72 --
 .../spark/ml/feature/StandardScaler.scala   | 78 +++-
 .../apache/spark/ml/feature/StringIndexer.scala | 70 --
 .../spark/ml/feature/CountVectorizerSuite.scala | 24 +-
 .../org/apache/spark/ml/feature/IDFSuite.scala  | 19 -
 .../spark/ml/feature/MinMaxScalerSuite.scala| 25 ++-
 .../spark/ml/feature/StandardScalerSuite.scala  | 64 +++-
 .../spark/ml/feature/StringIndexerSuite.scala   | 19 -
 10 files changed, 467 insertions(+), 47 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dc1e2374/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 49028e4..5ff9bfb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -16,17 +16,19 @@
  */
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
-import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
-import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.DataFrame
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -105,7 +107,7 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
  */
 @Experimental
 class CountVectorizer(override val uid: String)
-  extends Estimator[CountVectorizerModel] with CountVectorizerParams {
+  extends Estimator[CountVectorizerModel] with CountVectorizerParams with 
Writable {
 
   def this() = this(Identifiable.randomUID("cntVec"))
 
@@ -169,6 +171,19 @@ class CountVectorizer(override val uid: String)
   }
 
   override def copy(extra: ParamMap): CountVectorizer = defaultCopy(extra)
+
+  @Since("1.6.0")
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+@Since("1.6.0")
+object CountVectorizer extends Readable[CountVectorizer] {
+
+  @Since("1.6.0")
+  override def read: Reader[CountVectorizer] = new DefaultParamsReader
+
+  @Since("1.6.0")
+  override def load(path: String): CountVectorizer = super.load(path)
 }
 
 /**
@@ -178,7 +193,9 @@ class CountVectorizer(override val uid: String)
  */
 @Experimental
 class CountVectorizerModel(override val uid: String, val vocabulary: 
Array[String])
-  extends Model[CountVectorizerModel] with CountVectorizerParams {
+  extends Model[CountVectorizerModel] with CountVectorizerParams with Writable 
{
+
+  import CountVectorizerModel._
 
   def this(vocabulary: Array[String

spark git commit: [SPARK-6787][ML] add read/write to estimators under ml.feature (1)

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 5df08949f -> 7e987de17


[SPARK-6787][ML] add read/write to estimators under ml.feature (1)

Add read/write support to the following estimators under spark.ml:

* CountVectorizer
* IDF
* MinMaxScaler
* StandardScaler (a little awkward because we store some params in spark.mllib 
model)
* StringIndexer

Added some necessary method for read/write. Maybe we should add `private[ml] 
trait DefaultParamsReadable` and `DefaultParamsWritable` to save some 
boilerplate code, though we still need to override `load` for Java 
compatibility.

jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #9798 from mengxr/SPARK-6787.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e987de1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e987de1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e987de1

Branch: refs/heads/master
Commit: 7e987de1770f4ab3d54bc05db8de0a1ef035941d
Parents: 5df0894
Author: Xiangrui Meng <m...@databricks.com>
Authored: Wed Nov 18 15:47:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 15:47:49 2015 -0800

--
 .../spark/ml/feature/CountVectorizer.scala  | 72 --
 .../scala/org/apache/spark/ml/feature/IDF.scala | 71 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  | 72 --
 .../spark/ml/feature/StandardScaler.scala   | 78 +++-
 .../apache/spark/ml/feature/StringIndexer.scala | 70 --
 .../spark/ml/feature/CountVectorizerSuite.scala | 24 +-
 .../org/apache/spark/ml/feature/IDFSuite.scala  | 19 -
 .../spark/ml/feature/MinMaxScalerSuite.scala| 25 ++-
 .../spark/ml/feature/StandardScalerSuite.scala  | 64 +++-
 .../spark/ml/feature/StringIndexerSuite.scala   | 19 -
 10 files changed, 467 insertions(+), 47 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7e987de1/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 49028e4..5ff9bfb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -16,17 +16,19 @@
  */
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
-import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
-import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.DataFrame
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -105,7 +107,7 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
  */
 @Experimental
 class CountVectorizer(override val uid: String)
-  extends Estimator[CountVectorizerModel] with CountVectorizerParams {
+  extends Estimator[CountVectorizerModel] with CountVectorizerParams with 
Writable {
 
   def this() = this(Identifiable.randomUID("cntVec"))
 
@@ -169,6 +171,19 @@ class CountVectorizer(override val uid: String)
   }
 
   override def copy(extra: ParamMap): CountVectorizer = defaultCopy(extra)
+
+  @Since("1.6.0")
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+@Since("1.6.0")
+object CountVectorizer extends Readable[CountVectorizer] {
+
+  @Since("1.6.0")
+  override def read: Reader[CountVectorizer] = new DefaultParamsReader
+
+  @Since("1.6.0")
+  override def load(path: String): CountVectorizer = super.load(path)
 }
 
 /**
@@ -178,7 +193,9 @@ class CountVectorizer(override val uid: String)
  */
 @Experimental
 class CountVectorizerModel(override val uid: String, val vocabulary: 
Array[String])
-  extends Model[CountVectorizerModel] with CountVectorizerParams {
+  extends Model[CountVectorizerModel] with CountVectorizerParams with Writable 
{
+
+  import CountVectorizerModel._
 
   def this(vocabulary: Array[String]) = {
 this(Identifiable.randomUID("cntVecModel"), vocabulary)
@@ -232,4 +249,47 @@ class CountVectorizerM

spark git commit: [SPARK-11720][SQL][ML] Handle edge cases when count = 0 or 1 for Stats function

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 7c5b64180 -> 09ad9533d


[SPARK-11720][SQL][ML] Handle edge cases when count = 0 or 1 for Stats function

return Double.NaN for mean/average when count == 0 for all numeric types that 
is converted to Double, Decimal type continue to return null.

Author: JihongMa <linlin200...@gmail.com>

Closes #9705 from JihongMA/SPARK-11720.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/09ad9533
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/09ad9533
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/09ad9533

Branch: refs/heads/master
Commit: 09ad9533d5760652de59fa4830c24cb8667958ac
Parents: 7c5b641
Author: JihongMa <linlin200...@gmail.com>
Authored: Wed Nov 18 13:03:37 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:03:37 2015 -0800

--
 python/pyspark/sql/dataframe.py   |  2 +-
 .../expressions/aggregate/CentralMomentAgg.scala  |  2 +-
 .../catalyst/expressions/aggregate/Kurtosis.scala |  9 +
 .../catalyst/expressions/aggregate/Skewness.scala |  9 +
 .../catalyst/expressions/aggregate/Stddev.scala   | 18 ++
 .../catalyst/expressions/aggregate/Variance.scala | 18 ++
 .../spark/sql/DataFrameAggregateSuite.scala   | 18 --
 .../org/apache/spark/sql/DataFrameSuite.scala |  2 +-
 8 files changed, 53 insertions(+), 25 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/09ad9533/python/pyspark/sql/dataframe.py
--
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index ad6ad02..0dd75ba 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -761,7 +761,7 @@ class DataFrame(object):
 +---+--+-+
 |  count| 2|2|
 |   mean|   3.5| null|
-| stddev|2.1213203435596424|  NaN|
+| stddev|2.1213203435596424| null|
 |min| 2|Alice|
 |max| 5|  Bob|
 +---+--+-+

http://git-wip-us.apache.org/repos/asf/spark/blob/09ad9533/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
index de5872a..d07d4c3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -206,7 +206,7 @@ abstract class CentralMomentAgg(child: Expression) extends 
ImperativeAggregate w
* @param centralMoments Length `momentOrder + 1` array of central moments 
(un-normalized)
*   needed to compute the aggregate stat.
*/
-  def getStatistic(n: Double, mean: Double, centralMoments: Array[Double]): 
Double
+  def getStatistic(n: Double, mean: Double, centralMoments: Array[Double]): Any
 
   override final def eval(buffer: InternalRow): Any = {
 val n = buffer.getDouble(nOffset)

http://git-wip-us.apache.org/repos/asf/spark/blob/09ad9533/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala
index 8fa3aac..c2bf2cb 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala
@@ -37,16 +37,17 @@ case class Kurtosis(child: Expression,
   override protected val momentOrder = 4
 
   // NOTE: this is the formula for excess kurtosis, which is default for R and 
SciPy
-  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): 
Double = {
+  override def getStatistic(n: Double, mean: Double, moments: Array[Double]): 
Any = {
 require(moments.length == momentOrder + 1,
   s"$prettyName requires ${momentOrder + 1} central moments, received: 
${moments.length}")
 val m2 = moments(2)
 val m4 = moments(4)
 
-if (n == 0.0 || m2 == 0.0) {
+if (n == 0.0) {
+  null
+} else if (m2 == 0.0) {

spark git commit: [SPARK-6790][ML] Add spark.ml LinearRegression import/export

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 39c8a995d -> bcc6813dd


[SPARK-6790][ML] Add spark.ml LinearRegression import/export

This replaces [https://github.com/apache/spark/pull/9656] with updates.

fayeshine should be the main author when this PR is committed.

CC: mengxr fayeshine

Author: Wenjian Huang <nextr...@163.com>
Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9814 from jkbradley/fayeshine-patch-6790.

(cherry picked from commit 045a4f045821dcf60442f0600c2df1b79bddb536)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bcc6813d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bcc6813d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bcc6813d

Branch: refs/heads/branch-1.6
Commit: bcc6813dd8b050fd4bf9dbd2708e413b43b3e80d
Parents: 39c8a99
Author: Wenjian Huang <nextr...@163.com>
Authored: Wed Nov 18 13:06:25 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:06:32 2015 -0800

--
 .../spark/ml/regression/LinearRegression.scala  | 77 +++-
 .../ml/regression/LinearRegressionSuite.scala   | 34 -
 2 files changed, 106 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bcc6813d/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 913140e..ca55d59 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => 
BreezeLBFGS, OWLQN => BreezeOWLQN}
 import breeze.stats.distributions.StudentsT
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.ml.feature.Instance
@@ -30,7 +31,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
@@ -65,7 +66,7 @@ private[regression] trait LinearRegressionParams extends 
PredictorParams
 @Experimental
 class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: 
String)
   extends Regressor[Vector, LinearRegression, LinearRegressionModel]
-  with LinearRegressionParams with Logging {
+  with LinearRegressionParams with Writable with Logging {
 
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("linReg"))
@@ -341,6 +342,19 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") 
override val uid: String
 
   @Since("1.4.0")
   override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra)
+
+  @Since("1.6.0")
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+@Since("1.6.0")
+object LinearRegression extends Readable[LinearRegression] {
+
+  @Since("1.6.0")
+  override def read: Reader[LinearRegression] = new 
DefaultParamsReader[LinearRegression]
+
+  @Since("1.6.0")
+  override def load(path: String): LinearRegression = read.load(path)
 }
 
 /**
@@ -354,7 +368,7 @@ class LinearRegressionModel private[ml] (
 val coefficients: Vector,
 val intercept: Double)
   extends RegressionModel[Vector, LinearRegressionModel]
-  with LinearRegressionParams {
+  with LinearRegressionParams with Writable {
 
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
 
@@ -422,6 +436,63 @@ class LinearRegressionModel private[ml] (
 if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
 newModel.setParent(parent)
   }
+
+  /**
+   * Returns a [[Writer]] instance for this ML instance.
+   *
+   * For [[LinearRegressionModel]], this does NOT currently save the training 
[[summary]].
+   * An option to save [[summary]] may be added in the future.
+   *
+   * This also does not save the [[parent]] currently.
+   */
+  @Since("1.6.0")
+  override def write: Writer = new 
LinearRegressionModel.LinearRegressionModelWriter(this)
+}
+
+@Since("1.6.0")
+object LinearRegressionMo

spark git commit: [SPARK-6790][ML] Add spark.ml LinearRegression import/export

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 09ad9533d -> 045a4f045


[SPARK-6790][ML] Add spark.ml LinearRegression import/export

This replaces [https://github.com/apache/spark/pull/9656] with updates.

fayeshine should be the main author when this PR is committed.

CC: mengxr fayeshine

Author: Wenjian Huang <nextr...@163.com>
Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9814 from jkbradley/fayeshine-patch-6790.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/045a4f04
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/045a4f04
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/045a4f04

Branch: refs/heads/master
Commit: 045a4f045821dcf60442f0600c2df1b79bddb536
Parents: 09ad953
Author: Wenjian Huang <nextr...@163.com>
Authored: Wed Nov 18 13:06:25 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:06:25 2015 -0800

--
 .../spark/ml/regression/LinearRegression.scala  | 77 +++-
 .../ml/regression/LinearRegressionSuite.scala   | 34 -
 2 files changed, 106 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/045a4f04/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 913140e..ca55d59 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => 
BreezeLBFGS, OWLQN => BreezeOWLQN}
 import breeze.stats.distributions.StudentsT
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.ml.feature.Instance
@@ -30,7 +31,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
@@ -65,7 +66,7 @@ private[regression] trait LinearRegressionParams extends 
PredictorParams
 @Experimental
 class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: 
String)
   extends Regressor[Vector, LinearRegression, LinearRegressionModel]
-  with LinearRegressionParams with Logging {
+  with LinearRegressionParams with Writable with Logging {
 
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("linReg"))
@@ -341,6 +342,19 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") 
override val uid: String
 
   @Since("1.4.0")
   override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra)
+
+  @Since("1.6.0")
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+@Since("1.6.0")
+object LinearRegression extends Readable[LinearRegression] {
+
+  @Since("1.6.0")
+  override def read: Reader[LinearRegression] = new 
DefaultParamsReader[LinearRegression]
+
+  @Since("1.6.0")
+  override def load(path: String): LinearRegression = read.load(path)
 }
 
 /**
@@ -354,7 +368,7 @@ class LinearRegressionModel private[ml] (
 val coefficients: Vector,
 val intercept: Double)
   extends RegressionModel[Vector, LinearRegressionModel]
-  with LinearRegressionParams {
+  with LinearRegressionParams with Writable {
 
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
 
@@ -422,6 +436,63 @@ class LinearRegressionModel private[ml] (
 if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
 newModel.setParent(parent)
   }
+
+  /**
+   * Returns a [[Writer]] instance for this ML instance.
+   *
+   * For [[LinearRegressionModel]], this does NOT currently save the training 
[[summary]].
+   * An option to save [[summary]] may be added in the future.
+   *
+   * This also does not save the [[parent]] currently.
+   */
+  @Since("1.6.0")
+  override def write: Writer = new 
LinearRegressionModel.LinearRegressionModelWriter(this)
+}
+
+@Since("1.6.0")
+object LinearRegressionModel extends Readable[LinearRegressionModel] {
+
+  @Since("1.6.0")
+  override def read: Reader[LinearR

spark git commit: [SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 045a4f045 -> 2acdf10b1


[SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel

Also modifies DefaultParamsWriter.saveMetadata to take optional extra metadata.

CC: mengxr yanboliang

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9786 from jkbradley/als-io.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2acdf10b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2acdf10b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2acdf10b

Branch: refs/heads/master
Commit: 2acdf10b1f3bb1242dba64efa798c672fde9f0d2
Parents: 045a4f0
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Wed Nov 18 13:16:31 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:16:31 2015 -0800

--
 .../apache/spark/ml/recommendation/ALS.scala| 75 +--
 .../org/apache/spark/ml/util/ReadWrite.scala| 14 +++-
 .../spark/ml/recommendation/ALSSuite.scala  | 78 +---
 3 files changed, 150 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2acdf10b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala 
b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 535f266..d92514d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -27,13 +27,16 @@ import scala.util.hashing.byteswap64
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.apache.hadoop.fs.{FileSystem, Path}
+import org.json4s.{DefaultFormats, JValue}
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, Partitioner}
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{Since, DeveloperApi, Experimental}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.CholeskyDecomposition
 import org.apache.spark.mllib.optimization.NNLS
 import org.apache.spark.rdd.RDD
@@ -182,7 +185,7 @@ class ALSModel private[ml] (
 val rank: Int,
 @transient val userFactors: DataFrame,
 @transient val itemFactors: DataFrame)
-  extends Model[ALSModel] with ALSModelParams {
+  extends Model[ALSModel] with ALSModelParams with Writable {
 
   /** @group setParam */
   def setUserCol(value: String): this.type = set(userCol, value)
@@ -220,8 +223,60 @@ class ALSModel private[ml] (
 val copied = new ALSModel(uid, rank, userFactors, itemFactors)
 copyValues(copied, extra).setParent(parent)
   }
+
+  @Since("1.6.0")
+  override def write: Writer = new ALSModel.ALSModelWriter(this)
 }
 
+@Since("1.6.0")
+object ALSModel extends Readable[ALSModel] {
+
+  @Since("1.6.0")
+  override def read: Reader[ALSModel] = new ALSModelReader
+
+  @Since("1.6.0")
+  override def load(path: String): ALSModel = read.load(path)
+
+  private[recommendation] class ALSModelWriter(instance: ALSModel) extends 
Writer {
+
+override protected def saveImpl(path: String): Unit = {
+  val extraMetadata = render("rank" -> instance.rank)
+  DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
+  val userPath = new Path(path, "userFactors").toString
+  instance.userFactors.write.format("parquet").save(userPath)
+  val itemPath = new Path(path, "itemFactors").toString
+  instance.itemFactors.write.format("parquet").save(itemPath)
+}
+  }
+
+  private[recommendation] class ALSModelReader extends Reader[ALSModel] {
+
+/** Checked against metadata when loading model */
+private val className = "org.apache.spark.ml.recommendation.ALSModel"
+
+override def load(path: String): ALSModel = {
+  val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+  implicit val format = DefaultFormats
+  val rank: Int = metadata.extraMetadata match {
+case Some(m: JValue) =>
+  (m \ "rank").extract[Int]
+case None =>
+  throw new RuntimeException(s"ALSModel loader could not read rank 
from JSON metadata:" +
+s" ${metadata.metadataStr}")
+  }
+
+  val userPath = new Path(path, "userFactors").toString
+  val userFactors =

spark git commit: [SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 bcc6813dd -> 23b8c2256


[SPARK-6789][ML] Add Readable, Writable support for spark.ml ALS, ALSModel

Also modifies DefaultParamsWriter.saveMetadata to take optional extra metadata.

CC: mengxr yanboliang

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9786 from jkbradley/als-io.

(cherry picked from commit 2acdf10b1f3bb1242dba64efa798c672fde9f0d2)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/23b8c225
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/23b8c225
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/23b8c225

Branch: refs/heads/branch-1.6
Commit: 23b8c2256d55d76ebe22977c03c0b893e5b6c408
Parents: bcc6813
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Wed Nov 18 13:16:31 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:16:39 2015 -0800

--
 .../apache/spark/ml/recommendation/ALS.scala| 75 +--
 .../org/apache/spark/ml/util/ReadWrite.scala| 14 +++-
 .../spark/ml/recommendation/ALSSuite.scala  | 78 +---
 3 files changed, 150 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/23b8c225/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala 
b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 535f266..d92514d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -27,13 +27,16 @@ import scala.util.hashing.byteswap64
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.apache.hadoop.fs.{FileSystem, Path}
+import org.json4s.{DefaultFormats, JValue}
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, Partitioner}
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{Since, DeveloperApi, Experimental}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.CholeskyDecomposition
 import org.apache.spark.mllib.optimization.NNLS
 import org.apache.spark.rdd.RDD
@@ -182,7 +185,7 @@ class ALSModel private[ml] (
 val rank: Int,
 @transient val userFactors: DataFrame,
 @transient val itemFactors: DataFrame)
-  extends Model[ALSModel] with ALSModelParams {
+  extends Model[ALSModel] with ALSModelParams with Writable {
 
   /** @group setParam */
   def setUserCol(value: String): this.type = set(userCol, value)
@@ -220,8 +223,60 @@ class ALSModel private[ml] (
 val copied = new ALSModel(uid, rank, userFactors, itemFactors)
 copyValues(copied, extra).setParent(parent)
   }
+
+  @Since("1.6.0")
+  override def write: Writer = new ALSModel.ALSModelWriter(this)
 }
 
+@Since("1.6.0")
+object ALSModel extends Readable[ALSModel] {
+
+  @Since("1.6.0")
+  override def read: Reader[ALSModel] = new ALSModelReader
+
+  @Since("1.6.0")
+  override def load(path: String): ALSModel = read.load(path)
+
+  private[recommendation] class ALSModelWriter(instance: ALSModel) extends 
Writer {
+
+override protected def saveImpl(path: String): Unit = {
+  val extraMetadata = render("rank" -> instance.rank)
+  DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
+  val userPath = new Path(path, "userFactors").toString
+  instance.userFactors.write.format("parquet").save(userPath)
+  val itemPath = new Path(path, "itemFactors").toString
+  instance.itemFactors.write.format("parquet").save(itemPath)
+}
+  }
+
+  private[recommendation] class ALSModelReader extends Reader[ALSModel] {
+
+/** Checked against metadata when loading model */
+private val className = "org.apache.spark.ml.recommendation.ALSModel"
+
+override def load(path: String): ALSModel = {
+  val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+  implicit val format = DefaultFormats
+  val rank: Int = metadata.extraMetadata match {
+case Some(m: JValue) =>
+  (m \ "rank").extract[Int]
+case None =>
+  throw new RuntimeException(s"ALSModel loader could not read rank 
from JSON metadata:" +
+s" ${meta

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 2acdf10b1 -> e391abdf2


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e391abdf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e391abdf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e391abdf

Branch: refs/heads/master
Commit: e391abdf2cb6098a35347bd123b815ee9ac5b689
Parents: 2acdf10
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:25:15 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e391abdf/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f3e4d34..7ab0d89 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -145,8 +145,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.4 e12fbd80c -> eda1ff4ee


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.

(cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eda1ff4e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eda1ff4e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eda1ff4e

Branch: refs/heads/branch-1.4
Commit: eda1ff4eede3968c24a0d1338432eae5682e8432
Parents: e12fbd8
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:25:54 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/eda1ff4e/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 74e7dbf..3493186 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -141,8 +141,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 23b8c2256 -> 18e308b84


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.

(cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18e308b8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18e308b8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18e308b8

Branch: refs/heads/branch-1.6
Commit: 18e308b84fe7ffeca730397152582b31a4b88a82
Parents: 23b8c22
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:25:22 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/18e308b8/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f3e4d34..7ab0d89 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -145,8 +145,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 f802b07ab -> 0439e32e2


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.

(cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0439e32e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0439e32e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0439e32e

Branch: refs/heads/branch-1.5
Commit: 0439e32e2e88cc3a3364a37509fa25aebf2c299f
Parents: f802b07
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:25:37 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0439e32e/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 36b124c..c226e3c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -148,8 +148,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.3 1bfa00d54 -> 5278ef0f1


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.

(cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5278ef0f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5278ef0f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5278ef0f

Branch: refs/heads/branch-1.3
Commit: 5278ef0f1aead5de7e32da8bb40ba15fabe7473d
Parents: 1bfa00d
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:26:05 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5278ef0f/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 59a79e5..dea35e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -134,8 +134,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.2 4b6e24e25 -> 307f27e24


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.

(cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/307f27e2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/307f27e2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/307f27e2

Branch: refs/heads/branch-1.2
Commit: 307f27e24e17afd92030194a3e6fec312fc19f4f
Parents: 4b6e24e
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:26:18 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/307f27e2/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 7960f3c..d983dd3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -127,8 +127,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.1 19835ec1f -> 11ee9d191


[SPARK-11813][MLLIB] Avoid serialization of vocab in Word2Vec

jira: https://issues.apache.org/jira/browse/SPARK-11813

I found the problem during training a large corpus. Avoid serialization of 
vocab in Word2Vec has 2 benefits.
1. Performance improvement for less serialization.
2. Increase the capacity of Word2Vec a lot.
Currently in the fit of word2vec, the closure mainly includes serialization of 
Word2Vec and 2 global table.
the main part of Word2vec is the vocab of size: vocab * 40 * 2 * 4 = 320 vocab
2 global table: vocab * vectorSize * 8. If vectorSize = 20, that's 160 vocab.

Their sum cannot exceed Int.max due to the restriction of 
ByteArrayOutputStream. In any case, avoiding serialization of vocab helps 
decrease the size of the closure serialization, especially when vectorSize is 
small, thus to allow larger vocabulary.

Actually there's another possible fix, make local copy of fields to avoid 
including Word2Vec in the closure. Let me know if that's preferred.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #9803 from hhbyyh/w2vVocab.

(cherry picked from commit e391abdf2cb6098a35347bd123b815ee9ac5b689)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/11ee9d19
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/11ee9d19
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/11ee9d19

Branch: refs/heads/branch-1.1
Commit: 11ee9d191e26a41a44ff0ca8730a129934942ee7
Parents: 19835ec
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Wed Nov 18 13:25:15 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:26:39 2015 -0800

--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/11ee9d19/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index fc14447..a3e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -127,8 +127,8 @@ class Word2Vec extends Serializable with Logging {
 
   private var trainWordsCount = 0
   private var vocabSize = 0
-  private var vocab: Array[VocabWord] = null
-  private var vocabHash = mutable.HashMap.empty[String, Int]
+  @transient private var vocab: Array[VocabWord] = null
+  @transient private var vocabHash = mutable.HashMap.empty[String, Int]
 
   private def learnVocab(words: RDD[String]): Unit = {
 vocab = words.map(w => (w, 1))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example codes

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e391abdf2 -> e222d7584


[SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example 
codes

This PR includes:
* Update SparkR:::glm, SparkR:::summary API docs.
* Update SparkR machine learning user guide and example codes to show:
  * supporting feature interaction in R formula.
  * summary for gaussian GLM model.
  * coefficients for binomial GLM model.

mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9727 from yanboliang/spark-11684.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e222d758
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e222d758
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e222d758

Branch: refs/heads/master
Commit: e222d758499ad2609046cc1a2cc8afb45c5bccbb
Parents: e391abd
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Nov 18 13:30:29 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:30:29 2015 -0800

--
 R/pkg/R/mllib.R | 18 +--
 docs/sparkr.md  | 50 
 .../spark/ml/regression/LinearRegression.scala  |  3 ++
 3 files changed, 60 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e222d758/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index f23e1c7..8d3b438 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -32,6 +32,12 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' @param family Error distribution. "gaussian" -> linear regression, 
"binomial" -> logistic reg.
 #' @param lambda Regularization parameter
 #' @param alpha Elastic-net mixing parameter (see glmnet's documentation for 
details)
+#' @param standardize Whether to standardize features before training
+#' @param solver The solver algorithm used for optimization, this can be 
"l-bfgs", "normal" and
+#'   "auto". "l-bfgs" denotes Limited-memory BFGS which is a 
limited-memory
+#'   quasi-Newton optimization method. "normal" denotes using 
Normal Equation as an
+#'   analytical solution to the linear regression problem. The 
default value is "auto"
+#'   which means that the solver algorithm is selected 
automatically.
 #' @return a fitted MLlib model
 #' @rdname glm
 #' @export
@@ -79,9 +85,15 @@ setMethod("predict", signature(object = "PipelineModel"),
 #'
 #' Returns the summary of a model produced by glm(), similarly to R's 
summary().
 #'
-#' @param x A fitted MLlib model
-#' @return a list with a 'coefficient' component, which is the matrix of 
coefficients. See
-#' summary.glm for more information.
+#' @param object A fitted MLlib model
+#' @return a list with 'devianceResiduals' and 'coefficients' components for 
gaussian family
+#' or a list with 'coefficients' component for binomial family. \cr
+#' For gaussian family: the 'devianceResiduals' gives the min/max 
deviance residuals
+#' of the estimation, the 'coefficients' gives the estimated 
coefficients and their
+#' estimated standard errors, t values and p-values. (It only 
available when model
+#' fitted by normal solver.) \cr
+#' For binomial family: the 'coefficients' gives the estimated 
coefficients.
+#' See summary.glm for more information. \cr
 #' @rdname summary
 #' @export
 #' @examples

http://git-wip-us.apache.org/repos/asf/spark/blob/e222d758/docs/sparkr.md
--
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 437bd47..a744b76 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -286,24 +286,37 @@ head(teenagers)
 
 # Machine Learning
 
-SparkR allows the fitting of generalized linear models over DataFrames using 
the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to 
train a model of the specified family. Currently the gaussian and binomial 
families are supported. We support a subset of the available R formula 
operators for model fitting, including '~', '.', '+', and '-'. The example 
below shows the use of building a gaussian GLM model using SparkR.
+SparkR allows the fitting of generalized linear models over DataFrames using 
the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to 
train a model of the specified family. Currently the gaussian and binomial 
families are supported. We support a subset of the available R formula 
operators for model fitting, including '~', '.', ':', '+', and '-'. 
+
+The [summar

spark git commit: [SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example codes

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 18e308b84 -> 03c2d20dc


[SPARK-11684][R][ML][DOC] Update SparkR glm API doc, user guide and example 
codes

This PR includes:
* Update SparkR:::glm, SparkR:::summary API docs.
* Update SparkR machine learning user guide and example codes to show:
  * supporting feature interaction in R formula.
  * summary for gaussian GLM model.
  * coefficients for binomial GLM model.

mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9727 from yanboliang/spark-11684.

(cherry picked from commit e222d758499ad2609046cc1a2cc8afb45c5bccbb)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/03c2d20d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/03c2d20d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/03c2d20d

Branch: refs/heads/branch-1.6
Commit: 03c2d20dcfdecd910b8b2b036d581720e1a370e8
Parents: 18e308b
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Nov 18 13:30:29 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:30:36 2015 -0800

--
 R/pkg/R/mllib.R | 18 +--
 docs/sparkr.md  | 50 
 .../spark/ml/regression/LinearRegression.scala  |  3 ++
 3 files changed, 60 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/03c2d20d/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index f23e1c7..8d3b438 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -32,6 +32,12 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' @param family Error distribution. "gaussian" -> linear regression, 
"binomial" -> logistic reg.
 #' @param lambda Regularization parameter
 #' @param alpha Elastic-net mixing parameter (see glmnet's documentation for 
details)
+#' @param standardize Whether to standardize features before training
+#' @param solver The solver algorithm used for optimization, this can be 
"l-bfgs", "normal" and
+#'   "auto". "l-bfgs" denotes Limited-memory BFGS which is a 
limited-memory
+#'   quasi-Newton optimization method. "normal" denotes using 
Normal Equation as an
+#'   analytical solution to the linear regression problem. The 
default value is "auto"
+#'   which means that the solver algorithm is selected 
automatically.
 #' @return a fitted MLlib model
 #' @rdname glm
 #' @export
@@ -79,9 +85,15 @@ setMethod("predict", signature(object = "PipelineModel"),
 #'
 #' Returns the summary of a model produced by glm(), similarly to R's 
summary().
 #'
-#' @param x A fitted MLlib model
-#' @return a list with a 'coefficient' component, which is the matrix of 
coefficients. See
-#' summary.glm for more information.
+#' @param object A fitted MLlib model
+#' @return a list with 'devianceResiduals' and 'coefficients' components for 
gaussian family
+#' or a list with 'coefficients' component for binomial family. \cr
+#' For gaussian family: the 'devianceResiduals' gives the min/max 
deviance residuals
+#' of the estimation, the 'coefficients' gives the estimated 
coefficients and their
+#' estimated standard errors, t values and p-values. (It only 
available when model
+#' fitted by normal solver.) \cr
+#' For binomial family: the 'coefficients' gives the estimated 
coefficients.
+#' See summary.glm for more information. \cr
 #' @rdname summary
 #' @export
 #' @examples

http://git-wip-us.apache.org/repos/asf/spark/blob/03c2d20d/docs/sparkr.md
--
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 437bd47..a744b76 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -286,24 +286,37 @@ head(teenagers)
 
 # Machine Learning
 
-SparkR allows the fitting of generalized linear models over DataFrames using 
the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to 
train a model of the specified family. Currently the gaussian and binomial 
families are supported. We support a subset of the available R formula 
operators for model fitting, including '~', '.', '+', and '-'. The example 
below shows the use of building a gaussian GLM model using SparkR.
+SparkR allows the fitting of generalized linear models over DataFrames using 
the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to 
train a model of the specified family. Currently the gaussian and binomial 
families are supported. We support

spark git commit: [SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e222d7584 -> 603a721c2


[SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol

[SPARK-7685](https://issues.apache.org/jira/browse/SPARK-7685) and 
[SPARK-9642](https://issues.apache.org/jira/browse/SPARK-9642) have already 
supported setting weight column for ```LogisticRegression``` and 
```LinearRegression```. It's a very important feature, PySpark should also 
support. mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9811 from yanboliang/spark-11820.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/603a721c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/603a721c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/603a721c

Branch: refs/heads/master
Commit: 603a721c21488e17c15c45ce1de893e6b3d02274
Parents: e222d75
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Nov 18 13:32:06 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:32:06 2015 -0800

--
 python/pyspark/ml/classification.py | 17 +
 python/pyspark/ml/regression.py | 16 
 2 files changed, 17 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/603a721c/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 603f2c7..4a2982e 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -36,7 +36,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 
'DecisionTreeClassif
 @inherit_doc
 class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,
  HasRegParam, HasTol, HasProbabilityCol, 
HasRawPredictionCol,
- HasElasticNetParam, HasFitIntercept, 
HasStandardization, HasThresholds):
+ HasElasticNetParam, HasFitIntercept, 
HasStandardization, HasThresholds,
+ HasWeightCol):
 """
 Logistic regression.
 Currently, this class only supports binary classification.
@@ -44,9 +45,9 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 >>> from pyspark.sql import Row
 >>> from pyspark.mllib.linalg import Vectors
 >>> df = sc.parallelize([
-... Row(label=1.0, features=Vectors.dense(1.0)),
-... Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF()
->>> lr = LogisticRegression(maxIter=5, regParam=0.01)
+... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
+... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], 
[]))]).toDF()
+>>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
 >>> model = lr.fit(df)
 >>> model.weights
 DenseVector([5.5...])
@@ -80,12 +81,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True,
  threshold=0.5, thresholds=None, probabilityCol="probability",
- rawPredictionCol="rawPrediction", standardization=True):
+ rawPredictionCol="rawPrediction", standardization=True, 
weightCol=None):
 """
 __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True, \
  threshold=0.5, thresholds=None, probabilityCol="probability", 
\
- rawPredictionCol="rawPrediction", standardization=True)
+ rawPredictionCol="rawPrediction", standardization=True, 
weightCol=None)
 If the threshold and thresholds Params are both set, they must be 
equivalent.
 """
 super(LogisticRegression, self).__init__()
@@ -105,12 +106,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True,
   threshold=0.5, thresholds=None, probabilityCol="probability",
-  rawPredictionCol="rawPredict

spark git commit: [SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol

2015-11-18 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 03c2d20dc -> 464b2d421


[SPARK-11820][ML][PYSPARK] PySpark LiR & LoR should support weightCol

[SPARK-7685](https://issues.apache.org/jira/browse/SPARK-7685) and 
[SPARK-9642](https://issues.apache.org/jira/browse/SPARK-9642) have already 
supported setting weight column for ```LogisticRegression``` and 
```LinearRegression```. It's a very important feature, PySpark should also 
support. mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9811 from yanboliang/spark-11820.

(cherry picked from commit 603a721c21488e17c15c45ce1de893e6b3d02274)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/464b2d42
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/464b2d42
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/464b2d42

Branch: refs/heads/branch-1.6
Commit: 464b2d4215534761c1a3fc84abc4007d47e391fd
Parents: 03c2d20
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Nov 18 13:32:06 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Nov 18 13:32:12 2015 -0800

--
 python/pyspark/ml/classification.py | 17 +
 python/pyspark/ml/regression.py | 16 
 2 files changed, 17 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/464b2d42/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 603f2c7..4a2982e 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -36,7 +36,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 
'DecisionTreeClassif
 @inherit_doc
 class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,
  HasRegParam, HasTol, HasProbabilityCol, 
HasRawPredictionCol,
- HasElasticNetParam, HasFitIntercept, 
HasStandardization, HasThresholds):
+ HasElasticNetParam, HasFitIntercept, 
HasStandardization, HasThresholds,
+ HasWeightCol):
 """
 Logistic regression.
 Currently, this class only supports binary classification.
@@ -44,9 +45,9 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 >>> from pyspark.sql import Row
 >>> from pyspark.mllib.linalg import Vectors
 >>> df = sc.parallelize([
-... Row(label=1.0, features=Vectors.dense(1.0)),
-... Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF()
->>> lr = LogisticRegression(maxIter=5, regParam=0.01)
+... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
+... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], 
[]))]).toDF()
+>>> lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
 >>> model = lr.fit(df)
 >>> model.weights
 DenseVector([5.5...])
@@ -80,12 +81,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True,
  threshold=0.5, thresholds=None, probabilityCol="probability",
- rawPredictionCol="rawPrediction", standardization=True):
+ rawPredictionCol="rawPrediction", standardization=True, 
weightCol=None):
 """
 __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True, \
  threshold=0.5, thresholds=None, probabilityCol="probability", 
\
- rawPredictionCol="rawPrediction", standardization=True)
+ rawPredictionCol="rawPrediction", standardization=True, 
weightCol=None)
 If the threshold and thresholds Params are both set, they must be 
equivalent.
 """
 super(LogisticRegression, self).__init__()
@@ -105,12 +106,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True,

spark git commit: [SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 fa9d56f9e -> 78dc07cdf


[SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector

This PR makes the default read/write work with simple transformers/estimators 
that have params of type `Param[Vector]`. jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #9776 from mengxr/SPARK-11764.

(cherry picked from commit 3e9e6380236985ec5b51b459f8c61f964a76ff8b)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/78dc07cd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/78dc07cd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/78dc07cd

Branch: refs/heads/branch-1.6
Commit: 78dc07cdf900cb0837e5a21e35e80af0ca6a7f26
Parents: fa9d56f
Author: Xiangrui Meng <m...@databricks.com>
Authored: Tue Nov 17 14:04:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 15:22:51 2015 -0800

--
 .../org/apache/spark/ml/param/params.scala  | 12 +--
 .../org/apache/spark/ml/param/ParamsSuite.scala | 22 
 2 files changed, 28 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/78dc07cd/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala 
b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index c932570..d182b0a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -29,6 +29,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 /**
  * :: DeveloperApi ::
@@ -88,9 +89,11 @@ class Param[T](val parent: String, val name: String, val 
doc: String, val isVali
 value match {
   case x: String =>
 compact(render(JString(x)))
+  case v: Vector =>
+v.toJson
   case _ =>
 throw new NotImplementedError(
-  "The default jsonEncode only supports string. " +
+  "The default jsonEncode only supports string and vector. " +
 s"${this.getClass.getName} must override jsonEncode for 
${value.getClass.getName}.")
 }
   }
@@ -100,9 +103,14 @@ class Param[T](val parent: String, val name: String, val 
doc: String, val isVali
 parse(json) match {
   case JString(x) =>
 x.asInstanceOf[T]
+  case JObject(v) =>
+val keys = v.map(_._1)
+assert(keys.contains("type") && keys.contains("values"),
+  s"Expect a JSON serialized vector but cannot find fields 'type' and 
'values' in $json.")
+Vectors.fromJson(json).asInstanceOf[T]
   case _ =>
 throw new NotImplementedError(
-  "The default jsonDecode only supports string. " +
+  "The default jsonDecode only supports string and vector. " +
 s"${this.getClass.getName} must override jsonDecode to support its 
value type.")
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/78dc07cd/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index eeb03db..a1878be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.param
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 class ParamsSuite extends SparkFunSuite {
 
@@ -80,7 +81,7 @@ class ParamsSuite extends SparkFunSuite {
   }
 }
 
-{ // StringParam
+{ // Param[String]
   val param = new Param[String](dummy, "name", "doc")
   // Currently we do not support null.
   for (value <- Seq("", "1", "abc", "quote\"", "newline\n")) {
@@ -89,6 +90,19 @@ class ParamsSuite extends SparkFunSuite {
   }
 }
 
+{ // Param[Vector]
+  val param = new Param[Vector](dummy, "name", "doc")
+  val values = Seq(
+Vectors.dense(Array.empty[Double]),
+Vectors.dense(0.0, 2.0),
+Vectors.sparse(0, Array.empty, Array.empty),
+Vectors.sparse(2, Array(1), Array(2.0)))
+

spark git commit: [SPARK-11769][ML] Add save, load to all basic Transformers

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master d92514966 -> d98d1cb00


[SPARK-11769][ML] Add save, load to all basic Transformers

This excludes Estimators and ones which include Vector and other non-basic 
types for Params or data.  This adds:
* Bucketizer
* DCT
* HashingTF
* Interaction
* NGram
* Normalizer
* OneHotEncoder
* PolynomialExpansion
* QuantileDiscretizer
* RFormula
* SQLTransformer
* StopWordsRemover
* StringIndexer
* Tokenizer
* VectorAssembler
* VectorSlicer

CC: mengxr

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9755 from jkbradley/transformer-io.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d98d1cb0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d98d1cb0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d98d1cb0

Branch: refs/heads/master
Commit: d98d1cb000c8c4e391d73ae86efd09f15e5d165c
Parents: d925149
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Nov 17 12:43:56 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 12:43:56 2015 -0800

--
 .../org/apache/spark/ml/feature/Binarizer.scala |  8 -
 .../apache/spark/ml/feature/Bucketizer.scala| 22 
 .../scala/org/apache/spark/ml/feature/DCT.scala | 19 +--
 .../org/apache/spark/ml/feature/HashingTF.scala | 20 +--
 .../apache/spark/ml/feature/Interaction.scala   | 29 +---
 .../org/apache/spark/ml/feature/NGram.scala | 19 +--
 .../apache/spark/ml/feature/Normalizer.scala| 20 +--
 .../apache/spark/ml/feature/OneHotEncoder.scala | 19 +--
 .../spark/ml/feature/PolynomialExpansion.scala  | 20 ---
 .../spark/ml/feature/QuantileDiscretizer.scala  | 22 
 .../spark/ml/feature/SQLTransformer.scala   | 27 +--
 .../spark/ml/feature/StopWordsRemover.scala | 19 +--
 .../apache/spark/ml/feature/StringIndexer.scala | 22 +---
 .../org/apache/spark/ml/feature/Tokenizer.scala | 35 +---
 .../spark/ml/feature/VectorAssembler.scala  | 18 +++---
 .../apache/spark/ml/feature/VectorSlicer.scala  | 22 
 .../spark/ml/feature/BinarizerSuite.scala   |  8 ++---
 .../spark/ml/feature/BucketizerSuite.scala  | 12 +--
 .../org/apache/spark/ml/feature/DCTSuite.scala  | 11 +-
 .../spark/ml/feature/HashingTFSuite.scala   | 11 +-
 .../spark/ml/feature/InteractionSuite.scala | 10 +-
 .../apache/spark/ml/feature/NGramSuite.scala| 11 +-
 .../spark/ml/feature/NormalizerSuite.scala  | 11 +-
 .../spark/ml/feature/OneHotEncoderSuite.scala   | 12 ++-
 .../ml/feature/PolynomialExpansionSuite.scala   | 12 ++-
 .../ml/feature/QuantileDiscretizerSuite.scala   | 13 +++-
 .../spark/ml/feature/SQLTransformerSuite.scala  | 10 +-
 .../ml/feature/StopWordsRemoverSuite.scala  | 14 +++-
 .../spark/ml/feature/StringIndexerSuite.scala   | 13 ++--
 .../spark/ml/feature/TokenizerSuite.scala   | 25 --
 .../spark/ml/feature/VectorAssemblerSuite.scala | 11 +-
 .../spark/ml/feature/VectorSlicerSuite.scala| 12 ++-
 32 files changed, 453 insertions(+), 84 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d98d1cb0/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
index e5c2557..e2be654 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Since, Experimental}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.BinaryAttribute
 import org.apache.spark.ml.param._
@@ -87,10 +87,16 @@ final class Binarizer(override val uid: String)
 
   override def copy(extra: ParamMap): Binarizer = defaultCopy(extra)
 
+  @Since("1.6.0")
   override def write: Writer = new DefaultParamsWriter(this)
 }
 
+@Since("1.6.0")
 object Binarizer extends Readable[Binarizer] {
 
+  @Since("1.6.0")
   override def read: Reader[Binarizer] = new DefaultParamsReader[Binarizer]
+
+  @Since("1.6.0")
+  override def load(path: String): Binarizer = read.load(path)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/d98d1cb0/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feat

spark git commit: [SPARK-11769][ML] Add save, load to all basic Transformers

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 88431fb3e -> e7f901647


[SPARK-11769][ML] Add save, load to all basic Transformers

This excludes Estimators and ones which include Vector and other non-basic 
types for Params or data.  This adds:
* Bucketizer
* DCT
* HashingTF
* Interaction
* NGram
* Normalizer
* OneHotEncoder
* PolynomialExpansion
* QuantileDiscretizer
* RFormula
* SQLTransformer
* StopWordsRemover
* StringIndexer
* Tokenizer
* VectorAssembler
* VectorSlicer

CC: mengxr

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9755 from jkbradley/transformer-io.

(cherry picked from commit d98d1cb000c8c4e391d73ae86efd09f15e5d165c)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7f90164
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7f90164
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7f90164

Branch: refs/heads/branch-1.6
Commit: e7f90164740d920bbdca06e90098b8bd8b775715
Parents: 88431fb
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Nov 17 12:43:56 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 12:44:04 2015 -0800

--
 .../org/apache/spark/ml/feature/Binarizer.scala |  8 -
 .../apache/spark/ml/feature/Bucketizer.scala| 22 
 .../scala/org/apache/spark/ml/feature/DCT.scala | 19 +--
 .../org/apache/spark/ml/feature/HashingTF.scala | 20 +--
 .../apache/spark/ml/feature/Interaction.scala   | 29 +---
 .../org/apache/spark/ml/feature/NGram.scala | 19 +--
 .../apache/spark/ml/feature/Normalizer.scala| 20 +--
 .../apache/spark/ml/feature/OneHotEncoder.scala | 19 +--
 .../spark/ml/feature/PolynomialExpansion.scala  | 20 ---
 .../spark/ml/feature/QuantileDiscretizer.scala  | 22 
 .../spark/ml/feature/SQLTransformer.scala   | 27 +--
 .../spark/ml/feature/StopWordsRemover.scala | 19 +--
 .../apache/spark/ml/feature/StringIndexer.scala | 22 +---
 .../org/apache/spark/ml/feature/Tokenizer.scala | 35 +---
 .../spark/ml/feature/VectorAssembler.scala  | 18 +++---
 .../apache/spark/ml/feature/VectorSlicer.scala  | 22 
 .../spark/ml/feature/BinarizerSuite.scala   |  8 ++---
 .../spark/ml/feature/BucketizerSuite.scala  | 12 +--
 .../org/apache/spark/ml/feature/DCTSuite.scala  | 11 +-
 .../spark/ml/feature/HashingTFSuite.scala   | 11 +-
 .../spark/ml/feature/InteractionSuite.scala | 10 +-
 .../apache/spark/ml/feature/NGramSuite.scala| 11 +-
 .../spark/ml/feature/NormalizerSuite.scala  | 11 +-
 .../spark/ml/feature/OneHotEncoderSuite.scala   | 12 ++-
 .../ml/feature/PolynomialExpansionSuite.scala   | 12 ++-
 .../ml/feature/QuantileDiscretizerSuite.scala   | 13 +++-
 .../spark/ml/feature/SQLTransformerSuite.scala  | 10 +-
 .../ml/feature/StopWordsRemoverSuite.scala  | 14 +++-
 .../spark/ml/feature/StringIndexerSuite.scala   | 13 ++--
 .../spark/ml/feature/TokenizerSuite.scala   | 25 --
 .../spark/ml/feature/VectorAssemblerSuite.scala | 11 +-
 .../spark/ml/feature/VectorSlicerSuite.scala| 12 ++-
 32 files changed, 453 insertions(+), 84 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e7f90164/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
index e5c2557..e2be654 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Since, Experimental}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.BinaryAttribute
 import org.apache.spark.ml.param._
@@ -87,10 +87,16 @@ final class Binarizer(override val uid: String)
 
   override def copy(extra: ParamMap): Binarizer = defaultCopy(extra)
 
+  @Since("1.6.0")
   override def write: Writer = new DefaultParamsWriter(this)
 }
 
+@Since("1.6.0")
 object Binarizer extends Readable[Binarizer] {
 
+  @Since("1.6.0")
   override def read: Reader[Binarizer] = new DefaultParamsReader[Binarizer]
+
+  @Since("1.6.0")
+  override def load(path: String): Binarizer = read.load(path)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9016

spark git commit: [SPARK-11728] Replace example code in ml-ensembles.md using include_example

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 dbb2ea06d -> 47e58322c


[SPARK-11728] Replace example code in ml-ensembles.md using include_example

JIRA issue https://issues.apache.org/jira/browse/SPARK-11728.

The ml-ensembles.md file contains `OneVsRestExample`. Instead of writing new 
code files of two `OneVsRestExample`s, I use two existing files in the examples 
directory, they are `OneVsRestExample.scala` and `JavaOneVsRestExample.scala`.

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9716 from yinxusen/SPARK-11728.

(cherry picked from commit 9154f89befb7a33d4853cea95efd7dc6b25d033b)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/47e58322
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/47e58322
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/47e58322

Branch: refs/heads/branch-1.6
Commit: 47e58322cac2f319dec07a97f4eaf38f9fce2407
Parents: dbb2ea0
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Tue Nov 17 23:44:06 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 23:44:14 2015 -0800

--
 docs/ml-ensembles.md| 754 +--
 ...avaGradientBoostedTreeClassifierExample.java | 102 +++
 ...JavaGradientBoostedTreeRegressorExample.java |  90 +++
 .../spark/examples/ml/JavaOneVsRestExample.java |   4 +
 .../ml/JavaRandomForestClassifierExample.java   | 101 +++
 .../ml/JavaRandomForestRegressorExample.java|  90 +++
 .../gradient_boosted_tree_classifier_example.py |  77 ++
 .../gradient_boosted_tree_regressor_example.py  |  74 ++
 .../ml/random_forest_classifier_example.py  |  77 ++
 .../ml/random_forest_regressor_example.py   |  74 ++
 .../GradientBoostedTreeClassifierExample.scala  |  97 +++
 .../GradientBoostedTreeRegressorExample.scala   |  85 +++
 .../spark/examples/ml/OneVsRestExample.scala|   4 +
 .../ml/RandomForestClassifierExample.scala  |  97 +++
 .../ml/RandomForestRegressorExample.scala   |  84 +++
 15 files changed, 1070 insertions(+), 740 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/47e58322/docs/ml-ensembles.md
--
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index ce15f5e..f6c3c30 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -115,194 +115,21 @@ We use two feature transformers to prepare the data; 
these help index categories
 
 Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier)
 for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.RandomForestClassifier
-import org.apache.spark.ml.classification.RandomForestClassificationModel
-import org.apache.spark.ml.feature.{StringIndexer, IndexToString, 
VectorIndexer}
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-
-// Load and parse the data file, converting it to a DataFrame.
-val data = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-val labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data)
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as 
continuous.
-val featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data)
-
-// Split the data into training and test sets (30% held out for testing)
-val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
-
-// Train a RandomForest model.
-val rf = new RandomForestClassifier()
-  .setLabelCol("indexedLabel")
-  .setFeaturesCol("indexedFeatures")
-  .setNumTrees(10)
-
-// Convert indexed labels back to original labels.
-val labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels)
-
-// Chain indexers and forest in a Pipeline
-val pipeline = new Pipeline()
-  .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
-
-// Train model.  This also runs the indexers.
-val model = pipeline.fit(trainingData)
-
-// Make predictions.
-val predictions = model.transform(testData)
-
-// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5)
-
-// Select (prediction, true label) and compute test error
-val eva

spark git commit: [SPARK-11728] Replace example code in ml-ensembles.md using include_example

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 2f191c66b -> 9154f89be


[SPARK-11728] Replace example code in ml-ensembles.md using include_example

JIRA issue https://issues.apache.org/jira/browse/SPARK-11728.

The ml-ensembles.md file contains `OneVsRestExample`. Instead of writing new 
code files of two `OneVsRestExample`s, I use two existing files in the examples 
directory, they are `OneVsRestExample.scala` and `JavaOneVsRestExample.scala`.

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9716 from yinxusen/SPARK-11728.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9154f89b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9154f89b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9154f89b

Branch: refs/heads/master
Commit: 9154f89befb7a33d4853cea95efd7dc6b25d033b
Parents: 2f191c6
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Tue Nov 17 23:44:06 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 23:44:06 2015 -0800

--
 docs/ml-ensembles.md| 754 +--
 ...avaGradientBoostedTreeClassifierExample.java | 102 +++
 ...JavaGradientBoostedTreeRegressorExample.java |  90 +++
 .../spark/examples/ml/JavaOneVsRestExample.java |   4 +
 .../ml/JavaRandomForestClassifierExample.java   | 101 +++
 .../ml/JavaRandomForestRegressorExample.java|  90 +++
 .../gradient_boosted_tree_classifier_example.py |  77 ++
 .../gradient_boosted_tree_regressor_example.py  |  74 ++
 .../ml/random_forest_classifier_example.py  |  77 ++
 .../ml/random_forest_regressor_example.py   |  74 ++
 .../GradientBoostedTreeClassifierExample.scala  |  97 +++
 .../GradientBoostedTreeRegressorExample.scala   |  85 +++
 .../spark/examples/ml/OneVsRestExample.scala|   4 +
 .../ml/RandomForestClassifierExample.scala  |  97 +++
 .../ml/RandomForestRegressorExample.scala   |  84 +++
 15 files changed, 1070 insertions(+), 740 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9154f89b/docs/ml-ensembles.md
--
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index ce15f5e..f6c3c30 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -115,194 +115,21 @@ We use two feature transformers to prepare the data; 
these help index categories
 
 Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier)
 for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.RandomForestClassifier
-import org.apache.spark.ml.classification.RandomForestClassificationModel
-import org.apache.spark.ml.feature.{StringIndexer, IndexToString, 
VectorIndexer}
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-
-// Load and parse the data file, converting it to a DataFrame.
-val data = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-val labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data)
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as 
continuous.
-val featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data)
-
-// Split the data into training and test sets (30% held out for testing)
-val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
-
-// Train a RandomForest model.
-val rf = new RandomForestClassifier()
-  .setLabelCol("indexedLabel")
-  .setFeaturesCol("indexedFeatures")
-  .setNumTrees(10)
-
-// Convert indexed labels back to original labels.
-val labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels)
-
-// Chain indexers and forest in a Pipeline
-val pipeline = new Pipeline()
-  .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
-
-// Train model.  This also runs the indexers.
-val model = pipeline.fit(trainingData)
-
-// Make predictions.
-val predictions = model.transform(testData)
-
-// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5)
-
-// Select (prediction, true label) and compute test error
-val evaluator = new MulticlassClassificationEvaluator()
-  .setLabelCol("indexedLabel")
-  .setPredictionCol("prediction&qu

spark git commit: [SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 6eb7008b7 -> 3e9e63802


[SPARK-11764][ML] make Param.jsonEncode/jsonDecode support Vector

This PR makes the default read/write work with simple transformers/estimators 
that have params of type `Param[Vector]`. jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #9776 from mengxr/SPARK-11764.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e9e6380
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e9e6380
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e9e6380

Branch: refs/heads/master
Commit: 3e9e6380236985ec5b51b459f8c61f964a76ff8b
Parents: 6eb7008
Author: Xiangrui Meng <m...@databricks.com>
Authored: Tue Nov 17 14:04:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 14:04:49 2015 -0800

--
 .../org/apache/spark/ml/param/params.scala  | 12 +--
 .../org/apache/spark/ml/param/ParamsSuite.scala | 22 
 2 files changed, 28 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3e9e6380/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala 
b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index c932570..d182b0a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -29,6 +29,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 /**
  * :: DeveloperApi ::
@@ -88,9 +89,11 @@ class Param[T](val parent: String, val name: String, val 
doc: String, val isVali
 value match {
   case x: String =>
 compact(render(JString(x)))
+  case v: Vector =>
+v.toJson
   case _ =>
 throw new NotImplementedError(
-  "The default jsonEncode only supports string. " +
+  "The default jsonEncode only supports string and vector. " +
 s"${this.getClass.getName} must override jsonEncode for 
${value.getClass.getName}.")
 }
   }
@@ -100,9 +103,14 @@ class Param[T](val parent: String, val name: String, val 
doc: String, val isVali
 parse(json) match {
   case JString(x) =>
 x.asInstanceOf[T]
+  case JObject(v) =>
+val keys = v.map(_._1)
+assert(keys.contains("type") && keys.contains("values"),
+  s"Expect a JSON serialized vector but cannot find fields 'type' and 
'values' in $json.")
+Vectors.fromJson(json).asInstanceOf[T]
   case _ =>
 throw new NotImplementedError(
-  "The default jsonDecode only supports string. " +
+  "The default jsonDecode only supports string and vector. " +
 s"${this.getClass.getName} must override jsonDecode to support its 
value type.")
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/3e9e6380/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index eeb03db..a1878be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.param
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 class ParamsSuite extends SparkFunSuite {
 
@@ -80,7 +81,7 @@ class ParamsSuite extends SparkFunSuite {
   }
 }
 
-{ // StringParam
+{ // Param[String]
   val param = new Param[String](dummy, "name", "doc")
   // Currently we do not support null.
   for (value <- Seq("", "1", "abc", "quote\"", "newline\n")) {
@@ -89,6 +90,19 @@ class ParamsSuite extends SparkFunSuite {
   }
 }
 
+{ // Param[Vector]
+  val param = new Param[Vector](dummy, "name", "doc")
+  val values = Seq(
+Vectors.dense(Array.empty[Double]),
+Vectors.dense(0.0, 2.0),
+Vectors.sparse(0, Array.empty, Array.empty),
+Vectors.sparse(2, Array(1), Array(2.0)))
+  for (value <- values) {
+val json = param.jsonEncode(value)
+assert(param.jsonDecode(json) === value)
+

spark git commit: [SPARK-11763][ML] Add save, load to LogisticRegression Estimator

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 328eb49e6 -> 6eb7008b7


[SPARK-11763][ML] Add save,load to LogisticRegression Estimator

Add save/load to LogisticRegression Estimator, and refactor tests a little to 
make it easier to add similar support to other Estimator, Model pairs.

Moved LogisticRegressionReader/Writer to within LogisticRegressionModel

CC: mengxr

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9749 from jkbradley/lr-io-2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6eb7008b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6eb7008b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6eb7008b

Branch: refs/heads/master
Commit: 6eb7008b7f33a36b06d0615b68cc21ed90ad1d8a
Parents: 328eb49
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Nov 17 14:03:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 14:03:49 2015 -0800

--
 .../ml/classification/LogisticRegression.scala  | 91 +++-
 .../org/apache/spark/ml/util/ReadWrite.scala|  1 +
 .../org/apache/spark/ml/PipelineSuite.scala |  7 --
 .../ml/classification/ClassifierSuite.scala | 32 +++
 .../LogisticRegressionSuite.scala   | 37 ++--
 .../ProbabilisticClassifierSuite.scala  | 14 +++
 .../spark/ml/util/DefaultReadWriteTest.scala| 50 ++-
 7 files changed, 173 insertions(+), 59 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6eb7008b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a88f526..71c2533 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -157,7 +157,7 @@ private[classification] trait LogisticRegressionParams 
extends ProbabilisticClas
 @Experimental
 class LogisticRegression(override val uid: String)
   extends ProbabilisticClassifier[Vector, LogisticRegression, 
LogisticRegressionModel]
-  with LogisticRegressionParams with Logging {
+  with LogisticRegressionParams with Writable with Logging {
 
   def this() = this(Identifiable.randomUID("logreg"))
 
@@ -385,6 +385,12 @@ class LogisticRegression(override val uid: String)
   }
 
   override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra)
+
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+object LogisticRegression extends Readable[LogisticRegression] {
+  override def read: Reader[LogisticRegression] = new 
DefaultParamsReader[LogisticRegression]
 }
 
 /**
@@ -517,61 +523,62 @@ class LogisticRegressionModel private[ml] (
*
* For [[LogisticRegressionModel]], this does NOT currently save the 
training [[summary]].
* An option to save [[summary]] may be added in the future.
+   *
+   * This also does not save the [[parent]] currently.
*/
-  override def write: Writer = new LogisticRegressionWriter(this)
-}
-
-
-/** [[Writer]] instance for [[LogisticRegressionModel]] */
-private[classification] class LogisticRegressionWriter(instance: 
LogisticRegressionModel)
-  extends Writer with Logging {
-
-  private case class Data(
-  numClasses: Int,
-  numFeatures: Int,
-  intercept: Double,
-  coefficients: Vector)
-
-  override protected def saveImpl(path: String): Unit = {
-// Save metadata and Params
-DefaultParamsWriter.saveMetadata(instance, path, sc)
-// Save model data: numClasses, numFeatures, intercept, coefficients
-val data = Data(instance.numClasses, instance.numFeatures, 
instance.intercept,
-  instance.coefficients)
-val dataPath = new Path(path, "data").toString
-
sqlContext.createDataFrame(Seq(data)).write.format("parquet").save(dataPath)
-  }
+  override def write: Writer = new 
LogisticRegressionModel.LogisticRegressionModelWriter(this)
 }
 
 
 object LogisticRegressionModel extends Readable[LogisticRegressionModel] {
 
-  override def read: Reader[LogisticRegressionModel] = new 
LogisticRegressionReader
+  override def read: Reader[LogisticRegressionModel] = new 
LogisticRegressionModelReader
 
   override def load(path: String): LogisticRegressionModel = read.load(path)
-}
 
+  /** [[Writer]] instance for [[LogisticRegressionModel]] */
+  private[classification] class LogisticRegressionModelWriter(instance: 
LogisticRegressionModel)
+extends Writer with Logging {
+
+private case class Data(
+numClasses: Int,
+

spark git commit: [SPARK-11763][ML] Add save, load to LogisticRegression Estimator

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 c0ada5b93 -> a529427a1


[SPARK-11763][ML] Add save,load to LogisticRegression Estimator

Add save/load to LogisticRegression Estimator, and refactor tests a little to 
make it easier to add similar support to other Estimator, Model pairs.

Moved LogisticRegressionReader/Writer to within LogisticRegressionModel

CC: mengxr

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #9749 from jkbradley/lr-io-2.

(cherry picked from commit 6eb7008b7f33a36b06d0615b68cc21ed90ad1d8a)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a529427a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a529427a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a529427a

Branch: refs/heads/branch-1.6
Commit: a529427a1d819ceb68fb2d7ab2c61f12bdaf0273
Parents: c0ada5b
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Nov 17 14:03:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 14:03:56 2015 -0800

--
 .../ml/classification/LogisticRegression.scala  | 91 +++-
 .../org/apache/spark/ml/util/ReadWrite.scala|  1 +
 .../org/apache/spark/ml/PipelineSuite.scala |  7 --
 .../ml/classification/ClassifierSuite.scala | 32 +++
 .../LogisticRegressionSuite.scala   | 37 ++--
 .../ProbabilisticClassifierSuite.scala  | 14 +++
 .../spark/ml/util/DefaultReadWriteTest.scala| 50 ++-
 7 files changed, 173 insertions(+), 59 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a529427a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a88f526..71c2533 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -157,7 +157,7 @@ private[classification] trait LogisticRegressionParams 
extends ProbabilisticClas
 @Experimental
 class LogisticRegression(override val uid: String)
   extends ProbabilisticClassifier[Vector, LogisticRegression, 
LogisticRegressionModel]
-  with LogisticRegressionParams with Logging {
+  with LogisticRegressionParams with Writable with Logging {
 
   def this() = this(Identifiable.randomUID("logreg"))
 
@@ -385,6 +385,12 @@ class LogisticRegression(override val uid: String)
   }
 
   override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra)
+
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+object LogisticRegression extends Readable[LogisticRegression] {
+  override def read: Reader[LogisticRegression] = new 
DefaultParamsReader[LogisticRegression]
 }
 
 /**
@@ -517,61 +523,62 @@ class LogisticRegressionModel private[ml] (
*
* For [[LogisticRegressionModel]], this does NOT currently save the 
training [[summary]].
* An option to save [[summary]] may be added in the future.
+   *
+   * This also does not save the [[parent]] currently.
*/
-  override def write: Writer = new LogisticRegressionWriter(this)
-}
-
-
-/** [[Writer]] instance for [[LogisticRegressionModel]] */
-private[classification] class LogisticRegressionWriter(instance: 
LogisticRegressionModel)
-  extends Writer with Logging {
-
-  private case class Data(
-  numClasses: Int,
-  numFeatures: Int,
-  intercept: Double,
-  coefficients: Vector)
-
-  override protected def saveImpl(path: String): Unit = {
-// Save metadata and Params
-DefaultParamsWriter.saveMetadata(instance, path, sc)
-// Save model data: numClasses, numFeatures, intercept, coefficients
-val data = Data(instance.numClasses, instance.numFeatures, 
instance.intercept,
-  instance.coefficients)
-val dataPath = new Path(path, "data").toString
-
sqlContext.createDataFrame(Seq(data)).write.format("parquet").save(dataPath)
-  }
+  override def write: Writer = new 
LogisticRegressionModel.LogisticRegressionModelWriter(this)
 }
 
 
 object LogisticRegressionModel extends Readable[LogisticRegressionModel] {
 
-  override def read: Reader[LogisticRegressionModel] = new 
LogisticRegressionReader
+  override def read: Reader[LogisticRegressionModel] = new 
LogisticRegressionModelReader
 
   override def load(path: String): LogisticRegressionModel = read.load(path)
-}
 
+  /** [[Writer]] instance for [[LogisticRegressionModel]] */
+  private[classification] class LogisticRegressionModelWrite

spark git commit: [SPARK-11729] Replace example code in ml-linear-methods.md using include_example

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 b468f8171 -> c0ada5b93


[SPARK-11729] Replace example code in ml-linear-methods.md using include_example

JIRA link: https://issues.apache.org/jira/browse/SPARK-11729

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9713 from yinxusen/SPARK-11729.

(cherry picked from commit 328eb49e671337e09188853b29c8f32fb157)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0ada5b9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0ada5b9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0ada5b9

Branch: refs/heads/branch-1.6
Commit: c0ada5b932c180c0ba72d30463bd4a8a622c8c65
Parents: b468f81
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Tue Nov 17 13:59:59 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 14:00:08 2015 -0800

--
 docs/ml-linear-methods.md   | 218 +--
 ...vaLinearRegressionWithElasticNetExample.java |  65 ++
 .../JavaLogisticRegressionSummaryExample.java   |  84 +++
 ...LogisticRegressionWithElasticNetExample.java |  55 +
 .../ml/linear_regression_with_elastic_net.py|  44 
 .../ml/logistic_regression_with_elastic_net.py  |  44 
 .../LinearRegressionWithElasticNetExample.scala |  61 ++
 .../ml/LogisticRegressionSummaryExample.scala   |  77 +++
 ...ogisticRegressionWithElasticNetExample.scala |  53 +
 9 files changed, 491 insertions(+), 210 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c0ada5b9/docs/ml-linear-methods.md
--
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 85edfd3..0c13d7d 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -57,77 +57,15 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 
 
 
-{% highlight scala %}
-import org.apache.spark.ml.classification.LogisticRegression
-
-// Load training data
-val training = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-val lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.3)
-  .setElasticNetParam(0.8)
-
-// Fit the model
-val lrModel = lr.fit(training)
-
-// Print the coefficients and intercept for logistic regression
-println(s"Coefficients: ${lrModel.coefficients} Intercept: 
${lrModel.intercept}")
-{% endhighlight %}
+{% include_example 
scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
 %}
 
 
 
-{% highlight java %}
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
-
-public class LogisticRegressionWithElasticNetExample {
-  public static void main(String[] args) {
-SparkConf conf = new SparkConf()
-  .setAppName("Logistic Regression with Elastic Net Example");
-
-SparkContext sc = new SparkContext(conf);
-SQLContext sql = new SQLContext(sc);
-String path = "data/mllib/sample_libsvm_data.txt";
-
-// Load training data
-DataFrame training = sqlContext.read().format("libsvm").load(path);
-
-LogisticRegression lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.3)
-  .setElasticNetParam(0.8);
-
-// Fit the model
-LogisticRegressionModel lrModel = lr.fit(training);
-
-// Print the coefficients and intercept for logistic regression
-System.out.println("Coefficients: " + lrModel.coefficients() + " 
Intercept: " + lrModel.intercept());
-  }
-}
-{% endhighlight %}
+{% include_example 
java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
 %}
 
 
 
-{% highlight python %}
-from pyspark.ml.classification import LogisticRegression
-
-# Load training data
-training = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
-
-# Fit the model
-lrModel = lr.fit(training)
-
-# Print the coefficients and intercept for logistic regression
-print("Coefficients: " + str(lrModel.coefficients))
-print("Intercept: " + str(lrModel.intercept))
-{% endhighlight %}
+{% include_example python/ml/logistic_regression_with_elastic_net.py %}
 
 
 
@@ -152,33 +90,7 @@ This will likely change when multiclass classification is 
supported.
 
 Continuing the earlier example:
 
-{% highlight scala %}
-import org.apache.spark.ml.classification.BinaryLogistic

spark git commit: [SPARK-11729] Replace example code in ml-linear-methods.md using include_example

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master fa603e08d -> 328eb49e6


[SPARK-11729] Replace example code in ml-linear-methods.md using include_example

JIRA link: https://issues.apache.org/jira/browse/SPARK-11729

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9713 from yinxusen/SPARK-11729.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/328eb49e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/328eb49e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/328eb49e

Branch: refs/heads/master
Commit: 328eb49e671337e09188853b29c8f32fb157
Parents: fa603e0
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Tue Nov 17 13:59:59 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 13:59:59 2015 -0800

--
 docs/ml-linear-methods.md   | 218 +--
 ...vaLinearRegressionWithElasticNetExample.java |  65 ++
 .../JavaLogisticRegressionSummaryExample.java   |  84 +++
 ...LogisticRegressionWithElasticNetExample.java |  55 +
 .../ml/linear_regression_with_elastic_net.py|  44 
 .../ml/logistic_regression_with_elastic_net.py  |  44 
 .../LinearRegressionWithElasticNetExample.scala |  61 ++
 .../ml/LogisticRegressionSummaryExample.scala   |  77 +++
 ...ogisticRegressionWithElasticNetExample.scala |  53 +
 9 files changed, 491 insertions(+), 210 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/328eb49e/docs/ml-linear-methods.md
--
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 85edfd3..0c13d7d 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -57,77 +57,15 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 
 
 
-{% highlight scala %}
-import org.apache.spark.ml.classification.LogisticRegression
-
-// Load training data
-val training = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-val lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.3)
-  .setElasticNetParam(0.8)
-
-// Fit the model
-val lrModel = lr.fit(training)
-
-// Print the coefficients and intercept for logistic regression
-println(s"Coefficients: ${lrModel.coefficients} Intercept: 
${lrModel.intercept}")
-{% endhighlight %}
+{% include_example 
scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
 %}
 
 
 
-{% highlight java %}
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
-
-public class LogisticRegressionWithElasticNetExample {
-  public static void main(String[] args) {
-SparkConf conf = new SparkConf()
-  .setAppName("Logistic Regression with Elastic Net Example");
-
-SparkContext sc = new SparkContext(conf);
-SQLContext sql = new SQLContext(sc);
-String path = "data/mllib/sample_libsvm_data.txt";
-
-// Load training data
-DataFrame training = sqlContext.read().format("libsvm").load(path);
-
-LogisticRegression lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.3)
-  .setElasticNetParam(0.8);
-
-// Fit the model
-LogisticRegressionModel lrModel = lr.fit(training);
-
-// Print the coefficients and intercept for logistic regression
-System.out.println("Coefficients: " + lrModel.coefficients() + " 
Intercept: " + lrModel.intercept());
-  }
-}
-{% endhighlight %}
+{% include_example 
java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
 %}
 
 
 
-{% highlight python %}
-from pyspark.ml.classification import LogisticRegression
-
-# Load training data
-training = 
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
-
-# Fit the model
-lrModel = lr.fit(training)
-
-# Print the coefficients and intercept for logistic regression
-print("Coefficients: " + str(lrModel.coefficients))
-print("Intercept: " + str(lrModel.intercept))
-{% endhighlight %}
+{% include_example python/ml/logistic_regression_with_elastic_net.py %}
 
 
 
@@ -152,33 +90,7 @@ This will likely change when multiclass classification is 
supported.
 
 Continuing the earlier example:
 
-{% highlight scala %}
-import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary
-
-// Extract the summary from the returned LogisticRegressionModel instance 
trained in the earlier exam

spark git commit: [SPARK-7013][ML][TEST] Add unit test for spark.ml StandardScaler

2015-11-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 737f07172 -> 3f63f08f9


[SPARK-7013][ML][TEST] Add unit test for spark.ml StandardScaler

I have added unit test for ML's StandardScaler By comparing with R's output, 
please review  for me.
Thx.

Author: RoyGaoVLIS <roy...@zju.edu.cn>

Closes #6665 from RoyGao/7013.

(cherry picked from commit 67a5132c21bc8338adbae80b33b85e8fa0ddda34)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f63f08f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f63f08f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f63f08f

Branch: refs/heads/branch-1.6
Commit: 3f63f08f9db6073ef9b6318ba20ebfbd1bbd263a
Parents: 737f071
Author: RoyGaoVLIS <roy...@zju.edu.cn>
Authored: Tue Nov 17 23:00:49 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 17 23:01:03 2015 -0800

--
 .../spark/ml/feature/StandardScalerSuite.scala  | 108 +++
 1 file changed, 108 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3f63f08f/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
new file mode 100644
index 000..879a3ae
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, 
Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext{
+
+  @transient var data: Array[Vector] = _
+  @transient var resWithStd: Array[Vector] = _
+  @transient var resWithMean: Array[Vector] = _
+  @transient var resWithBoth: Array[Vector] = _
+
+  override def beforeAll(): Unit = {
+super.beforeAll()
+
+data = Array(
+  Vectors.dense(-2.0, 2.3, 0.0),
+  Vectors.dense(0.0, -5.1, 1.0),
+  Vectors.dense(1.7, -0.6, 3.3)
+)
+resWithMean = Array(
+  Vectors.dense(-1.9, 3.4333, -1.4333),
+  Vectors.dense(0.1, -3.9667, -0.4333),
+  Vectors.dense(1.8, 0.5333, 1.8667)
+)
+resWithStd = Array(
+  Vectors.dense(-1.079898494312, 0.616834091415, 0.0),
+  Vectors.dense(0.0, -1.367762550529, 0.590968109266),
+  Vectors.dense(0.917913720165, -0.160913241239, 1.950194760579)
+)
+resWithBoth = Array(
+  Vectors.dense(-1.0259035695965, 0.920781324866, -0.8470542899497),
+  Vectors.dense(0.0539949247156, -1.063815317078, -0.256086180682),
+  Vectors.dense(0.9719086448809, 0.143033992212, 1.103140470631)
+)
+  }
+
+  def assertResult(dataframe: DataFrame): Unit = {
+dataframe.select("standarded_features", "expected").collect().foreach {
+  case Row(vector1: Vector, vector2: Vector) =>
+assert(vector1 ~== vector2 absTol 1E-5,
+  "The vector value is not correct after standardization.")
+}
+  }
+
+  test("Standardization with default parameter") {
+val df0 = 
sqlContext.createDataFrame(data.zip(resWithStd)).toDF("features", "expected")
+
+val standardscaler0 = new StandardScaler()
+  .setInputCol("features")
+  .setOutputCol("standarded_features")
+  .fit(df0)
+
+assertResult(standardscaler0.transform(df0))
+  }
+
+  test("Standardization with setter") {
+val df1 = 
sqlContext.createDataFrame(data.zip(resWithBot

< 1 2 3 4 5 6 7 8 9 10 >

301 - 400 of 1469 matches

Mail list logo