from:"meng"

[spark] branch master updated: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF

2020-05-12 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new e248bc7  [SPARK-31610][SPARK-31668][ML] Address hashingTF 
saving bug and expose hashFunc property in HashingTF
e248bc7 is described below

commit e248bc7af6086cde7dd89a51459ae6a221a600c8
Author: Weichen Xu 
AuthorDate: Tue May 12 08:54:28 2020 -0700

[SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and 
expose hashFunc property in HashingTF

### What changes were proposed in this pull request?
Expose hashFunc property in HashingTF

Some third-party library such as mleap need to access it.
See background description here:
https://github.com/combust/mleap/pull/665#issuecomment-621258623

### Why are the changes needed?
See https://github.com/combust/mleap/pull/665#issuecomment-621258623

### Does this PR introduce any user-facing change?
No. Only add a package private constructor.

### How was this patch tested?
N/A

Closes #28413 from WeichenXu123/hashing_tf_expose_hashfunc.

Authored-by: Weichen Xu 
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/ml/feature/HashingTF.scala| 40 +-
 .../apache/spark/ml/feature/HashingTFSuite.scala   |  4 +++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 80bf859..d2bb013 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -42,14 +42,17 @@ import org.apache.spark.util.VersionUtils.majorMinorVersion
  * otherwise the features will not be mapped evenly to the columns.
  */
 @Since("1.2.0")
-class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
+class HashingTF @Since("3.0.0") private[ml] (
+@Since("1.4.0") override val uid: String,
+@Since("3.1.0") val hashFuncVersion: Int)
   extends Transformer with HasInputCol with HasOutputCol with HasNumFeatures
 with DefaultParamsWritable {
 
-  private var hashFunc: Any => Int = FeatureHasher.murmur3Hash
-
   @Since("1.2.0")
-  def this() = this(Identifiable.randomUID("hashingTF"))
+  def this() = this(Identifiable.randomUID("hashingTF"), 
HashingTF.SPARK_3_MURMUR3_HASH)
+
+  @Since("1.4.0")
+  def this(uid: String) = this(uid, hashFuncVersion = 
HashingTF.SPARK_3_MURMUR3_HASH)
 
   /** @group setParam */
   @Since("1.4.0")
@@ -122,7 +125,12 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override 
val uid: String)
*/
   @Since("3.0.0")
   def indexOf(term: Any): Int = {
-Utils.nonNegativeMod(hashFunc(term), $(numFeatures))
+val hashValue = hashFuncVersion match {
+  case HashingTF.SPARK_2_MURMUR3_HASH => OldHashingTF.murmur3Hash(term)
+  case HashingTF.SPARK_3_MURMUR3_HASH => FeatureHasher.murmur3Hash(term)
+  case _ => throw new IllegalArgumentException("Illegal hash function 
version setting.")
+}
+Utils.nonNegativeMod(hashValue, $(numFeatures))
   }
 
   @Since("1.4.1")
@@ -132,27 +140,41 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override 
val uid: String)
   override def toString: String = {
 s"HashingTF: uid=$uid, binary=${$(binary)}, numFeatures=${$(numFeatures)}"
   }
+
+  @Since("3.0.0")
+  override def save(path: String): Unit = {
+require(hashFuncVersion == HashingTF.SPARK_3_MURMUR3_HASH,
+  "Cannot save model which is loaded from lower version spark saved model. 
We can address " +
+  "it by (1) use old spark version to save the model, or (2) use new 
version spark to " +
+  "re-train the pipeline.")
+super.save(path)
+  }
 }
 
 @Since("1.6.0")
 object HashingTF extends DefaultParamsReadable[HashingTF] {
 
+  private[ml] val SPARK_2_MURMUR3_HASH = 1
+  private[ml] val SPARK_3_MURMUR3_HASH = 2
+
   private class HashingTFReader extends MLReader[HashingTF] {
 
 private val className = classOf[HashingTF].getName
 
 override def load(path: String): HashingTF = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
-  val hashingTF = new HashingTF(metadata.uid)
-  metadata.getAndSetParams(hashingTF)
 
   // We support loading old `HashingTF` saved by previous Spark versions.
   // Previous `HashingTF` uses `mllib.feature.HashingTF.murmur3Hash`, but 
new `HashingTF` uses
   // `ml.Feature.FeatureHasher.murmur3Hash`.
   val (majorVersion, _) = majorMinorVersion(metadata.sparkVersion)
-  if (majorVersi

[spark] branch branch-3.0 updated: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF

2020-05-12 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new b50d53b  [SPARK-31610][SPARK-31668][ML] Address hashingTF 
saving bug and expose hashFunc property in HashingTF
b50d53b is described below

commit b50d53b1079ea32c75f9192f27b2b07cdec69641
Author: Weichen Xu 
AuthorDate: Tue May 12 08:54:28 2020 -0700

[SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and 
expose hashFunc property in HashingTF

### What changes were proposed in this pull request?
Expose hashFunc property in HashingTF

Some third-party library such as mleap need to access it.
See background description here:
https://github.com/combust/mleap/pull/665#issuecomment-621258623

### Why are the changes needed?
See https://github.com/combust/mleap/pull/665#issuecomment-621258623

### Does this PR introduce any user-facing change?
No. Only add a package private constructor.

### How was this patch tested?
N/A

Closes #28413 from WeichenXu123/hashing_tf_expose_hashfunc.

Authored-by: Weichen Xu 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit e248bc7af6086cde7dd89a51459ae6a221a600c8)
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/ml/feature/HashingTF.scala| 40 +-
 .../apache/spark/ml/feature/HashingTFSuite.scala   |  4 +++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 80bf859..d2bb013 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -42,14 +42,17 @@ import org.apache.spark.util.VersionUtils.majorMinorVersion
  * otherwise the features will not be mapped evenly to the columns.
  */
 @Since("1.2.0")
-class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
+class HashingTF @Since("3.0.0") private[ml] (
+@Since("1.4.0") override val uid: String,
+@Since("3.1.0") val hashFuncVersion: Int)
   extends Transformer with HasInputCol with HasOutputCol with HasNumFeatures
 with DefaultParamsWritable {
 
-  private var hashFunc: Any => Int = FeatureHasher.murmur3Hash
-
   @Since("1.2.0")
-  def this() = this(Identifiable.randomUID("hashingTF"))
+  def this() = this(Identifiable.randomUID("hashingTF"), 
HashingTF.SPARK_3_MURMUR3_HASH)
+
+  @Since("1.4.0")
+  def this(uid: String) = this(uid, hashFuncVersion = 
HashingTF.SPARK_3_MURMUR3_HASH)
 
   /** @group setParam */
   @Since("1.4.0")
@@ -122,7 +125,12 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override 
val uid: String)
*/
   @Since("3.0.0")
   def indexOf(term: Any): Int = {
-Utils.nonNegativeMod(hashFunc(term), $(numFeatures))
+val hashValue = hashFuncVersion match {
+  case HashingTF.SPARK_2_MURMUR3_HASH => OldHashingTF.murmur3Hash(term)
+  case HashingTF.SPARK_3_MURMUR3_HASH => FeatureHasher.murmur3Hash(term)
+  case _ => throw new IllegalArgumentException("Illegal hash function 
version setting.")
+}
+Utils.nonNegativeMod(hashValue, $(numFeatures))
   }
 
   @Since("1.4.1")
@@ -132,27 +140,41 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override 
val uid: String)
   override def toString: String = {
 s"HashingTF: uid=$uid, binary=${$(binary)}, numFeatures=${$(numFeatures)}"
   }
+
+  @Since("3.0.0")
+  override def save(path: String): Unit = {
+require(hashFuncVersion == HashingTF.SPARK_3_MURMUR3_HASH,
+  "Cannot save model which is loaded from lower version spark saved model. 
We can address " +
+  "it by (1) use old spark version to save the model, or (2) use new 
version spark to " +
+  "re-train the pipeline.")
+super.save(path)
+  }
 }
 
 @Since("1.6.0")
 object HashingTF extends DefaultParamsReadable[HashingTF] {
 
+  private[ml] val SPARK_2_MURMUR3_HASH = 1
+  private[ml] val SPARK_3_MURMUR3_HASH = 2
+
   private class HashingTFReader extends MLReader[HashingTF] {
 
 private val className = classOf[HashingTF].getName
 
 override def load(path: String): HashingTF = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
-  val hashingTF = new HashingTF(metadata.uid)
-  metadata.getAndSetParams(hashingTF)
 
   // We support loading old `HashingTF` saved by previous Spark versions.
   // Previous `HashingTF` uses `mllib.feature.HashingTF.murmur3Hash`, but 
new `HashingTF` uses

[spark] branch branch-3.0 updated: [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model

2020-04-26 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 4421178  [SPARK-31497][ML][PYSPARK] Fix Pyspark 
CrossValidator/TrainValidationSplit with pipeline estimator cannot save and 
load model
4421178 is described below

commit 442117812ca6edc6e0ab271da829032b9637e89e
Author: Weichen Xu 
AuthorDate: Sun Apr 26 21:04:14 2020 -0700

[SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit 
with pipeline estimator cannot save and load model

### What changes were proposed in this pull request?
Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator 
cannot save and load model.

Most pyspark estimators/transformers inherit `JavaParams`, but some 
estimators are special (in order to support pure python implemented nested 
estimators/transformers):
* Pipeline
* OneVsRest
* CrossValidator
* TrainValidationSplit

But note that, currently, in pyspark, estimators listed above, their model 
reader/writer do NOT support pure python implemented nested 
estimators/transformers. Because they use java reader/writer wrapper as python 
side reader/writer.

Pyspark CrossValidator/TrainValidationSplit model reader/writer require all 
estimators define the `_transfer_param_map_to_java` and 
`_transfer_param_map_from_java` (used in model read/write).

OneVsRest class already defines the two methods, but Pipeline do not, so it 
lead to this bug.

In this PR I add `_transfer_param_map_to_java` and 
`_transfer_param_map_from_java` into Pipeline class.

### Why are the changes needed?
Bug fix.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Unit test.

Manually test in pyspark shell:
1) CrossValidator with Simple Pipeline estimator
```
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, 
ParamGridBuilder

training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0),
(4, "b spark who", 1.0),
(5, "g d a y", 0.0),
(6, "spark fly", 1.0),
(7, "was mapreduce", 0.0),
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of tree stages: tokenizer, 
hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), 
outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline,
  estimatorParamMaps=paramGrid,
  evaluator=BinaryClassificationEvaluator(),
  numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

cvModel.save('/tmp/cv_model001')
CrossValidatorModel.load('/tmp/cv_model001')
```

2) CrossValidator with Pipeline estimator which include a OneVsRest 
estimator stage, and OneVsRest estimator nest a LogisticRegression estimator.

```
from pyspark.ml.linalg import Vectors
from pyspark.ml import Estimator, Model
from pyspark.ml.classification import LogisticRegression, 
LogisticRegressionModel, OneVsRest
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.param import Param, Params
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, 
ParamGridBuilder, \
TrainValidationSplit, TrainValidationSplitModel
from pyspark.sql.functions import rand
from pyspark.testing.mlutils import SparkSessionTestCase

dataset = spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
 (Vectors.dense([0.4]), 1.0),
 (Vectors.dense([0.5]), 0.0),
 (Vectors.dense([0.6]), 1.0),
 (Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])

ova = OneVsRest(classifier=LogisticRegression())
lr1 = LogisticReg

[spark] branch master updated: [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model

2020-04-26 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 4a21c4c  [SPARK-31497][ML][PYSPARK] Fix Pyspark 
CrossValidator/TrainValidationSplit with pipeline estimator cannot save and 
load model
4a21c4c is described below

commit 4a21c4cc92805b034ade0593eea3c4a9b6122083
Author: Weichen Xu 
AuthorDate: Sun Apr 26 21:04:14 2020 -0700

[SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit 
with pipeline estimator cannot save and load model

### What changes were proposed in this pull request?
Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator 
cannot save and load model.

Most pyspark estimators/transformers inherit `JavaParams`, but some 
estimators are special (in order to support pure python implemented nested 
estimators/transformers):
* Pipeline
* OneVsRest
* CrossValidator
* TrainValidationSplit

But note that, currently, in pyspark, estimators listed above, their model 
reader/writer do NOT support pure python implemented nested 
estimators/transformers. Because they use java reader/writer wrapper as python 
side reader/writer.

Pyspark CrossValidator/TrainValidationSplit model reader/writer require all 
estimators define the `_transfer_param_map_to_java` and 
`_transfer_param_map_from_java` (used in model read/write).

OneVsRest class already defines the two methods, but Pipeline do not, so it 
lead to this bug.

In this PR I add `_transfer_param_map_to_java` and 
`_transfer_param_map_from_java` into Pipeline class.

### Why are the changes needed?
Bug fix.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Unit test.

Manually test in pyspark shell:
1) CrossValidator with Simple Pipeline estimator
```
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, 
ParamGridBuilder

training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0),
(4, "b spark who", 1.0),
(5, "g d a y", 0.0),
(6, "spark fly", 1.0),
(7, "was mapreduce", 0.0),
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of tree stages: tokenizer, 
hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), 
outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline,
  estimatorParamMaps=paramGrid,
  evaluator=BinaryClassificationEvaluator(),
  numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

cvModel.save('/tmp/cv_model001')
CrossValidatorModel.load('/tmp/cv_model001')
```

2) CrossValidator with Pipeline estimator which include a OneVsRest 
estimator stage, and OneVsRest estimator nest a LogisticRegression estimator.

```
from pyspark.ml.linalg import Vectors
from pyspark.ml import Estimator, Model
from pyspark.ml.classification import LogisticRegression, 
LogisticRegressionModel, OneVsRest
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.param import Param, Params
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, 
ParamGridBuilder, \
TrainValidationSplit, TrainValidationSplitModel
from pyspark.sql.functions import rand
from pyspark.testing.mlutils import SparkSessionTestCase

dataset = spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
 (Vectors.dense([0.4]), 1.0),
 (Vectors.dense([0.5]), 0.0),
 (Vectors.dense([0.6]), 1.0),
 (Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])

ova = OneVsRest(classifier=LogisticRegression())
lr1 = LogisticReg

[spark] branch branch-3.0 updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext

2020-02-19 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new f482187  [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
f482187 is described below

commit f482187c127418d2ea538ac2551ae0fce1ddbc31
Author: sarthfrey-db 
AuthorDate: Thu Feb 13 16:15:00 2020 -0800

[SPARK-30667][CORE] Add allGather method to BarrierTaskContext

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method 
contains the same functionality as the `BarrierTaskContext.barrier` method; it 
blocks the task until all tasks make the call, at which time they may continue 
execution. In addition, the `allGather` method takes an input message. Upon 
returning from the `allGather` the task receives a list of all the messages 
sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a 
synchronized way is useful. One simple example is if each task needs to start a 
server to serve requests from one another; first the tasks must find a free 
port (the result of which is undetermined beforehand) and then start making 
requests, but to do so they each must know the port chosen by the other task. 
An `allGather` method would allow them to inform each other of the port they 
will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the 
Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, 
since this PR includes a refactor so that much code is shared by the `barrier` 
and `allGather` methods. However, a test is added to assert that an all gather 
on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
... context = BarrierTaskContext.get()
... return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27395 from sarthfrey/master.

Lead-authored-by: sarthfrey-db 
Co-authored-by: sarthfrey 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002)
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/BarrierCoordinator.scala  | 113 +--
 .../org/apache/spark/BarrierTaskContext.scala  | 153 ++---
 .../org/apache/spark/api/python/PythonRunner.scala |  51 +--
 .../spark/scheduler/BarrierTaskContextSuite.scala  |  74 ++
 python/pyspark/taskcontext.py  |  49 ++-
 python/pyspark/tests/test_taskcontext.py   |  20 +++
 6 files changed, 381 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala 
b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e41767..042a266 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, 
SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
 // reset when a barrier() call fails due to timeout.
 private var barrierEpoch: Int = 0
 
-// An array of RPCCallContexts for barrier tasks that are waiting for 
reply of a barrier()
-// call.
+// An Array of RPCCallContexts for barrier tasks that have made a blocking 
runBarrier() call
 private val requesters: ArrayBuffer[RpcCallContext] = new 
ArrayBuffer[RpcCallContext](numTasks)
 
+// An Array of allGather messages for barrier tasks that have made a 
blocking runBarrier() call
+private val allGatherMessages: ArrayBuffer[String] = new 
Array[String](numTasks).to[ArrayBuffer]
+
+// The blocking requestMethod called by tasks to sync up for this stage 
attempt
+private var requestMethodToSync: RequestMethod.Value = 
RequestM

[spark] branch master updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext

2020-02-19 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new af63971  [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
af63971 is described below

commit af63971cb7a5e7c7cb23ff1f87e5838d54c59a7d
Author: sarthfrey-db 
AuthorDate: Thu Feb 13 16:15:00 2020 -0800

[SPARK-30667][CORE] Add allGather method to BarrierTaskContext

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method 
contains the same functionality as the `BarrierTaskContext.barrier` method; it 
blocks the task until all tasks make the call, at which time they may continue 
execution. In addition, the `allGather` method takes an input message. Upon 
returning from the `allGather` the task receives a list of all the messages 
sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a 
synchronized way is useful. One simple example is if each task needs to start a 
server to serve requests from one another; first the tasks must find a free 
port (the result of which is undetermined beforehand) and then start making 
requests, but to do so they each must know the port chosen by the other task. 
An `allGather` method would allow them to inform each other of the port they 
will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the 
Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, 
since this PR includes a refactor so that much code is shared by the `barrier` 
and `allGather` methods. However, a test is added to assert that an all gather 
on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
... context = BarrierTaskContext.get()
... return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27395 from sarthfrey/master.

Lead-authored-by: sarthfrey-db 
Co-authored-by: sarthfrey 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002)
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/BarrierCoordinator.scala  | 113 +--
 .../org/apache/spark/BarrierTaskContext.scala  | 153 ++---
 .../org/apache/spark/api/python/PythonRunner.scala |  51 +--
 .../spark/scheduler/BarrierTaskContextSuite.scala  |  74 ++
 python/pyspark/taskcontext.py  |  49 ++-
 python/pyspark/tests/test_taskcontext.py   |  20 +++
 6 files changed, 381 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala 
b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e41767..042a266 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, 
SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
 // reset when a barrier() call fails due to timeout.
 private var barrierEpoch: Int = 0
 
-// An array of RPCCallContexts for barrier tasks that are waiting for 
reply of a barrier()
-// call.
+// An Array of RPCCallContexts for barrier tasks that have made a blocking 
runBarrier() call
 private val requesters: ArrayBuffer[RpcCallContext] = new 
ArrayBuffer[RpcCallContext](numTasks)
 
+// An Array of allGather messages for barrier tasks that have made a 
blocking runBarrier() call
+private val allGatherMessages: ArrayBuffer[String] = new 
Array[String](numTasks).to[ArrayBuffer]
+
+// The blocking requestMethod called by tasks to sync up for this stage 
attempt
+private var requestMethodToSync: RequestMethod.Value = 
RequestM

[spark] branch branch-3.0 updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext

2020-02-13 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 6001866  [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
6001866 is described below

commit 6001866cea1216da421c5acd71d6fc74228222ac
Author: sarthfrey-db 
AuthorDate: Thu Feb 13 16:15:00 2020 -0800

[SPARK-30667][CORE] Add allGather method to BarrierTaskContext

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method 
contains the same functionality as the `BarrierTaskContext.barrier` method; it 
blocks the task until all tasks make the call, at which time they may continue 
execution. In addition, the `allGather` method takes an input message. Upon 
returning from the `allGather` the task receives a list of all the messages 
sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a 
synchronized way is useful. One simple example is if each task needs to start a 
server to serve requests from one another; first the tasks must find a free 
port (the result of which is undetermined beforehand) and then start making 
requests, but to do so they each must know the port chosen by the other task. 
An `allGather` method would allow them to inform each other of the port they 
will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the 
Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, 
since this PR includes a refactor so that much code is shared by the `barrier` 
and `allGather` methods. However, a test is added to assert that an all gather 
on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
... context = BarrierTaskContext.get()
... return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27395 from sarthfrey/master.

Lead-authored-by: sarthfrey-db 
Co-authored-by: sarthfrey 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002)
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/BarrierCoordinator.scala  | 113 +--
 .../org/apache/spark/BarrierTaskContext.scala  | 153 ++---
 .../org/apache/spark/api/python/PythonRunner.scala |  51 +--
 .../spark/scheduler/BarrierTaskContextSuite.scala  |  74 ++
 python/pyspark/taskcontext.py  |  49 ++-
 python/pyspark/tests/test_taskcontext.py   |  20 +++
 6 files changed, 381 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala 
b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e41767..042a266 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, 
SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
 // reset when a barrier() call fails due to timeout.
 private var barrierEpoch: Int = 0
 
-// An array of RPCCallContexts for barrier tasks that are waiting for 
reply of a barrier()
-// call.
+// An Array of RPCCallContexts for barrier tasks that have made a blocking 
runBarrier() call
 private val requesters: ArrayBuffer[RpcCallContext] = new 
ArrayBuffer[RpcCallContext](numTasks)
 
+// An Array of allGather messages for barrier tasks that have made a 
blocking runBarrier() call
+private val allGatherMessages: ArrayBuffer[String] = new 
Array[String](numTasks).to[ArrayBuffer]
+
+// The blocking requestMethod called by tasks to sync up for this stage 
attempt
+private var requestMethodToSync: RequestMethod.Value = 
RequestM

[spark] branch master updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext

2020-02-13 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 57254c9  [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
57254c9 is described below

commit 57254c9719f9af9ad985596ed7fbbaafa4052002
Author: sarthfrey-db 
AuthorDate: Thu Feb 13 16:15:00 2020 -0800

[SPARK-30667][CORE] Add allGather method to BarrierTaskContext

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method 
contains the same functionality as the `BarrierTaskContext.barrier` method; it 
blocks the task until all tasks make the call, at which time they may continue 
execution. In addition, the `allGather` method takes an input message. Upon 
returning from the `allGather` the task receives a list of all the messages 
sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a 
synchronized way is useful. One simple example is if each task needs to start a 
server to serve requests from one another; first the tasks must find a free 
port (the result of which is undetermined beforehand) and then start making 
requests, but to do so they each must know the port chosen by the other task. 
An `allGather` method would allow them to inform each other of the port they 
will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the 
Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, 
since this PR includes a refactor so that much code is shared by the `barrier` 
and `allGather` methods. However, a test is added to assert that an all gather 
on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
... context = BarrierTaskContext.get()
... return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27395 from sarthfrey/master.

Lead-authored-by: sarthfrey-db 
Co-authored-by: sarthfrey 
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/BarrierCoordinator.scala  | 113 +--
 .../org/apache/spark/BarrierTaskContext.scala  | 153 ++---
 .../org/apache/spark/api/python/PythonRunner.scala |  51 +--
 .../spark/scheduler/BarrierTaskContextSuite.scala  |  74 ++
 python/pyspark/taskcontext.py  |  49 ++-
 python/pyspark/tests/test_taskcontext.py   |  20 +++
 6 files changed, 381 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala 
b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e41767..042a266 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, 
SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
 // reset when a barrier() call fails due to timeout.
 private var barrierEpoch: Int = 0
 
-// An array of RPCCallContexts for barrier tasks that are waiting for 
reply of a barrier()
-// call.
+// An Array of RPCCallContexts for barrier tasks that have made a blocking 
runBarrier() call
 private val requesters: ArrayBuffer[RpcCallContext] = new 
ArrayBuffer[RpcCallContext](numTasks)
 
+// An Array of allGather messages for barrier tasks that have made a 
blocking runBarrier() call
+private val allGatherMessages: ArrayBuffer[String] = new 
Array[String](numTasks).to[ArrayBuffer]
+
+// The blocking requestMethod called by tasks to sync up for this stage 
attempt
+private var requestMethodToSync: RequestMethod.Value = 
RequestMethod.BARRIER
+
 // A timer task that ensures we may timeout for a barrier() call.
 private var timerTask:

[spark] branch master updated: [SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays

2020-01-06 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 88542bc  [SPARK-30154][ML] PySpark UDF to convert MLlib vectors to 
dense arrays
88542bc is described below

commit 88542bc3d9e506b1a0e852f3e9c632920d3fe553
Author: WeichenXu 
AuthorDate: Mon Jan 6 16:18:51 2020 -0800

[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays

### What changes were proposed in this pull request?

PySpark UDF to convert MLlib vectors to dense arrays.
Example:
```
from pyspark.ml.functions import vector_to_array
df.select(vector_to_array(col("features"))
```

### Why are the changes needed?
If a PySpark user wants to convert MLlib sparse/dense vectors in a 
DataFrame into dense arrays, an efficient approach is to do that in JVM. 
However, it requires PySpark user to write Scala code and register it as a UDF. 
Often this is infeasible for a pure python project.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
UT.

Closes #26910 from WeichenXu123/vector_to_array.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 
---
 dev/sparktestsupport/modules.py|  1 +
 .../main/scala/org/apache/spark/ml/functions.scala | 48 +++
 .../scala/org/apache/spark/ml/FunctionsSuite.scala | 65 +
 python/docs/pyspark.ml.rst |  8 +++
 python/pyspark/ml/functions.py | 68 ++
 5 files changed, 190 insertions(+)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 1443584..4179359 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -460,6 +460,7 @@ pyspark_ml = Module(
 "pyspark.ml.evaluation",
 "pyspark.ml.feature",
 "pyspark.ml.fpm",
+"pyspark.ml.functions",
 "pyspark.ml.image",
 "pyspark.ml.linalg.__init__",
 "pyspark.ml.recommendation",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/functions.scala 
b/mllib/src/main/scala/org/apache/spark/ml/functions.scala
new file mode 100644
index 000..1faf562
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/functions.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.mllib.linalg.{Vector => OldVector}
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.functions.udf
+
+// scalastyle:off
+@Since("3.0.0")
+object functions {
+// scalastyle:on
+
+  private val vectorToArrayUdf = udf { vec: Any =>
+vec match {
+  case v: Vector => v.toArray
+  case v: OldVector => v.toArray
+  case v => throw new IllegalArgumentException(
+"function vector_to_array requires a non-null input argument and input 
type must be " +
+"`org.apache.spark.ml.linalg.Vector` or 
`org.apache.spark.mllib.linalg.Vector`, " +
+s"but got ${ if (v == null) "null" else v.getClass.getName }.")
+}
+  }.asNonNullable()
+
+  /**
+   * Converts a column of MLlib sparse/dense vectors into a column of dense 
arrays.
+   *
+   * @since 3.0.0
+   */
+  def vector_to_array(v: Column): Column = vectorToArrayUdf(v)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala
new file mode 100644
index 000..2f5062c
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF lice

[spark] branch master updated: [SPARK-28978][ ] Support > 256 args to python udf

2019-11-08 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 8152a87  [SPARK-28978][ ] Support > 256 args to python udf
8152a87 is described below

commit 8152a87235a63a13969f7c1ff5ed038956e8ed76
Author: Bago Amirbekian 
AuthorDate: Fri Nov 8 19:19:14 2019 -0800

[SPARK-28978][ ] Support > 256 args to python udf

### What changes were proposed in this pull request?

On the worker we express lambda functions as strings and then eval them to 
create a "mapper" function. This make the code hard to read & limits the # of 
arguments a udf can support to 256 for python <= 3.6.

This PR rewrites the mapper functions as nested functions instead of 
"lambda strings" and allows passing in more than 255 args.

### Why are the changes needed?
The jira ticket associated with this issue describes how MLflow uses udfs 
to consume columns as features. This pattern isn't unique and a limit of 255 
features is quite low.

### Does this PR introduce any user-facing change?
Users can now pass more than 255 cols to a udf function.

### How was this patch tested?
Added a unit test for passing in > 255 args to udf.

Closes #26442 from MrBago/replace-lambdas-on-worker.

Authored-by: Bago Amirbekian 
Signed-off-by: Xiangrui Meng 
---
 python/pyspark/sql/tests/test_udf.py | 13 
 python/pyspark/worker.py | 62 +---
 2 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/python/pyspark/sql/tests/test_udf.py 
b/python/pyspark/sql/tests/test_udf.py
index c274dc7..3b9f12f 100644
--- a/python/pyspark/sql/tests/test_udf.py
+++ b/python/pyspark/sql/tests/test_udf.py
@@ -629,6 +629,19 @@ class UDFTests(ReusedSQLTestCase):
 
 self.sc.parallelize(range(1), 1).mapPartitions(task).count()
 
+def test_udf_with_256_args(self):
+N = 256
+data = [["data-%d" % i for i in range(N)]] * 5
+df = self.spark.createDataFrame(data)
+
+def f(*a):
+return "success"
+
+fUdf = udf(f, StringType())
+
+r = df.select(fUdf(*df.columns))
+self.assertEqual(r.first()[0], "success")
+
 
 class UDFInitializationTests(unittest.TestCase):
 def tearDown(self):
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 3a1200e..bfa8d97 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -403,54 +403,50 @@ def read_udfs(pickleSer, infile, eval_type):
 idx += offsets_len
 return parsed
 
-udfs = {}
-call_udf = []
-mapper_str = ""
 if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
-# Create function like this:
-#   lambda a: f([a[0]], [a[0], a[1]])
-
 # We assume there is only one UDF here because grouped map doesn't
 # support combining multiple UDFs.
 assert num_udfs == 1
 
 # See FlatMapGroupsInPandasExec for how arg_offsets are used to
 # distinguish between grouping attributes and data attributes
-arg_offsets, udf = read_single_udf(
-pickleSer, infile, eval_type, runner_conf, udf_index=0)
-udfs['f'] = udf
+arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, 
runner_conf, udf_index=0)
 parsed_offsets = extract_key_value_indexes(arg_offsets)
-keys = ["a[%d]" % (o,) for o in parsed_offsets[0][0]]
-vals = ["a[%d]" % (o, ) for o in parsed_offsets[0][1]]
-mapper_str = "lambda a: f([%s], [%s])" % (", ".join(keys), ", 
".join(vals))
+
+# Create function like this:
+#   mapper a: f([a[0]], [a[0], a[1]])
+def mapper(a):
+keys = [a[o] for o in parsed_offsets[0][0]]
+vals = [a[o] for o in parsed_offsets[0][1]]
+return f(keys, vals)
 elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
 # We assume there is only one UDF here because cogrouped map doesn't
 # support combining multiple UDFs.
 assert num_udfs == 1
-arg_offsets, udf = read_single_udf(
-pickleSer, infile, eval_type, runner_conf, udf_index=0)
-udfs['f'] = udf
+arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, 
runner_conf, udf_index=0)
+
 parsed_offsets = extract_key_value_indexes(arg_offsets)
-df1_keys = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][0]]
-df1_vals = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][1]]
-df2_keys = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][0]]
-df2_vals = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][1]]
-mapper_s

[spark] branch master updated: [SPARK-29417][CORE] Resource Scheduling - add TaskContext.resource java api

2019-10-14 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new a42d894  [SPARK-29417][CORE] Resource Scheduling - add 
TaskContext.resource java api
a42d894 is described below

commit a42d894a4090c97a90ce23b0989163909ebf548d
Author: Thomas Graves 
AuthorDate: Mon Oct 14 13:27:34 2019 -0700

[SPARK-29417][CORE] Resource Scheduling - add TaskContext.resource java api

### What changes were proposed in this pull request?
We added a TaskContext.resources() api, but I realized this is returning a 
scala Map which is not ideal for access from Java.  Here I add a resourcesJMap 
function which returns a java.util.Map to make it easily accessible from Java.

### Why are the changes needed?
Java API access

### Does this PR introduce any user-facing change?

Yes, new TaskContext function to access from Java

### How was this patch tested?

new unit test

Closes #26083 from tgravescs/SPARK-29417.

Lead-authored-by: Thomas Graves 

Co-authored-by: Thomas Graves 
Co-authored-by: Thomas Graves 
Signed-off-by: Xiangrui Meng 
---
 core/src/main/scala/org/apache/spark/BarrierTaskContext.scala | 5 +
 core/src/main/scala/org/apache/spark/TaskContext.scala| 8 
 core/src/main/scala/org/apache/spark/TaskContextImpl.scala| 5 +
 .../java/test/org/apache/spark/JavaTaskContextCompileCheck.java   | 5 +
 project/MimaExcludes.scala| 3 +++
 5 files changed, 26 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala 
b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 5afd8a5..3d36980 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import java.util.{Properties, Timer, TimerTask}
 
+import scala.collection.JavaConverters._
 import scala.concurrent.TimeoutException
 import scala.concurrent.duration._
 
@@ -211,6 +212,10 @@ class BarrierTaskContext private[spark] (
 
   override def resources(): Map[String, ResourceInformation] = 
taskContext.resources()
 
+  override def resourcesJMap(): java.util.Map[String, ResourceInformation] = {
+resources().asJava
+  }
+
   override private[spark] def killTaskIfInterrupted(): Unit = 
taskContext.killTaskIfInterrupted()
 
   override private[spark] def getKillReason(): Option[String] = 
taskContext.getKillReason()
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala 
b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 2299c54..fd41fac 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -185,6 +185,14 @@ abstract class TaskContext extends Serializable {
   @Evolving
   def resources(): Map[String, ResourceInformation]
 
+  /**
+   * (java-specific) Resources allocated to the task. The key is the resource 
name and the value
+   * is information about the resource. Please refer to
+   * [[org.apache.spark.resource.ResourceInformation]] for specifics.
+   */
+  @Evolving
+  def resourcesJMap(): java.util.Map[String, ResourceInformation]
+
   @DeveloperApi
   def taskMetrics(): TaskMetrics
 
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala 
b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index 516fb95..08a58a0 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import java.util.Properties
 import javax.annotation.concurrent.GuardedBy
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.executor.TaskMetrics
@@ -101,6 +102,10 @@ private[spark] class TaskContextImpl(
 this
   }
 
+  override def resourcesJMap(): java.util.Map[String, ResourceInformation] = {
+resources.asJava
+  }
+
   @GuardedBy("this")
   private[spark] override def markTaskFailed(error: Throwable): Unit = 
synchronized {
 if (failed) return
diff --git 
a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java 
b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
index 62a0b85..5ce7937 100644
--- a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
+++ b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
@@ -17,7 +17,10 @@
 
 package test.org.apache.spark;
 
+import java.util.Map;
+
 import org.apache.spark.TaskContext;
+import org.apache.spark.resource.ResourceInformation;
 import org.apache.spark.util.TaskCompletionListene

[spark] branch master updated: [SPARK-28206][PYTHON] Remove the legacy Epydoc in PySpark API documentation

2019-07-05 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new fe75ff8  [SPARK-28206][PYTHON] Remove the legacy Epydoc in PySpark API 
documentation
fe75ff8 is described below

commit fe75ff8bea3330a10aba1a61f3aba42e541195a8
Author: HyukjinKwon 
AuthorDate: Fri Jul 5 10:08:22 2019 -0700

[SPARK-28206][PYTHON] Remove the legacy Epydoc in PySpark API documentation

## What changes were proposed in this pull request?

Seems like we used to generate PySpark API documentation by Epydoc almost 
at the very first place (see 
https://github.com/apache/spark/commit/85b8f2c64f0fc4be5645d8736629fc082cb3587b).

This fixes an actual issue:

Before:

![Screen Shot 2019-07-05 at 8 20 01 
PM](https://user-images.githubusercontent.com/6477701/60720491-e9879180-9f65-11e9-9562-100830a456cd.png)

After:

![Screen Shot 2019-07-05 at 8 20 05 
PM](https://user-images.githubusercontent.com/6477701/60720495-ec828200-9f65-11e9-8277-8f689e292cb0.png)

It seems apparently a bug within `epytext` plugin during the conversion 
between`param` and `:param` syntax. See also [Epydoc 
syntax](http://epydoc.sourceforge.net/manual-epytext.html).

Actually, Epydoc syntax violates 
[PEP-257](https://www.python.org/dev/peps/pep-0257/) IIRC and blocks us to 
enable some rules for doctest linter as well.

We should remove this legacy away and I guess Spark 3 is good timing to do 
it.

## How was this patch tested?

Manually built the doc and check each.

I had to manually find the Epydoc syntax by `git grep -r "{L"`, for 
instance.

Closes #25060 from HyukjinKwon/SPARK-28206.

Authored-by: HyukjinKwon 
Signed-off-by: Xiangrui Meng 
---
 python/docs/conf.py  |   1 -
 python/docs/epytext.py   |  30 
 python/pyspark/accumulators.py   |  14 ++--
 python/pyspark/broadcast.py  |   6 +-
 python/pyspark/conf.py   |   8 +--
 python/pyspark/context.py|  56 +++
 python/pyspark/files.py  |   7 +-
 python/pyspark/ml/feature.py |   2 +-
 python/pyspark/ml/linalg/__init__.py |   8 +--
 python/pyspark/mllib/classification.py   |   4 +-
 python/pyspark/mllib/clustering.py   |   6 +-
 python/pyspark/mllib/linalg/__init__.py  |   8 +--
 python/pyspark/mllib/random.py   |   6 +-
 python/pyspark/mllib/stat/_statistics.py |   4 +-
 python/pyspark/mllib/util.py |   4 +-
 python/pyspark/rdd.py| 114 +++
 python/pyspark/serializers.py|  12 ++--
 python/pyspark/sql/dataframe.py  |  10 +--
 python/pyspark/sql/types.py  |   2 +-
 python/pyspark/streaming/context.py  |  42 ++--
 python/pyspark/streaming/dstream.py  |  50 +++---
 python/pyspark/taskcontext.py|   2 +-
 python/pyspark/testing/streamingutils.py |   6 +-
 23 files changed, 185 insertions(+), 217 deletions(-)

diff --git a/python/docs/conf.py b/python/docs/conf.py
index f507ee3..9e7afb7 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -31,7 +31,6 @@ needs_sphinx = '1.2'
 extensions = [
 'sphinx.ext.autodoc',
 'sphinx.ext.viewcode',
-'epytext',
 'sphinx.ext.mathjax',
 ]
 
diff --git a/python/docs/epytext.py b/python/docs/epytext.py
deleted file mode 100644
index 4bbbf65..000
--- a/python/docs/epytext.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import re
-
-RULES = (
-(r"<(!BLANKLINE)[\w.]+>", r""),
-(r"L{([\w.()]+)}", r":class:`\1`"),
-(r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
-(r"C{([\w.()]+)}", r":class:`\1`"),
-(r"[IBCM]{([^}]+)}", r"`\1`"),
-('pyspark.rdd.RDD', 'RDD'),
-)
-
-
-def _convert_epytext(line):
-"""
->>> _convert_epytext("L{A}")
-:class:`A`
-"""
-line = line.replace('@', ':')
-for p, sub in RULES:
-line = re.sub(p, sub, line)
-return line
-
-
-def _process_docstring(app, what, name, obj, options, lines):
-for i in range(len(lines)):
-lines[i] = _convert_epytext(lines[i])
-
-
-def setup(app):
-app.connect("autodoc-process-docstring", _process_docstring)
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 00ec094..a5d5132 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -123,13 +123,13 @@ class Accumulator(object):
 
 """
 A shared variable that can be accumulated, i.e., has a commutative and 
associative "add"
-operation. Worker tasks on a Spark clus

[spark] branch master updated: [SPARK-28115][CORE][TEST] Fix flaky test: SparkContextSuite.test resource scheduling under local-cluster mode

2019-06-20 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new d98a5ce  [SPARK-28115][CORE][TEST] Fix flaky test: 
SparkContextSuite.test resource scheduling under local-cluster mode
d98a5ce is described below

commit d98a5ce34d6b4b098d30c26c89a9d65d931f930d
Author: Xingbo Jiang 
AuthorDate: Thu Jun 20 13:23:29 2019 -0700

[SPARK-28115][CORE][TEST] Fix flaky test: SparkContextSuite.test resource 
scheduling under local-cluster mode

## What changes were proposed in this pull request?

The test `SparkContextSuite.test resource scheduling under local-cluster 
mode` has been flaky, because it expects the size of 
`sc.statusTracker.getExecutorInfos` be the same as the number of executors, 
while the returned list contains both the driver and executors.

## How was this patch tested?

Updated existing tests.

Closes #24917 from jiangxb1987/getExecutorInfos.

Authored-by: Xingbo Jiang 
Signed-off-by: Xiangrui Meng 
---
 .../src/main/scala/org/apache/spark/SparkStatusTracker.scala |  1 +
 core/src/test/scala/org/apache/spark/SparkContextSuite.scala | 12 +++-
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala 
b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
index 815237e..555c085 100644
--- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
+++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
@@ -99,6 +99,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext, 
store: AppStatusStore
   /**
* Returns information of all known executors, including host, port, 
cacheSize, numRunningTasks
* and memory metrics.
+   * Note this include information for both the driver and executors.
*/
   def getExecutorInfos: Array[SparkExecutorInfo] = {
 store.executorList(true).map { exec =>
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala 
b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index fa2c4bd..628ac60 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -750,9 +750,7 @@ class SparkContextSuite extends SparkFunSuite with 
LocalSparkContext with Eventu
   sc = new SparkContext(conf)
 
   // Ensure all executors has started
-  eventually(timeout(10.seconds)) {
-assert(sc.statusTracker.getExecutorInfos.size == 1)
-  }
+  TestUtils.waitUntilExecutorsUp(sc, 1, 1)
   assert(sc.resources.size === 1)
   assert(sc.resources.get(GPU).get.addresses === Array("5", "6"))
   assert(sc.resources.get(GPU).get.name === "gpu")
@@ -780,9 +778,7 @@ class SparkContextSuite extends SparkFunSuite with 
LocalSparkContext with Eventu
   sc = new SparkContext(conf)
 
   // Ensure all executors has started
-  eventually(timeout(10.seconds)) {
-assert(sc.statusTracker.getExecutorInfos.size == 1)
-  }
+  TestUtils.waitUntilExecutorsUp(sc, 1, 1)
   // driver gpu resources file should take precedence over the script
   assert(sc.resources.size === 1)
   assert(sc.resources.get(GPU).get.addresses === Array("0", "1", "8"))
@@ -855,9 +851,7 @@ class SparkContextSuite extends SparkFunSuite with 
LocalSparkContext with Eventu
   sc = new SparkContext(conf)
 
   // Ensure all executors has started
-  eventually(timeout(60.seconds)) {
-assert(sc.statusTracker.getExecutorInfos.size == 3)
-  }
+  TestUtils.waitUntilExecutorsUp(sc, 3, 6)
 
   val rdd = sc.makeRDD(1 to 10, 9).mapPartitions { it =>
 val context = TaskContext.get()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-28056][PYTHON] add doc for SCALAR_ITER Pandas UDF

2019-06-17 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 1b2448b  [SPARK-28056][PYTHON] add doc for SCALAR_ITER Pandas UDF
1b2448b is described below

commit 1b2448bc10a8ee732d08fa1abae6d64ae25e3a14
Author: Xiangrui Meng 
AuthorDate: Mon Jun 17 20:51:36 2019 -0700

[SPARK-28056][PYTHON] add doc for SCALAR_ITER Pandas UDF

## What changes were proposed in this pull request?

Add docs for `SCALAR_ITER` Pandas UDF.

cc: WeichenXu123 HyukjinKwon

## How was this patch tested?

Tested example code manually.

Closes #24897 from mengxr/SPARK-28056.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 docs/sql-pyspark-pandas-with-arrow.md | 17 +++
 examples/src/main/python/sql/arrow.py | 86 +++
 2 files changed, 103 insertions(+)

diff --git a/docs/sql-pyspark-pandas-with-arrow.md 
b/docs/sql-pyspark-pandas-with-arrow.md
index 6cf280c..9cab4be 100644
--- a/docs/sql-pyspark-pandas-with-arrow.md
+++ b/docs/sql-pyspark-pandas-with-arrow.md
@@ -86,6 +86,23 @@ The following example shows how to create a scalar Pandas 
UDF that computes the
 
 
 
+### Scalar Iterator
+
+Scalar iterator (`SCALAR_ITER`) Pandas UDF is the same as scalar Pandas UDF 
above except that the
+underlying Python function takes an iterator of batches as input instead of a 
single batch and,
+instead of returning a single output batch, it yields output batches or 
returns an iterator of
+output batches.
+It is useful when the UDF execution requires initializing some states, e.g., 
loading an machine
+learning model file to apply inference to every input batch.
+
+The following example shows how to create scalar iterator Pandas UDFs:
+
+
+
+{% include_example scalar_iter_pandas_udf python/sql/arrow.py %}
+
+
+
 ### Grouped Map
 Grouped map Pandas UDFs are used with `groupBy().apply()` which implements the 
"split-apply-combine" pattern.
 Split-apply-combine consists of three steps:
diff --git a/examples/src/main/python/sql/arrow.py 
b/examples/src/main/python/sql/arrow.py
index c1e2d29..ede121b 100644
--- a/examples/src/main/python/sql/arrow.py
+++ b/examples/src/main/python/sql/arrow.py
@@ -86,6 +86,92 @@ def scalar_pandas_udf_example(spark):
 # $example off:scalar_pandas_udf$
 
 
+def scalar_iter_pandas_udf_example(spark):
+# $example on:scalar_iter_pandas_udf$
+import pandas as pd
+
+from pyspark.sql.functions import col, pandas_udf, struct, PandasUDFType
+
+pdf = pd.DataFrame([1, 2, 3], columns=["x"])
+df = spark.createDataFrame(pdf)
+
+# When the UDF is called with a single column that is not StructType,
+# the input to the underlying function is an iterator of pd.Series.
+@pandas_udf("long", PandasUDFType.SCALAR_ITER)
+def plus_one(batch_iter):
+for x in batch_iter:
+yield x + 1
+
+df.select(plus_one(col("x"))).show()
+# +---+
+# |plus_one(x)|
+# +---+
+# |  2|
+# |  3|
+# |  4|
+# +---+
+
+# When the UDF is called with more than one columns,
+# the input to the underlying function is an iterator of pd.Series tuple.
+@pandas_udf("long", PandasUDFType.SCALAR_ITER)
+def multiply_two_cols(batch_iter):
+for a, b in batch_iter:
+yield a * b
+
+df.select(multiply_two_cols(col("x"), col("x"))).show()
+# +---+
+# |multiply_two_cols(x, x)|
+# +---+
+# |  1|
+# |  4|
+# |  9|
+# +---+
+
+# When the UDF is called with a single column that is StructType,
+# the input to the underlying function is an iterator of pd.DataFrame.
+@pandas_udf("long", PandasUDFType.SCALAR_ITER)
+def multiply_two_nested_cols(pdf_iter):
+for pdf in pdf_iter:
+yield pdf["a"] * pdf["b"]
+
+df.select(
+multiply_two_nested_cols(
+struct(col("x").alias("a"), col("x").alias("b"))
+).alias("y")
+).show()
+# +---+
+# |  y|
+# +---+
+# |  1|
+# |  4|
+# |  9|
+# +---+
+
+# In the UDF, you can initialize some states before processing batches.
+# Wrap your code with try/finally or use context managers to ensure
+# the release of resources at the end.
+y_bc = spark.sparkContext.broadcast(1)
+
+@pandas_udf("long", PandasUDFType.SCALAR_ITER)
+def plus_y(batch_iter):
+y = y_bc.value  # initialize states
+try:
+for x in batch_iter:
+

[spark] branch master updated: [SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple of pd.Series

2019-06-15 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 6d441dc  [SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an 
iterator of pd.Series or an iterator of tuple of pd.Series
6d441dc is described below

commit 6d441dcdc68dae886e375794a55658f70cd18d9d
Author: WeichenXu 
AuthorDate: Sat Jun 15 08:29:20 2019 -0700

[SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an iterator of 
pd.Series or an iterator of tuple of pd.Series

## What changes were proposed in this pull request?

Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple 
of pd.Series.
Note the UDF input args will be always one iterator:
* if the udf take only column as input, the iterator's element will be 
pd.Series (corresponding to the column values batch)
* if the udf take multiple columns as inputs, the iterator's element will 
be a tuple composed of multiple `pd.Series`s, each one corresponding to the 
multiple columns as inputs (keep the same order). For example:
```
pandas_udf("int", PandasUDFType.SCALAR_ITER)
def the_udf(iterator):
for col1_batch, col2_batch in iterator:
yield col1_batch + col2_batch

df.select(the_udf("col1", "col2"))
```
The udf above will add col1 and col2.

I haven't add unit tests, but manually tests show it works fine. So it is 
ready for first pass review.
We can test several typical cases:

```
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.functions import udf
from pyspark.taskcontext import TaskContext

df = spark.createDataFrame([(1, 20), (3, 40)], ["a", "b"])

pandas_udf("int", PandasUDFType.SCALAR_ITER)
def fi1(it):
pid = TaskContext.get().partitionId()
print("DBG: fi1: do init stuff, partitionId=" + str(pid))
for batch in it:
yield batch + 100
print("DBG: fi1: do close stuff, partitionId=" + str(pid))

pandas_udf("int", PandasUDFType.SCALAR_ITER)
def fi2(it):
pid = TaskContext.get().partitionId()
print("DBG: fi2: do init stuff, partitionId=" + str(pid))
for batch in it:
yield batch + 1
print("DBG: fi2: do close stuff, partitionId=" + str(pid))

pandas_udf("int", PandasUDFType.SCALAR_ITER)
def fi3(it):
pid = TaskContext.get().partitionId()
print("DBG: fi3: do init stuff, partitionId=" + str(pid))
for x, y in it:
yield x + y * 10 + 10
print("DBG: fi3: do close stuff, partitionId=" + str(pid))

pandas_udf("int", PandasUDFType.SCALAR)
def fp1(x):
return x + 1000

udf("int")
def fu1(x):
return x + 10

# test select "pandas iter udf/pandas udf/sql udf" expressions at the same 
time.
# Note this case the `fi1("a"), fi2("b"), fi3("a", "b")` will generate only 
one plan,
# and `fu1("a")`, `fp1("a")` will generate another two separate plans.
df.select(fi1("a"), fi2("b"), fi3("a", "b"), fu1("a"), fp1("a")).show()

# test chain two pandas iter udf together
# Note this case `fi2(fi1("a"))` will generate only one plan
# Also note the init stuff/close stuff call order will be like:
# (debug output following)
# DBG: fi2: do init stuff, partitionId=0
# DBG: fi1: do init stuff, partitionId=0
# DBG: fi1: do close stuff, partitionId=0
# DBG: fi2: do close stuff, partitionId=0
df.select(fi2(fi1("a"))).show()

# test more complex chain
# Note this case `fi1("a"), fi2("a")` will generate one plan,
# and `fi3(fi1_output, fi2_output)` will generate another plan
df.select(fi3(fi1("a"), fi2("a"))).show()
```

## How was this patch tested?

To be added.

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24643 from WeichenXu123/pandas_udf_iter.

Lead-authored-by: WeichenXu 
Co-authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/api/python/PythonRunner.scala |   2 +
 python/pyspark/rdd.py  |   1 +
 python/pyspark/sql/functions.py|   3 +
 python/pyspark/sql/tests/test_pandas_udf_scalar.py | 882 ++---
 python/pyspark/sql/udf.py  |  13 +-
 python/pyspark/worker.py

[spark] branch master updated: [SPARK-28030][SQL] convert filePath to URI in binary file data source

2019-06-12 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 4f4829b  [SPARK-28030][SQL] convert filePath to URI in binary file 
data source
4f4829b is described below

commit 4f4829b4ae261a9fd656fbf1928e6440d31f8d8c
Author: Xiangrui Meng 
AuthorDate: Wed Jun 12 13:24:02 2019 -0700

[SPARK-28030][SQL] convert filePath to URI in binary file data source

## What changes were proposed in this pull request?

Convert `PartitionedFile.filePath` to URI first in binary file data source. 
Otherwise Spark will throw a FileNotFound exception because we create `Path` 
with URL encoded string, instead of wrapping it with URI.

## How was this patch tested?

Unit test.

Closes #24855 from mengxr/SPARK-28030.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 .../spark/sql/execution/datasources/FileScanRDD.scala  |  2 +-
 .../datasources/binaryfile/BinaryFileFormat.scala  |  3 ++-
 .../datasources/binaryfile/BinaryFileFormatSuite.scala | 14 ++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index d92ea2e..9e98b0b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util.NextIterator
  * that need to be prepended to each row.
  *
  * @param partitionValues value of partition columns to be prepended to each 
row.
- * @param filePath path of the file to read
+ * @param filePath URI of the file to read
  * @param start the beginning offset (in bytes) of the block.
  * @param length number of bytes to read.
  * @param locations locality information (list of nodes that have the data).
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
index cdc7cd5..fda4e14 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.binaryfile
 
+import java.net.URI
 import java.sql.Timestamp
 
 import com.google.common.io.{ByteStreams, Closeables}
@@ -100,7 +101,7 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
 val maxLength = sparkSession.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH)
 
 file: PartitionedFile => {
-  val path = new Path(file.filePath)
+  val path = new Path(new URI(file.filePath))
   val fs = path.getFileSystem(broadcastedHadoopConf.value.value)
   val status = fs.getFileStatus(path)
   if (filterFuncs.forall(_.apply(status))) {
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala
index 01dc96c..9e2969b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala
@@ -368,4 +368,18 @@ class BinaryFileFormatSuite extends QueryTest with 
SharedSQLContext with SQLTest
   assert(caught.getMessage.contains("exceeds the max length allowed"))
 }
   }
+
+  test("SPARK-28030: support chars in file names that require URL encoding") {
+withTempDir { dir =>
+  val file = new File(dir, "test space.txt")
+  val content = "123".getBytes
+  Files.write(file.toPath, content, StandardOpenOption.CREATE, 
StandardOpenOption.WRITE)
+  val df = spark.read.format(BINARY_FILE).load(dir.getPath)
+  df.select(col(PATH), col(CONTENT)).first() match {
+case Row(p: String, c: Array[Byte]) =>
+  assert(p.endsWith(file.getAbsolutePath), "should support space in 
file name")
+  assert(c === content, "should read file with space in file name")
+  }
+}
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-27968] ArrowEvalPythonExec.evaluate shouldn't eagerly read the first row

2019-06-06 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 4d770db  [SPARK-27968] ArrowEvalPythonExec.evaluate shouldn't eagerly 
read the first row
4d770db is described below

commit 4d770db0eb252c56072f093eae318bad3d20b8d7
Author: Xiangrui Meng 
AuthorDate: Thu Jun 6 15:45:44 2019 -0700

[SPARK-27968] ArrowEvalPythonExec.evaluate shouldn't eagerly read the first 
row

## What changes were proposed in this pull request?

Issued fixed in https://github.com/apache/spark/pull/24734 but that PR 
might takes longer to merge.

## How was this patch tested?

It should pass existing unit tests.

Closes #24816 from mengxr/SPARK-27968.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 .../sql/execution/python/ArrowEvalPythonExec.scala | 27 --
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index 000ae97..73a43af 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -86,28 +86,11 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], 
resultAttrs: Seq[Attribute]
   sessionLocalTimeZone,
   pythonRunnerConf).compute(batchIter, context.partitionId(), context)
 
-new Iterator[InternalRow] {
-
-  private var currentIter = if (columnarBatchIter.hasNext) {
-val batch = columnarBatchIter.next()
-val actualDataTypes = (0 until batch.numCols()).map(i => 
batch.column(i).dataType())
-assert(outputTypes == actualDataTypes, "Invalid schema from 
pandas_udf: " +
-  s"expected ${outputTypes.mkString(", ")}, got 
${actualDataTypes.mkString(", ")}")
-batch.rowIterator.asScala
-  } else {
-Iterator.empty
-  }
-
-  override def hasNext: Boolean = currentIter.hasNext || {
-if (columnarBatchIter.hasNext) {
-  currentIter = columnarBatchIter.next().rowIterator.asScala
-  hasNext
-} else {
-  false
-}
-  }
-
-  override def next(): InternalRow = currentIter.next()
+columnarBatchIter.flatMap { batch =>
+  val actualDataTypes = (0 until batch.numCols()).map(i => 
batch.column(i).dataType())
+  assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: 
" +
+s"expected ${outputTypes.mkString(", ")}, got 
${actualDataTypes.mkString(", ")}")
+  batch.rowIterator.asScala
 }
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-27366][CORE] Support GPU Resources in Spark job scheduling

2019-06-04 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new ac808e2  [SPARK-27366][CORE] Support GPU Resources in Spark job 
scheduling
ac808e2 is described below

commit ac808e2a02d67ed6210986704b84c8079791b123
Author: Xingbo Jiang 
AuthorDate: Tue Jun 4 16:57:47 2019 -0700

[SPARK-27366][CORE] Support GPU Resources in Spark job scheduling

## What changes were proposed in this pull request?

This PR adds support to schedule tasks with extra resource requirements 
(eg. GPUs) on executors with available resources. It also introduce a new 
method `TaskContext.resources()` so tasks can access available resource 
addresses allocated to them.

## How was this patch tested?

* Added new end-to-end test cases in `SparkContextSuite`;
* Added new test case in `CoarseGrainedSchedulerBackendSuite`;
* Added new test case in `CoarseGrainedExecutorBackendSuite`;
* Added new test case in `TaskSchedulerImplSuite`;
* Added new test case in `TaskSetManagerSuite`;
* Updated existing tests.

Closes #24374 from jiangxb1987/gpu.

Authored-by: Xingbo Jiang 
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/BarrierTaskContext.scala  |   2 +
 .../main/scala/org/apache/spark/SparkConf.scala|  33 ++---
 .../main/scala/org/apache/spark/SparkContext.scala |  84 ++---
 .../main/scala/org/apache/spark/TaskContext.scala  |   9 +-
 .../scala/org/apache/spark/TaskContextImpl.scala   |   3 +-
 .../main/scala/org/apache/spark/TestUtils.scala|  11 ++
 .../executor/CoarseGrainedExecutorBackend.scala|  14 ++-
 .../scala/org/apache/spark/executor/Executor.scala |   3 +-
 .../spark/scheduler/ExecutorResourceInfo.scala | 101 
 .../scala/org/apache/spark/scheduler/Task.scala|   7 +-
 .../apache/spark/scheduler/TaskDescription.scala   |  51 +++-
 .../apache/spark/scheduler/TaskSchedulerImpl.scala |  33 -
 .../apache/spark/scheduler/TaskSetManager.scala|  14 ++-
 .../org/apache/spark/scheduler/WorkerOffer.scala   |   5 +-
 .../cluster/CoarseGrainedClusterMessage.scala  |  15 ++-
 .../cluster/CoarseGrainedSchedulerBackend.scala|  34 +-
 .../spark/scheduler/cluster/ExecutorData.scala |   5 +-
 .../scheduler/local/LocalSchedulerBackend.scala|   1 +
 .../apache/spark/JavaTaskContextCompileCheck.java  |   4 +
 ...g.apache.spark.scheduler.ExternalClusterManager |   1 +
 .../scala/org/apache/spark/ResourceName.scala} |  18 +--
 .../scala/org/apache/spark/SparkConfSuite.scala|  24 
 .../scala/org/apache/spark/SparkContextSuite.scala |  79 ++--
 .../CoarseGrainedExecutorBackendSuite.scala|  63 +-
 .../org/apache/spark/executor/ExecutorSuite.scala  |   2 +
 .../CoarseGrainedSchedulerBackendSuite.scala   | 133 -
 .../scheduler/ExecutorResourceInfoSuite.scala  |  91 ++
 .../apache/spark/scheduler/TaskContextSuite.scala  |   4 +-
 .../spark/scheduler/TaskDescriptionSuite.scala |  17 +++
 .../spark/scheduler/TaskSchedulerImplSuite.scala   |  40 ++-
 .../spark/scheduler/TaskSetManagerSuite.scala  |  21 
 project/MimaExcludes.scala |   3 +
 .../MesosFineGrainedSchedulerBackendSuite.scala|   6 +-
 33 files changed, 825 insertions(+), 106 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala 
b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index a354f44..cf957ff 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -185,6 +185,8 @@ class BarrierTaskContext private[spark] (
 taskContext.getMetricsSources(sourceName)
   }
 
+  override def resources(): Map[String, ResourceInformation] = 
taskContext.resources()
+
   override private[spark] def killTaskIfInterrupted(): Unit = 
taskContext.killTaskIfInterrupted()
 
   override private[spark] def getKillReason(): Option[String] = 
taskContext.getKillReason()
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala 
b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 15f1730..227f4a5 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -508,6 +508,15 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable 
with Logging with Seria
   }
 
   /**
+   * Get task resource requirements.
+   */
+  private[spark] def getTaskResourceRequirements(): Map[String, Int] = {
+getAllWithPrefix(SPARK_TASK_RESOURCE_PREFIX)
+  .withFilter { case (k, v) => k.endsWith(SPARK_RESOURCE_COUNT_SUFFIX)}
+  .map { case (k, v) => (k.dropRight(SPARK_RESOURCE_COUNT_SUFFIX.length), 
v.toInt)}.toMap
+  }
+
+  /**
* Checks for i

[spark] branch master updated: [SPARK-27488][CORE] Driver interface to support GPU resources

2019-05-23 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 74e5e41  [SPARK-27488][CORE] Driver interface to support GPU resources
74e5e41 is described below

commit 74e5e41eebf9ed596b48e6db52a2a9c642e5cbc3
Author: Thomas Graves 
AuthorDate: Thu May 23 11:46:13 2019 -0700

[SPARK-27488][CORE] Driver interface to support GPU resources

## What changes were proposed in this pull request?

Added the driver functionality to get the resources.

The user interface is: SparkContext.resources  - I called it this to match 
the TaskContext.resources api proposed in the other PR. Originally it was going 
to be called SparkContext.getResources but changed to be consistent, if people 
have strong feelings I can change it.

There are 2 ways the driver can discover what resources it has.
  1) user specifies a discoveryScript, this is similar to the executors and 
is meant for yarn and k8s where they don't tell you what you were allocated but 
you are running in isolated environment.
  2) read the config spark.driver.resource.resourceName.addresses.  The 
config is meant to be used with standalone mode where the Worker will have to 
assign what GPU addresses the Driver is allowed to use by setting that config.

When the user runs a spark application, if they want the driver to have 
GPU's they would specify the conf spark.driver.resource.gpu.count=X  where x is 
the number they want.  If they are running on yarn or k8s they will also have 
to specify the discoveryScript as specified above, if they are on standalone 
mode and cluster is setup properly they wouldn't have to specify anything else. 
 We could potentially get rid of the spark.driver.resources.gpu.addresses 
config which is really meant [...]

- This PR also has changes to be consistent about using resourceName 
everywhere.
- change the config names from POSTFIX to SUFFIX to be more consistent with 
other areas in Spark
- Moved the config checks around a bit since now used by both executor and 
driver. Note those might overlap a bit with 
https://github.com/apache/spark/pull/24374 so we will have to figure out which 
one should go in first.

## How was this patch tested?

Unit tests and manually test the interface.

Closes #24615 from tgravescs/SPARK-27488.

Authored-by: Thomas Graves 
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/ResourceDiscoverer.scala  | 88 ++-
 .../main/scala/org/apache/spark/SparkConf.scala| 60 +
 .../main/scala/org/apache/spark/SparkContext.scala | 47 +++
 .../executor/CoarseGrainedExecutorBackend.scala| 64 +++---
 .../org/apache/spark/internal/config/package.scala |  5 +-
 .../org/apache/spark/ResourceDiscovererSuite.scala | 83 ++
 .../scala/org/apache/spark/SparkConfSuite.scala| 43 ++
 .../scala/org/apache/spark/SparkContextSuite.scala | 98 ++
 .../CoarseGrainedExecutorBackendSuite.scala| 61 --
 docs/configuration.md  | 29 +--
 10 files changed, 436 insertions(+), 142 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala 
b/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala
index 1963942..d3b3860 100644
--- a/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala
+++ b/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala
@@ -29,10 +29,10 @@ import org.apache.spark.internal.config._
 import org.apache.spark.util.Utils.executeAndGetOutput
 
 /**
- * Discovers resources (GPUs/FPGAs/etc). It currently only supports resources 
that have
- * addresses.
+ * Discovers information about resources (GPUs/FPGAs/etc). It currently only 
supports
+ * resources that have addresses.
  * This class finds resources by running and parsing the output of the user 
specified script
- * from the config 
spark.{driver/executor}.resource.{resourceType}.discoveryScript.
+ * from the config 
spark.{driver/executor}.resource.{resourceName}.discoveryScript.
  * The output of the script it runs is expected to be JSON in the format of the
  * ResourceInformation class.
  *
@@ -42,28 +42,41 @@ private[spark] object ResourceDiscoverer extends Logging {
 
   private implicit val formats = DefaultFormats
 
-  def findResources(sparkConf: SparkConf, isDriver: Boolean): Map[String, 
ResourceInformation] = {
-val prefix = if (isDriver) {
-  SPARK_DRIVER_RESOURCE_PREFIX
-} else {
-  SPARK_EXECUTOR_RESOURCE_PREFIX
-}
-// get unique resource types by grabbing first part config with multiple 
periods,
-// ie resourceType.count, grab resourceType part
-val resourceNames = sparkConf.getAllWithPrefix(prefix).map { case (k

[spark] branch master updated: [SPARK-27588] Binary file data source fails fast and doesn't attempt to read very large files

2019-04-29 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 618d6bf  [SPARK-27588] Binary file data source fails fast and doesn't 
attempt to read very large files
618d6bf is described below

commit 618d6bff71073c8c93501ab7392c3cc579730f0b
Author: Xiangrui Meng 
AuthorDate: Mon Apr 29 16:24:49 2019 -0700

[SPARK-27588] Binary file data source fails fast and doesn't attempt to 
read very large files

## What changes were proposed in this pull request?

If a file is too big (>2GB), we should fail fast and do not try to read the 
file.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration 
tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, 
remove this)

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24483 from mengxr/SPARK-27588.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 .../org/apache/spark/sql/internal/SQLConf.scala|  8 ++
 .../datasources/binaryfile/BinaryFileFormat.scala  |  8 ++
 .../binaryfile/BinaryFileFormatSuite.scala | 31 +-
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 96d3f5c..87bce1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1744,6 +1744,14 @@ object SQLConf {
  "and from_utc_timestamp() functions.")
 .booleanConf
 .createWithDefault(false)
+
+  val SOURCES_BINARY_FILE_MAX_LENGTH = 
buildConf("spark.sql.sources.binaryFile.maxLength")
+.doc("The max length of a file that can be read by the binary file data 
source. " +
+  "Spark will fail fast and not attempt to read the file if its length 
exceeds this value. " +
+  "The theoretical max is Int.MaxValue, though VMs might implement a 
smaller max.")
+.internal()
+.intConf
+.createWithDefault(Int.MaxValue)
 }
 
 /**
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
index db93268..2637784 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -24,11 +24,13 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path}
 import org.apache.hadoop.mapreduce.Job
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.datasources.{FileFormat, 
OutputWriterFactory, PartitionedFile}
+import org.apache.spark.sql.internal.SQLConf.SOURCES_BINARY_FILE_MAX_LENGTH
 import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, 
GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -99,6 +101,7 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
 val binaryFileSourceOptions = new BinaryFileSourceOptions(options)
 val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter
 val filterFuncs = filters.map(filter => createFilterFunction(filter))
+val maxLength = sparkSession.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH)
 
 file: PartitionedFile => {
   val path = new Path(file.filePath)
@@ -115,6 +118,11 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
 case (MODIFICATION_TIME, i) =>
   writer.write(i, 
DateTimeUtils.fromMillis(status.getModificationTime))
 case (CONTENT, i) =>
+  if (status.getLen > maxLength) {
+throw new SparkException(
+  s"The length of ${status.getPath} is ${status.getLen}, " +
+s"which exceeds the max length allowed: ${maxLength}.")
+  }
   val stream = fs.open(status.getPath)
   try {
 writer.write(i, ByteStreams.toByteArray(stream))
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/exec

[spark] branch master updated: [SPARK-27472] add user guide for binary file data source

2019-04-29 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new fbc7942  [SPARK-27472] add user guide for binary file data source
fbc7942 is described below

commit fbc794268340bec868a0abcae3516e4ae3714286
Author: Xiangrui Meng 
AuthorDate: Mon Apr 29 08:58:56 2019 -0700

[SPARK-27472] add user guide for binary file data source

## What changes were proposed in this pull request?

Add user guide for binary file data source.

https://user-images.githubusercontent.com/829644/56877594-0488d300-6a04-11e9-9064-5047dfedd913.png;>

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration 
tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, 
remove this)

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24484 from mengxr/SPARK-27472.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 docs/sql-data-sources-binaryFile.md | 80 +
 docs/sql-data-sources.md|  1 +
 2 files changed, 81 insertions(+)

diff --git a/docs/sql-data-sources-binaryFile.md 
b/docs/sql-data-sources-binaryFile.md
new file mode 100644
index 000..d861a24
--- /dev/null
+++ b/docs/sql-data-sources-binaryFile.md
@@ -0,0 +1,80 @@
+---
+layout: global
+title: Binary File Data Source
+displayTitle: Binary File Data Source
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+Since Spark 3.0, Spark supports binary file data source,
+which reads binary files and converts each file into a single record that 
contains the raw content
+and metadata of the file.
+It produces a DataFrame with the following columns and possibly partition 
columns:
+* `path`: StringType
+* `modificationTime`: TimestampType
+* `length`: LongType
+* `content`: BinaryType
+
+It supports the following read option:
+
+  Property 
NameDefaultMeaning
+  
+pathGlobFilter
+none (accepts all)
+
+An optional glob pattern to only include files with paths matching the 
pattern.
+The syntax follows org.apache.hadoop.fs.GlobFilter.
+It does not change the behavior of partition discovery.
+
+  
+
+
+To read whole binary files, you need to specify the data source `format` as 
`binaryFile`.
+For example, the following code reads all PNG files from the input directory:
+
+
+
+{% highlight scala %}
+
+spark.read.format("binaryFile").option("pathGlobFilter", 
"*.png").load("/path/to/data")
+
+{% endhighlight %}
+
+
+
+{% highlight java %}
+
+spark.read().format("binaryFile").option("pathGlobFilter", 
"*.png").load("/path/to/data");
+
+{% endhighlight %}
+
+
+{% highlight python %}
+
+spark.read.format("binaryFile").option("pathGlobFilter", 
"*.png").load("/path/to/data")
+
+{% endhighlight %}
+
+
+{% highlight r %}
+
+read.df("/path/to/data", source = "binaryFile", pathGlobFilter = "*.png")
+
+{% endhighlight %}
+
+
+
+Binary file data source does not support writing a DataFrame back to the 
original files.
diff --git a/docs/sql-data-sources.md b/docs/sql-data-sources.md
index d908aac..079c540 100644
--- a/docs/sql-data-sources.md
+++ b/docs/sql-data-sources.md
@@ -54,4 +54,5 @@ goes into specific options that are available for the 
built-in data sources.
   * [Compatibility with Databricks 
spark-avro](sql-data-sources-avro.html#compatibility-with-databricks-spark-avro)
   * [Supported types for Avro -> Spark SQL 
conversion](sql-data-sources-avro.html#supported-types-for-avro---spark-sql-conversion)
   * [Supported types for Spark SQL -> Avro 
conversion](sql-data-sources-avro.html#supported-types-for-spark-sql---avro-conversion)
+* [Whole Binary Files](sql-data-sources-binaryFile.html)
 * [Troubleshooting](sql-data-sources-troubleshooting.html)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-27534][SQL] Do not load `content` column in binary data source if it is not selected

2019-04-28 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 20a3ef7  [SPARK-27534][SQL] Do not load `content` column in binary 
data source if it is not selected
20a3ef7 is described below

commit 20a3ef7259490e0c9f6348f13db1e99da5f0df83
Author: Xiangrui Meng 
AuthorDate: Sun Apr 28 07:57:03 2019 -0700

[SPARK-27534][SQL] Do not load `content` column in binary data source if it 
is not selected

## What changes were proposed in this pull request?

A follow-up task from SPARK-25348. To save I/O cost, Spark shouldn't 
attempt to read the file if users didn't request the `content` column. For 
example:
```
spark.read.format("binaryFile").load(path).filter($"length" < 
100).count()
```

## How was this patch tested?

Unit test added.

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24473 from WeichenXu123/SPARK-27534.

Lead-authored-by: Xiangrui Meng 
Co-authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 
---
 .../datasources/binaryfile/BinaryFileFormat.scala  | 74 +-
 .../binaryfile/BinaryFileFormatSuite.scala | 63 --
 2 files changed, 89 insertions(+), 48 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
index 8617ae3..db93268 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -26,12 +26,10 @@ import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.datasources.{FileFormat, 
OutputWriterFactory, PartitionedFile}
-import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, 
GreaterThan,
-  GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or}
+import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, 
GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.SerializableConfiguration
@@ -80,7 +78,7 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
 false
   }
 
-  override def shortName(): String = "binaryFile"
+  override def shortName(): String = BINARY_FILE
 
   override protected def buildReader(
   sparkSession: SparkSession,
@@ -90,54 +88,43 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
   filters: Seq[Filter],
   options: Map[String, String],
   hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
+require(dataSchema.sameType(schema),
+  s"""
+ |Binary file data source expects dataSchema: $schema,
+ |but got: $dataSchema.
+""".stripMargin)
 
 val broadcastedHadoopConf =
   sparkSession.sparkContext.broadcast(new 
SerializableConfiguration(hadoopConf))
-
 val binaryFileSourceOptions = new BinaryFileSourceOptions(options)
-
 val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter
-
 val filterFuncs = filters.map(filter => createFilterFunction(filter))
 
 file: PartitionedFile => {
-  val path = file.filePath
-  val fsPath = new Path(path)
-
+  val path = new Path(file.filePath)
   // TODO: Improve performance here: each file will recompile the glob 
pattern here.
-  if (pathGlobPattern.forall(new GlobFilter(_).accept(fsPath))) {
-val fs = fsPath.getFileSystem(broadcastedHadoopConf.value.value)
-val fileStatus = fs.getFileStatus(fsPath)
-val length = fileStatus.getLen
-val modificationTime = fileStatus.getModificationTime
-
-if (filterFuncs.forall(_.apply(fileStatus))) {
-  val stream = fs.open(fsPath)
-  val content = try {
-ByteStreams.toByteArray(stream)
-  } finally {
-Closeables.close(stream, true)
-  }
-
-  val fullOutput = dataSchema.map { f =>
-AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()
-  }
-  val requiredOutput

[spark] branch master updated: [SPARK-27473][SQL] Support filter push down for status fields in binary file data source

2019-04-21 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9793d9e  [SPARK-27473][SQL] Support filter push down for status fields 
in binary file data source
9793d9e is described below

commit 9793d9ec22ff7d9778554e4fa3f03ef4f93d473d
Author: WeichenXu 
AuthorDate: Sun Apr 21 12:45:59 2019 -0700

[SPARK-27473][SQL] Support filter push down for status fields in binary 
file data source

## What changes were proposed in this pull request?

Support 4 kinds of filters:
- LessThan
- LessThanOrEqual
- GreatThan
- GreatThanOrEqual

Support filters applied on 2 columns:
- modificationTime
- length

Note:
In order to support datasource filter push-down, I flatten schema to be:
```
val schema = StructType(
StructField("path", StringType, false) ::
StructField("modificationTime", TimestampType, false) ::
StructField("length", LongType, false) ::
StructField("content", BinaryType, true) :: Nil)
```

## How was this patch tested?

To be added.

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24387 from WeichenXu123/binary_ds_filter.

Lead-authored-by: WeichenXu 
Co-authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 .../datasources/binaryfile/BinaryFileFormat.scala  | 134 ++-
 .../binaryfile/BinaryFileFormatSuite.scala | 188 ++---
 2 files changed, 256 insertions(+), 66 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
index ad9292a..8617ae3 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.binaryfile
 
+import java.sql.Timestamp
+
 import com.google.common.io.{ByteStreams, Closeables}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path}
@@ -28,7 +30,8 @@ import 
org.apache.spark.sql.catalyst.expressions.AttributeReference
 import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.datasources.{FileFormat, 
OutputWriterFactory, PartitionedFile}
-import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
+import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, 
GreaterThan,
+  GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.SerializableConfiguration
@@ -55,10 +58,12 @@ import org.apache.spark.util.SerializableConfiguration
  */
 class BinaryFileFormat extends FileFormat with DataSourceRegister {
 
+  import BinaryFileFormat._
+
   override def inferSchema(
   sparkSession: SparkSession,
   options: Map[String, String],
-  files: Seq[FileStatus]): Option[StructType] = 
Some(BinaryFileFormat.schema)
+  files: Seq[FileStatus]): Option[StructType] = Some(schema)
 
   override def prepareWrite(
   sparkSession: SparkSession,
@@ -84,7 +89,7 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
   requiredSchema: StructType,
   filters: Seq[Filter],
   options: Map[String, String],
-  hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = 
{
+  hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
 
 val broadcastedHadoopConf =
   sparkSession.sparkContext.broadcast(new 
SerializableConfiguration(hadoopConf))
@@ -93,46 +98,49 @@ class BinaryFileFormat extends FileFormat with 
DataSourceRegister {
 
 val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter
 
-(file: PartitionedFile) => {
+val filterFuncs = filters.map(filter => createFilterFunction(filter))
+
+file: PartitionedFile => {
   val path = file.filePath
   val fsPath = new Path(path)
 
   // TODO: Improve performance here: each file will recompile the glob 
pattern here.
-  val globFilter = pathGlobPattern.map(new GlobFilter(_))
-  if (!globFilter.isDefined || globFilter.get.accept(fsPath)) {
+  if (pathGlobPattern.forall(new GlobFilter(_).accept(fsPath))) {
 val fs = fsPath.getFileSystem(broadcastedHadoopConf.value.value)
 val fileStatus = fs.getFil

[spark] branch master updated: [SPARK-25348][SQL] Data source for binary files

2019-04-16 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 1bb0c8e  [SPARK-25348][SQL] Data source for binary files
1bb0c8e is described below

commit 1bb0c8e407e0fcd1283f0eb2f742ba2567eda87e
Author: WeichenXu 
AuthorDate: Tue Apr 16 15:41:32 2019 -0700

[SPARK-25348][SQL] Data source for binary files

## What changes were proposed in this pull request?

Implement binary file data source in Spark.

Format name: "binaryFile" (case-insensitive)

Schema:
- content: BinaryType
- status: StructType
  - path: StringType
  - modificationTime: TimestampType
  - length: LongType

Options:
* pathGlobFilter (instead of pathFilterRegex) to reply on GlobFilter 
behavior
* maxBytesPerPartition is not implemented since it is controlled by two SQL 
confs: maxPartitionBytes and openCostInBytes.

## How was this patch tested?

Unit test added.

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24354 from WeichenXu123/binary_file_datasource.

Lead-authored-by: WeichenXu 
Co-authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
---
 ...org.apache.spark.sql.sources.DataSourceRegister |   1 +
 .../datasources/binaryfile/BinaryFileFormat.scala  | 177 +
 .../binaryfile/BinaryFileFormatSuite.scala | 143 +
 3 files changed, 321 insertions(+)

diff --git 
a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
 
b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index be9cb81..d988287 100644
--- 
a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ 
b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -8,3 +8,4 @@ 
org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider
 org.apache.spark.sql.execution.streaming.sources.TextSocketSourceProvider
+org.apache.spark.sql.execution.datasources.binaryfile.BinaryFileFormat
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
new file mode 100644
index 000..ad9292a
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.binaryfile
+
+import com.google.common.io.{ByteStreams, Closeables}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path}
+import org.apache.hadoop.mapreduce.Job
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.execution.datasources.{FileFormat, 
OutputWriterFactory, PartitionedFile}
+import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * The binary file data source.
+ *
+ * It reads binary files and converts each file into a single record that 
contains the raw content
+ * and metadata of the file.
+ *
+ * Example:
+ * {{{
+ *   // Scala
+ *   val df = spark.read.format("binaryFile")
+ * .option("pathGlobFilter", "*.png")
+ * .load("/path/to/fileDir")
+ *
+ *   // Jav

[spark] branch master updated: [SPARK-27454][ML][SQL] Spark image datasource fail when encounter some illegal images

2019-04-15 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new d35e81f  [SPARK-27454][ML][SQL] Spark image datasource fail when 
encounter some illegal images
d35e81f is described below

commit d35e81f4bc561598676a508319ec872f7361b069
Author: WeichenXu 
AuthorDate: Mon Apr 15 11:55:51 2019 -0700

[SPARK-27454][ML][SQL] Spark image datasource fail when encounter some 
illegal images

## What changes were proposed in this pull request?

Fix in Spark image datasource fail when encounter some illegal images.

This related to bugs inside `ImageIO.read` so in spark code I add exception 
handling for it.

## How was this patch tested?

N/A

Please review http://spark.apache.org/contributing.html before opening a 
pull request.

Closes #24362 from WeichenXu123/fix_image_ds_bug.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 
---
 mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala 
b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
index 0b13eef..a7ddf2f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
@@ -133,7 +133,13 @@ object ImageSchema {
*/
   private[spark] def decode(origin: String, bytes: Array[Byte]): Option[Row] = 
{
 
-val img = ImageIO.read(new ByteArrayInputStream(bytes))
+val img = try {
+  ImageIO.read(new ByteArrayInputStream(bytes))
+} catch {
+  // Catch runtime exception because `ImageIO` may throw unexcepted 
`RuntimeException`.
+  // But do not catch the declared `IOException` (regarded as FileSystem 
failure)
+  case _: RuntimeException => null
+}
 
 if (img == null) {
   None


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-25970][ML] Add Instrumentation to PrefixSpan

2018-12-20 Thread meng

This is an automated email from the ASF dual-hosted git repository.

meng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new aa0d4ca  [SPARK-25970][ML] Add Instrumentation to PrefixSpan
aa0d4ca is described below

commit aa0d4ca8bab08a467645080a5b8a28bf6dd8a042
Author: zhengruifeng 
AuthorDate: Thu Dec 20 11:22:49 2018 -0800

[SPARK-25970][ML] Add Instrumentation to PrefixSpan

## What changes were proposed in this pull request?
Add Instrumentation to PrefixSpan

## How was this patch tested?
existing tests

Closes #22971 from zhengruifeng/log_PrefixSpan.

Authored-by: zhengruifeng 
Signed-off-by: Xiangrui Meng 
---
 mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala 
b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
index 2a34135..b0006a8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.fpm
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.util.Instrumentation.instrumented
 import org.apache.spark.mllib.fpm.{PrefixSpan => mllibPrefixSpan}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.col
@@ -135,7 +136,10 @@ final class PrefixSpan(@Since("2.4.0") override val uid: 
String) extends Params
*  - `freq: Long`
*/
   @Since("2.4.0")
-  def findFrequentSequentialPatterns(dataset: Dataset[_]): DataFrame = {
+  def findFrequentSequentialPatterns(dataset: Dataset[_]): DataFrame = 
instrumented { instr =>
+instr.logDataset(dataset)
+instr.logParams(this, params: _*)
+
 val sequenceColParam = $(sequenceCol)
 val inputType = dataset.schema(sequenceColParam).dataType
 require(inputType.isInstanceOf[ArrayType] &&


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25321][ML] Fix local LDA model constructor

2018-09-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.4 138a63165 -> 1303eb5c8


[SPARK-25321][ML] Fix local LDA model constructor

## What changes were proposed in this pull request?

change back the constructor to:
```
class LocalLDAModel private[ml] (
uid: String,
vocabSize: Int,
private[clustering] val oldLocalModel : OldLocalLDAModel,
sparkSession: SparkSession)
```

Although it is marked `private[ml]`, it is used in `mleap` and the master 
change breaks `mleap` building.
See mleap code 
[here](https://github.com/combust/mleap/blob/c7860af328d519cf56441b4a7cd8e6ec9d9fee59/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/clustering/LDAModelOp.scala#L57)
## How was this patch tested?

Manual.

Closes #22510 from WeichenXu123/LDA_fix.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 40edab209bdefe793b59b650099cea026c244484)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1303eb5c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1303eb5c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1303eb5c

Branch: refs/heads/branch-2.4
Commit: 1303eb5c8d976748ba3da23b66abb8eb6512ea5d
Parents: 138a631
Author: WeichenXu 
Authored: Fri Sep 21 13:08:01 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Sep 21 13:08:11 2018 -0700

--
 mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1303eb5c/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 50867f7..84e73dc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -570,13 +570,11 @@ abstract class LDAModel private[ml] (
 class LocalLDAModel private[ml] (
 uid: String,
 vocabSize: Int,
-private[clustering] val oldLocalModel_ : OldLocalLDAModel,
+private[clustering] val oldLocalModel : OldLocalLDAModel,
 sparkSession: SparkSession)
   extends LDAModel(uid, vocabSize, sparkSession) {
 
-  override private[clustering] def oldLocalModel: OldLocalLDAModel = {
-oldLocalModel_.setSeed(getSeed)
-  }
+  oldLocalModel.setSeed(getSeed)
 
   @Since("1.6.0")
   override def copy(extra: ParamMap): LocalLDAModel = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25321][ML] Fix local LDA model constructor

2018-09-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 4a1120953 -> 40edab209


[SPARK-25321][ML] Fix local LDA model constructor

## What changes were proposed in this pull request?

change back the constructor to:
```
class LocalLDAModel private[ml] (
uid: String,
vocabSize: Int,
private[clustering] val oldLocalModel : OldLocalLDAModel,
sparkSession: SparkSession)
```

Although it is marked `private[ml]`, it is used in `mleap` and the master 
change breaks `mleap` building.
See mleap code 
[here](https://github.com/combust/mleap/blob/c7860af328d519cf56441b4a7cd8e6ec9d9fee59/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/clustering/LDAModelOp.scala#L57)
## How was this patch tested?

Manual.

Closes #22510 from WeichenXu123/LDA_fix.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/40edab20
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/40edab20
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/40edab20

Branch: refs/heads/master
Commit: 40edab209bdefe793b59b650099cea026c244484
Parents: 4a11209
Author: WeichenXu 
Authored: Fri Sep 21 13:08:01 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Sep 21 13:08:01 2018 -0700

--
 mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/40edab20/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 50867f7..84e73dc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -570,13 +570,11 @@ abstract class LDAModel private[ml] (
 class LocalLDAModel private[ml] (
 uid: String,
 vocabSize: Int,
-private[clustering] val oldLocalModel_ : OldLocalLDAModel,
+private[clustering] val oldLocalModel : OldLocalLDAModel,
 sparkSession: SparkSession)
   extends LDAModel(uid, vocabSize, sparkSession) {
 
-  override private[clustering] def oldLocalModel: OldLocalLDAModel = {
-oldLocalModel_.setSeed(getSeed)
-  }
+  oldLocalModel.setSeed(getSeed)
 
   @Since("1.6.0")
   override def copy(extra: ParamMap): LocalLDAModel = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change

2018-09-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.4 ce6636112 -> 138a63165


[SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change

## What changes were proposed in this pull request?

Revert SPARK-14681 to avoid API breaking change. PR [SPARK-14681] will break 
mleap.

## How was this patch tested?

N/A

Closes #22492 from WeichenXu123/revert_tree_change.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/138a6316
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/138a6316
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/138a6316

Branch: refs/heads/branch-2.4
Commit: 138a63165ce90f8400e0a5c7503894662ead03c5
Parents: ce66361
Author: WeichenXu 
Authored: Fri Sep 21 13:05:24 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Sep 21 13:05:24 2018 -0700

--
 .../classification/DecisionTreeClassifier.scala |  14 +-
 .../spark/ml/classification/GBTClassifier.scala |   6 +-
 .../classification/RandomForestClassifier.scala |   6 +-
 .../ml/regression/DecisionTreeRegressor.scala   |  13 +-
 .../spark/ml/regression/GBTRegressor.scala  |   6 +-
 .../ml/regression/RandomForestRegressor.scala   |   6 +-
 .../scala/org/apache/spark/ml/tree/Node.scala   | 247 ---
 .../spark/ml/tree/impl/RandomForest.scala   |  10 +-
 .../org/apache/spark/ml/tree/treeModels.scala   |  36 +--
 .../DecisionTreeClassifierSuite.scala   |  31 +--
 .../ml/classification/GBTClassifierSuite.scala  |   4 +-
 .../RandomForestClassifierSuite.scala   |   5 +-
 .../regression/DecisionTreeRegressorSuite.scala |  14 --
 .../spark/ml/tree/impl/RandomForestSuite.scala  |  22 +-
 .../apache/spark/ml/tree/impl/TreeTests.scala   |  12 +-
 project/MimaExcludes.scala  |   7 -
 16 files changed, 107 insertions(+), 332 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/138a6316/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 8a57bfc..6648e78 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -168,7 +168,7 @@ object DecisionTreeClassifier extends 
DefaultParamsReadable[DecisionTreeClassifi
 @Since("1.4.0")
 class DecisionTreeClassificationModel private[ml] (
 @Since("1.4.0")override val uid: String,
-@Since("1.4.0")override val rootNode: ClassificationNode,
+@Since("1.4.0")override val rootNode: Node,
 @Since("1.6.0")override val numFeatures: Int,
 @Since("1.5.0")override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, 
DecisionTreeClassificationModel]
@@ -181,7 +181,7 @@ class DecisionTreeClassificationModel private[ml] (
* Construct a decision tree classification model.
* @param rootNode  Root node of tree, with other nodes attached.
*/
-  private[ml] def this(rootNode: ClassificationNode, numFeatures: Int, 
numClasses: Int) =
+  private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) =
 this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses)
 
   override def predict(features: Vector): Double = {
@@ -279,9 +279,8 @@ object DecisionTreeClassificationModel extends 
MLReadable[DecisionTreeClassifica
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
   val numFeatures = (metadata.metadata \ "numFeatures").extract[Int]
   val numClasses = (metadata.metadata \ "numClasses").extract[Int]
-  val root = loadTreeNodes(path, metadata, sparkSession, isClassification 
= true)
-  val model = new DecisionTreeClassificationModel(metadata.uid,
-root.asInstanceOf[ClassificationNode], numFeatures, numClasses)
+  val root = loadTreeNodes(path, metadata, sparkSession)
+  val model = new DecisionTreeClassificationModel(metadata.uid, root, 
numFeatures, numClasses)
   metadata.getAndSetParams(model)
   model
 }
@@ -296,10 +295,9 @@ object DecisionTreeClassificationModel extends 
MLReadable[DecisionTreeClassifica
 require(oldModel.algo == OldAlgo.Classification,
   s"Cannot convert non-classification DecisionTreeModel (old API) to" +
 s" DecisionTreeClassificationModel (new API).  Algo is: 
${oldModel.algo}")
-val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures, 
isClassif

spark git commit: [SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema representation

2018-09-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master cb1b55cf7 -> 6f681d429


[SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema 
representation

## What changes were proposed in this pull request?

Improve testcase "image datasource test: read non image" to tolerate different 
schema representation.
Because file:/path and file:///path are both valid URI-ifications so in some 
environment the testcase will fail.

## How was this patch tested?

Manual.

Closes #22449 from WeichenXu123/image_url.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f681d42
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f681d42
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f681d42

Branch: refs/heads/master
Commit: 6f681d42964884d19bf22deb614550d712223117
Parents: cb1b55c
Author: WeichenXu 
Authored: Wed Sep 19 15:16:20 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Sep 19 15:16:20 2018 -0700

--
 .../spark/ml/source/image/ImageFileFormatSuite.scala | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f681d42/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
index 1a6a8d6..38e2513 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.source.image
 
+import java.net.URI
 import java.nio.file.Paths
 
 import org.apache.spark.SparkFunSuite
@@ -58,8 +59,14 @@ class ImageFileFormatSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   .load(filePath)
 assert(df2.count() === 1)
 val result = df2.head()
-assert(result === invalidImageRow(
-  Paths.get(filePath).toAbsolutePath().normalize().toUri().toString))
+
+val resultOrigin = result.getStruct(0).getString(0)
+// covert `origin` to `java.net.URI` object and then compare.
+// because `file:/path` and `file:///path` are both valid URI-ifications
+assert(new URI(resultOrigin) === 
Paths.get(filePath).toAbsolutePath().normalize().toUri())
+
+// Compare other columns in the row to be the same with the 
`invalidImageRow`
+assert(result === invalidImageRow(resultOrigin))
   }
 
   test("image datasource partition test") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema representation

2018-09-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.4 9fefb47fe -> 83a75a83c


[SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema 
representation

## What changes were proposed in this pull request?

Improve testcase "image datasource test: read non image" to tolerate different 
schema representation.
Because file:/path and file:///path are both valid URI-ifications so in some 
environment the testcase will fail.

## How was this patch tested?

Manual.

Closes #22449 from WeichenXu123/image_url.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 6f681d42964884d19bf22deb614550d712223117)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/83a75a83
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/83a75a83
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/83a75a83

Branch: refs/heads/branch-2.4
Commit: 83a75a83cb24d20d4c2df5389bb8db34ad0335d9
Parents: 9fefb47
Author: WeichenXu 
Authored: Wed Sep 19 15:16:20 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Sep 19 15:16:30 2018 -0700

--
 .../spark/ml/source/image/ImageFileFormatSuite.scala | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/83a75a83/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
index 1a6a8d6..38e2513 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.source.image
 
+import java.net.URI
 import java.nio.file.Paths
 
 import org.apache.spark.SparkFunSuite
@@ -58,8 +59,14 @@ class ImageFileFormatSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   .load(filePath)
 assert(df2.count() === 1)
 val result = df2.head()
-assert(result === invalidImageRow(
-  Paths.get(filePath).toAbsolutePath().normalize().toUri().toString))
+
+val resultOrigin = result.getStruct(0).getString(0)
+// covert `origin` to `java.net.URI` object and then compare.
+// because `file:/path` and `file:///path` are both valid URI-ifications
+assert(new URI(resultOrigin) === 
Paths.get(filePath).toAbsolutePath().normalize().toUri())
+
+// Compare other columns in the row to be the same with the 
`invalidImageRow`
+assert(result === invalidImageRow(resultOrigin))
   }
 
   test("image datasource partition test") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25345][ML] Deprecate public APIs from ImageSchema

2018-09-08 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 01c3dfab1 -> 08c02e637


[SPARK-25345][ML] Deprecate public APIs from ImageSchema

## What changes were proposed in this pull request?

Deprecate public APIs from ImageSchema.

## How was this patch tested?

N/A

Closes #22349 from WeichenXu123/image_api_deprecate.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08c02e63
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08c02e63
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08c02e63

Branch: refs/heads/master
Commit: 08c02e637ac601df2fe890b8b5a7a049bdb4541b
Parents: 01c3dfa
Author: WeichenXu 
Authored: Sat Sep 8 09:09:14 2018 -0700
Committer: Xiangrui Meng 
Committed: Sat Sep 8 09:09:14 2018 -0700

--
 .../main/scala/org/apache/spark/ml/image/ImageSchema.scala   | 4 
 python/pyspark/ml/image.py   | 8 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/08c02e63/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala 
b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
index dcc40b6..0b13eef 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
@@ -198,6 +198,8 @@ object ImageSchema {
* @return DataFrame with a single column "image" of images;
* see ImageSchema for the details
*/
+  @deprecated("use `spark.read.format(\"image\").load(path)` and this 
`readImages` will be " +
+"removed in 3.0.0.", "2.4.0")
   def readImages(path: String): DataFrame = readImages(path, null, false, -1, 
false, 1.0, 0)
 
   /**
@@ -218,6 +220,8 @@ object ImageSchema {
* @return DataFrame with a single column "image" of images;
* see ImageSchema for the details
*/
+  @deprecated("use `spark.read.format(\"image\").load(path)` and this 
`readImages` will be " +
+"removed in 3.0.0.", "2.4.0")
   def readImages(
   path: String,
   sparkSession: SparkSession,

http://git-wip-us.apache.org/repos/asf/spark/blob/08c02e63/python/pyspark/ml/image.py
--
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index ef6785b..edb90a3 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -25,8 +25,10 @@
 """
 
 import sys
+import warnings
 
 import numpy as np
+
 from pyspark import SparkContext
 from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string
 from pyspark.sql import DataFrame, SparkSession
@@ -207,6 +209,9 @@ class _ImageSchema(object):
 .. note:: If sample ratio is less than 1, sampling uses a PathFilter 
that is efficient but
 potentially non-deterministic.
 
+.. note:: Deprecated in 2.4.0. Use 
`spark.read.format("image").load(path)` instead and
+this `readImages` will be removed in 3.0.0.
+
 :param str path: Path to the image directory.
 :param bool recursive: Recursive search flag.
 :param int numPartitions: Number of DataFrame partitions.
@@ -222,7 +227,8 @@ class _ImageSchema(object):
 
 .. versionadded:: 2.3.0
 """
-
+warnings.warn("`ImageSchema.readImage` is deprecated. " +
+  "Use `spark.read.format(\"image\").load(path)` 
instead.", DeprecationWarning)
 spark = SparkSession.builder.getOrCreate()
 image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema
 jsession = spark._jsparkSession


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25345][ML] Deprecate public APIs from ImageSchema

2018-09-08 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.4 80567fad4 -> 904192ad1


[SPARK-25345][ML] Deprecate public APIs from ImageSchema

## What changes were proposed in this pull request?

Deprecate public APIs from ImageSchema.

## How was this patch tested?

N/A

Closes #22349 from WeichenXu123/image_api_deprecate.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 08c02e637ac601df2fe890b8b5a7a049bdb4541b)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/904192ad
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/904192ad
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/904192ad

Branch: refs/heads/branch-2.4
Commit: 904192ad18ff09cc5874e09b03447dd5f7754963
Parents: 80567fa
Author: WeichenXu 
Authored: Sat Sep 8 09:09:14 2018 -0700
Committer: Xiangrui Meng 
Committed: Sat Sep 8 09:09:33 2018 -0700

--
 .../main/scala/org/apache/spark/ml/image/ImageSchema.scala   | 4 
 python/pyspark/ml/image.py   | 8 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/904192ad/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala 
b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
index dcc40b6..0b13eef 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala
@@ -198,6 +198,8 @@ object ImageSchema {
* @return DataFrame with a single column "image" of images;
* see ImageSchema for the details
*/
+  @deprecated("use `spark.read.format(\"image\").load(path)` and this 
`readImages` will be " +
+"removed in 3.0.0.", "2.4.0")
   def readImages(path: String): DataFrame = readImages(path, null, false, -1, 
false, 1.0, 0)
 
   /**
@@ -218,6 +220,8 @@ object ImageSchema {
* @return DataFrame with a single column "image" of images;
* see ImageSchema for the details
*/
+  @deprecated("use `spark.read.format(\"image\").load(path)` and this 
`readImages` will be " +
+"removed in 3.0.0.", "2.4.0")
   def readImages(
   path: String,
   sparkSession: SparkSession,

http://git-wip-us.apache.org/repos/asf/spark/blob/904192ad/python/pyspark/ml/image.py
--
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index ef6785b..edb90a3 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -25,8 +25,10 @@
 """
 
 import sys
+import warnings
 
 import numpy as np
+
 from pyspark import SparkContext
 from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string
 from pyspark.sql import DataFrame, SparkSession
@@ -207,6 +209,9 @@ class _ImageSchema(object):
 .. note:: If sample ratio is less than 1, sampling uses a PathFilter 
that is efficient but
 potentially non-deterministic.
 
+.. note:: Deprecated in 2.4.0. Use 
`spark.read.format("image").load(path)` instead and
+this `readImages` will be removed in 3.0.0.
+
 :param str path: Path to the image directory.
 :param bool recursive: Recursive search flag.
 :param int numPartitions: Number of DataFrame partitions.
@@ -222,7 +227,8 @@ class _ImageSchema(object):
 
 .. versionadded:: 2.3.0
 """
-
+warnings.warn("`ImageSchema.readImage` is deprecated. " +
+  "Use `spark.read.format(\"image\").load(path)` 
instead.", DeprecationWarning)
 spark = SparkSession.builder.getOrCreate()
 image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema
 jsession = spark._jsparkSession


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22666][ML][SQL] Spark datasource for image format

2018-09-05 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c66eef844 -> 925449283


[SPARK-22666][ML][SQL] Spark datasource for image format

## What changes were proposed in this pull request?

Implement an image schema datasource.

This image datasource support:
  - partition discovery (loading partitioned images)
  - dropImageFailures (the same behavior with `ImageSchema.readImage`)
  - path wildcard matching (the same behavior with `ImageSchema.readImage`)
  - loading recursively from directory (different from `ImageSchema.readImage`, 
but use such path: `/path/to/dir/**`)

This datasource **NOT** support:
  - specify `numPartitions` (it will be determined by datasource automatically)
  - sampling (you can use `df.sample` later but the sampling operator won't be 
pushdown to datasource)

## How was this patch tested?
Unit tests.

## Benchmark
I benchmark and compare the cost time between old `ImageSchema.read` API and my 
image datasource.

**cluster**: 4 nodes, each with 64GB memory, 8 cores CPU
**test dataset**: Flickr8k_Dataset (about 8091 images)

**time cost**:
- My image datasource time (automatically generate 258 partitions):  38.04s
- `ImageSchema.read` time (set 16 partitions): 68.4s
- `ImageSchema.read` time (set 258 partitions):  90.6s

**time cost when increase image number by double (clone Flickr8k_Dataset and 
loads double number images)**:
- My image datasource time (automatically generate 515 partitions):  95.4s
- `ImageSchema.read` (set 32 partitions): 109s
- `ImageSchema.read` (set 515 partitions):  105s

So we can see that my image datasource implementation (this PR) bring some 
performance improvement compared against old`ImageSchema.read` API.

Closes #22328 from WeichenXu123/image_datasource.

Authored-by: WeichenXu 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92544928
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92544928
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92544928

Branch: refs/heads/master
Commit: 925449283dcaef80e0f77e60aea6ef988bd697b4
Parents: c66eef8
Author: WeichenXu 
Authored: Wed Sep 5 11:59:00 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Sep 5 11:59:00 2018 -0700

--
 .../images/kittens/29.5.a_b_EGDP022204.jpg  | Bin 27295 -> 0 bytes
 data/mllib/images/kittens/54893.jpg | Bin 35914 -> 0 bytes
 data/mllib/images/kittens/DP153539.jpg  | Bin 26354 -> 0 bytes
 data/mllib/images/kittens/DP802813.jpg  | Bin 30432 -> 0 bytes
 data/mllib/images/kittens/not-image.txt |   1 -
 data/mllib/images/multi-channel/BGRA.png| Bin 683 -> 0 bytes
 .../images/multi-channel/BGRA_alpha_60.png  | Bin 747 -> 0 bytes
 data/mllib/images/multi-channel/chr30.4.184.jpg | Bin 59472 -> 0 bytes
 data/mllib/images/multi-channel/grayscale.jpg   | Bin 36728 -> 0 bytes
 .../origin/kittens/29.5.a_b_EGDP022204.jpg  | Bin 0 -> 27295 bytes
 data/mllib/images/origin/kittens/54893.jpg  | Bin 0 -> 35914 bytes
 data/mllib/images/origin/kittens/DP153539.jpg   | Bin 0 -> 26354 bytes
 data/mllib/images/origin/kittens/DP802813.jpg   | Bin 0 -> 30432 bytes
 data/mllib/images/origin/kittens/not-image.txt  |   1 +
 data/mllib/images/origin/license.txt|  13 ++
 data/mllib/images/origin/multi-channel/BGRA.png | Bin 0 -> 683 bytes
 .../origin/multi-channel/BGRA_alpha_60.png  | Bin 0 -> 747 bytes
 .../images/origin/multi-channel/chr30.4.184.jpg | Bin 0 -> 59472 bytes
 .../images/origin/multi-channel/grayscale.jpg   | Bin 0 -> 36728 bytes
 .../date=2018-01/29.5.a_b_EGDP022204.jpg| Bin 0 -> 27295 bytes
 .../cls=kittens/date=2018-01/not-image.txt  |   1 +
 .../cls=kittens/date=2018-02/54893.jpg  | Bin 0 -> 35914 bytes
 .../cls=kittens/date=2018-02/DP153539.jpg   | Bin 0 -> 26354 bytes
 .../cls=kittens/date=2018-02/DP802813.jpg   | Bin 0 -> 30432 bytes
 .../cls=multichannel/date=2018-01/BGRA.png  | Bin 0 -> 683 bytes
 .../date=2018-01/BGRA_alpha_60.png  | Bin 0 -> 747 bytes
 .../date=2018-02/chr30.4.184.jpg| Bin 0 -> 59472 bytes
 .../cls=multichannel/date=2018-02/grayscale.jpg | Bin 0 -> 36728 bytes
 apache.spark.sql.sources.DataSourceRegister |   1 +
 .../spark/ml/source/image/ImageDataSource.scala |  53 +
 .../spark/ml/source/image/ImageFileFormat.scala | 100 
 .../spark/ml/source/image/ImageOptions.scala|  32 +
 .../spark/ml/image/ImageSchemaSuite.scala   |   2 +-
 .../ml/source/image/ImageFileFormatSuite.scala  | 119 +++
 python/pyspark/ml/image.py  |   2 +-
 python/pyspark/ml/tests.py  |   4 +-
 36 files changed, 324 insertions(+), 5 deletions(-)
---

spark git commit: [SPARK-25248][CORE] Audit barrier Scala APIs for 2.4

2018-09-04 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3aa60282c -> 061bb01d9


[SPARK-25248][CORE] Audit barrier Scala APIs for 2.4

## What changes were proposed in this pull request?

I made one pass over barrier APIs added to Spark 2.4 and updates some scopes 
and docs. I will update Python docs once Scala doc was reviewed.

One major issue is that `BarrierTaskContext` implements `TaskContextImpl` that 
exposes some public methods. And internally there were several direct 
references to `TaskContextImpl` methods instead of `TaskContext`. This PR moved 
some methods from `TaskContextImpl` to `TaskContext`, remaining package 
private, and used delegate methods to avoid inheriting `TaskContextImp` and 
exposing unnecessary APIs.

TODOs:
- [x] scala doc
- [x] python doc (#22261 ).

Closes #22240 from mengxr/SPARK-25248.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/061bb01d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/061bb01d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/061bb01d

Branch: refs/heads/master
Commit: 061bb01d9b99911353e66a90abc3164c467fcae1
Parents: 3aa6028
Author: Xiangrui Meng 
Authored: Tue Sep 4 09:55:53 2018 -0700
Committer: Xiangrui Meng 
Committed: Tue Sep 4 09:55:53 2018 -0700

--
 .../org/apache/spark/BarrierTaskContext.scala   | 114 +++
 .../org/apache/spark/BarrierTaskInfo.scala  |   2 +-
 .../scala/org/apache/spark/TaskContext.scala|  14 +++
 .../org/apache/spark/TaskContextImpl.scala  |  15 +--
 .../apache/spark/api/python/PythonRunner.scala  |   2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala   |  10 +-
 .../scala/org/apache/spark/rdd/RDDBarrier.scala |  22 ++--
 .../scala/org/apache/spark/scheduler/Task.scala |  35 +++---
 .../scala/org/apache/spark/util/Utils.scala |   2 +-
 project/MimaExcludes.scala  |   7 ++
 .../spark/sql/internal/ReadOnlySQLConf.scala|   4 +-
 11 files changed, 163 insertions(+), 64 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/061bb01d/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala 
b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 3901f96..90a5c41 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -24,25 +24,22 @@ import scala.language.postfixOps
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.internal.Logging
 import org.apache.spark.memory.TaskMemoryManager
-import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.metrics.source.Source
 import org.apache.spark.rpc.{RpcEndpointRef, RpcTimeout}
-import org.apache.spark.util.{RpcUtils, Utils}
-
-/** A [[TaskContext]] with extra info and tooling for a barrier stage. */
-class BarrierTaskContext(
-override val stageId: Int,
-override val stageAttemptNumber: Int,
-override val partitionId: Int,
-override val taskAttemptId: Long,
-override val attemptNumber: Int,
-override val taskMemoryManager: TaskMemoryManager,
-localProperties: Properties,
-@transient private val metricsSystem: MetricsSystem,
-// The default value is only used in tests.
-override val taskMetrics: TaskMetrics = TaskMetrics.empty)
-  extends TaskContextImpl(stageId, stageAttemptNumber, partitionId, 
taskAttemptId, attemptNumber,
-  taskMemoryManager, localProperties, metricsSystem, taskMetrics) {
+import org.apache.spark.shuffle.FetchFailedException
+import org.apache.spark.util._
+
+/**
+ * :: Experimental ::
+ * A [[TaskContext]] with extra contextual info and tooling for tasks in a 
barrier stage.
+ * Use [[BarrierTaskContext#get]] to obtain the barrier context for a running 
barrier task.
+ */
+@Experimental
+@Since("2.4.0")
+class BarrierTaskContext private[spark] (
+taskContext: TaskContext) extends TaskContext with Logging {
 
   // Find the driver side RPCEndpointRef of the coordinator that handles all 
the barrier() calls.
   private val barrierCoordinator: RpcEndpointRef = {
@@ -68,7 +65,7 @@ class BarrierTaskContext(
*
* CAUTION! In a barrier stage, each task must have the same number of 
barrier() calls, in all
* possible code branches. Otherwise, you may get the job hanging or a 
SparkException after
-   * timeout. Some examples of misuses listed below:
+   * timeout. Some examples of '''misuses''' are listed below:
* 1. Only call barrier() function on a subset of all the tasks in the same 
barrier stage, it

spark git commit: [SPARK-25248][.1][PYSPARK] update barrier Python API

2018-08-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3864480e1 -> 20b7c684c


[SPARK-25248][.1][PYSPARK] update barrier Python API

## What changes were proposed in this pull request?

I made one pass over the Python APIs for barrier mode and updated them to match 
the Scala doc in #22240 . Major changes:

* export the public classes
* expand the docs
* add doc for BarrierTaskInfo.addresss

cc: jiangxb1987

Closes #22261 from mengxr/SPARK-25248.1.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20b7c684
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20b7c684
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20b7c684

Branch: refs/heads/master
Commit: 20b7c684cc4a8136b9a9c56390a4948de04e7c34
Parents: 3864480
Author: Xiangrui Meng 
Authored: Wed Aug 29 07:22:03 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Aug 29 07:22:03 2018 -0700

--
 python/pyspark/__init__.py| 12 +---
 python/pyspark/rdd.py | 22 ++
 python/pyspark/taskcontext.py | 26 +-
 3 files changed, 44 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/20b7c684/python/pyspark/__init__.py
--
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 5821891..ee153af 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -36,7 +36,12 @@ Public classes:
   Finer-grained cache persistence levels.
   - :class:`TaskContext`:
   Information about the current running task, available on the workers and 
experimental.
-
+  - :class:`RDDBarrier`:
+  Wraps an RDD under a barrier stage for barrier execution.
+  - :class:`BarrierTaskContext`:
+  A :class:`TaskContext` that provides extra info and tooling for barrier 
execution.
+  - :class:`BarrierTaskInfo`:
+  Information about a barrier task.
 """
 
 from functools import wraps
@@ -44,14 +49,14 @@ import types
 
 from pyspark.conf import SparkConf
 from pyspark.context import SparkContext
-from pyspark.rdd import RDD
+from pyspark.rdd import RDD, RDDBarrier
 from pyspark.files import SparkFiles
 from pyspark.storagelevel import StorageLevel
 from pyspark.accumulators import Accumulator, AccumulatorParam
 from pyspark.broadcast import Broadcast
 from pyspark.serializers import MarshalSerializer, PickleSerializer
 from pyspark.status import *
-from pyspark.taskcontext import TaskContext
+from pyspark.taskcontext import TaskContext, BarrierTaskContext, 
BarrierTaskInfo
 from pyspark.profiler import Profiler, BasicProfiler
 from pyspark.version import __version__
 from pyspark._globals import _NoValue
@@ -113,4 +118,5 @@ __all__ = [
 "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", 
"Broadcast",
 "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
 "StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", 
"BasicProfiler", "TaskContext",
+"RDDBarrier", "BarrierTaskContext", "BarrierTaskInfo",
 ]

http://git-wip-us.apache.org/repos/asf/spark/blob/20b7c684/python/pyspark/rdd.py
--
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 380475e..b317156 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2390,7 +2390,18 @@ class RDD(object):
 """
 .. note:: Experimental
 
-Indicates that Spark must launch the tasks together for the current 
stage.
+Marks the current stage as a barrier stage, where Spark must launch 
all tasks together.
+In case of a task failure, instead of only restarting the failed task, 
Spark will abort the
+entire stage and relaunch all tasks for this stage.
+The barrier execution mode feature is experimental and it only handles 
limited scenarios.
+Please read the linked SPIP and design docs to understand the 
limitations and future plans.
+
+:return: an :class:`RDDBarrier` instance that provides actions within 
a barrier stage.
+
+.. seealso:: :class:`BarrierTaskContext`
+.. seealso:: `SPIP: Barrier Execution Mode \
+<http://jira.apache.org/jira/browse/SPARK-24374>`_
+.. seealso:: `Design Doc 
<https://jira.apache.org/jira/browse/SPARK-24582>`_
 
 .. versionadded:: 2.4.0
 """
@@ -2430,8 +2441,8 @@ class RDDBarrier(object):
 """
 .. note:: Exper

spark git commit: [SPARK-25266][CORE] Fix memory leak in Barrier Execution Mode

2018-08-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 1fd59c129 -> 3864480e1


[SPARK-25266][CORE] Fix memory leak in Barrier Execution Mode

## What changes were proposed in this pull request?

BarrierCoordinator uses Timer and TimerTask. `TimerTask#cancel()` is invoked in 
ContextBarrierState#cancelTimerTask but `Timer#purge()` is never invoked.

Once a TimerTask is scheduled, the reference to it is not released until 
`Timer#purge()` is invoked even though `TimerTask#cancel()` is invoked.

## How was this patch tested?

I checked the number of instances related to the TimerTask using jmap.

Closes #22258 from sarutak/fix-barrierexec-oom.

Authored-by: sarutak 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3864480e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3864480e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3864480e

Branch: refs/heads/master
Commit: 3864480e14a4961720cc1be43635c7c7dec08c09
Parents: 1fd59c1
Author: sarutak 
Authored: Wed Aug 29 07:13:13 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Aug 29 07:13:13 2018 -0700

--
 core/src/main/scala/org/apache/spark/BarrierCoordinator.scala | 1 +
 core/src/main/scala/org/apache/spark/BarrierTaskContext.scala | 1 +
 2 files changed, 2 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3864480e/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
--
diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala 
b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 5e546c6..6439ca5 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -123,6 +123,7 @@ private[spark] class BarrierCoordinator(
 private def cancelTimerTask(): Unit = {
   if (timerTask != null) {
 timerTask.cancel()
+timer.purge()
 timerTask = null
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/3864480e/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala 
b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index de82798..3901f96 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -140,6 +140,7 @@ class BarrierTaskContext(
 throw e
 } finally {
   timerTask.cancel()
+  timer.purge()
 }
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25234][SPARKR] avoid integer overflow in parallelize

2018-08-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.3 fcc9bd632 -> 42c1fdd22


[SPARK-25234][SPARKR] avoid integer overflow in parallelize

## What changes were proposed in this pull request?

`parallelize` uses integer multiplication to determine the split indices. It 
might cause integer overflow.

## How was this patch tested?

unit test

Closes #5 from mengxr/SPARK-25234.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 
(cherry picked from commit 9714fa547325ed7b6a8066a88957537936b233dd)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42c1fdd2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42c1fdd2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42c1fdd2

Branch: refs/heads/branch-2.3
Commit: 42c1fdd229b3cf19ff804b7516eae9d36ae50c81
Parents: fcc9bd6
Author: Xiangrui Meng 
Authored: Fri Aug 24 15:03:00 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Aug 24 15:04:11 2018 -0700

--
 R/pkg/R/context.R| 9 -
 R/pkg/tests/fulltests/test_context.R | 7 +++
 2 files changed, 11 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/42c1fdd2/R/pkg/R/context.R
--
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 443c2ff..25e2d15 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -138,11 +138,10 @@ parallelize <- function(sc, coll, numSlices = 1) {
 
   sizeLimit <- getMaxAllocationLimit(sc)
   objectSize <- object.size(coll)
+  len <- length(coll)
 
   # For large objects we make sure the size of each slice is also smaller than 
sizeLimit
-  numSerializedSlices <- max(numSlices, ceiling(objectSize / sizeLimit))
-  if (numSerializedSlices > length(coll))
-numSerializedSlices <- length(coll)
+  numSerializedSlices <- min(len, max(numSlices, ceiling(objectSize / 
sizeLimit)))
 
   # Generate the slice ids to put each row
   # For instance, for numSerializedSlices of 22, length of 50
@@ -153,8 +152,8 @@ parallelize <- function(sc, coll, numSlices = 1) {
   splits <- if (numSerializedSlices > 0) {
 unlist(lapply(0: (numSerializedSlices - 1), function(x) {
   # nolint start
-  start <- trunc((x * length(coll)) / numSerializedSlices)
-  end <- trunc(((x + 1) * length(coll)) / numSerializedSlices)
+  start <- trunc((as.numeric(x) * len) / numSerializedSlices)
+  end <- trunc(((as.numeric(x) + 1) * len) / numSerializedSlices)
   # nolint end
   rep(start, end - start)
 }))

http://git-wip-us.apache.org/repos/asf/spark/blob/42c1fdd2/R/pkg/tests/fulltests/test_context.R
--
diff --git a/R/pkg/tests/fulltests/test_context.R 
b/R/pkg/tests/fulltests/test_context.R
index f0d0a51..288a271 100644
--- a/R/pkg/tests/fulltests/test_context.R
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -240,3 +240,10 @@ test_that("add and get file to be downloaded with Spark 
job on every node", {
   unlink(path, recursive = TRUE)
   sparkR.session.stop()
 })
+
+test_that("SPARK-25234: parallelize should not have integer overflow", {
+  sc <- sparkR.sparkContext(master = sparkRTestMaster)
+  # 47000 * 47000 exceeds integer range
+  parallelize(sc, 1:47000, 47000)
+  sparkR.session.stop()
+})


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25234][SPARKR] avoid integer overflow in parallelize

2018-08-24 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master f8346d2fc -> 9714fa547


[SPARK-25234][SPARKR] avoid integer overflow in parallelize

## What changes were proposed in this pull request?

`parallelize` uses integer multiplication to determine the split indices. It 
might cause integer overflow.

## How was this patch tested?

unit test

Closes #5 from mengxr/SPARK-25234.

Authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9714fa54
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9714fa54
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9714fa54

Branch: refs/heads/master
Commit: 9714fa547325ed7b6a8066a88957537936b233dd
Parents: f8346d2
Author: Xiangrui Meng 
Authored: Fri Aug 24 15:03:00 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Aug 24 15:03:00 2018 -0700

--
 R/pkg/R/context.R| 9 -
 R/pkg/tests/fulltests/test_context.R | 7 +++
 2 files changed, 11 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9714fa54/R/pkg/R/context.R
--
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 7e77ea4..f168ca7 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -138,11 +138,10 @@ parallelize <- function(sc, coll, numSlices = 1) {
 
   sizeLimit <- getMaxAllocationLimit(sc)
   objectSize <- object.size(coll)
+  len <- length(coll)
 
   # For large objects we make sure the size of each slice is also smaller than 
sizeLimit
-  numSerializedSlices <- max(numSlices, ceiling(objectSize / sizeLimit))
-  if (numSerializedSlices > length(coll))
-numSerializedSlices <- length(coll)
+  numSerializedSlices <- min(len, max(numSlices, ceiling(objectSize / 
sizeLimit)))
 
   # Generate the slice ids to put each row
   # For instance, for numSerializedSlices of 22, length of 50
@@ -153,8 +152,8 @@ parallelize <- function(sc, coll, numSlices = 1) {
   splits <- if (numSerializedSlices > 0) {
 unlist(lapply(0: (numSerializedSlices - 1), function(x) {
   # nolint start
-  start <- trunc((x * length(coll)) / numSerializedSlices)
-  end <- trunc(((x + 1) * length(coll)) / numSerializedSlices)
+  start <- trunc((as.numeric(x) * len) / numSerializedSlices)
+  end <- trunc(((as.numeric(x) + 1) * len) / numSerializedSlices)
   # nolint end
   rep(start, end - start)
 }))

http://git-wip-us.apache.org/repos/asf/spark/blob/9714fa54/R/pkg/tests/fulltests/test_context.R
--
diff --git a/R/pkg/tests/fulltests/test_context.R 
b/R/pkg/tests/fulltests/test_context.R
index f0d0a51..288a271 100644
--- a/R/pkg/tests/fulltests/test_context.R
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -240,3 +240,10 @@ test_that("add and get file to be downloaded with Spark 
job on every node", {
   unlink(path, recursive = TRUE)
   sparkR.session.stop()
 })
+
+test_that("SPARK-25234: parallelize should not have integer overflow", {
+  sc <- sparkR.sparkContext(master = sparkRTestMaster)
+  # 47000 * 47000 exceeds integer range
+  parallelize(sc, 1:47000, 47000)
+  sparkR.session.stop()
+})


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25095][PYSPARK] Python support for BarrierTaskContext

2018-08-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 42035a4fe -> ad45299d0


[SPARK-25095][PYSPARK] Python support for BarrierTaskContext

## What changes were proposed in this pull request?

Add method `barrier()` and `getTaskInfos()` in python TaskContext, these two 
methods are only allowed for barrier tasks.

## How was this patch tested?

Add new tests in `tests.py`

Closes #22085 from jiangxb1987/python.barrier.

Authored-by: Xingbo Jiang 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad45299d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad45299d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad45299d

Branch: refs/heads/master
Commit: ad45299d047c10472fd3a86103930fe7c54a4cf1
Parents: 42035a4
Author: Xingbo Jiang 
Authored: Tue Aug 21 15:54:30 2018 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 21 15:54:30 2018 -0700

--
 .../apache/spark/api/python/PythonRunner.scala  | 106 ++
 python/pyspark/serializers.py   |   7 +
 python/pyspark/taskcontext.py   | 144 +++
 python/pyspark/tests.py |  36 -
 python/pyspark/worker.py|  16 ++-
 5 files changed, 305 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ad45299d/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala 
b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 7b31857..f824191 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -20,12 +20,14 @@ package org.apache.spark.api.python
 import java.io._
 import java.net._
 import java.nio.charset.StandardCharsets
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
+import org.apache.spark.security.SocketAuthHelper
 import org.apache.spark.util._
 
 
@@ -76,6 +78,12 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
   // TODO: support accumulator in multiple UDF
   protected val accumulator = funcs.head.funcs.head.accumulator
 
+  // Expose a ServerSocket to support method calls via socket from Python side.
+  private[spark] var serverSocket: Option[ServerSocket] = None
+
+  // Authentication helper used when serving method calls via socket from 
Python side.
+  private lazy val authHelper = new SocketAuthHelper(SparkEnv.get.conf)
+
   def compute(
   inputIterator: Iterator[IN],
   partitionIndex: Int,
@@ -180,7 +188,73 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
 dataOut.writeInt(partitionIndex)
 // Python version of driver
 PythonRDD.writeUTF(pythonVer, dataOut)
+// Init a ServerSocket to accept method calls from Python side.
+val isBarrier = context.isInstanceOf[BarrierTaskContext]
+if (isBarrier) {
+  serverSocket = Some(new ServerSocket(/* port */ 0,
+/* backlog */ 1,
+InetAddress.getByName("localhost")))
+  // A call to accept() for ServerSocket shall block infinitely.
+  serverSocket.map(_.setSoTimeout(0))
+  new Thread("accept-connections") {
+setDaemon(true)
+
+override def run(): Unit = {
+  while (!serverSocket.get.isClosed()) {
+var sock: Socket = null
+try {
+  sock = serverSocket.get.accept()
+  // Wait for function call from python side.
+  sock.setSoTimeout(1)
+  val input = new DataInputStream(sock.getInputStream())
+  input.readInt() match {
+case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
+  // The barrier() function may wait infinitely, socket 
shall not timeout
+  // before the function finishes.
+  sock.setSoTimeout(0)
+  barrierAndServe(sock)
+
+case _ =>
+  val out = new DataOutputStream(new BufferedOutputStream(
+sock.getOutputStream))
+  
writeUTF(BarrierTaskContextMessageProtocol.ERROR_UNRECOGNIZED_FUNCTION, out)
+  }
+} catch {
+  case e: SocketException if e.getMessage.contains("Socket 
closed") =>
+// It is possible that the ServerSo

spark git commit: [SPARK-25161][CORE] Fix several bugs in failure handling of barrier execution mode

2018-08-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master b8788b3e7 -> 5059255d9


[SPARK-25161][CORE] Fix several bugs in failure handling of barrier execution 
mode

## What changes were proposed in this pull request?

Fix several bugs in failure handling of barrier execution mode:
* Mark TaskSet for a barrier stage as zombie when a task attempt fails;
* Multiple barrier task failures from a single barrier stage should not trigger 
multiple stage retries;
* Barrier task failure from a previous failed stage attempt should not trigger 
stage retry;
* Fail the job when a task from a barrier ResultStage failed;
* RDD.isBarrier() should not rely on `ShuffleDependency`s.

## How was this patch tested?

Added corresponding test cases in `DAGSchedulerSuite` and 
`TaskSchedulerImplSuite`.

Closes #22158 from jiangxb1987/failure.

Authored-by: Xingbo Jiang 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5059255d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5059255d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5059255d

Branch: refs/heads/master
Commit: 5059255d91fc7a9810e013eba39e12d30291dd08
Parents: b8788b3
Author: Xingbo Jiang 
Authored: Tue Aug 21 08:25:02 2018 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 21 08:25:02 2018 -0700

--
 .../main/scala/org/apache/spark/rdd/RDD.scala   |   3 +-
 .../apache/spark/scheduler/DAGScheduler.scala   | 125 +++
 .../apache/spark/scheduler/TaskSetManager.scala |   4 +
 .../spark/scheduler/DAGSchedulerSuite.scala | 106 
 .../scheduler/TaskSchedulerImplSuite.scala  |  18 +++
 5 files changed, 200 insertions(+), 56 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5059255d/core/src/main/scala/org/apache/spark/rdd/RDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index cbc1143..374b846 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1863,7 +1863,8 @@ abstract class RDD[T: ClassTag](
 
   // From performance concern, cache the value to avoid repeatedly compute 
`isBarrier()` on a long
   // RDD chain.
-  @transient protected lazy val isBarrier_ : Boolean = 
dependencies.exists(_.rdd.isBarrier())
+  @transient protected lazy val isBarrier_ : Boolean =
+dependencies.filter(!_.isInstanceOf[ShuffleDependency[_, _, 
_]]).exists(_.rdd.isBarrier())
 }
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/5059255d/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 2b0ca13..6787250 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1478,9 +1478,11 @@ private[spark] class DAGScheduler(
 
mapOutputTracker.unregisterAllMapOutput(failedMapStage.shuffleDep.shuffleId)
 
   case failedResultStage: ResultStage =>
-// Mark all the partitions of the result stage to be not 
finished, to ensure retry
-// all the tasks on resubmitted stage attempt.
-failedResultStage.activeJob.map(_.resetAllPartitions())
+// Abort the failed result stage since we may have committed 
output for some
+// partitions.
+val reason = "Could not recover from a failed barrier 
ResultStage. Most recent " +
+  s"failure reason: $failureMessage"
+abortStage(failedResultStage, reason, None)
 }
   }
 
@@ -1553,62 +1555,75 @@ private[spark] class DAGScheduler(
 
 // Always fail the current stage and retry all the tasks when a 
barrier task fail.
 val failedStage = stageIdToStage(task.stageId)
-logInfo(s"Marking $failedStage (${failedStage.name}) as failed due to 
a barrier task " +
-  "failed.")
-val message = s"Stage failed because barrier task $task finished 
unsuccessfully.\n" +
-  failure.toErrorString
-try {
-  // killAllTaskAttempts will fail if a SchedulerBackend does not 
implement killTask.
-  val reason = s"Task $task from barrier stage $failedStage 
(${failedStage.name}) failed."
-  taskScheduler.killAllTaskAttempts(stageId, interruptThread = false, 
reason)
-} catch {
-  case e: UnsupportedOperationExcepti

spark git commit: [SPARK-24819][CORE] Fail fast when no enough slots to launch the barrier stage on job submitted

2018-08-15 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 4d8ae0d1c -> bfb74394a


[SPARK-24819][CORE] Fail fast when no enough slots to launch the barrier stage 
on job submitted

## What changes were proposed in this pull request?

We shall check whether the barrier stage requires more slots (to be able to 
launch all tasks in the barrier stage together) than the total number of active 
slots currently, and fail fast if trying to submit a barrier stage that 
requires more slots than current total number.

This PR proposes to add a new method `getNumSlots()` to try to get the total 
number of currently active slots in `SchedulerBackend`, support of this new 
method has been added to all the first-class scheduler backends except 
`MesosFineGrainedSchedulerBackend`.

## How was this patch tested?

Added new test cases in `BarrierStageOnSubmittedSuite`.

Closes #22001 from jiangxb1987/SPARK-24819.

Lead-authored-by: Xingbo Jiang 
Co-authored-by: Xiangrui Meng 
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfb74394
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfb74394
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfb74394

Branch: refs/heads/master
Commit: bfb74394a5513134ea1da9fcf4a1783b77dd64e4
Parents: 4d8ae0d
Author: Xingbo Jiang 
Authored: Wed Aug 15 13:31:28 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Aug 15 13:31:28 2018 -0700

--
 .../scala/org/apache/spark/SparkContext.scala   |  9 ++
 .../apache/spark/internal/config/package.scala  | 27 ++
 .../scheduler/BarrierJobAllocationFailed.scala  | 62 +
 .../apache/spark/scheduler/DAGScheduler.scala   | 88 ++-
 .../spark/scheduler/SchedulerBackend.scala  |  9 ++
 .../cluster/CoarseGrainedSchedulerBackend.scala |  6 ++
 .../scheduler/local/LocalSchedulerBackend.scala |  2 +
 .../spark/BarrierStageOnSubmittedSuite.scala| 91 ++--
 .../spark/ExecutorAllocationManagerSuite.scala  |  2 +
 .../org/apache/spark/SparkContextSuite.scala|  1 +
 .../CoarseGrainedSchedulerBackendSuite.scala| 89 ++-
 .../spark/scheduler/DAGSchedulerSuite.scala |  2 +-
 .../scheduler/ExternalClusterManagerSuite.scala |  1 +
 .../scheduler/SchedulerIntegrationSuite.scala   |  2 +
 .../scheduler/TaskSchedulerImplSuite.scala  |  1 +
 .../MesosFineGrainedSchedulerBackend.scala  |  4 +
 16 files changed, 364 insertions(+), 32 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bfb74394/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index a7ffb35..e5b1e0e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1603,6 +1603,15 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
+   * Get the max number of tasks that can be concurrent launched currently.
+   * Note that please don't cache the value returned by this method, because 
the number can change
+   * due to add/remove executors.
+   *
+   * @return The max number of tasks that can be concurrent launched currently.
+   */
+  private[spark] def maxNumConcurrentTasks(): Int = 
schedulerBackend.maxNumConcurrentTasks()
+
+  /**
* Update the cluster manager on our scheduling needs. Three bits of 
information are included
* to help it make decisions.
* @param numExecutors The total number of executors we'd like to have. The 
cluster manager

http://git-wip-us.apache.org/repos/asf/spark/blob/bfb74394/core/src/main/scala/org/apache/spark/internal/config/package.scala
--
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala 
b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index eb08628..a8aa691 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -577,4 +577,31 @@ package object config {
   .timeConf(TimeUnit.SECONDS)
   .checkValue(v => v > 0, "The value should be a positive time value.")
   .createWithDefaultString("365d")
+
+  private[spark] val BARRIER_MAX_CONCURRENT_TASKS_CHECK_INTERVAL =
+ConfigBuilder("spark.scheduler.barrier.maxConcurrentTasksCheck.interval")
+  .doc("Time in seconds to wait between a max concurrent tasks check 
failure and the next " +
+"check. A max concurrent tasks check ensures the cluster can launch 
more concurrent " +
+

spark git commit: [SPARK-25045][CORE] Make `RDDBarrier.mapParititions` similar to `RDD.mapPartitions`

2018-08-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 66699c5c3 -> d90f1336d


[SPARK-25045][CORE] Make `RDDBarrier.mapParititions` similar to 
`RDD.mapPartitions`

## What changes were proposed in this pull request?

Signature of the function passed to `RDDBarrier.mapPartitions()` is different 
from that of `RDD.mapPartitions`. The later doesnât take a `TaskContext`. We 
shall make the function signature the same to avoid confusion and misusage.

This PR proposes the following API changes:
- In `RDDBarrier`, migrate `mapPartitions` from
   ```
def mapPartitions[S: ClassTag](
f: (Iterator[T], BarrierTaskContext) => Iterator[S],
preservesPartitioning: Boolean = false): RDD[S]
}
   ```
to
   ```
def mapPartitions[S: ClassTag](
f: Iterator[T] => Iterator[S],
preservesPartitioning: Boolean = false): RDD[S]
}
   ```
- Add new static method to get a `BarrierTaskContext`:
   ```
object BarrierTaskContext {
   def get(): BarrierTaskContext
}
   ```

## How was this patch tested?

Existing test cases.

Author: Xingbo Jiang 

Closes #22026 from jiangxb1987/mapPartitions.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d90f1336
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d90f1336
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d90f1336

Branch: refs/heads/master
Commit: d90f1336d87199aac56fe227a0fe14ab0ae3a332
Parents: 66699c5
Author: Xingbo Jiang 
Authored: Tue Aug 7 17:32:41 2018 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 7 17:32:41 2018 -0700

--
 .../org/apache/spark/BarrierTaskContext.scala   | 14 +++--
 .../scala/org/apache/spark/rdd/RDDBarrier.scala |  7 +++
 .../spark/BarrierStageOnSubmittedSuite.scala| 22 ++--
 .../org/apache/spark/SparkContextSuite.scala|  6 --
 .../org/apache/spark/rdd/RDDBarrierSuite.scala  |  6 +++---
 .../scheduler/BarrierTaskContextSuite.scala | 15 -
 .../spark/scheduler/DAGSchedulerSuite.scala |  4 ++--
 7 files changed, 45 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d90f1336/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala 
b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 8e2b155..de82798 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -72,7 +72,8 @@ class BarrierTaskContext(
* 1. Only call barrier() function on a subset of all the tasks in the same 
barrier stage, it
* shall lead to timeout of the function call.
* {{{
-   *   rdd.barrier().mapPartitions { (iter, context) =>
+   *   rdd.barrier().mapPartitions { iter =>
+   *   val context = BarrierTaskContext.get()
*   if (context.partitionId() == 0) {
*   // Do nothing.
*   } else {
@@ -85,7 +86,8 @@ class BarrierTaskContext(
* 2. Include barrier() function in a try-catch code block, this may lead to 
timeout of the
* second function call.
* {{{
-   *   rdd.barrier().mapPartitions { (iter, context) =>
+   *   rdd.barrier().mapPartitions { iter =>
+   *   val context = BarrierTaskContext.get()
*   try {
*   // Do something that might throw an Exception.
*   doSomething()
@@ -152,3 +154,11 @@ class BarrierTaskContext(
 addressesStr.split(",").map(_.trim()).map(new BarrierTaskInfo(_))
   }
 }
+
+object BarrierTaskContext {
+  /**
+   * Return the currently active BarrierTaskContext. This can be called inside 
of user functions to
+   * access contextual information about running barrier tasks.
+   */
+  def get(): BarrierTaskContext = 
TaskContext.get().asInstanceOf[BarrierTaskContext]
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/d90f1336/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala
index 71f38bf..978e7c0 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala
@@ -28,7 +28,7 @@ class RDDBarrier[T: ClassTag](rdd: RDD[T]) {
 
   /**
* :: Experimental ::
-   * Maps partitions together with a provided 
[[org.apache.spark.BarrierTaskContext]].
+   * Generate a new barrier RDD by applying a function to each partitions of 
the prev RDD.
*
* `preservesPartitioning` indicates wheth

spark git commit: [SPARK-24954][CORE] Fail fast on job submit if run a barrier stage with dynamic resource allocation enabled

2018-08-03 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c32dbd6bd -> 92b48842b


[SPARK-24954][CORE] Fail fast on job submit if run a barrier stage with dynamic 
resource allocation enabled

## What changes were proposed in this pull request?

We don't support run a barrier stage with dynamic resource allocation enabled, 
it shall lead to some confusing behaviors (eg. with dynamic resource allocation 
enabled, it may happen that we acquire some executors (but not enough to launch 
all the tasks in a barrier stage) and later release them due to executor idle 
time expire, and then acquire again).

We perform the check on job submit and fail fast if running a barrier stage 
with dynamic resource allocation enabled.

## How was this patch tested?

Added new test suite `BarrierStageOnSubmittedSuite` to cover all the fail fast 
cases that submitted a job containing one or more barrier stages.

Author: Xingbo Jiang 

Closes #21915 from jiangxb1987/SPARK-24954.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92b48842
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92b48842
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92b48842

Branch: refs/heads/master
Commit: 92b48842b944a3e430472294cdc3c481bad6b804
Parents: c32dbd6
Author: Xingbo Jiang 
Authored: Fri Aug 3 09:36:56 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Aug 3 09:36:56 2018 -0700

--
 .../apache/spark/scheduler/DAGScheduler.scala   | 25 +
 .../spark/BarrierStageOnSubmittedSuite.scala| 57 
 2 files changed, 71 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/92b48842/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 3dd0718..cf1fcbc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -364,6 +364,7 @@ class DAGScheduler(
*/
   def createShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: 
Int): ShuffleMapStage = {
 val rdd = shuffleDep.rdd
+checkBarrierStageWithDynamicAllocation(rdd)
 checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions)
 val numTasks = rdd.partitions.length
 val parents = getOrCreateParentStages(rdd, jobId)
@@ -385,6 +386,23 @@ class DAGScheduler(
   }
 
   /**
+   * We don't support run a barrier stage with dynamic resource allocation 
enabled, it shall lead
+   * to some confusing behaviors (eg. with dynamic resource allocation 
enabled, it may happen that
+   * we acquire some executors (but not enough to launch all the tasks in a 
barrier stage) and
+   * later release them due to executor idle time expire, and then acquire 
again).
+   *
+   * We perform the check on job submit and fail fast if running a barrier 
stage with dynamic
+   * resource allocation enabled.
+   *
+   * TODO SPARK-24942 Improve cluster resource management with jobs containing 
barrier stage
+   */
+  private def checkBarrierStageWithDynamicAllocation(rdd: RDD[_]): Unit = {
+if (rdd.isBarrier() && Utils.isDynamicAllocationEnabled(sc.getConf)) {
+  throw new 
SparkException(DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_DYN_ALLOCATION)
+}
+  }
+
+  /**
* Create a ResultStage associated with the provided jobId.
*/
   private def createResultStage(
@@ -393,6 +411,7 @@ class DAGScheduler(
   partitions: Array[Int],
   jobId: Int,
   callSite: CallSite): ResultStage = {
+checkBarrierStageWithDynamicAllocation(rdd)
 checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
 val parents = getOrCreateParentStages(rdd, jobId)
 val id = nextStageId.getAndIncrement()
@@ -2001,4 +2020,10 @@ private[spark] object DAGScheduler {
   "PartitionPruningRDD). A workaround for first()/take() can be 
barrierRdd.collect().head " +
   "(scala) or barrierRdd.collect()[0] (python).\n" +
   "2. An RDD that depends on multiple barrier RDDs (eg. 
barrierRdd1.zip(barrierRdd2))."
+
+  // Error message when running a barrier stage with dynamic resource 
allocation enabled.
+  val ERROR_MESSAGE_RUN_BARRIER_WITH_DYN_ALLOCATION =
+"[SPARK-24942]: Barrier execution mode does not support dynamic resource 
allocation for " +
+  "now. You can disable dynamic resource allocation by setting Spark conf 
" +
+  "\"spark.dynamicAllocation.enabled\" to \"false\"."
 }

http://git-wip-us.apache.org/repos/asf/

spark git commit: [SPARK-24795][CORE][FOLLOWUP] Combine BarrierTaskContext with BarrierTaskContextImpl

2018-08-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master bbdcc3bf6 -> 29077a1d1


[SPARK-24795][CORE][FOLLOWUP] Combine BarrierTaskContext with 
BarrierTaskContextImpl

## What changes were proposed in this pull request?

According to https://github.com/apache/spark/pull/21758#discussion_r206746905 , 
current declaration of `BarrierTaskContext` didn't extend methods from 
`TaskContext`. Since `TaskContext` is an abstract class and we don't want to 
change it to a trait, we have to define class `BarrierTaskContext` directly.

## How was this patch tested?

Existing tests.

Author: Xingbo Jiang 

Closes #21972 from jiangxb1987/BarrierTaskContext.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29077a1d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29077a1d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29077a1d

Branch: refs/heads/master
Commit: 29077a1d15e49dfafe7f2eab963830ba9cc6b29a
Parents: bbdcc3b
Author: Xingbo Jiang 
Authored: Thu Aug 2 17:19:42 2018 -0700
Committer: Xiangrui Meng 
Committed: Thu Aug 2 17:19:42 2018 -0700

--
 .../org/apache/spark/BarrierTaskContext.scala   | 60 +++-
 .../apache/spark/BarrierTaskContextImpl.scala   | 49 
 .../scala/org/apache/spark/rdd/RDDBarrier.scala |  2 +-
 .../scala/org/apache/spark/scheduler/Task.scala |  2 +-
 4 files changed, 59 insertions(+), 54 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/29077a1d/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala 
b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 4c35862..ba30368 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -17,20 +17,71 @@
 
 package org.apache.spark
 
+import java.util.Properties
+
 import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.memory.TaskMemoryManager
+import org.apache.spark.metrics.MetricsSystem
 
 /** A [[TaskContext]] with extra info and tooling for a barrier stage. */
-trait BarrierTaskContext extends TaskContext {
+class BarrierTaskContext(
+override val stageId: Int,
+override val stageAttemptNumber: Int,
+override val partitionId: Int,
+override val taskAttemptId: Long,
+override val attemptNumber: Int,
+override val taskMemoryManager: TaskMemoryManager,
+localProperties: Properties,
+@transient private val metricsSystem: MetricsSystem,
+// The default value is only used in tests.
+override val taskMetrics: TaskMetrics = TaskMetrics.empty)
+  extends TaskContextImpl(stageId, stageAttemptNumber, partitionId, 
taskAttemptId, attemptNumber,
+  taskMemoryManager, localProperties, metricsSystem, taskMetrics) {
 
   /**
* :: Experimental ::
* Sets a global barrier and waits until all tasks in this stage hit this 
barrier. Similar to
* MPI_Barrier function in MPI, the barrier() function call blocks until all 
tasks in the same
* stage have reached this routine.
+   *
+   * CAUTION! In a barrier stage, each task must have the same number of 
barrier() calls, in all
+   * possible code branches. Otherwise, you may get the job hanging or a 
SparkException after
+   * timeout. Some examples of misuses listed below:
+   * 1. Only call barrier() function on a subset of all the tasks in the same 
barrier stage, it
+   * shall lead to timeout of the function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { (iter, context) =>
+   *   if (context.partitionId() == 0) {
+   *   // Do nothing.
+   *   } else {
+   *   context.barrier()
+   *   }
+   *   iter
+   *   }
+   * }}}
+   *
+   * 2. Include barrier() function in a try-catch code block, this may lead to 
timeout of the
+   * second function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { (iter, context) =>
+   *   try {
+   *   // Do something that might throw an Exception.
+   *   doSomething()
+   *   context.barrier()
+   *   } catch {
+   *   case e: Exception => logWarning("...", e)
+   *   }
+   *   context.barrier()
+   *   iter
+   *   }
+   * }}}
*/
   @Experimental
   @Since("2.4.0")
-  def barrier(): Unit
+  def barrier(): Unit = {
+// TODO SPARK-24817 implement global barrier.
+  }
 
   /**
* :: Experimental ::
@@ -38,5 +89,8 @@ trait BarrierTaskContext extends TaskContext {
*/
   @Experimental
   @Since("2.4.0")
-  def getTaskInfos(): Array[BarrierTaskInfo]
+  def getTaskInfos(): Array[

spark git commit: [SPARK-24820][SPARK-24821][CORE] Fail fast when submitted job contains a barrier stage with unsupported RDD chain pattern

2018-08-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master ad2e63662 -> 38e4699c9


[SPARK-24820][SPARK-24821][CORE] Fail fast when submitted job contains a 
barrier stage with unsupported RDD chain pattern

## What changes were proposed in this pull request?

Check on job submit to make sure we don't launch a barrier stage with 
unsupported RDD chain pattern. The following patterns are not supported:
- Ancestor RDDs that have different number of partitions from the resulting RDD 
(eg. union()/coalesce()/first()/PartitionPruningRDD);
- An RDD that depends on multiple barrier RDDs (eg. 
barrierRdd1.zip(barrierRdd2)).

## How was this patch tested?

Add test cases in `BarrierStageOnSubmittedSuite`.

Author: Xingbo Jiang 

Closes #21927 from jiangxb1987/SPARK-24820.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/38e4699c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/38e4699c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/38e4699c

Branch: refs/heads/master
Commit: 38e4699c978e56a0f24b8efb94fd3206cdd8b3fe
Parents: ad2e636
Author: Xingbo Jiang 
Authored: Thu Aug 2 09:36:26 2018 -0700
Committer: Xiangrui Meng 
Committed: Thu Aug 2 09:36:26 2018 -0700

--
 .../apache/spark/scheduler/DAGScheduler.scala   |  55 ++-
 .../spark/BarrierStageOnSubmittedSuite.scala| 153 +++
 2 files changed, 207 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/38e4699c/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 4858af7..3dd0718 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -39,7 +39,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.partial.{ApproximateActionListener, 
ApproximateEvaluator, PartialResult}
-import org.apache.spark.rdd.{RDD, RDDCheckpointData}
+import org.apache.spark.rdd.{PartitionPruningRDD, RDD, RDDCheckpointData}
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.storage._
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
@@ -341,6 +341,22 @@ class DAGScheduler(
   }
 
   /**
+   * Check to make sure we don't launch a barrier stage with unsupported RDD 
chain pattern. The
+   * following patterns are not supported:
+   * 1. Ancestor RDDs that have different number of partitions from the 
resulting RDD (eg.
+   * union()/coalesce()/first()/take()/PartitionPruningRDD);
+   * 2. An RDD that depends on multiple barrier RDDs (eg. 
barrierRdd1.zip(barrierRdd2)).
+   */
+  private def checkBarrierStageWithRDDChainPattern(rdd: RDD[_], 
numTasksInStage: Int): Unit = {
+val predicate: RDD[_] => Boolean = (r =>
+  r.getNumPartitions == numTasksInStage && 
r.dependencies.filter(_.rdd.isBarrier()).size <= 1)
+if (rdd.isBarrier() && !traverseParentRDDsWithinStage(rdd, predicate)) {
+  throw new SparkException(
+
DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
+}
+  }
+
+  /**
* Creates a ShuffleMapStage that generates the given shuffle dependency's 
partitions. If a
* previously run stage generated the same shuffle data, this function will 
copy the output
* locations that are still available from the previous shuffle to avoid 
unnecessarily
@@ -348,6 +364,7 @@ class DAGScheduler(
*/
   def createShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: 
Int): ShuffleMapStage = {
 val rdd = shuffleDep.rdd
+checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions)
 val numTasks = rdd.partitions.length
 val parents = getOrCreateParentStages(rdd, jobId)
 val id = nextStageId.getAndIncrement()
@@ -376,6 +393,7 @@ class DAGScheduler(
   partitions: Array[Int],
   jobId: Int,
   callSite: CallSite): ResultStage = {
+checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
 val parents = getOrCreateParentStages(rdd, jobId)
 val id = nextStageId.getAndIncrement()
 val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, 
callSite)
@@ -451,6 +469,32 @@ class DAGScheduler(
 parents
   }
 
+  /**
+   * Traverses the given RDD and its ancestors within the same stage and 
checks whether all of the
+   * RDDs satisfy a given predicate.
+   */
+  private def traverseParentRDDsWithinStage(rdd: RDD[_], predicate: RDD[_] => 
Boolean): Boolean = {
+val visited = new HashSet[RDD[_]]

spark git commit: [SPARK-24557][ML] ClusteringEvaluator support array input

2018-08-02 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 166f34618 -> 57d994994


[SPARK-24557][ML] ClusteringEvaluator support array input

## What changes were proposed in this pull request?
ClusteringEvaluator support array input

## How was this patch tested?
added tests

Author: zhengruifeng 

Closes #21563 from zhengruifeng/clu_eval_support_array.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57d99499
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57d99499
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57d99499

Branch: refs/heads/master
Commit: 57d994994d27154f57f2724924c42beb2ab2e0e7
Parents: 166f346
Author: zhengruifeng 
Authored: Wed Aug 1 23:46:01 2018 -0700
Committer: Xiangrui Meng 
Committed: Wed Aug 1 23:46:01 2018 -0700

--
 .../spark/ml/evaluation/ClusteringEvaluator.scala| 15 +--
 .../ml/evaluation/ClusteringEvaluatorSuite.scala | 15 ++-
 2 files changed, 23 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/57d99499/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
index 4353c46..a6d6b4e 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
@@ -21,11 +21,10 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.linalg.{BLAS, DenseVector, SparseVector, Vector, 
Vectors, VectorUDT}
+import org.apache.spark.ml.linalg.{BLAS, DenseVector, SparseVector, Vector, 
Vectors}
 import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
-import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, 
Identifiable,
-  SchemaUtils}
+import org.apache.spark.ml.util._
 import org.apache.spark.sql.{Column, DataFrame, Dataset}
 import org.apache.spark.sql.functions.{avg, col, udf}
 import org.apache.spark.sql.types.DoubleType
@@ -107,15 +106,19 @@ class ClusteringEvaluator @Since("2.3.0") 
(@Since("2.3.0") override val uid: Str
 
   @Since("2.3.0")
   override def evaluate(dataset: Dataset[_]): Double = {
-SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
+SchemaUtils.validateVectorCompatibleColumn(dataset.schema, $(featuresCol))
 SchemaUtils.checkNumericType(dataset.schema, $(predictionCol))
 
+val vectorCol = DatasetUtils.columnToVector(dataset, $(featuresCol))
+val df = dataset.select(col($(predictionCol)),
+  vectorCol.as($(featuresCol), dataset.schema($(featuresCol)).metadata))
+
 ($(metricName), $(distanceMeasure)) match {
   case ("silhouette", "squaredEuclidean") =>
 SquaredEuclideanSilhouette.computeSilhouetteScore(
-  dataset, $(predictionCol), $(featuresCol))
+  df, $(predictionCol), $(featuresCol))
   case ("silhouette", "cosine") =>
-CosineSilhouette.computeSilhouetteScore(dataset, $(predictionCol), 
$(featuresCol))
+CosineSilhouette.computeSilhouetteScore(df, $(predictionCol), 
$(featuresCol))
 }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/57d99499/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
index 2c175ff..e2d7756 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
@@ -33,10 +33,17 @@ class ClusteringEvaluatorSuite
   import testImplicits._
 
   @tran

spark git commit: [SPARK-15064][ML] Locale support in StopWordsRemover

2018-06-12 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 1d7db65e9 -> 5d6a53d98


[SPARK-15064][ML] Locale support in StopWordsRemover

## What changes were proposed in this pull request?

Add locale support for `StopWordsRemover`.

## How was this patch tested?

[Scala|Python] unit tests.

Author: Lee Dongjin 

Closes #21501 from dongjinleekr/feature/SPARK-15064.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d6a53d9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d6a53d9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d6a53d9

Branch: refs/heads/master
Commit: 5d6a53d9831cc1e2115560db5cebe0eea2565dcd
Parents: 1d7db65
Author: Lee Dongjin 
Authored: Tue Jun 12 08:16:37 2018 -0700
Committer: Xiangrui Meng 
Committed: Tue Jun 12 08:16:37 2018 -0700

--
 .../spark/ml/feature/StopWordsRemover.scala | 30 ++--
 .../ml/feature/StopWordsRemoverSuite.scala  | 51 
 python/pyspark/ml/feature.py| 30 ++--
 python/pyspark/ml/tests.py  |  7 +++
 4 files changed, 109 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5d6a53d9/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 3fcd84c..0f946dd 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.ml.feature
 
+import java.util.Locale
+
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.Transformer
-import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam}
+import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -84,7 +86,27 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") 
override val uid: String
   @Since("1.5.0")
   def getCaseSensitive: Boolean = $(caseSensitive)
 
-  setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), 
caseSensitive -> false)
+  /**
+   * Locale of the input for case insensitive matching. Ignored when 
[[caseSensitive]]
+   * is true.
+   * Default: Locale.getDefault.toString
+   * @group param
+   */
+  @Since("2.4.0")
+  val locale: Param[String] = new Param[String](this, "locale",
+"Locale of the input for case insensitive matching. Ignored when 
caseSensitive is true.",
+
ParamValidators.inArray[String](Locale.getAvailableLocales.map(_.toString)))
+
+  /** @group setParam */
+  @Since("2.4.0")
+  def setLocale(value: String): this.type = set(locale, value)
+
+  /** @group getParam */
+  @Since("2.4.0")
+  def getLocale: String = $(locale)
+
+  setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"),
+caseSensitive -> false, locale -> Locale.getDefault.toString)
 
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
@@ -95,8 +117,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") 
override val uid: String
 terms.filter(s => !stopWordsSet.contains(s))
   }
 } else {
-  // TODO: support user locale (SPARK-15064)
-  val toLower = (s: String) => if (s != null) s.toLowerCase else s
+  val lc = new Locale($(locale))
+  val toLower = (s: String) => if (s != null) s.toLowerCase(lc) else s
   val lowerStopWords = $(stopWords).map(toLower(_)).toSet
   udf { terms: Seq[String] =>
 terms.filter(s => !lowerStopWords.contains(toLower(s)))

http://git-wip-us.apache.org/repos/asf/spark/blob/5d6a53d9/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 21259a5..20972d1 100755
--- 
a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -65,6 +65,57 @@ class StopWordsRemoverSuite extends MLTest with 
DefaultReadWriteTest {
 testStopWordsRemover(remover, dataSet)
   }
 
+  test("StopWordsRemover with localed input (case insensitive)") {
+

spark git commit: [SPARK-19826][ML][PYTHON] add spark.ml Python API for PIC

2018-06-11 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 3e5b4ae63 -> a99d284c1


[SPARK-19826][ML][PYTHON] add spark.ml Python API for PIC

## What changes were proposed in this pull request?

add spark.ml Python API for PIC

## How was this patch tested?

add doctest

Author: Huaxin Gao 

Closes #21513 from huaxingao/spark--19826.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a99d284c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a99d284c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a99d284c

Branch: refs/heads/master
Commit: a99d284c16cc4e00ce7c83ecdc3db6facd467552
Parents: 3e5b4ae
Author: Huaxin Gao 
Authored: Mon Jun 11 12:15:14 2018 -0700
Committer: Xiangrui Meng 
Committed: Mon Jun 11 12:15:14 2018 -0700

--
 python/pyspark/ml/clustering.py | 184 ++-
 1 file changed, 179 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a99d284c/python/pyspark/ml/clustering.py
--
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index b3d5fb1..4aa1cf8 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -19,14 +19,15 @@ import sys
 
 from pyspark import since, keyword_only
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, 
JavaWrapper
 from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
+from pyspark.sql import DataFrame
 
 __all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary',
'KMeans', 'KMeansModel',
'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary',
-   'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']
+   'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel', 
'PowerIterationClustering']
 
 
 class ClusteringSummary(JavaWrapper):
@@ -836,7 +837,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, 
HasSeed, HasCheckpointInter
 
 Terminology:
 
- - "term" = "word": an el
+ - "term" = "word": an element of the vocabulary
  - "token": instance of a term appearing in a document
  - "topic": multinomial distribution over terms representing some concept
  - "document": one piece of text, corresponding to one row in the input 
data
@@ -938,7 +939,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, 
HasSeed, HasCheckpointInter
   k=10, optimizer="online", learningOffset=1024.0, 
learningDecay=0.51,\
   subsamplingRate=0.05, optimizeDocConcentration=True,\
   docConcentration=None, topicConcentration=None,\
-  topicDistributionCol="topicDistribution", 
keepLastCheckpoint=True):
+  topicDistributionCol="topicDistribution", 
keepLastCheckpoint=True)
 """
 super(LDA, self).__init__()
 self._java_obj = 
self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid)
@@ -967,7 +968,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, 
HasSeed, HasCheckpointInter
   k=10, optimizer="online", learningOffset=1024.0, 
learningDecay=0.51,\
   subsamplingRate=0.05, optimizeDocConcentration=True,\
   docConcentration=None, topicConcentration=None,\
-  topicDistributionCol="topicDistribution", 
keepLastCheckpoint=True):
+  topicDistributionCol="topicDistribution", 
keepLastCheckpoint=True)
 
 Sets params for LDA.
 """
@@ -1156,6 +1157,179 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, 
HasSeed, HasCheckpointInter
 return self.getOrDefault(self.keepLastCheckpoint)
 
 
+@inherit_doc
+class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, 
JavaMLReadable,
+   JavaMLWritable):
+"""
+.. note:: Experimental
+
+Power Iteration Clustering (PIC), a scalable graph clustering algorithm 
developed by
+http://www.icml2010.org/papers/387.pdf>Lin and Cohen. From the 
abstract:
+PIC finds a very low-dimensional embedding of a dataset using truncated 
power
+iteration on a normalized pair-wise similarity matrix of the data.
+
+This class is not yet an Estimator/Transformer, use 
:py:func:`assignClusters` method
+to run the PowerIterationClustering algorithm.
+
+.. seealso:: `Wikipedia on Spectral clustering \
+<http://en.wikipedia.org/wiki/Spectral_clustering>

spark git commit: [SPARK-24477][SPARK-24454][ML][PYTHON] Imports submodule in ml/init.py and add ImageSchema into all

2018-06-08 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master a5d775a1f -> 173fe450d


[SPARK-24477][SPARK-24454][ML][PYTHON] Imports submodule in ml/__init__.py and 
add ImageSchema into __all__

## What changes were proposed in this pull request?

This PR attaches submodules to ml's `__init__.py` module.

Also, adds `ImageSchema` into `image.py` explicitly.

## How was this patch tested?

Before:

```python
>>> from pyspark import ml
>>> ml.image
Traceback (most recent call last):
  File "", line 1, in 
AttributeError: 'module' object has no attribute 'image'
>>> ml.image.ImageSchema
Traceback (most recent call last):
  File "", line 1, in 
AttributeError: 'module' object has no attribute 'image'
```

```python
>>> "image" in globals()
False
>>> from pyspark.ml import *
>>> "image" in globals()
False
>>> image
Traceback (most recent call last):
  File "", line 1, in 
NameError: name 'image' is not defined
```

After:

```python
>>> from pyspark import ml
>>> ml.image

>>> ml.image.ImageSchema

```

```python
>>> "image" in globals()
False
>>> from pyspark.ml import *
>>> "image" in globals()
True
>>> image

```

Author: hyukjinkwon 

Closes #21483 from HyukjinKwon/SPARK-24454.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/173fe450
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/173fe450
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/173fe450

Branch: refs/heads/master
Commit: 173fe450df203b262b58f7e71c6b52a79db95ee0
Parents: a5d775a
Author: hyukjinkwon 
Authored: Fri Jun 8 09:32:11 2018 -0700
Committer: Xiangrui Meng 
Committed: Fri Jun 8 09:32:11 2018 -0700

--
 python/pyspark/ml/__init__.py | 8 +++-
 python/pyspark/ml/image.py| 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/173fe450/python/pyspark/ml/__init__.py
--
diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py
index 129d7d6..d99a253 100644
--- a/python/pyspark/ml/__init__.py
+++ b/python/pyspark/ml/__init__.py
@@ -21,5 +21,11 @@ machine learning pipelines.
 """
 from pyspark.ml.base import Estimator, Model, Transformer, UnaryTransformer
 from pyspark.ml.pipeline import Pipeline, PipelineModel
+from pyspark.ml import classification, clustering, evaluation, feature, fpm, \
+image, pipeline, recommendation, regression, stat, tuning, util, linalg, 
param
 
-__all__ = ["Transformer", "UnaryTransformer", "Estimator", "Model", 
"Pipeline", "PipelineModel"]
+__all__ = [
+"Transformer", "UnaryTransformer", "Estimator", "Model", "Pipeline", 
"PipelineModel",
+"classification", "clustering", "evaluation", "feature", "fpm", "image",
+"recommendation", "regression", "stat", "tuning", "util", "linalg", 
"param",
+]

http://git-wip-us.apache.org/repos/asf/spark/blob/173fe450/python/pyspark/ml/image.py
--
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 96d702f..5f0c57e 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -31,6 +31,8 @@ from pyspark import SparkContext
 from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string
 from pyspark.sql import DataFrame, SparkSession
 
+__all__ = ["ImageSchema"]
+
 
 class _ImageSchema(object):
 """


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-24300][ML] change the way to set seed in ml.cluster.LDASuite.generateLDAData

2018-06-04 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master b24d3dba6 -> ff0501b0c


[SPARK-24300][ML] change the way to set seed in 
ml.cluster.LDASuite.generateLDAData

## What changes were proposed in this pull request?

Using different RNG in all different partitions.

## How was this patch tested?

manually

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Lu WANG 

Closes #21492 from ludatabricks/SPARK-24300.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff0501b0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff0501b0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff0501b0

Branch: refs/heads/master
Commit: ff0501b0c27dc8149bd5fb38a19d9b0056698766
Parents: b24d3db
Author: Lu WANG 
Authored: Mon Jun 4 16:08:27 2018 -0700
Committer: Xiangrui Meng 
Committed: Mon Jun 4 16:08:27 2018 -0700

--
 .../src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ff0501b0/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
index 096b541..db92132 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -34,9 +34,8 @@ object LDASuite {
   vocabSize: Int): DataFrame = {
 val avgWC = 1  // average instances of each word in a doc
 val sc = spark.sparkContext
-val rng = new java.util.Random()
-rng.setSeed(1)
 val rdd = sc.parallelize(1 to rows).map { i =>
+  val rng = new java.util.Random(i)
   Vectors.dense(Array.fill(vocabSize)(rng.nextInt(2 * avgWC).toDouble))
 }.map(v => new TestRow(v))
 spark.createDataFrame(rdd)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-24290][ML] add support for Array input for instrumentation.logNamedValue

2018-06-04 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 7297ae04d -> b24d3dba6


[SPARK-24290][ML] add support for Array input for instrumentation.logNamedValue

## What changes were proposed in this pull request?

Extend instrumentation.logNamedValue to support Array input
change the logging for "clusterSizes" to new method

## How was this patch tested?

N/A

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Lu WANG 

Closes #21347 from ludatabricks/SPARK-24290.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b24d3dba
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b24d3dba
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b24d3dba

Branch: refs/heads/master
Commit: b24d3dba6571fd3c9e2649aceeaadc3f9c6cc90f
Parents: 7297ae0
Author: Lu WANG 
Authored: Mon Jun 4 14:54:31 2018 -0700
Committer: Xiangrui Meng 
Committed: Mon Jun 4 14:54:31 2018 -0700

--
 .../apache/spark/ml/clustering/BisectingKMeans.scala   |  3 +--
 .../apache/spark/ml/clustering/GaussianMixture.scala   |  3 +--
 .../scala/org/apache/spark/ml/clustering/KMeans.scala  |  3 +--
 .../org/apache/spark/ml/util/Instrumentation.scala | 13 +
 4 files changed, 16 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 1ad4e09..9c96145 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -276,8 +276,7 @@ class BisectingKMeans @Since("2.0.0") (
 val summary = new BisectingKMeansSummary(
   model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
 model.setSummary(Some(summary))
-// TODO: need to extend logNamedValue to support Array
-instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
+instr.logNamedValue("clusterSizes", summary.clusterSizes)
 instr.logSuccess(model)
 model
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 3091bb5..64ecc1e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -426,8 +426,7 @@ class GaussianMixture @Since("2.0.0") (
   $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
 model.setSummary(Some(summary))
 instr.logNamedValue("logLikelihood", logLikelihood)
-// TODO: need to extend logNamedValue to support Array
-instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
+instr.logNamedValue("clusterSizes", summary.clusterSizes)
 instr.logSuccess(model)
 model
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index e72d7f9..1704412 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -359,8 +359,7 @@ class KMeans @Since("1.5.0") (
   model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
 
 model.setSummary(Some(summary))
-// TODO: need to extend logNamedValue to support Array
-instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
+instr.logNamedValue("clusterSizes", summary.clusterSizes)
 instr.logSuccess(model)
 if (handlePersistence) {
   instances.unpersist()

http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentatio

spark git commit: [SPARK-20114][ML][FOLLOW-UP] spark.ml parity for sequential pattern mining - PrefixSpan

2018-05-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master a40ffc656 -> df125062c


[SPARK-20114][ML][FOLLOW-UP] spark.ml parity for sequential pattern mining - 
PrefixSpan

## What changes were proposed in this pull request?

Change `PrefixSpan` into a class with param setter/getters.
This address issues mentioned here:
https://github.com/apache/spark/pull/20973#discussion_r186931806

## How was this patch tested?

UT.

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: WeichenXu <weichen...@databricks.com>

Closes #21393 from WeichenXu123/fix_prefix_span.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df125062
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df125062
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df125062

Branch: refs/heads/master
Commit: df125062c8dac9fee3328d67dd438a456b7a3b74
Parents: a40ffc6
Author: WeichenXu <weichen...@databricks.com>
Authored: Wed May 23 11:00:23 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed May 23 11:00:23 2018 -0700

--
 .../org/apache/spark/ml/fpm/PrefixSpan.scala| 127 +++
 .../apache/spark/ml/fpm/PrefixSpanSuite.scala   |  28 ++--
 2 files changed, 119 insertions(+), 36 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/df125062/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala 
b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
index 02168fe..41716c6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.ml.fpm
 
 import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.fpm.{PrefixSpan => mllibPrefixSpan}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.col
@@ -29,13 +31,97 @@ import org.apache.spark.sql.types.{ArrayType, LongType, 
StructField, StructType}
  * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining 
Sequential Patterns
  * Efficiently by Prefix-Projected Pattern Growth
  * (see http://doi.org/10.1109/ICDE.2001.914830;>here).
+ * This class is not yet an Estimator/Transformer, use 
`findFrequentSequentialPatterns` method to
+ * run the PrefixSpan algorithm.
  *
  * @see https://en.wikipedia.org/wiki/Sequential_Pattern_Mining;>Sequential 
Pattern Mining
  * (Wikipedia)
  */
 @Since("2.4.0")
 @Experimental
-object PrefixSpan {
+final class PrefixSpan(@Since("2.4.0") override val uid: String) extends 
Params {
+
+  @Since("2.4.0")
+  def this() = this(Identifiable.randomUID("prefixSpan"))
+
+  /**
+   * Param for the minimal support level (default: `0.1`).
+   * Sequential patterns that appear more than (minSupport * 
size-of-the-dataset) times are
+   * identified as frequent sequential patterns.
+   * @group param
+   */
+  @Since("2.4.0")
+  val minSupport = new DoubleParam(this, "minSupport", "The minimal support 
level of the " +
+"sequential pattern. Sequential pattern that appears more than " +
+"(minSupport * size-of-the-dataset)." +
+"times will be output.", ParamValidators.gtEq(0.0))
+
+  /** @group getParam */
+  @Since("2.4.0")
+  def getMinSupport: Double = $(minSupport)
+
+  /** @group setParam */
+  @Since("2.4.0")
+  def setMinSupport(value: Double): this.type = set(minSupport, value)
+
+  /**
+   * Param for the maximal pattern length (default: `10`).
+   * @group param
+   */
+  @Since("2.4.0")
+  val maxPatternLength = new IntParam(this, "maxPatternLength",
+"The maximal length of the sequential pattern.",
+ParamValidators.gt(0))
+
+  /** @group getParam */
+  @Since("2.4.0")
+  def getMaxPatternLength: Int = $(maxPatternLength)
+
+  /** @group setParam */
+  @Since("2.4.0")
+  def setMaxPatternLength(value: Int): this.type = set(maxPatternLength, value)
+
+  /**
+   * Param for the maximum number of items (including delimiters used in the 
internal storage
+   * format) allowed in a projected database before local processing (default: 
`3200`).
+   * If a projected database exceeds this size, another iteration of 
distributed prefix growth
+   * is run.
+   * @group param
+   */
+  @Since("2.4.0")
+  val maxLocalProjDBSize = new LongParam(this, "maxLocalProjDBSize",
+"T

spark git commit: [SPARK-22884][ML] ML tests for StructuredStreaming: spark.ml.clustering

2018-05-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 439c69511 -> d4a0895c6


[SPARK-22884][ML] ML tests for StructuredStreaming: spark.ml.clustering

## What changes were proposed in this pull request?

Converting clustering tests to also check code with structured streaming, using 
the ML testing infrastructure implemented in SPARK-22882.

This PR is a new version of https://github.com/apache/spark/pull/20319

Author: Sandor Murakozi <smurak...@gmail.com>
Author: Joseph K. Bradley <jos...@databricks.com>

Closes #21358 from jkbradley/smurakozi-SPARK-22884.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4a0895c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4a0895c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4a0895c

Branch: refs/heads/master
Commit: d4a0895c628ca854895c3c35c46ed990af36ec61
Parents: 439c695
Author: Sandor Murakozi <smurak...@gmail.com>
Authored: Thu May 17 16:33:06 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu May 17 16:33:06 2018 -0700

--
 .../ml/clustering/BisectingKMeansSuite.scala| 41 ++--
 .../ml/clustering/GaussianMixtureSuite.scala| 22 ---
 .../spark/ml/clustering/KMeansSuite.scala   | 31 +++
 .../apache/spark/ml/clustering/LDASuite.scala   | 21 --
 4 files changed, 50 insertions(+), 65 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d4a0895c/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index f3ff2af..81842af 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -19,17 +19,18 @@ package org.apache.spark.ml.clustering
 
 import scala.language.existentials
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.clustering.DistanceMeasure
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.Dataset
 
-class BisectingKMeansSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
+
+  import testImplicits._
 
   final val k = 5
   @transient var dataset: Dataset[_] = _
@@ -68,10 +69,13 @@ class BisectingKMeansSuite
 
 // Verify fit does not fail on very sparse data
 val model = bkm.fit(sparseDataset)
-val result = model.transform(sparseDataset)
-val numClusters = result.select("prediction").distinct().collect().length
-// Verify we hit the edge case
-assert(numClusters < k && numClusters > 1)
+
+testTransformerByGlobalCheckFunc[Tuple1[Vector]](sparseDataset.toDF(), 
model, "prediction") {
+  rows =>
+val numClusters = rows.distinct.length
+// Verify we hit the edge case
+assert(numClusters < k && numClusters > 1)
+}
   }
 
   test("setter/getter") {
@@ -104,19 +108,16 @@ class BisectingKMeansSuite
 val bkm = new 
BisectingKMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
 val model = bkm.fit(dataset)
 assert(model.clusterCenters.length === k)
-
-val transformed = model.transform(dataset)
-val expectedColumns = Array("features", predictionColName)
-expectedColumns.foreach { column =>
-  assert(transformed.columns.contains(column))
-}
-val clusters =
-  
transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
-assert(clusters.size === k)
-assert(clusters === Set(0, 1, 2, 3, 4))
 assert(model.computeCost(dataset) < 0.1)
 assert(model.hasParent)
 
+testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataset.toDF(), model,
+  "features", predictionColName) { rows =>
+  val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
+  assert(clusters.size === k)
+  assert(clusters === Set(0, 1, 2, 3, 4))
+}
+
 // Check validity of model summary
 val numRows = dataset.count()
 assert(model.hasSum

spark git commit: [SPARK-24115] Have logging pass through instrumentation class.

2018-05-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 8a837bf4f -> a7a9b1837


[SPARK-24115] Have logging pass through instrumentation class.

## What changes were proposed in this pull request?

Fixes to tuning instrumentation.

## How was this patch tested?

Existing tests.

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Bago Amirbekian <b...@databricks.com>

Closes #21340 from MrBago/tunning-instrumentation.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a7a9b183
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a7a9b183
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a7a9b183

Branch: refs/heads/master
Commit: a7a9b1837808b281f47643490abcf054f6de7b50
Parents: 8a837bf
Author: Bago Amirbekian <b...@databricks.com>
Authored: Thu May 17 11:13:16 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu May 17 11:13:16 2018 -0700

--
 .../scala/org/apache/spark/ml/tuning/CrossValidator.scala | 10 +-
 .../org/apache/spark/ml/tuning/TrainValidationSplit.scala | 10 +-
 .../scala/org/apache/spark/ml/util/Instrumentation.scala  |  7 +++
 3 files changed, 17 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a7a9b183/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala 
b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 5e916cc..f327f37 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -144,7 +144,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") 
override val uid: String)
 val metrics = splits.zipWithIndex.map { case ((training, validation), 
splitIndex) =>
   val trainingDataset = sparkSession.createDataFrame(training, 
schema).cache()
   val validationDataset = sparkSession.createDataFrame(validation, 
schema).cache()
-  logDebug(s"Train split $splitIndex with multiple sets of parameters.")
+  instr.logDebug(s"Train split $splitIndex with multiple sets of 
parameters.")
 
   // Fit models in a Future for training in parallel
   val foldMetricFutures = epm.zipWithIndex.map { case (paramMap, 
paramIndex) =>
@@ -155,7 +155,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") 
override val uid: String)
   }
   // TODO: duplicate evaluator to take extra params from input
   val metric = eval.evaluate(model.transform(validationDataset, 
paramMap))
-  logDebug(s"Got metric $metric for model trained with $paramMap.")
+  instr.logDebug(s"Got metric $metric for model trained with 
$paramMap.")
   metric
 } (executionContext)
   }
@@ -169,12 +169,12 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") 
override val uid: String)
   foldMetrics
 }.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all 
splits
 
-logInfo(s"Average cross-validation metrics: ${metrics.toSeq}")
+instr.logInfo(s"Average cross-validation metrics: ${metrics.toSeq}")
 val (bestMetric, bestIndex) =
   if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
   else metrics.zipWithIndex.minBy(_._1)
-logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
-logInfo(s"Best cross-validation metric: $bestMetric.")
+instr.logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
+instr.logInfo(s"Best cross-validation metric: $bestMetric.")
 val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
 instr.logSuccess(bestModel)
 copyValues(new CrossValidatorModel(uid, bestModel, metrics)

http://git-wip-us.apache.org/repos/asf/spark/blob/a7a9b183/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala 
b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index 13369c4..14d6a69 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -143,7 +143,7 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") 
override val uid: St
 } else None
 
 // Fit models in a Future for training in parallel
-

spark git commit: [SPARK-24155][ML] Instrumentation improvements for clustering

2018-05-14 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c26f67325 -> 075d678c8


[SPARK-24155][ML] Instrumentation improvements for clustering

## What changes were proposed in this pull request?

changed the instrument for all of the clustering methods

## How was this patch tested?

N/A

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Lu WANG <lu.w...@databricks.com>

Closes #21218 from ludatabricks/SPARK-23686-1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/075d678c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/075d678c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/075d678c

Branch: refs/heads/master
Commit: 075d678c8844614910b50abca07282bde31ef7e0
Parents: c26f673
Author: Lu WANG <lu.w...@databricks.com>
Authored: Mon May 14 13:35:54 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon May 14 13:35:54 2018 -0700

--
 .../org/apache/spark/ml/clustering/BisectingKMeans.scala  | 7 +--
 .../org/apache/spark/ml/clustering/GaussianMixture.scala  | 5 -
 .../main/scala/org/apache/spark/ml/clustering/KMeans.scala| 4 +++-
 3 files changed, 12 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 438e53b..1ad4e09 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -261,8 +261,9 @@ class BisectingKMeans @Since("2.0.0") (
 transformSchema(dataset.schema, logging = true)
 val rdd = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
 
-val instr = Instrumentation.create(this, rdd)
-instr.logParams(featuresCol, predictionCol, k, maxIter, seed, 
minDivisibleClusterSize)
+val instr = Instrumentation.create(this, dataset)
+instr.logParams(featuresCol, predictionCol, k, maxIter, seed,
+  minDivisibleClusterSize, distanceMeasure)
 
 val bkm = new MLlibBisectingKMeans()
   .setK($(k))
@@ -275,6 +276,8 @@ class BisectingKMeans @Since("2.0.0") (
 val summary = new BisectingKMeansSummary(
   model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
 model.setSummary(Some(summary))
+// TODO: need to extend logNamedValue to support Array
+instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
 instr.logSuccess(model)
 model
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 88d618c..3091bb5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -352,7 +352,7 @@ class GaussianMixture @Since("2.0.0") (
   s"than ${GaussianMixture.MAX_NUM_FEATURES} features because the size of 
the covariance" +
   s" matrix is quadratic in the number of features.")
 
-val instr = Instrumentation.create(this, instances)
+val instr = Instrumentation.create(this, dataset)
 instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, 
seed, tol)
 instr.logNumFeatures(numFeatures)
 
@@ -425,6 +425,9 @@ class GaussianMixture @Since("2.0.0") (
 val summary = new GaussianMixtureSummary(model.transform(dataset),
   $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
 model.setSummary(Some(summary))
+instr.logNamedValue("logLikelihood", logLikelihood)
+// TODO: need to extend logNamedValue to support Array
+instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
 instr.logSuccess(model)
 model
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 97f246f..e7

spark git commit: [SPARK-24132][ML] Instrumentation improvement for classification

2018-05-08 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9498e528d -> 7e7350285


[SPARK-24132][ML] Instrumentation improvement for classification

## What changes were proposed in this pull request?

- Add OptionalInstrumentation as argument for getNumClasses in 
ml.classification.Classifier

- Change the function call for getNumClasses in train() in 
ml.classification.DecisionTreeClassifier, 
ml.classification.RandomForestClassifier, and ml.classification.NaiveBayes

- Modify the instrumentation creation in ml.classification.LinearSVC

- Change the log call in ml.classification.OneVsRest and 
ml.classification.LinearSVC

## How was this patch tested?

Manual.

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Lu WANG <lu.w...@databricks.com>

Closes #21204 from ludatabricks/SPARK-23686.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e735028
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e735028
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e735028

Branch: refs/heads/master
Commit: 7e7350285dc22764f599671d874617c0eea093e5
Parents: 9498e52
Author: Lu WANG <lu.w...@databricks.com>
Authored: Tue May 8 21:20:58 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue May 8 21:20:58 2018 -0700

--
 .../spark/ml/classification/DecisionTreeClassifier.scala| 9 ++---
 .../org/apache/spark/ml/classification/LinearSVC.scala  | 9 ++---
 .../org/apache/spark/ml/classification/NaiveBayes.scala | 3 ++-
 .../org/apache/spark/ml/classification/OneVsRest.scala  | 4 ++--
 .../spark/ml/classification/RandomForestClassifier.scala| 4 +++-
 5 files changed, 19 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7e735028/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 57797d1..c9786f1 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -97,9 +97,11 @@ class DecisionTreeClassifier @Since("1.4.0") (
   override def setSeed(value: Long): this.type = set(seed, value)
 
   override protected def train(dataset: Dataset[_]): 
DecisionTreeClassificationModel = {
+val instr = Instrumentation.create(this, dataset)
 val categoricalFeatures: Map[Int, Int] =
   MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
 val numClasses: Int = getNumClasses(dataset)
+instr.logNumClasses(numClasses)
 
 if (isDefined(thresholds)) {
   require($(thresholds).length == numClasses, this.getClass.getSimpleName +
@@ -110,8 +112,8 @@ class DecisionTreeClassifier @Since("1.4.0") (
 val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset, 
numClasses)
 val strategy = getOldStrategy(categoricalFeatures, numClasses)
 
-val instr = Instrumentation.create(this, oldDataset)
-instr.logParams(params: _*)
+instr.logParams(maxDepth, maxBins, minInstancesPerNode, minInfoGain, 
maxMemoryInMB,
+  cacheNodeIds, checkpointInterval, impurity, seed)
 
 val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, 
featureSubsetStrategy = "all",
   seed = $(seed), instr = Some(instr), parentUID = Some(uid))
@@ -125,7 +127,8 @@ class DecisionTreeClassifier @Since("1.4.0") (
   private[ml] def train(data: RDD[LabeledPoint],
   oldStrategy: OldStrategy): DecisionTreeClassificationModel = {
 val instr = Instrumentation.create(this, data)
-instr.logParams(params: _*)
+instr.logParams(maxDepth, maxBins, minInstancesPerNode, minInfoGain, 
maxMemoryInMB,
+  cacheNodeIds, checkpointInterval, impurity, seed)
 
 val trees = RandomForest.run(data, oldStrategy, numTrees = 1, 
featureSubsetStrategy = "all",
   seed = 0L, instr = Some(instr), parentUID = Some(uid))

http://git-wip-us.apache.org/repos/asf/spark/blob/7e735028/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 80c537e..38eb045 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -170,7 +170,7

spark git commit: [SPARK-23975][ML] Add support of array input for all clustering methods

2018-05-07 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 76ecd0950 -> 0d63eb888


[SPARK-23975][ML] Add support of array input for all clustering methods

## What changes were proposed in this pull request?

Add support for all of the clustering methods

## How was this patch tested?

unit tests added

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Lu WANG <lu.w...@databricks.com>

Closes #21195 from ludatabricks/SPARK-23975-1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d63eb88
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d63eb88
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d63eb88

Branch: refs/heads/master
Commit: 0d63ebd17df747fb41d7ba254718bb7af3ae
Parents: 76ecd09
Author: Lu WANG <lu.w...@databricks.com>
Authored: Mon May 7 20:08:41 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon May 7 20:08:41 2018 -0700

--
 .../spark/ml/clustering/BisectingKMeans.scala   | 21 -
 .../spark/ml/clustering/GaussianMixture.scala   | 12 +++--
 .../org/apache/spark/ml/clustering/KMeans.scala | 31 +++--
 .../org/apache/spark/ml/clustering/LDA.scala|  9 ++--
 .../org/apache/spark/ml/util/DatasetUtils.scala | 13 +-
 .../org/apache/spark/ml/util/SchemaUtils.scala  | 16 ++-
 .../ml/clustering/BisectingKMeansSuite.scala| 21 -
 .../ml/clustering/GaussianMixtureSuite.scala| 21 -
 .../spark/ml/clustering/KMeansSuite.scala   | 48 ++--
 .../apache/spark/ml/clustering/LDASuite.scala   | 20 +++-
 .../apache/spark/ml/util/MLTestingUtils.scala   | 23 +-
 11 files changed, 147 insertions(+), 88 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0d63eb88/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index addc12ac..438e53b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -22,17 +22,15 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.linalg.{Vector, VectorUDT}
+import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.clustering.{BisectingKMeans => 
MLlibBisectingKMeans,
   BisectingKMeansModel => MLlibBisectingKMeansModel}
-import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
 import org.apache.spark.mllib.linalg.VectorImplicits._
-import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.types.{IntegerType, StructType}
 
 
@@ -75,7 +73,7 @@ private[clustering] trait BisectingKMeansParams extends 
Params with HasMaxIter
* @return output schema
*/
   protected def validateAndTransformSchema(schema: StructType): StructType = {
-SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+SchemaUtils.validateVectorCompatibleColumn(schema, getFeaturesCol)
 SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType)
   }
 }
@@ -113,7 +111,8 @@ class BisectingKMeansModel private[ml] (
   override def transform(dataset: Dataset[_]): DataFrame = {
 transformSchema(dataset.schema, logging = true)
 val predictUDF = udf((vector: Vector) => predict(vector))
-dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol
+dataset.withColumn($(predictionCol),
+  predictUDF(DatasetUtils.columnToVector(dataset, getFeaturesCol)))
   }
 
   @Since("2.0.0")
@@ -132,9 +131,9 @@ class BisectingKMeansModel private[ml] (
*/
   @Since("2.0.0")
   def computeCost(dataset: Dataset[_]): Double = {
-SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
-val data = dataset.select(col($(featuresCol))).rdd.map { case Row(point: 
Vector) => point }
-parentModel.computeCost(data.map(OldVectors.fromML))
+SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol)
+val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
+parentModel.computeCost(data)
   }
 
   @Since("2.0.0")
@@ -260,9 +259,7 @@ class BisectingKMeans @Sin

spark git commit: [SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples.

2018-01-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.3 29ed71873 -> f8f522c01


[SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples.

## What changes were proposed in this pull request?

Added documentation for new transformer.

Author: Bago Amirbekian <b...@databricks.com>

Closes #20285 from MrBago/sizeHintDocs.

(cherry picked from commit 05839d164836e544af79c13de25802552eadd636)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f8f522c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f8f522c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f8f522c0

Branch: refs/heads/branch-2.3
Commit: f8f522c01025e78eca1724c909c749374f855039
Parents: 29ed718
Author: Bago Amirbekian <b...@databricks.com>
Authored: Tue Jan 23 14:11:23 2018 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Jan 23 14:11:48 2018 -0800

--
 docs/ml-features.md | 51 +
 .../examples/ml/JavaVectorSizeHintExample.java  | 79 
 .../main/python/ml/vector_size_hint_example.py  | 57 ++
 .../examples/ml/VectorSizeHintExample.scala | 63 
 4 files changed, 250 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f8f522c0/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 466a8fb..3370eb3 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1291,6 +1291,57 @@ for more details on the API.
 
 
 
+## VectorSizeHint
+
+It can sometimes be useful to explicitly specify the size of the vectors for a 
column of
+`VectorType`. For example, `VectorAssembler` uses size information from its 
input columns to
+produce size information and metadata for its output column. While in some 
cases this information
+can be obtained by inspecting the contents of the column, in a streaming 
dataframe the contents are
+not available until the stream is started. `VectorSizeHint` allows a user to 
explicitly specify the
+vector size for a column so that `VectorAssembler`, or other transformers that 
might
+need to know vector size, can use that column as an input.
+
+To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. 
Applying this
+transformer to a dataframe produces a new dataframe with updated metadata for 
`inputCol` specifying
+the vector size. Downstream operations on the resulting dataframe can get this 
size using the
+meatadata.
+
+`VectorSizeHint` can also take an optional `handleInvalid` parameter which 
controls its
+behaviour when the vector column contains nulls or vectors of the wrong size. 
By default
+`handleInvalid` is set to "error", indicating an exception should be thrown. 
This parameter can
+also be set to "skip", indicating that rows containing invalid values should 
be filtered out from
+the resulting dataframe, or "optimistic", indicating that the column should 
not be checked for
+invalid values and all rows should be kept. Note that the use of "optimistic" 
can cause the
+resulting dataframe to be in an inconsistent state, me:aning the metadata for 
the column
+`VectorSizeHint` was applied to does not match the contents of that column. 
Users should take care
+to avoid this kind of inconsistent state.
+
+
+
+
+Refer to the [VectorSizeHint Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example 
scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
+
+
+
+
+Refer to the [VectorSizeHint Java 
docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html)
+for more details on the API.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %}
+
+
+
+
+Refer to the [VectorSizeHint Python 
docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example python/ml/vector_size_hint_example.py %}
+
+
+
 ## QuantileDiscretizer
 
 `QuantileDiscretizer` takes a column with continuous features and outputs a 
column with binned

http://git-wip-us.apache.org/repos/asf/spark/blob/f8f522c0/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
new file mode 100644
index 000..d649a2c
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
@@ -0,0 +1,79 @@

spark git commit: [SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples.

2018-01-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master dc4761fd8 -> 05839d164


[SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples.

## What changes were proposed in this pull request?

Added documentation for new transformer.

Author: Bago Amirbekian <b...@databricks.com>

Closes #20285 from MrBago/sizeHintDocs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/05839d16
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/05839d16
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/05839d16

Branch: refs/heads/master
Commit: 05839d164836e544af79c13de25802552eadd636
Parents: dc4761f
Author: Bago Amirbekian <b...@databricks.com>
Authored: Tue Jan 23 14:11:23 2018 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Jan 23 14:11:23 2018 -0800

--
 docs/ml-features.md | 51 +
 .../examples/ml/JavaVectorSizeHintExample.java  | 79 
 .../main/python/ml/vector_size_hint_example.py  | 57 ++
 .../examples/ml/VectorSizeHintExample.scala | 63 
 4 files changed, 250 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/05839d16/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 466a8fb..3370eb3 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1291,6 +1291,57 @@ for more details on the API.
 
 
 
+## VectorSizeHint
+
+It can sometimes be useful to explicitly specify the size of the vectors for a 
column of
+`VectorType`. For example, `VectorAssembler` uses size information from its 
input columns to
+produce size information and metadata for its output column. While in some 
cases this information
+can be obtained by inspecting the contents of the column, in a streaming 
dataframe the contents are
+not available until the stream is started. `VectorSizeHint` allows a user to 
explicitly specify the
+vector size for a column so that `VectorAssembler`, or other transformers that 
might
+need to know vector size, can use that column as an input.
+
+To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. 
Applying this
+transformer to a dataframe produces a new dataframe with updated metadata for 
`inputCol` specifying
+the vector size. Downstream operations on the resulting dataframe can get this 
size using the
+meatadata.
+
+`VectorSizeHint` can also take an optional `handleInvalid` parameter which 
controls its
+behaviour when the vector column contains nulls or vectors of the wrong size. 
By default
+`handleInvalid` is set to "error", indicating an exception should be thrown. 
This parameter can
+also be set to "skip", indicating that rows containing invalid values should 
be filtered out from
+the resulting dataframe, or "optimistic", indicating that the column should 
not be checked for
+invalid values and all rows should be kept. Note that the use of "optimistic" 
can cause the
+resulting dataframe to be in an inconsistent state, me:aning the metadata for 
the column
+`VectorSizeHint` was applied to does not match the contents of that column. 
Users should take care
+to avoid this kind of inconsistent state.
+
+
+
+
+Refer to the [VectorSizeHint Scala 
docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example 
scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
+
+
+
+
+Refer to the [VectorSizeHint Java 
docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html)
+for more details on the API.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %}
+
+
+
+
+Refer to the [VectorSizeHint Python 
docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example python/ml/vector_size_hint_example.py %}
+
+
+
 ## QuantileDiscretizer
 
 `QuantileDiscretizer` takes a column with continuous features and outputs a 
column with binned

http://git-wip-us.apache.org/repos/asf/spark/blob/05839d16/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
new file mode 100644
index 000..d649a2c
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distri

spark git commit: [SPARK-20088] Do not create new SparkContext in SparkR createSparkContext

2017-03-27 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 890493458 -> 0588dc7c0


[SPARK-20088] Do not create new SparkContext in SparkR createSparkContext

## What changes were proposed in this pull request?
Instead of creating new `JavaSparkContext` we use `SparkContext.getOrCreate`.

## How was this patch tested?
Existing tests

Author: Hossein <hoss...@databricks.com>

Closes #17423 from falaki/SPARK-20088.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0588dc7c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0588dc7c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0588dc7c

Branch: refs/heads/master
Commit: 0588dc7c0a9f3180dddae0dc202a6d41eb43464f
Parents: 8904934
Author: Hossein <hoss...@databricks.com>
Authored: Mon Mar 27 08:53:45 2017 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Mar 27 08:53:45 2017 -0700

--
 core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0588dc7c/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala 
b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 72ae034..295355c 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -136,7 +136,7 @@ private[r] object RRDD {
  .mkString(File.separator))
 }
 
-val jsc = new JavaSparkContext(sparkConf)
+val jsc = new JavaSparkContext(SparkContext.getOrCreate(sparkConf))
 jars.foreach { jar =>
   jsc.addJar(jar)
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes

2016-12-13 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 25b97589e -> 5693ac8e5


[SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes

## What changes were proposed in this pull request?

Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content 
minimal since users can type `?spark.randomForest` to see the full doc.

cc: jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #16264 from mengxr/SPARK-18793.

(cherry picked from commit 594b14f1ebd0b3db9f630e504be92228f11b4d9f)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5693ac8e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5693ac8e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5693ac8e

Branch: refs/heads/branch-2.1
Commit: 5693ac8e5bd5df8aca1b0d6df0be072a45abcfbd
Parents: 25b9758
Author: Xiangrui Meng <m...@databricks.com>
Authored: Tue Dec 13 16:59:09 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Dec 13 16:59:15 2016 -0800

--
 R/pkg/vignettes/sparkr-vignettes.Rmd | 32 +++
 1 file changed, 32 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5693ac8e/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 625b759..334daa5 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -449,6 +449,10 @@ SparkR supports the following machine learning models and 
algorithms.
 
 * Generalized Linear Model (GLM)
 
+* Random Forest
+
+* Gradient-Boosted Trees (GBT)
+
 * Naive Bayes Model
 
 * $k$-means Clustering
@@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF)
 head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 ```
 
+ Random Forest
+
+`spark.randomForest` fits a [random 
forest](https://en.wikipedia.org/wiki/Random_forest) classification or 
regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+In the following example, we use the `longley` dataset to train a random 
forest and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth 
= 2, numTrees = 2)
+summary(rfModel)
+predictions <- predict(rfModel, df)
+```
+
+ Gradient-Boosted Trees
+
+`spark.gbt` fits a [gradient-boosted 
tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or 
regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+Similar to the random forest example above, we use the `longley` dataset to 
train a gradient-boosted tree and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, 
maxIter = 2)
+summary(gbtModel)
+predictions <- predict(gbtModel, df)
+```
+
  Naive Bayes Model
 
 Naive Bayes model assumes independence among the features. `spark.naiveBayes` 
fits a [Bernoulli naive Bayes 
model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes)
 against a SparkDataFrame. The data should be all categorical. These models are 
often used for document classification.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes

2016-12-13 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c68fb426d -> 594b14f1e


[SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes

## What changes were proposed in this pull request?

Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content 
minimal since users can type `?spark.randomForest` to see the full doc.

cc: jkbradley

Author: Xiangrui Meng <m...@databricks.com>

Closes #16264 from mengxr/SPARK-18793.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/594b14f1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/594b14f1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/594b14f1

Branch: refs/heads/master
Commit: 594b14f1ebd0b3db9f630e504be92228f11b4d9f
Parents: c68fb42
Author: Xiangrui Meng <m...@databricks.com>
Authored: Tue Dec 13 16:59:09 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Dec 13 16:59:09 2016 -0800

--
 R/pkg/vignettes/sparkr-vignettes.Rmd | 32 +++
 1 file changed, 32 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/594b14f1/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 625b759..334daa5 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -449,6 +449,10 @@ SparkR supports the following machine learning models and 
algorithms.
 
 * Generalized Linear Model (GLM)
 
+* Random Forest
+
+* Gradient-Boosted Trees (GBT)
+
 * Naive Bayes Model
 
 * $k$-means Clustering
@@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF)
 head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 ```
 
+ Random Forest
+
+`spark.randomForest` fits a [random 
forest](https://en.wikipedia.org/wiki/Random_forest) classification or 
regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+In the following example, we use the `longley` dataset to train a random 
forest and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth 
= 2, numTrees = 2)
+summary(rfModel)
+predictions <- predict(rfModel, df)
+```
+
+ Gradient-Boosted Trees
+
+`spark.gbt` fits a [gradient-boosted 
tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or 
regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+Similar to the random forest example above, we use the `longley` dataset to 
train a gradient-boosted tree and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, 
maxIter = 2)
+summary(gbtModel)
+predictions <- predict(gbtModel, df)
+```
+
  Naive Bayes Model
 
 Naive Bayes model assumes independence among the features. `spark.naiveBayes` 
fits a [Bernoulli naive Bayes 
model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes)
 against a SparkDataFrame. The data should be all categorical. These models are 
often used for document classification.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes

2016-12-12 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 9dc5fa5f7 -> 9f0e3be62


[SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes

## What changes were proposed in this pull request?
spark.logit is added in 2.1. We need to update spark-vignettes to reflect the 
changes. This is part of SparkR QA work.

## How was this patch tested?

Manual build html. Please see attached image for the result.
![test](https://cloud.githubusercontent.com/assets/5033592/21032237/01b565fe-bd5d-11e6-8b59-4de4b6ef611d.jpeg)

Author: wm...@hotmail.com <wm...@hotmail.com>

Closes #16222 from wangmiao1981/veg.

(cherry picked from commit 2aa16d03db79a642cbe21f387441c34fc51a8236)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f0e3be6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f0e3be6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f0e3be6

Branch: refs/heads/branch-2.1
Commit: 9f0e3be622c77f7a677ce2c930b6dba2f652df00
Parents: 9dc5fa5
Author: wm...@hotmail.com <wm...@hotmail.com>
Authored: Mon Dec 12 22:41:11 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 12 22:41:20 2016 -0800

--
 R/pkg/vignettes/sparkr-vignettes.Rmd | 45 ++-
 1 file changed, 38 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9f0e3be6/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index a36f8fc..625b759 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -565,7 +565,7 @@ head(aftPredictions)
 
  Gaussian Mixture Model
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture 
Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
 (GMM) against a `SparkDataFrame`. 
[Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
 (EM) is used to approximate the maximum likelihood estimator (MLE) of the 
model.
 
@@ -584,7 +584,7 @@ head(select(gmmFitted, "V1", "V2", "prediction"))
 
  Latent Dirichlet Allocation
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.lda` fits a [Latent Dirichlet 
Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on 
a `SparkDataFrame`. It is often used in topic modeling in which topics are 
inferred from a collection of text documents. LDA can be thought of as a 
clustering algorithm as follows:
 
@@ -657,7 +657,7 @@ perplexity
 
  Multilayer Perceptron
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 Multilayer perceptron classifier (MLPC) is a classifier based on the 
[feedforward artificial neural 
network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC 
consists of multiple layers of nodes. Each layer is fully connected to the next 
layer in the network. Nodes in the input layer represent the input data. All 
other nodes map inputs to outputs by a linear combination of the inputs with 
the nodeâs weights $w$ and bias $b$ and applying an activation function. This 
can be written in matrix form for MLPC with $K+1$ layers as follows:
 $$
@@ -694,7 +694,7 @@ MLPC employs backpropagation for learning the model. We use 
the logistic loss fu
 
  Collaborative Filtering
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.als` learns latent factors in [collaborative 
filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering)
 via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
@@ -725,7 +725,7 @@ head(predicted)
 
  Isotonic Regression Model
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.isoreg` fits an [Isotonic 
Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a 
`SparkDataFrame`. It solves a weighted univariate a regression problem under a 
complete order constraint. Specifically, given a set of real observed responses 
$y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and 
optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone 
(piecewise linear) function $f$ to  minimize
 $$
@@ -768,8 +768,39 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
- What's More?
-We also expect Decision Tree, Random Forest, Kolmogorov-Smirnov Test coming in 
the next version 2.1.0.
+### Logistic Regression Model
+
+(Added in 2.1.0)
+
+[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a 
widely-used model when the response is categorical. It can be seen as

spark git commit: [SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes

2016-12-12 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 417e45c58 -> 2aa16d03d


[SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes

## What changes were proposed in this pull request?
spark.logit is added in 2.1. We need to update spark-vignettes to reflect the 
changes. This is part of SparkR QA work.

## How was this patch tested?

Manual build html. Please see attached image for the result.
![test](https://cloud.githubusercontent.com/assets/5033592/21032237/01b565fe-bd5d-11e6-8b59-4de4b6ef611d.jpeg)

Author: wm...@hotmail.com <wm...@hotmail.com>

Closes #16222 from wangmiao1981/veg.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2aa16d03
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2aa16d03
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2aa16d03

Branch: refs/heads/master
Commit: 2aa16d03db79a642cbe21f387441c34fc51a8236
Parents: 417e45c
Author: wm...@hotmail.com <wm...@hotmail.com>
Authored: Mon Dec 12 22:41:11 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Dec 12 22:41:11 2016 -0800

--
 R/pkg/vignettes/sparkr-vignettes.Rmd | 45 ++-
 1 file changed, 38 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2aa16d03/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index a36f8fc..625b759 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -565,7 +565,7 @@ head(aftPredictions)
 
  Gaussian Mixture Model
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture 
Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
 (GMM) against a `SparkDataFrame`. 
[Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
 (EM) is used to approximate the maximum likelihood estimator (MLE) of the 
model.
 
@@ -584,7 +584,7 @@ head(select(gmmFitted, "V1", "V2", "prediction"))
 
  Latent Dirichlet Allocation
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.lda` fits a [Latent Dirichlet 
Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on 
a `SparkDataFrame`. It is often used in topic modeling in which topics are 
inferred from a collection of text documents. LDA can be thought of as a 
clustering algorithm as follows:
 
@@ -657,7 +657,7 @@ perplexity
 
  Multilayer Perceptron
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 Multilayer perceptron classifier (MLPC) is a classifier based on the 
[feedforward artificial neural 
network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC 
consists of multiple layers of nodes. Each layer is fully connected to the next 
layer in the network. Nodes in the input layer represent the input data. All 
other nodes map inputs to outputs by a linear combination of the inputs with 
the nodeâs weights $w$ and bias $b$ and applying an activation function. This 
can be written in matrix form for MLPC with $K+1$ layers as follows:
 $$
@@ -694,7 +694,7 @@ MLPC employs backpropagation for learning the model. We use 
the logistic loss fu
 
  Collaborative Filtering
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.als` learns latent factors in [collaborative 
filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering)
 via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
@@ -725,7 +725,7 @@ head(predicted)
 
  Isotonic Regression Model
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.isoreg` fits an [Isotonic 
Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a 
`SparkDataFrame`. It solves a weighted univariate a regression problem under a 
complete order constraint. Specifically, given a set of real observed responses 
$y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and 
optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone 
(piecewise linear) function $f$ to  minimize
 $$
@@ -768,8 +768,39 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
- What's More?
-We also expect Decision Tree, Random Forest, Kolmogorov-Smirnov Test coming in 
the next version 2.1.0.
+### Logistic Regression Model
+
+(Added in 2.1.0)
+
+[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a 
widely-used model when the response is categorical. It can be seen as a special 
case of the [Generalized Linear Predictive 
Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
+We provide `

spark git commit: [SPARK-18812][MLLIB] explain "Spark ML"

2016-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 562507ef0 -> e45345d91


[SPARK-18812][MLLIB] explain "Spark ML"

## What changes were proposed in this pull request?

There has been some confusion around "Spark ML" vs. "MLlib". This PR adds some 
FAQ-like entries to the MLlib user guide to explain "Spark ML" and reduce the 
confusion.

I check the [Spark FAQ page](http://spark.apache.org/faq.html), which seems too 
high-level for the content here. So I added it to the MLlib user guide instead.

cc: mateiz

Author: Xiangrui Meng <m...@databricks.com>

Closes #16241 from mengxr/SPARK-18812.

(cherry picked from commit d2493a203e852adf63dde4e1fc993e8d11efec3d)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e45345d9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e45345d9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e45345d9

Branch: refs/heads/branch-2.1
Commit: e45345d91e333e0b5f9219e857affeda461863c6
Parents: 562507e
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Dec 9 17:34:52 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Dec 9 17:34:58 2016 -0800

--
 docs/ml-guide.md | 12 
 1 file changed, 12 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e45345d9/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index ddf81be..9717619 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -35,6 +35,18 @@ The primary Machine Learning API for Spark is now the 
[DataFrame](sql-programmin
 * The DataFrame-based API for MLlib provides a uniform API across ML 
algorithms and across multiple languages.
 * DataFrames facilitate practical ML Pipelines, particularly feature 
transformations.  See the [Pipelines guide](ml-pipeline.html) for details.
 
+*What is "Spark ML"?*
+
+* "Spark ML" is not an official name but occasionally used to refer to the 
MLlib DataFrame-based API.
+  This is majorly due to the `org.apache.spark.ml` Scala package name used by 
the DataFrame-based API, 
+  and the "Spark ML Pipelines" term we used initially to emphasize the 
pipeline concept.
+  
+*Is MLlib deprecated?*
+
+* No. MLlib includes both the RDD-based API and the DataFrame-based API.
+  The RDD-based API is now in maintenance mode.
+  But neither API is deprecated, nor MLlib as a whole.
+
 # Dependencies
 
 MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), 
which depends on


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18812][MLLIB] explain "Spark ML"

2016-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master cf33a8628 -> d2493a203


[SPARK-18812][MLLIB] explain "Spark ML"

## What changes were proposed in this pull request?

There has been some confusion around "Spark ML" vs. "MLlib". This PR adds some 
FAQ-like entries to the MLlib user guide to explain "Spark ML" and reduce the 
confusion.

I check the [Spark FAQ page](http://spark.apache.org/faq.html), which seems too 
high-level for the content here. So I added it to the MLlib user guide instead.

cc: mateiz

Author: Xiangrui Meng <m...@databricks.com>

Closes #16241 from mengxr/SPARK-18812.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2493a20
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2493a20
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2493a20

Branch: refs/heads/master
Commit: d2493a203e852adf63dde4e1fc993e8d11efec3d
Parents: cf33a86
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Dec 9 17:34:52 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Dec 9 17:34:52 2016 -0800

--
 docs/ml-guide.md | 12 
 1 file changed, 12 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d2493a20/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index ddf81be..9717619 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -35,6 +35,18 @@ The primary Machine Learning API for Spark is now the 
[DataFrame](sql-programmin
 * The DataFrame-based API for MLlib provides a uniform API across ML 
algorithms and across multiple languages.
 * DataFrames facilitate practical ML Pipelines, particularly feature 
transformations.  See the [Pipelines guide](ml-pipeline.html) for details.
 
+*What is "Spark ML"?*
+
+* "Spark ML" is not an official name but occasionally used to refer to the 
MLlib DataFrame-based API.
+  This is majorly due to the `org.apache.spark.ml` Scala package name used by 
the DataFrame-based API, 
+  and the "Spark ML Pipelines" term we used initially to emphasize the 
pipeline concept.
+  
+*Is MLlib deprecated?*
+
+* No. MLlib includes both the RDD-based API and the DataFrame-based API.
+  The RDD-based API is now in maintenance mode.
+  But neither API is deprecated, nor MLlib as a whole.
+
 # Dependencies
 
 MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), 
which depends on


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend

2016-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 44df6d2ce -> 65b4b0561


[SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend

## What changes were proposed in this pull request?

* This PR changes `JVMObjectTracker` from `object` to `class` and let its 
instance associated with each RBackend. So we can manage the lifecycle of JVM 
objects when there are multiple `RBackend` sessions. `RBackend.close` will 
clear the object tracker explicitly.
* I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, 
which could be wrong.
* Small refactor of `SerDe.sqlSerDe` to increase readability.

## How was this patch tested?

* Added unit tests for `JVMObjectTracker`.
* Wait for Jenkins to run full tests.

Author: Xiangrui Meng <m...@databricks.com>

Closes #16154 from mengxr/SPARK-17822.

(cherry picked from commit fd48d80a6145ea94f03e7fc6e4d724a0fbccac58)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65b4b056
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65b4b056
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65b4b056

Branch: refs/heads/branch-2.0
Commit: 65b4b05616bf8f5cf70a618cc15d379634e9b42d
Parents: 44df6d2
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Dec 9 07:51:46 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Dec 9 07:55:58 2016 -0800

--
 .../apache/spark/api/r/JVMObjectTracker.scala   | 87 ++
 .../scala/org/apache/spark/api/r/RBackend.scala |  4 +
 .../apache/spark/api/r/RBackendHandler.scala| 54 ++--
 .../scala/org/apache/spark/api/r/RRunner.scala  |  2 +-
 .../scala/org/apache/spark/api/r/SerDe.scala| 92 
 .../spark/api/r/JVMObjectTrackerSuite.scala | 73 
 .../org/apache/spark/api/r/RBackendSuite.scala  | 31 +++
 .../org/apache/spark/sql/api/r/SQLUtils.scala   | 12 +--
 8 files changed, 264 insertions(+), 91 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/65b4b056/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala 
b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
new file mode 100644
index 000..3432700
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.concurrent.ConcurrentHashMap
+
+/** JVM object ID wrapper */
+private[r] case class JVMObjectId(id: String) {
+  require(id != null, "Object ID cannot be null.")
+}
+
+/**
+ * Counter that tracks JVM objects returned to R.
+ * This is useful for referencing these objects in RPC calls.
+ */
+private[r] class JVMObjectTracker {
+
+  private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]()
+  private[this] val objCounter = new AtomicInteger()
+
+  /**
+   * Returns the JVM object associated with the input key or None if not found.
+   */
+  final def get(id: JVMObjectId): Option[Object] = this.synchronized {
+if (objMap.containsKey(id)) {
+  Some(objMap.get(id))
+} else {
+  None
+}
+  }
+
+  /**
+   * Returns the JVM object associated with the input key or throws an 
exception if not found.
+   */
+  @throws[NoSuchElementException]("if key does not exist.")
+  final def apply(id: JVMObjectId): Object = {
+get(id).getOrElse(
+  throw new NoSuchElementException(s"$id does not exist.")
+)
+  }
+
+  /**
+   * Adds a JVM object to track and returns assigned ID, which is unique 
within this tracker.
+   */
+  final def addAndGetId(obj: Object): JVMObjectId = {
+val id = JVMObjectId(objCounter.getAndIncrement().toString)
+obj

spark git commit: [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend

2016-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 b226f10e3 -> 0c6415aec


[SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend

## What changes were proposed in this pull request?

* This PR changes `JVMObjectTracker` from `object` to `class` and let its 
instance associated with each RBackend. So we can manage the lifecycle of JVM 
objects when there are multiple `RBackend` sessions. `RBackend.close` will 
clear the object tracker explicitly.
* I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, 
which could be wrong.
* Small refactor of `SerDe.sqlSerDe` to increase readability.

## How was this patch tested?

* Added unit tests for `JVMObjectTracker`.
* Wait for Jenkins to run full tests.

Author: Xiangrui Meng <m...@databricks.com>

Closes #16154 from mengxr/SPARK-17822.

(cherry picked from commit fd48d80a6145ea94f03e7fc6e4d724a0fbccac58)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c6415ae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c6415ae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c6415ae

Branch: refs/heads/branch-2.1
Commit: 0c6415aeca7a5c2fc5462c483c60d770f0236efe
Parents: b226f10
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Dec 9 07:51:46 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Dec 9 07:51:58 2016 -0800

--
 .../apache/spark/api/r/JVMObjectTracker.scala   | 87 ++
 .../scala/org/apache/spark/api/r/RBackend.scala |  6 +-
 .../apache/spark/api/r/RBackendHandler.scala| 54 ++--
 .../scala/org/apache/spark/api/r/RRunner.scala  |  2 +-
 .../scala/org/apache/spark/api/r/SerDe.scala| 92 
 .../spark/api/r/JVMObjectTrackerSuite.scala | 73 
 .../org/apache/spark/api/r/RBackendSuite.scala  | 31 +++
 .../org/apache/spark/sql/api/r/SQLUtils.scala   | 12 +--
 8 files changed, 265 insertions(+), 92 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0c6415ae/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala 
b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
new file mode 100644
index 000..3432700
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.concurrent.ConcurrentHashMap
+
+/** JVM object ID wrapper */
+private[r] case class JVMObjectId(id: String) {
+  require(id != null, "Object ID cannot be null.")
+}
+
+/**
+ * Counter that tracks JVM objects returned to R.
+ * This is useful for referencing these objects in RPC calls.
+ */
+private[r] class JVMObjectTracker {
+
+  private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]()
+  private[this] val objCounter = new AtomicInteger()
+
+  /**
+   * Returns the JVM object associated with the input key or None if not found.
+   */
+  final def get(id: JVMObjectId): Option[Object] = this.synchronized {
+if (objMap.containsKey(id)) {
+  Some(objMap.get(id))
+} else {
+  None
+}
+  }
+
+  /**
+   * Returns the JVM object associated with the input key or throws an 
exception if not found.
+   */
+  @throws[NoSuchElementException]("if key does not exist.")
+  final def apply(id: JVMObjectId): Object = {
+get(id).getOrElse(
+  throw new NoSuchElementException(s"$id does not exist.")
+)
+  }
+
+  /**
+   * Adds a JVM object to track and returns assigned ID, which is unique 
within this tracker.
+   */
+  final def addAndGetId(obj: Object): JVMObjectId = {
+val id = JVMObjectId(objCounter.getAndIncrement().toString)
+obj

spark git commit: [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend

2016-12-09 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master b162cc0c2 -> fd48d80a6


[SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend

## What changes were proposed in this pull request?

* This PR changes `JVMObjectTracker` from `object` to `class` and let its 
instance associated with each RBackend. So we can manage the lifecycle of JVM 
objects when there are multiple `RBackend` sessions. `RBackend.close` will 
clear the object tracker explicitly.
* I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, 
which could be wrong.
* Small refactor of `SerDe.sqlSerDe` to increase readability.

## How was this patch tested?

* Added unit tests for `JVMObjectTracker`.
* Wait for Jenkins to run full tests.

Author: Xiangrui Meng <m...@databricks.com>

Closes #16154 from mengxr/SPARK-17822.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd48d80a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd48d80a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd48d80a

Branch: refs/heads/master
Commit: fd48d80a6145ea94f03e7fc6e4d724a0fbccac58
Parents: b162cc0
Author: Xiangrui Meng <m...@databricks.com>
Authored: Fri Dec 9 07:51:46 2016 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Dec 9 07:51:46 2016 -0800

--
 .../apache/spark/api/r/JVMObjectTracker.scala   | 87 ++
 .../scala/org/apache/spark/api/r/RBackend.scala |  6 +-
 .../apache/spark/api/r/RBackendHandler.scala| 54 ++--
 .../scala/org/apache/spark/api/r/RRunner.scala  |  2 +-
 .../scala/org/apache/spark/api/r/SerDe.scala| 92 
 .../spark/api/r/JVMObjectTrackerSuite.scala | 73 
 .../org/apache/spark/api/r/RBackendSuite.scala  | 31 +++
 .../org/apache/spark/sql/api/r/SQLUtils.scala   | 12 +--
 8 files changed, 265 insertions(+), 92 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fd48d80a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
--
diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala 
b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
new file mode 100644
index 000..3432700
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.concurrent.ConcurrentHashMap
+
+/** JVM object ID wrapper */
+private[r] case class JVMObjectId(id: String) {
+  require(id != null, "Object ID cannot be null.")
+}
+
+/**
+ * Counter that tracks JVM objects returned to R.
+ * This is useful for referencing these objects in RPC calls.
+ */
+private[r] class JVMObjectTracker {
+
+  private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]()
+  private[this] val objCounter = new AtomicInteger()
+
+  /**
+   * Returns the JVM object associated with the input key or None if not found.
+   */
+  final def get(id: JVMObjectId): Option[Object] = this.synchronized {
+if (objMap.containsKey(id)) {
+  Some(objMap.get(id))
+} else {
+  None
+}
+  }
+
+  /**
+   * Returns the JVM object associated with the input key or throws an 
exception if not found.
+   */
+  @throws[NoSuchElementException]("if key does not exist.")
+  final def apply(id: JVMObjectId): Object = {
+get(id).getOrElse(
+  throw new NoSuchElementException(s"$id does not exist.")
+)
+  }
+
+  /**
+   * Adds a JVM object to track and returns assigned ID, which is unique 
within this tracker.
+   */
+  final def addAndGetId(obj: Object): JVMObjectId = {
+val id = JVMObjectId(objCounter.getAndIncrement().toString)
+objMap.put(id, obj)
+id
+  }
+
+  /**
+   * Removes and returns a JVM object with the specific ID from the tracker, 
or None if not found.
+   */
+

spark git commit: [SPARKR][MINOR] Fix LDA doc

2016-08-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 08913ce00 -> 6a0fda2c0


[SPARKR][MINOR] Fix LDA doc

## What changes were proposed in this pull request?

This PR tries to fix the name of the `SparkDataFrame` used in the example. 
Also, it gives a reference url of an example data file so that users can play 
with.

## How was this patch tested?

Manual test.

Author: Junyang Qian <junya...@databricks.com>

Closes #14853 from junyangq/SPARKR-FixLDADoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a0fda2c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a0fda2c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a0fda2c

Branch: refs/heads/master
Commit: 6a0fda2c0590b455e8713da79cd5f2413e5d0f28
Parents: 08913ce
Author: Junyang Qian <junya...@databricks.com>
Authored: Mon Aug 29 10:23:10 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Aug 29 10:23:10 2016 -0700

--
 R/pkg/R/mllib.R | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6a0fda2c/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 6808aae..64d19fa 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -994,18 +994,22 @@ setMethod("spark.survreg", signature(data = 
"SparkDataFrame", formula = "formula
 #' @export
 #' @examples
 #' \dontrun{
-#' text <- read.df("path/to/data", source = "libsvm")
+#' # nolint start
+#' # An example "path/to/file" can be
+#' # paste0(Sys.getenv("SPARK_HOME"), "/data/mllib/sample_lda_libsvm_data.txt")
+#' # nolint end
+#' text <- read.df("path/to/file", source = "libsvm")
 #' model <- spark.lda(data = text, optimizer = "em")
 #'
 #' # get a summary of the model
 #' summary(model)
 #'
 #' # compute posterior probabilities
-#' posterior <- spark.posterior(model, df)
+#' posterior <- spark.posterior(model, text)
 #' showDF(posterior)
 #'
 #' # compute perplexity
-#' perplexity <- spark.perplexity(model, df)
+#' perplexity <- spark.perplexity(model, text)
 #'
 #' # save and load the model
 #' path <- "path/to/model"


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARKR][MINOR] Update R DESCRIPTION file

2016-08-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 eaea1c86b -> d16f9a0b7


[SPARKR][MINOR] Update R DESCRIPTION file

## What changes were proposed in this pull request?

Update DESCRIPTION

## How was this patch tested?

Run install and CRAN tests

Author: Felix Cheung <felixcheun...@hotmail.com>

Closes #14764 from felixcheung/rpackagedescription.

(cherry picked from commit d2b3d3e63e1a9217de6ef507c350308017664a62)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d16f9a0b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d16f9a0b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d16f9a0b

Branch: refs/heads/branch-2.0
Commit: d16f9a0b7c464728d7b11899740908e23820a797
Parents: eaea1c8
Author: Felix Cheung <felixcheun...@hotmail.com>
Authored: Mon Aug 22 20:15:03 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Aug 22 20:15:14 2016 -0700

--
 R/pkg/DESCRIPTION | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d16f9a0b/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d81f1a3..e5afed2 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -3,10 +3,15 @@ Type: Package
 Title: R Frontend for Apache Spark
 Version: 2.0.0
 Date: 2016-07-07
-Author: The Apache Software Foundation
-Maintainer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
-Xiangrui Meng <m...@databricks.com>
-Felix Cheung <felixcheun...@hotmail.com>
+Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
+    email = "shiva...@cs.berkeley.edu"),
+ person("Xiangrui", "Meng", role = "aut",
+email = "m...@databricks.com"),
+ person("Felix", "Cheung", role = "aut",
+email = "felixche...@apache.org"),
+ person(family = "The Apache Software Foundation", role = c("aut", 
"cph")))
+URL: http://www.apache.org/ http://spark.apache.org/
+BugReports: 
https://issues.apache.org/jira/secure/CreateIssueDetails!init.jspa?pid=12315420=12325400=4
 Depends:
 R (>= 3.0),
 methods


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore

2016-08-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 faff9297d -> 26d5a8b0d


[MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore

## What changes were proposed in this pull request?

Ignore temp files generated by `check-cran.sh`.

Author: Xiangrui Meng <m...@databricks.com>

Closes #14740 from mengxr/R-gitignore.

(cherry picked from commit ab7143463daf2056736c85e3a943c826b5992623)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26d5a8b0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26d5a8b0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26d5a8b0

Branch: refs/heads/branch-2.0
Commit: 26d5a8b0dab10310ec76b91465b3b4ff465e9746
Parents: faff929
Author: Xiangrui Meng <m...@databricks.com>
Authored: Sun Aug 21 10:31:25 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Aug 21 10:31:32 2016 -0700

--
 R/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/26d5a8b0/R/.gitignore
--
diff --git a/R/.gitignore b/R/.gitignore
index 9a5889b..c98504a 100644
--- a/R/.gitignore
+++ b/R/.gitignore
@@ -4,3 +4,5 @@
 lib
 pkg/man
 pkg/html
+SparkR.Rcheck/
+SparkR_*.tar.gz


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore

2016-08-21 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e328f577e -> ab7143463


[MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore

## What changes were proposed in this pull request?

Ignore temp files generated by `check-cran.sh`.

Author: Xiangrui Meng <m...@databricks.com>

Closes #14740 from mengxr/R-gitignore.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab714346
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab714346
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab714346

Branch: refs/heads/master
Commit: ab7143463daf2056736c85e3a943c826b5992623
Parents: e328f57
Author: Xiangrui Meng <m...@databricks.com>
Authored: Sun Aug 21 10:31:25 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Sun Aug 21 10:31:25 2016 -0700

--
 R/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ab714346/R/.gitignore
--
diff --git a/R/.gitignore b/R/.gitignore
index 9a5889b..c98504a 100644
--- a/R/.gitignore
+++ b/R/.gitignore
@@ -4,3 +4,5 @@
 lib
 pkg/man
 pkg/html
+SparkR.Rcheck/
+SparkR_*.tar.gz


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16443][SPARKR] Alternating Least Squares (ALS) wrapper

2016-08-19 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master cf0cce903 -> acac7a508


[SPARK-16443][SPARKR] Alternating Least Squares (ALS) wrapper

## What changes were proposed in this pull request?

Add Alternating Least Squares wrapper in SparkR. Unit tests have been updated.

## How was this patch tested?

SparkR unit tests.

(If this patch involves UI changes, please attach a screenshot; otherwise, 
remove this)

![screen shot 2016-07-27 at 3 50 31 
pm](https://cloud.githubusercontent.com/assets/15318264/17195347/f7a6352a-5411-11e6-8e21-61a48070192a.png)
![screen shot 2016-07-27 at 3 50 46 
pm](https://cloud.githubusercontent.com/assets/15318264/17195348/f7a7d452-5411-11e6-845f-6d292283bc28.png)

Author: Junyang Qian <junya...@databricks.com>

Closes #14384 from junyangq/SPARK-16443.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acac7a50
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acac7a50
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acac7a50

Branch: refs/heads/master
Commit: acac7a508a29d0f75d86ee2e4ca83ebf01a36cf8
Parents: cf0cce9
Author: Junyang Qian <junya...@databricks.com>
Authored: Fri Aug 19 14:24:09 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri Aug 19 14:24:09 2016 -0700

--
 R/pkg/NAMESPACE |   3 +-
 R/pkg/R/generics.R  |   4 +
 R/pkg/R/mllib.R | 159 ++-
 R/pkg/inst/tests/testthat/test_mllib.R  |  40 +
 .../org/apache/spark/ml/r/ALSWrapper.scala  | 119 ++
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   2 +
 6 files changed, 322 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/acac7a50/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 4404cff..e1b87b2 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -29,7 +29,8 @@ exportMethods("glm",
   "spark.posterior",
   "spark.perplexity",
   "spark.isoreg",
-  "spark.gaussianMixture")
+  "spark.gaussianMixture",
+  "spark.als")
 
 # Job group lifecycle management methods
 export("setJobGroup",

http://git-wip-us.apache.org/repos/asf/spark/blob/acac7a50/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index fe04bcf..693aa31 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1332,3 +1332,7 @@ setGeneric("spark.gaussianMixture",
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml") })
+
+#' @rdname spark.als
+#' @export
+setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })

http://git-wip-us.apache.org/repos/asf/spark/blob/acac7a50/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b952741..36f38fc 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -74,6 +74,13 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' @note GaussianMixtureModel since 2.1.0
 setClass("GaussianMixtureModel", representation(jobj = "jobj"))
 
+#' S4 class that represents an ALSModel
+#'
+#' @param jobj a Java object reference to the backing Scala ALSWrapper
+#' @export
+#' @note ALSModel since 2.1.0
+setClass("ALSModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the 
specific
@@ -82,8 +89,8 @@ setClass("GaussianMixtureModel", representation(jobj = 
"jobj"))
 #' @name write.ml
 #' @export
 #' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
-#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, 
\link{spark.survreg}, \link{spark.lda}
-#' @seealso \link{spark.isoreg}
+#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.lda}, 
\link{spark.naiveBayes}
+#' @seealso \link{spark.survreg}, \link{spark.isoreg}
 #' @seealso \link{read.ml}
 NULL
 
@@ -95,10 +102,11 @@ NULL
 #' @name predict
 #' @export
 #' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
-#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.naiveBayes}, 
\link{spark.survreg}
 #' @seealso \link{spark.isoreg}
 NULL
 
+
 #

spark git commit: [SPARK-16446][SPARKR][ML] Gaussian Mixture Model wrapper in SparkR

2016-08-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master e3fec51fa -> 4d92af310


[SPARK-16446][SPARKR][ML] Gaussian Mixture Model wrapper in SparkR

## What changes were proposed in this pull request?
Gaussian Mixture Model wrapper in SparkR, similarly to R's ```mvnormalmixEM```.

## How was this patch tested?
Unit test.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #14392 from yanboliang/spark-16446.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4d92af31
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4d92af31
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4d92af31

Branch: refs/heads/master
Commit: 4d92af310ad29ade039e4130f91f2a3d9180deef
Parents: e3fec51
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Aug 17 11:18:33 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Aug 17 11:18:33 2016 -0700

--
 R/pkg/NAMESPACE |   3 +-
 R/pkg/R/generics.R  |   7 +
 R/pkg/R/mllib.R | 139 ++-
 R/pkg/inst/tests/testthat/test_mllib.R  |  62 +
 .../spark/ml/r/GaussianMixtureWrapper.scala | 128 +
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   2 +
 6 files changed, 338 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4d92af31/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 1e23b23..c71eec5 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -25,7 +25,8 @@ exportMethods("glm",
   "fitted",
   "spark.naiveBayes",
   "spark.survreg",
-  "spark.isoreg")
+  "spark.isoreg",
+  "spark.gaussianMixture")
 
 # Job group lifecycle management methods
 export("setJobGroup",

http://git-wip-us.apache.org/repos/asf/spark/blob/4d92af31/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index ebacc11..06bb25d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1308,6 +1308,13 @@ setGeneric("spark.survreg", function(data, formula, ...) 
{ standardGeneric("spar
 #' @export
 setGeneric("spark.isoreg", function(data, formula, ...) { 
standardGeneric("spark.isoreg") })
 
+#' @rdname spark.gaussianMixture
+#' @export
+setGeneric("spark.gaussianMixture",
+   function(data, formula, ...) {
+ standardGeneric("spark.gaussianMixture")
+   })
+
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml") })

http://git-wip-us.apache.org/repos/asf/spark/blob/4d92af31/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 0dcc54d..db74046 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -60,6 +60,13 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' @note IsotonicRegressionModel since 2.1.0
 setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 
+#' S4 class that represents a GaussianMixtureModel
+#'
+#' @param jobj a Java object reference to the backing Scala 
GaussianMixtureModel
+#' @export
+#' @note GaussianMixtureModel since 2.1.0
+setClass("GaussianMixtureModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the 
specific
@@ -67,7 +74,7 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' @rdname write.ml
 #' @name write.ml
 #' @export
-#' @seealso \link{spark.glm}, \link{glm}
+#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
 #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
 #' @seealso \link{spark.isoreg}
 #' @seealso \link{read.ml}
@@ -80,7 +87,7 @@ NULL
 #' @rdname predict
 #' @name predict
 #' @export
-#' @seealso \link{spark.glm}, \link{glm}
+#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
 #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
 #' @seealso \link{spark.isoreg}
 NULL
@@ -649,6 +656,25 @@ setMethod("write.ml", signature(object = 
"IsotonicRegressionModel", path = "char
invisible(callJMethod(writer, "save", path))
   })
 
+#  Save fitted MLlib model to the input path
+
+#' @param path the directory

spark git commit: [SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin

2016-06-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 b52bd8070 -> a54852350


[SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin

## What changes were proposed in this pull request?

This PR adds labelling support for the `include_example` Jekyll plugin, so that 
we may split a single source file into multiple line blocks with different 
labels, and include them in multiple code snippets in the generated HTML page.

## How was this patch tested?

Manually tested.

https://cloud.githubusercontent.com/assets/230655/16451099/66a76db2-3e33-11e6-84fb-63104c2f0688.png;>

Author: Cheng Lian <l...@databricks.com>

Closes #13972 from liancheng/include-example-with-labels.

(cherry picked from commit bde1d6a61593aeb62370f526542cead94919b0c0)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a5485235
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a5485235
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a5485235

Branch: refs/heads/branch-2.0
Commit: a54852350346cacae61d851d796bc3a7abd3a048
Parents: b52bd80
Author: Cheng Lian <l...@databricks.com>
Authored: Wed Jun 29 22:50:53 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jun 29 22:51:04 2016 -0700

--
 docs/_plugins/include_example.rb| 25 +---
 docs/sql-programming-guide.md   | 41 +++-
 .../apache/spark/examples/sql/JavaSparkSQL.java |  5 +++
 examples/src/main/python/sql.py |  5 +++
 .../apache/spark/examples/sql/RDDRelation.scala | 10 -
 5 files changed, 43 insertions(+), 43 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a5485235/docs/_plugins/include_example.rb
--
diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index f748582..306 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -32,8 +32,18 @@ module Jekyll
   @code_dir = File.join(site.source, config_dir)
 
   clean_markup = @markup.strip
-  @file = File.join(@code_dir, clean_markup)
-  @lang = clean_markup.split('.').last
+
+  parts = clean_markup.strip.split(' ')
+  if parts.length > 1 then
+@snippet_label = ':' + parts[0]
+snippet_file = parts[1]
+  else
+@snippet_label = ''
+snippet_file = parts[0]
+  end
+
+  @file = File.join(@code_dir, snippet_file)
+  @lang = snippet_file.split('.').last
 
   code = File.open(@file).read.encode("UTF-8")
   code = select_lines(code)
@@ -41,7 +51,7 @@ module Jekyll
   rendered_code = Pygments.highlight(code, :lexer => @lang)
 
   hint = "Find full example code at " \
-"\"examples/src/main/#{clean_markup}\" in the Spark 
repo."
+"\"examples/src/main/#{snippet_file}\" in the Spark 
repo."
 
   rendered_code + hint
 end
@@ -66,13 +76,13 @@ module Jekyll
   # Select the array of start labels from code.
   startIndices = lines
 .each_with_index
-.select { |l, i| l.include? "$example on$" }
+.select { |l, i| l.include? "$example on#{@snippet_label}$" }
 .map { |l, i| i }
 
   # Select the array of end labels from code.
   endIndices = lines
 .each_with_index
-.select { |l, i| l.include? "$example off$" }
+.select { |l, i| l.include? "$example off#{@snippet_label}$" }
 .map { |l, i| i }
 
   raise "Start indices amount is not equal to end indices amount, see 
#{@file}." \
@@ -92,7 +102,10 @@ module Jekyll
 if start == endline
 lastIndex = endline
 range = Range.new(start + 1, endline - 1)
-result += trim_codeblock(lines[range]).join
+trimmed = trim_codeblock(lines[range])
+# Filter out possible example tags of overlapped labels.
+taggs_filtered = trimmed.select { |l| !l.include? '$example ' }
+result += taggs_filtered.join
 result += "\n"
   end
   result

http://git-wip-us.apache.org/repos/asf/spark/blob/a5485235/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 6c6bc8d..68419e1 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -63,52 +63,23 @@ Throughout this document, we will often refer to Scala/Java 
Datasets of `Row`s a
 
 
 
-The entry point into all functionality in Spark is the 
[`SparkSession`](api/sca

spark git commit: [SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin

2016-06-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master d3af6731f -> bde1d6a61


[SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin

## What changes were proposed in this pull request?

This PR adds labelling support for the `include_example` Jekyll plugin, so that 
we may split a single source file into multiple line blocks with different 
labels, and include them in multiple code snippets in the generated HTML page.

## How was this patch tested?

Manually tested.

https://cloud.githubusercontent.com/assets/230655/16451099/66a76db2-3e33-11e6-84fb-63104c2f0688.png;>

Author: Cheng Lian <l...@databricks.com>

Closes #13972 from liancheng/include-example-with-labels.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bde1d6a6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bde1d6a6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bde1d6a6

Branch: refs/heads/master
Commit: bde1d6a61593aeb62370f526542cead94919b0c0
Parents: d3af673
Author: Cheng Lian <l...@databricks.com>
Authored: Wed Jun 29 22:50:53 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jun 29 22:50:53 2016 -0700

--
 docs/_plugins/include_example.rb| 25 +---
 docs/sql-programming-guide.md   | 41 +++-
 .../apache/spark/examples/sql/JavaSparkSQL.java |  5 +++
 examples/src/main/python/sql.py |  5 +++
 .../apache/spark/examples/sql/RDDRelation.scala | 10 -
 5 files changed, 43 insertions(+), 43 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bde1d6a6/docs/_plugins/include_example.rb
--
diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index f748582..306 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -32,8 +32,18 @@ module Jekyll
   @code_dir = File.join(site.source, config_dir)
 
   clean_markup = @markup.strip
-  @file = File.join(@code_dir, clean_markup)
-  @lang = clean_markup.split('.').last
+
+  parts = clean_markup.strip.split(' ')
+  if parts.length > 1 then
+@snippet_label = ':' + parts[0]
+snippet_file = parts[1]
+  else
+@snippet_label = ''
+snippet_file = parts[0]
+  end
+
+  @file = File.join(@code_dir, snippet_file)
+  @lang = snippet_file.split('.').last
 
   code = File.open(@file).read.encode("UTF-8")
   code = select_lines(code)
@@ -41,7 +51,7 @@ module Jekyll
   rendered_code = Pygments.highlight(code, :lexer => @lang)
 
   hint = "Find full example code at " \
-"\"examples/src/main/#{clean_markup}\" in the Spark 
repo."
+"\"examples/src/main/#{snippet_file}\" in the Spark 
repo."
 
   rendered_code + hint
 end
@@ -66,13 +76,13 @@ module Jekyll
   # Select the array of start labels from code.
   startIndices = lines
 .each_with_index
-.select { |l, i| l.include? "$example on$" }
+.select { |l, i| l.include? "$example on#{@snippet_label}$" }
 .map { |l, i| i }
 
   # Select the array of end labels from code.
   endIndices = lines
 .each_with_index
-.select { |l, i| l.include? "$example off$" }
+.select { |l, i| l.include? "$example off#{@snippet_label}$" }
 .map { |l, i| i }
 
   raise "Start indices amount is not equal to end indices amount, see 
#{@file}." \
@@ -92,7 +102,10 @@ module Jekyll
 if start == endline
 lastIndex = endline
 range = Range.new(start + 1, endline - 1)
-result += trim_codeblock(lines[range]).join
+trimmed = trim_codeblock(lines[range])
+# Filter out possible example tags of overlapped labels.
+taggs_filtered = trimmed.select { |l| !l.include? '$example ' }
+result += taggs_filtered.join
 result += "\n"
   end
   result

http://git-wip-us.apache.org/repos/asf/spark/blob/bde1d6a6/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 6c6bc8d..68419e1 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -63,52 +63,23 @@ Throughout this document, we will often refer to Scala/Java 
Datasets of `Row`s a
 
 
 
-The entry point into all functionality in Spark is the 
[`SparkSession`](api/scala/index.html#org.apache.spark.sql.SparkSession) class. 
To create a basic `SparkSession`, just use `SparkSession.build()`:
-
-{% hig

spark git commit: [SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc

2016-06-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 d96e8c2dd -> 1cde325e2


[SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc

https://issues.apache.org/jira/browse/SPARK-16140

## What changes were proposed in this pull request?

Group the R doc of spark.kmeans, predict(KM), summary(KM), read/write.ml(KM) 
under Rd spark.kmeans. The example code was updated.

## How was this patch tested?

Tested on my local machine

And on my laptop `jekyll build` is failing to build API docs, so here I can 
only show you the html I manually generated from Rd files, with no CSS applied, 
but the doc content should be there.

![screenshotkmeans](https://cloud.githubusercontent.com/assets/3925641/16403203/c2c9ca1e-3ca7-11e6-9e29-f2164aee75fc.png)

Author: Xin Ren <iamsh...@126.com>

Closes #13921 from keypointt/SPARK-16140.

(cherry picked from commit 8c9cd0a7a719ce4286f77f35bb787e2b626a472e)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cde325e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cde325e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cde325e

Branch: refs/heads/branch-2.0
Commit: 1cde325e29286a8c6631b0b32351994aad7db567
Parents: d96e8c2
Author: Xin Ren <iamsh...@126.com>
Authored: Wed Jun 29 11:25:00 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jun 29 11:25:07 2016 -0700

--
 R/pkg/R/generics.R |  2 ++
 R/pkg/R/mllib.R| 72 +++--
 2 files changed, 35 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1cde325e/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 27dfd67..0e4350f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1247,6 +1247,7 @@ setGeneric("spark.glm", function(data, formula, ...) { 
standardGeneric("spark.gl
 #' @export
 setGeneric("glm")
 
+#' predict
 #' @rdname predict
 #' @export
 setGeneric("predict", function(object, ...) { standardGeneric("predict") })
@@ -1271,6 +1272,7 @@ setGeneric("spark.naiveBayes", function(data, formula, 
...) { standardGeneric("s
 #' @export
 setGeneric("spark.survreg", function(data, formula, ...) { 
standardGeneric("spark.survreg") })
 
+#' write.ml
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml") })

http://git-wip-us.apache.org/repos/asf/spark/blob/1cde325e/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 897a376..4fe7367 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -267,9 +267,10 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 return(list(apriori = apriori, tables = tables))
   })
 
-#' Fit a k-means model
+#' K-Means Clustering Model
 #'
-#' Fit a k-means model, similarly to R's kmeans().
+#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's 
kmeans().
+#' Users can print, make predictions on the produced model and save the model 
to the input path.
 #'
 #' @param data SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
@@ -278,14 +279,32 @@ setMethod("summary", signature(object = 
"NaiveBayesModel"),
 #' @param k Number of centers
 #' @param maxIter Maximum iteration number
 #' @param initMode The initialization algorithm choosen to fit the model
-#' @return A fitted k-means model
+#' @return \code{spark.kmeans} returns a fitted k-means model
 #' @rdname spark.kmeans
+#' @name spark.kmeans
 #' @export
 #' @examples
 #' \dontrun{
-#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random")
+#' sparkR.session()
+#' data(iris)
+#' df <- createDataFrame(iris)
+#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = 
"random")
+#' summary(model)
+#'
+#' # fitted values on training data
+#' fitted <- predict(model, df)
+#' head(select(fitted, "Sepal_Length", "prediction"))
+#'
+#' # save fitted model to input path
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#'
+#' # can also read back the saved model and print
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
 #' }
 #' @note spark.kmeans since 2.0.0
+#' @seealso \link{predict}, \link{read.ml}, \link{write.ml}
 setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = 
"formula&qu

spark git commit: [SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc

2016-06-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c6a220d75 -> 8c9cd0a7a


[SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc

https://issues.apache.org/jira/browse/SPARK-16140

## What changes were proposed in this pull request?

Group the R doc of spark.kmeans, predict(KM), summary(KM), read/write.ml(KM) 
under Rd spark.kmeans. The example code was updated.

## How was this patch tested?

Tested on my local machine

And on my laptop `jekyll build` is failing to build API docs, so here I can 
only show you the html I manually generated from Rd files, with no CSS applied, 
but the doc content should be there.

![screenshotkmeans](https://cloud.githubusercontent.com/assets/3925641/16403203/c2c9ca1e-3ca7-11e6-9e29-f2164aee75fc.png)

Author: Xin Ren <iamsh...@126.com>

Closes #13921 from keypointt/SPARK-16140.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c9cd0a7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c9cd0a7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c9cd0a7

Branch: refs/heads/master
Commit: 8c9cd0a7a719ce4286f77f35bb787e2b626a472e
Parents: c6a220d
Author: Xin Ren <iamsh...@126.com>
Authored: Wed Jun 29 11:25:00 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jun 29 11:25:00 2016 -0700

--
 R/pkg/R/generics.R |  2 ++
 R/pkg/R/mllib.R| 72 +++--
 2 files changed, 35 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8c9cd0a7/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 27dfd67..0e4350f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1247,6 +1247,7 @@ setGeneric("spark.glm", function(data, formula, ...) { 
standardGeneric("spark.gl
 #' @export
 setGeneric("glm")
 
+#' predict
 #' @rdname predict
 #' @export
 setGeneric("predict", function(object, ...) { standardGeneric("predict") })
@@ -1271,6 +1272,7 @@ setGeneric("spark.naiveBayes", function(data, formula, 
...) { standardGeneric("s
 #' @export
 setGeneric("spark.survreg", function(data, formula, ...) { 
standardGeneric("spark.survreg") })
 
+#' write.ml
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml") })

http://git-wip-us.apache.org/repos/asf/spark/blob/8c9cd0a7/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 897a376..4fe7367 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -267,9 +267,10 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 return(list(apriori = apriori, tables = tables))
   })
 
-#' Fit a k-means model
+#' K-Means Clustering Model
 #'
-#' Fit a k-means model, similarly to R's kmeans().
+#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's 
kmeans().
+#' Users can print, make predictions on the produced model and save the model 
to the input path.
 #'
 #' @param data SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
@@ -278,14 +279,32 @@ setMethod("summary", signature(object = 
"NaiveBayesModel"),
 #' @param k Number of centers
 #' @param maxIter Maximum iteration number
 #' @param initMode The initialization algorithm choosen to fit the model
-#' @return A fitted k-means model
+#' @return \code{spark.kmeans} returns a fitted k-means model
 #' @rdname spark.kmeans
+#' @name spark.kmeans
 #' @export
 #' @examples
 #' \dontrun{
-#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random")
+#' sparkR.session()
+#' data(iris)
+#' df <- createDataFrame(iris)
+#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = 
"random")
+#' summary(model)
+#'
+#' # fitted values on training data
+#' fitted <- predict(model, df)
+#' head(select(fitted, "Sepal_Length", "prediction"))
+#'
+#' # save fitted model to input path
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#'
+#' # can also read back the saved model and print
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
 #' }
 #' @note spark.kmeans since 2.0.0
+#' @seealso \link{predict}, \link{read.ml}, \link{write.ml}
 setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = 
"formula"),
   function(data, formula, k = 2, maxIter = 20, initMode = 
c("k-means||", "random")) {

spark git commit: [MINOR][SPARKR] Fix arguments of survreg in SparkR

2016-06-29 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 ba71cf451 -> d96e8c2dd


[MINOR][SPARKR] Fix arguments of survreg in SparkR

## What changes were proposed in this pull request?
Fix wrong arguments description of ```survreg``` in SparkR.

## How was this patch tested?
```Arguments``` section of ```survreg``` doc before this PR (with wrong 
description for ```path``` and missing ```overwrite```):
![image](https://cloud.githubusercontent.com/assets/1962026/16447548/fe7a5ed4-3da1-11e6-8b96-b5bf2083b07e.png)

After this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/16447617/368e0b18-3da2-11e6-8277-45640fb11859.png)

Author: Yanbo Liang <yblia...@gmail.com>

Closes #13970 from yanboliang/spark-16143-followup.

(cherry picked from commit c6a220d756f23ee89a0d1366b20259890c9d67c9)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d96e8c2d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d96e8c2d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d96e8c2d

Branch: refs/heads/branch-2.0
Commit: d96e8c2dd0a9949751d3074b6ab61eee12f5d622
Parents: ba71cf4
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Jun 29 11:20:35 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jun 29 11:20:41 2016 -0700

--
 R/pkg/R/mllib.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d96e8c2d/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 8e6c2dd..897a376 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -442,11 +442,11 @@ setMethod("write.ml", signature(object = 
"NaiveBayesModel", path = "character"),
 
 # Saves the AFT survival regression model to the input path.
 
-#' @param path The directory where the model is savedist containing the 
model's coefficien
+#' @param path The directory where the model is saved
+#' @param overwrite Overwrites or not if the output path already exists. 
Default is FALSE
 #'  which means throw exception if the output path exists.
 #'
 #' @rdname spark.survreg
-#' @name write.ml
 #' @export
 #' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0
 #' @seealso \link{read.ml}


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA

2016-06-28 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 dd70a115c -> 22b4072e7


[SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA

## What changes were proposed in this pull request?
model loading backward compatibility for ml.feature.PCA.

## How was this patch tested?
existing ut and manual test for loading models saved by Spark 1.6.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #13937 from yanboliang/spark-16245.

(cherry picked from commit 0df5ce1bc1387a58b33cd185008f4022bd3dcc69)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/22b4072e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/22b4072e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/22b4072e

Branch: refs/heads/branch-2.0
Commit: 22b4072e704f9a68a605e9a4cebf54d2122fe448
Parents: dd70a11
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Tue Jun 28 19:53:07 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Jun 28 19:53:16 2016 -0700

--
 .../scala/org/apache/spark/ml/feature/PCA.scala   | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/22b4072e/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 72167b5..ef8b085 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -206,24 +206,22 @@ object PCAModel extends MLReadable[PCAModel] {
 override def load(path: String): PCAModel = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
-  // explainedVariance field is not present in Spark <= 1.6
-  val versionRegex = "([0-9]+)\\.([0-9]+).*".r
-  val hasExplainedVariance = metadata.sparkVersion match {
-case versionRegex(major, minor) =>
-  major.toInt >= 2 || (major.toInt == 1 && minor.toInt > 6)
-case _ => false
-  }
+  val versionRegex = "([0-9]+)\\.(.+)".r
+  val versionRegex(major, _) = metadata.sparkVersion
 
   val dataPath = new Path(path, "data").toString
-  val model = if (hasExplainedVariance) {
+  val model = if (major.toInt >= 2) {
 val Row(pc: DenseMatrix, explainedVariance: DenseVector) =
   sparkSession.read.parquet(dataPath)
 .select("pc", "explainedVariance")
 .head()
 new PCAModel(metadata.uid, pc, explainedVariance)
   } else {
-val Row(pc: DenseMatrix) = 
sparkSession.read.parquet(dataPath).select("pc").head()
-new PCAModel(metadata.uid, pc, 
Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector])
+// pc field is the old matrix format in Spark <= 1.6
+// explainedVariance field is not present in Spark <= 1.6
+val Row(pc: OldDenseMatrix) = 
sparkSession.read.parquet(dataPath).select("pc").head()
+new PCAModel(metadata.uid, pc.asML,
+  Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector])
   }
   DefaultParamsReader.getAndSetParams(model, metadata)
   model


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA

2016-06-28 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 363bcedee -> 0df5ce1bc


[SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA

## What changes were proposed in this pull request?
model loading backward compatibility for ml.feature.PCA.

## How was this patch tested?
existing ut and manual test for loading models saved by Spark 1.6.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #13937 from yanboliang/spark-16245.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0df5ce1b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0df5ce1b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0df5ce1b

Branch: refs/heads/master
Commit: 0df5ce1bc1387a58b33cd185008f4022bd3dcc69
Parents: 363bced
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Tue Jun 28 19:53:07 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Jun 28 19:53:07 2016 -0700

--
 .../scala/org/apache/spark/ml/feature/PCA.scala   | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0df5ce1b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 72167b5..ef8b085 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -206,24 +206,22 @@ object PCAModel extends MLReadable[PCAModel] {
 override def load(path: String): PCAModel = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
-  // explainedVariance field is not present in Spark <= 1.6
-  val versionRegex = "([0-9]+)\\.([0-9]+).*".r
-  val hasExplainedVariance = metadata.sparkVersion match {
-case versionRegex(major, minor) =>
-  major.toInt >= 2 || (major.toInt == 1 && minor.toInt > 6)
-case _ => false
-  }
+  val versionRegex = "([0-9]+)\\.(.+)".r
+  val versionRegex(major, _) = metadata.sparkVersion
 
   val dataPath = new Path(path, "data").toString
-  val model = if (hasExplainedVariance) {
+  val model = if (major.toInt >= 2) {
 val Row(pc: DenseMatrix, explainedVariance: DenseVector) =
   sparkSession.read.parquet(dataPath)
 .select("pc", "explainedVariance")
 .head()
 new PCAModel(metadata.uid, pc, explainedVariance)
   } else {
-val Row(pc: DenseMatrix) = 
sparkSession.read.parquet(dataPath).select("pc").head()
-new PCAModel(metadata.uid, pc, 
Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector])
+// pc field is the old matrix format in Spark <= 1.6
+// explainedVariance field is not present in Spark <= 1.6
+val Row(pc: OldDenseMatrix) = 
sparkSession.read.parquet(dataPath).select("pc").head()
+new PCAModel(metadata.uid, pc.asML,
+  Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector])
   }
   DefaultParamsReader.getAndSetParams(model, metadata)
   model


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16143][R] group AFT survival regression methods docs in a single Rd

2016-06-27 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 c7704099d -> 4c4f7775c


[SPARK-16143][R] group AFT survival regression methods docs in a single Rd

## What changes were proposed in this pull request?

This PR groups `spark.survreg`, `summary(AFT)`, `predict(AFT)`, `write.ml(AFT)` 
for survival regression into a single Rd.

## How was this patch tested?

Manually checked generated HTML doc. See attached screenshots.

![screen shot 2016-06-27 at 10 28 20 
am](https://cloud.githubusercontent.com/assets/15318264/16392008/a14cf472-3c5e-11e6-9ce5-490ed1a52249.png)
![screen shot 2016-06-27 at 10 28 35 
am](https://cloud.githubusercontent.com/assets/15318264/16392009/a14e333c-3c5e-11e6-8bd7-c2e9ba71f8e2.png)

Author: Junyang Qian <junya...@databricks.com>

Closes #13927 from junyangq/SPARK-16143.

(cherry picked from commit 1b7fc5817203db5a56489b289fb1a0dd44b2e26b)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c4f7775
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c4f7775
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c4f7775

Branch: refs/heads/branch-2.0
Commit: 4c4f7775cbf5dd69e688350ee59a9319bcaa56fe
Parents: c770409
Author: Junyang Qian <junya...@databricks.com>
Authored: Mon Jun 27 20:32:27 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jun 27 20:32:35 2016 -0700

--
 R/pkg/R/mllib.R | 88 +---
 1 file changed, 42 insertions(+), 46 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4c4f7775/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 853cfce..8e6c2dd 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -233,9 +233,10 @@ setMethod("predict", signature(object = 
"GeneralizedLinearRegressionModel"),
 # Makes predictions from a naive Bayes model or a model produced by 
spark.naiveBayes(),
 # similarly to R package e1071's predict.
 
-#' @rdname spark.naiveBayes
+#' @param newData A SparkDataFrame for testing
 #' @return \code{predict} returns a SparkDataFrame containing predicted 
labeled in a column named
 #' "prediction"
+#' @rdname spark.naiveBayes
 #' @export
 #' @note predict(NaiveBayesModel) since 2.0.0
 setMethod("predict", signature(object = "NaiveBayesModel"),
@@ -439,25 +440,16 @@ setMethod("write.ml", signature(object = 
"NaiveBayesModel", path = "character"),
 invisible(callJMethod(writer, "save", path))
   })
 
-#' Save fitted MLlib model to the input path
-#'
-#' Save the AFT survival regression model to the input path.
-#'
-#' @param object A fitted AFT survival regression model
-#' @param path The directory where the model is saved
-#' @param overwrite Overwrites or not if the output path already exists. 
Default is FALSE
+# Saves the AFT survival regression model to the input path.
+
+#' @param path The directory where the model is savedist containing the 
model's coefficien
 #'  which means throw exception if the output path exists.
 #'
-#' @rdname write.ml
+#' @rdname spark.survreg
 #' @name write.ml
 #' @export
-#' @examples
-#' \dontrun{
-#' model <- spark.survreg(trainingData, Surv(futime, fustat) ~ ecog_ps + rx)
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' }
 #' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0
+#' @seealso \link{read.ml}
 setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = 
"character"),
   function(object, path, overwrite = FALSE) {
 writer <- callJMethod(object@jobj, "write")
@@ -542,15 +534,18 @@ read.ml <- function(path) {
   }
 }
 
-#' Fit an accelerated failure time (AFT) survival regression model.
+#' Accelerated Failure Time (AFT) Survival Regression Model
 #'
-#' Fit an accelerated failure time (AFT) survival regression model on a Spark 
DataFrame.
+#' \code{spark.survreg} fits an accelerated failure time (AFT) survival 
regression model on
+#' a SparkDataFrame. Users can call \code{summary} to get a summary of the 
fitted AFT model,
+#' \code{predict} to make predictions on new data, and 
\code{write.ml}/\code{read.ml} to
+#' save/load fitted models.
 #'
-#' @param data SparkDataFrame for training.
+#' @param data A SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
 #'operators are supported, including '~', ':', '+', and '-'.
-#'Note that operator '.' is not supported currently.
-#' @return a

spark git commit: [SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors

2016-06-27 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e4bb31fb3 -> 27f3462d0


[SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML 
style vectors

## What changes were proposed in this pull request?
Need to convert ML Vectors to the old MLlib style before doing 
Statistics.colStats operations on the DataFrame

## How was this patch tested?
Ran example, local tests

Author: Bryan Cutler <cutl...@gmail.com>

Closes #13928 from BryanCutler/pyspark-ml-example-vector-conv-SPARK-16231.

(cherry picked from commit 1aa191e58e905f470f73663fc1c35f36e05e929a)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27f3462d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27f3462d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27f3462d

Branch: refs/heads/branch-2.0
Commit: 27f3462d0e11b4768140e452f02ab043438b8e86
Parents: e4bb31f
Author: Bryan Cutler <cutl...@gmail.com>
Authored: Mon Jun 27 12:58:39 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jun 27 14:12:31 2016 -0700

--
 examples/src/main/python/ml/dataframe_example.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/27f3462d/examples/src/main/python/ml/dataframe_example.py
--
diff --git a/examples/src/main/python/ml/dataframe_example.py 
b/examples/src/main/python/ml/dataframe_example.py
index a7d8b90..c1818d7 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -28,6 +28,7 @@ import shutil
 
 from pyspark.sql import SparkSession
 from pyspark.mllib.stat import Statistics
+from pyspark.mllib.util import MLUtils
 
 if __name__ == "__main__":
 if len(sys.argv) > 2:
@@ -55,7 +56,8 @@ if __name__ == "__main__":
 labelSummary.show()
 
 # Convert features column to an RDD of vectors.
-features = df.select("features").rdd.map(lambda r: r.features)
+features = MLUtils.convertVectorColumnsFromML(df, "features") \
+.select("features").rdd.map(lambda r: r.features)
 summary = Statistics.colStats(features)
 print("Selected features column with average values:\n" +
   str(summary.mean()))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors

2016-06-27 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c17b1abff -> 1aa191e58


[SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML 
style vectors

## What changes were proposed in this pull request?
Need to convert ML Vectors to the old MLlib style before doing 
Statistics.colStats operations on the DataFrame

## How was this patch tested?
Ran example, local tests

Author: Bryan Cutler <cutl...@gmail.com>

Closes #13928 from BryanCutler/pyspark-ml-example-vector-conv-SPARK-16231.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1aa191e5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1aa191e5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1aa191e5

Branch: refs/heads/master
Commit: 1aa191e58e905f470f73663fc1c35f36e05e929a
Parents: c17b1ab
Author: Bryan Cutler <cutl...@gmail.com>
Authored: Mon Jun 27 12:58:39 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jun 27 12:58:39 2016 -0700

--
 examples/src/main/python/ml/dataframe_example.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1aa191e5/examples/src/main/python/ml/dataframe_example.py
--
diff --git a/examples/src/main/python/ml/dataframe_example.py 
b/examples/src/main/python/ml/dataframe_example.py
index a7d8b90..c1818d7 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -28,6 +28,7 @@ import shutil
 
 from pyspark.sql import SparkSession
 from pyspark.mllib.stat import Statistics
+from pyspark.mllib.util import MLUtils
 
 if __name__ == "__main__":
 if len(sys.argv) > 2:
@@ -55,7 +56,8 @@ if __name__ == "__main__":
 labelSummary.show()
 
 # Convert features column to an RDD of vectors.
-features = df.select("features").rdd.map(lambda r: r.features)
+features = MLUtils.convertVectorColumnsFromML(df, "features") \
+.select("features").rdd.map(lambda r: r.features)
 summary = Statistics.colStats(features)
 print("Selected features column with average values:\n" +
   str(summary.mean()))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java

2016-06-27 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 f2017c59b -> e4bb31fb3


[SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-16187
This is to provide conversion utils between old/new vector columns in a 
DataFrame. So users can use it to migrate their datasets and pipelines manually.

## How was this patch tested?

java and scala ut

Author: Yuhao Yang <yuhao.y...@intel.com>

Closes #13888 from hhbyyh/matComp.

(cherry picked from commit c17b1abff8f8c6d24cb0cf4ff4f8c14a780c64b0)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4bb31fb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4bb31fb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4bb31fb

Branch: refs/heads/branch-2.0
Commit: e4bb31fb3afeaf6b6ddc1af4c9c07f1f7001b7cc
Parents: f2017c5
Author: Yuhao Yang <yuhao.y...@intel.com>
Authored: Mon Jun 27 12:27:39 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jun 27 12:27:47 2016 -0700

--
 .../org/apache/spark/ml/linalg/MatrixUDT.scala  |   2 +-
 .../org/apache/spark/mllib/util/MLUtils.scala   | 107 ++-
 .../spark/mllib/util/JavaMLUtilsSuite.java  |  29 -
 .../apache/spark/mllib/util/MLUtilsSuite.scala  |  56 +-
 4 files changed, 187 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e4bb31fb/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala 
b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
index 521a216..a1e5366 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types._
  * User-defined type for [[Matrix]] in [[mllib-local]] which allows easy 
interaction with SQL
  * via [[org.apache.spark.sql.Dataset]].
  */
-private[ml] class MatrixUDT extends UserDefinedType[Matrix] {
+private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
 
   override def sqlType: StructType = {
 // type: 0 = sparse, 1 = dense

http://git-wip-us.apache.org/repos/asf/spark/blob/e4bb31fb/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 7d5bdff..e96c2bc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -23,7 +23,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.{VectorUDT => MLVectorUDT}
+import org.apache.spark.ml.linalg.{MatrixUDT => MLMatrixUDT, VectorUDT => 
MLVectorUDT}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -309,8 +309,8 @@ object MLUtils extends Logging {
   }
 
   /**
-   * Converts vector columns in an input Dataset to the 
[[org.apache.spark.ml.linalg.Vector]] type
-   * from the new [[org.apache.spark.mllib.linalg.Vector]] type under the 
`spark.ml` package.
+   * Converts vector columns in an input Dataset to the 
[[org.apache.spark.mllib.linalg.Vector]]
+   * type from the new [[org.apache.spark.ml.linalg.Vector]] type under the 
`spark.ml` package.
* @param dataset input dataset
* @param cols a list of vector columns to be converted. Old vector columns 
will be ignored. If
* unspecified, all new vector columns will be converted except 
nested ones.
@@ -361,6 +361,107 @@ object MLUtils extends Logging {
   }
 
   /**
+   * Converts Matrix columns in an input Dataset from the 
[[org.apache.spark.mllib.linalg.Matrix]]
+   * type to the new [[org.apache.spark.ml.linalg.Matrix]] type under the 
`spark.ml` package.
+   * @param dataset input dataset
+   * @param cols a list of matrix columns to be converted. New matrix columns 
will be ignored. If
+   * unspecified, all old matrix columns will be converted except 
nested ones.
+   * @return the input [[DataFrame]] with old matrix columns converted to the 
new matrix type
+   */
+  @Since("2.0.0")
+  @varargs
+  def convertMatrixColumnsToML(dataset: Dataset[_], cols: String*): DataFrame 
= {
+

spark git commit: [SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java

2016-06-27 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master c48c8ebc0 -> c17b1abff


[SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-16187
This is to provide conversion utils between old/new vector columns in a 
DataFrame. So users can use it to migrate their datasets and pipelines manually.

## How was this patch tested?

java and scala ut

Author: Yuhao Yang <yuhao.y...@intel.com>

Closes #13888 from hhbyyh/matComp.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c17b1abf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c17b1abf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c17b1abf

Branch: refs/heads/master
Commit: c17b1abff8f8c6d24cb0cf4ff4f8c14a780c64b0
Parents: c48c8eb
Author: Yuhao Yang <yuhao.y...@intel.com>
Authored: Mon Jun 27 12:27:39 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Jun 27 12:27:39 2016 -0700

--
 .../org/apache/spark/ml/linalg/MatrixUDT.scala  |   2 +-
 .../org/apache/spark/mllib/util/MLUtils.scala   | 107 ++-
 .../spark/mllib/util/JavaMLUtilsSuite.java  |  29 -
 .../apache/spark/mllib/util/MLUtilsSuite.scala  |  56 +-
 4 files changed, 187 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c17b1abf/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala 
b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
index 521a216..a1e5366 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types._
  * User-defined type for [[Matrix]] in [[mllib-local]] which allows easy 
interaction with SQL
  * via [[org.apache.spark.sql.Dataset]].
  */
-private[ml] class MatrixUDT extends UserDefinedType[Matrix] {
+private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
 
   override def sqlType: StructType = {
 // type: 0 = sparse, 1 = dense

http://git-wip-us.apache.org/repos/asf/spark/blob/c17b1abf/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 7d5bdff..e96c2bc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -23,7 +23,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.{VectorUDT => MLVectorUDT}
+import org.apache.spark.ml.linalg.{MatrixUDT => MLMatrixUDT, VectorUDT => 
MLVectorUDT}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -309,8 +309,8 @@ object MLUtils extends Logging {
   }
 
   /**
-   * Converts vector columns in an input Dataset to the 
[[org.apache.spark.ml.linalg.Vector]] type
-   * from the new [[org.apache.spark.mllib.linalg.Vector]] type under the 
`spark.ml` package.
+   * Converts vector columns in an input Dataset to the 
[[org.apache.spark.mllib.linalg.Vector]]
+   * type from the new [[org.apache.spark.ml.linalg.Vector]] type under the 
`spark.ml` package.
* @param dataset input dataset
* @param cols a list of vector columns to be converted. Old vector columns 
will be ignored. If
* unspecified, all new vector columns will be converted except 
nested ones.
@@ -361,6 +361,107 @@ object MLUtils extends Logging {
   }
 
   /**
+   * Converts Matrix columns in an input Dataset from the 
[[org.apache.spark.mllib.linalg.Matrix]]
+   * type to the new [[org.apache.spark.ml.linalg.Matrix]] type under the 
`spark.ml` package.
+   * @param dataset input dataset
+   * @param cols a list of matrix columns to be converted. New matrix columns 
will be ignored. If
+   * unspecified, all old matrix columns will be converted except 
nested ones.
+   * @return the input [[DataFrame]] with old matrix columns converted to the 
new matrix type
+   */
+  @Since("2.0.0")
+  @varargs
+  def convertMatrixColumnsToML(dataset: Dataset[_], cols: String*): DataFrame 
= {
+val schema = dataset.schema
+val colSet = if (cols.nonEmpty) {
+  cols.flatMap { c =>
+val dataType = schema(c).dataTyp

spark git commit: [SPARK-16133][ML] model loading backward compatibility for ml.feature

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 557eee5b6 -> 3d8d95644


[SPARK-16133][ML] model loading backward compatibility for ml.feature

## What changes were proposed in this pull request?

model loading backward compatibility for ml.feature,

## How was this patch tested?

existing ut and manual test for loading 1.6 models.

Author: Yuhao Yang <yuhao.y...@intel.com>
Author: Yuhao Yang <hhb...@gmail.com>

Closes #13844 from hhbyyh/featureComp.

(cherry picked from commit cc6778ee0bf4fa7a78abd30542c4a6f80ea371c5)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3d8d9564
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3d8d9564
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3d8d9564

Branch: refs/heads/branch-2.0
Commit: 3d8d956448fd3b7ae8d380e655bfa245b11c4ea0
Parents: 557eee5
Author: Yuhao Yang <yuhao.y...@intel.com>
Authored: Thu Jun 23 21:50:25 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 21:50:32 2016 -0700

--
 mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala  | 3 ++-
 .../scala/org/apache/spark/ml/feature/MinMaxScaler.scala| 9 ++---
 .../scala/org/apache/spark/ml/feature/StandardScaler.scala  | 4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3d8d9564/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 02d4e6a..5d6287f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -27,6 +27,7 @@ import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
@@ -180,9 +181,9 @@ object IDFModel extends MLReadable[IDFModel] {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.parquet(dataPath)
+  val Row(idf: Vector) = MLUtils.convertVectorColumnsToML(data, "idf")
 .select("idf")
 .head()
-  val idf = data.getAs[Vector](0)
   val model = new IDFModel(metadata.uid, new 
feature.IDFModel(OldVectors.fromML(idf)))
   DefaultParamsReader.getAndSetParams(model, metadata)
   model

http://git-wip-us.apache.org/repos/asf/spark/blob/3d8d9564/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index 562b3f3..d5ad5ab 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -28,6 +28,7 @@ import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
@@ -232,9 +233,11 @@ object MinMaxScalerModel extends 
MLReadable[MinMaxScalerModel] {
 override def load(path: String): MinMaxScalerModel = {
   val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
   val dataPath = new Path(path, "data").toString
-  val Row(originalMin: Vector, originalMax: Vector) = 
sparkSession.read.parquet(dataPath)
-.select("originalMin", "originalMax")
-.head()
+  val data = sparkSession.read.parquet(dataPath)
+  val Row(originalMin: Vector, originalMax: Vector) =
+MLUtils.convertVectorColumnsToML(data, "originalMin", "originalMax")
+  .select("originalMin", "originalMax")
+  .head()
   val model = new MinMaxScalerModel(metadata.uid, originalMin, originalMax)
   DefaultParamsReader.getAndSetParams(model, metadata)
   model

http://git-wip-us.apache.org/repos/asf/spark/blob/3d8d9564/mllib/src/main/scala/org/apache/spark/ml/feature/StandardS

spark git commit: [SPARK-16142][R] group naiveBayes method docs in a single Rd

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 14bc5a7f3 -> 4a40d43bb


[SPARK-16142][R] group naiveBayes method docs in a single Rd

## What changes were proposed in this pull request?

This PR groups `spark.naiveBayes`, `summary(NB)`, `predict(NB)`, and 
`write.ml(NB)` into a single Rd.

## How was this patch tested?

Manually checked generated HTML doc. See attached screenshots.

![screen shot 2016-06-23 at 2 11 00 
pm](https://cloud.githubusercontent.com/assets/829644/16320452/a5885e92-394c-11e6-994f-2ab5cddad86f.png)

![screen shot 2016-06-23 at 2 11 15 
pm](https://cloud.githubusercontent.com/assets/829644/16320455/aad1f6d8-394c-11e6-8ef4-13bee989f52f.png)

Author: Xiangrui Meng <m...@databricks.com>

Closes #13877 from mengxr/SPARK-16142.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a40d43b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a40d43b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a40d43b

Branch: refs/heads/master
Commit: 4a40d43bb29704734b8128bf2a3f27802ae34e17
Parents: 14bc5a7
Author: Xiangrui Meng <m...@databricks.com>
Authored: Thu Jun 23 21:43:13 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 21:43:13 2016 -0700

--
 R/pkg/R/mllib.R | 90 
 1 file changed, 42 insertions(+), 48 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4a40d43b/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index dbff1b9..853cfce 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -218,9 +218,10 @@ print.summary.GeneralizedLinearRegressionModel <- 
function(x, ...) {
 
 #  Makes predictions from a generalized linear model produced by glm() or 
spark.glm(),
 #  similarly to R's predict().
-#'
+
 #' @param newData SparkDataFrame for testing
-#' @return \code{predict} returns a SparkDataFrame containing predicted labels 
in a column named "prediction"
+#' @return \code{predict} returns a SparkDataFrame containing predicted labels 
in a column named
+#' "prediction"
 #' @rdname spark.glm
 #' @export
 #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0
@@ -229,41 +230,26 @@ setMethod("predict", signature(object = 
"GeneralizedLinearRegressionModel"),
 return(dataFrame(callJMethod(object@jobj, "transform", 
newData@sdf)))
   })
 
-#' Predicted values based on model
-#'
-#' Makes predictions from a naive Bayes model or a model produced by 
spark.naiveBayes(),
-#' similarly to R package e1071's predict.
-#'
-#' @param object A fitted naive Bayes model
-#' @rdname predict
+# Makes predictions from a naive Bayes model or a model produced by 
spark.naiveBayes(),
+# similarly to R package e1071's predict.
+
+#' @rdname spark.naiveBayes
+#' @return \code{predict} returns a SparkDataFrame containing predicted 
labeled in a column named
+#' "prediction"
 #' @export
-#' @examples
-#' \dontrun{
-#' model <- spark.naiveBayes(trainingData, y ~ x)
-#' predicted <- predict(model, testData)
-#' showDF(predicted)
-#'}
 #' @note predict(NaiveBayesModel) since 2.0.0
 setMethod("predict", signature(object = "NaiveBayesModel"),
   function(object, newData) {
 return(dataFrame(callJMethod(object@jobj, "transform", 
newData@sdf)))
   })
 
-#' Get the summary of a naive Bayes model
-#'
-#' Returns the summary of a naive Bayes model produced by spark.naiveBayes(),
-#' similarly to R's summary().
-#'
-#' @param object A fitted MLlib model
-#' @return a list containing 'apriori', the label distribution, and 'tables', 
conditional
-#  probabilities given the target label
-#' @rdname summary
+# Returns the summary of a naive Bayes model produced by 
\code{spark.naiveBayes}
+
+#' @param object A naive Bayes model fitted by \code{spark.naiveBayes}
+#' @return \code{summary} returns a list containing \code{apriori}, the label 
distribution, and
+#' \code{tables}, conditional probabilities given the target label
+#' @rdname spark.naiveBayes
 #' @export
-#' @examples
-#' \dontrun{
-#' model <- spark.naiveBayes(trainingData, y ~ x)
-#' summary(model)
-#'}
 #' @note summary(NaiveBayesModel) since 2.0.0
 setMethod("summary", signature(object = "NaiveBayesModel"),
   function(object, ...) {
@@ -390,23 +376,41 @@ setMethod("predict", signature(object = "KMeansModel"),
 return(dataFrame(callJMethod(object@jobj, "transform", 
newData@sdf)))
   })
 
-#' Fit a Bernoulli naive Bayes model
+#' Naive Bayes Models
 #'
-#' Fit a Bernoulli naive Bayes model o

spark git commit: [SPARK-16142][R] group naiveBayes method docs in a single Rd

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 ea0cf93d3 -> 557eee5b6


[SPARK-16142][R] group naiveBayes method docs in a single Rd

## What changes were proposed in this pull request?

This PR groups `spark.naiveBayes`, `summary(NB)`, `predict(NB)`, and 
`write.ml(NB)` into a single Rd.

## How was this patch tested?

Manually checked generated HTML doc. See attached screenshots.

![screen shot 2016-06-23 at 2 11 00 
pm](https://cloud.githubusercontent.com/assets/829644/16320452/a5885e92-394c-11e6-994f-2ab5cddad86f.png)

![screen shot 2016-06-23 at 2 11 15 
pm](https://cloud.githubusercontent.com/assets/829644/16320455/aad1f6d8-394c-11e6-8ef4-13bee989f52f.png)

Author: Xiangrui Meng <m...@databricks.com>

Closes #13877 from mengxr/SPARK-16142.

(cherry picked from commit 4a40d43bb29704734b8128bf2a3f27802ae34e17)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/557eee5b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/557eee5b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/557eee5b

Branch: refs/heads/branch-2.0
Commit: 557eee5b6d07f8a17257cd9aae5d7830b4de4690
Parents: ea0cf93
Author: Xiangrui Meng <m...@databricks.com>
Authored: Thu Jun 23 21:43:13 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 21:43:21 2016 -0700

--
 R/pkg/R/mllib.R | 90 
 1 file changed, 42 insertions(+), 48 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/557eee5b/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index dbff1b9..853cfce 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -218,9 +218,10 @@ print.summary.GeneralizedLinearRegressionModel <- 
function(x, ...) {
 
 #  Makes predictions from a generalized linear model produced by glm() or 
spark.glm(),
 #  similarly to R's predict().
-#'
+
 #' @param newData SparkDataFrame for testing
-#' @return \code{predict} returns a SparkDataFrame containing predicted labels 
in a column named "prediction"
+#' @return \code{predict} returns a SparkDataFrame containing predicted labels 
in a column named
+#' "prediction"
 #' @rdname spark.glm
 #' @export
 #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0
@@ -229,41 +230,26 @@ setMethod("predict", signature(object = 
"GeneralizedLinearRegressionModel"),
 return(dataFrame(callJMethod(object@jobj, "transform", 
newData@sdf)))
   })
 
-#' Predicted values based on model
-#'
-#' Makes predictions from a naive Bayes model or a model produced by 
spark.naiveBayes(),
-#' similarly to R package e1071's predict.
-#'
-#' @param object A fitted naive Bayes model
-#' @rdname predict
+# Makes predictions from a naive Bayes model or a model produced by 
spark.naiveBayes(),
+# similarly to R package e1071's predict.
+
+#' @rdname spark.naiveBayes
+#' @return \code{predict} returns a SparkDataFrame containing predicted 
labeled in a column named
+#' "prediction"
 #' @export
-#' @examples
-#' \dontrun{
-#' model <- spark.naiveBayes(trainingData, y ~ x)
-#' predicted <- predict(model, testData)
-#' showDF(predicted)
-#'}
 #' @note predict(NaiveBayesModel) since 2.0.0
 setMethod("predict", signature(object = "NaiveBayesModel"),
   function(object, newData) {
 return(dataFrame(callJMethod(object@jobj, "transform", 
newData@sdf)))
   })
 
-#' Get the summary of a naive Bayes model
-#'
-#' Returns the summary of a naive Bayes model produced by spark.naiveBayes(),
-#' similarly to R's summary().
-#'
-#' @param object A fitted MLlib model
-#' @return a list containing 'apriori', the label distribution, and 'tables', 
conditional
-#  probabilities given the target label
-#' @rdname summary
+# Returns the summary of a naive Bayes model produced by 
\code{spark.naiveBayes}
+
+#' @param object A naive Bayes model fitted by \code{spark.naiveBayes}
+#' @return \code{summary} returns a list containing \code{apriori}, the label 
distribution, and
+#' \code{tables}, conditional probabilities given the target label
+#' @rdname spark.naiveBayes
 #' @export
-#' @examples
-#' \dontrun{
-#' model <- spark.naiveBayes(trainingData, y ~ x)
-#' summary(model)
-#'}
 #' @note summary(NaiveBayesModel) since 2.0.0
 setMethod("summary", signature(object = "NaiveBayesModel"),
   function(object, ...) {
@@ -390,23 +376,41 @@ setMethod("predict", signature(object = "KMeansModel"),
 return(dataFrame(callJMethod(object@jobj, "transform",

spark git commit: [SPARK-16177][ML] model loading backward compatibility for ml.regression

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 a6edec2c5 -> ea0cf93d3


[SPARK-16177][ML] model loading backward compatibility for ml.regression

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-16177
model loading backward compatibility for ml.regression

## How was this patch tested?

existing ut and manual test for loading 1.6 models.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #13879 from hhbyyh/regreComp.

(cherry picked from commit 14bc5a7f36bed19cd714a4c725a83feaccac3468)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea0cf93d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea0cf93d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea0cf93d

Branch: refs/heads/branch-2.0
Commit: ea0cf93d3969845e9df8305c0ce54326cdfb2bbd
Parents: a6edec2
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Thu Jun 23 20:43:19 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 20:43:29 2016 -0700

--
 .../apache/spark/ml/regression/AFTSurvivalRegression.scala  | 9 +
 .../org/apache/spark/ml/regression/LinearRegression.scala   | 8 +---
 2 files changed, 10 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ea0cf93d/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 2dbac49..7c51845 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -33,6 +33,7 @@ import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -389,10 +390,10 @@ object AFTSurvivalRegressionModel extends 
MLReadable[AFTSurvivalRegressionModel]
 
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.parquet(dataPath)
-.select("coefficients", "intercept", "scale").head()
-  val coefficients = data.getAs[Vector](0)
-  val intercept = data.getDouble(1)
-  val scale = data.getDouble(2)
+  val Row(coefficients: Vector, intercept: Double, scale: Double) =
+MLUtils.convertVectorColumnsToML(data, "coefficients")
+  .select("coefficients", "intercept", "scale")
+  .head()
   val model = new AFTSurvivalRegressionModel(metadata.uid, coefficients, 
intercept, scale)
 
   DefaultParamsReader.getAndSetParams(model, metadata)

http://git-wip-us.apache.org/repos/asf/spark/blob/ea0cf93d/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 2723f74..0a4d98c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -39,6 +39,7 @@ import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -500,9 +501,10 @@ object LinearRegressionModel extends 
MLReadable[LinearRegressionModel] {
 
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.format("parquet").load(dataPath)
-.select("intercept", "coefficients").head()
-  val intercept = data.getDouble(0)
-  val coefficients = data.getAs[Vector](1)
+  val Row(intercept: Double, coefficients: Vector) =
+MLUtils.convertVectorColumnsToML(data, "coefficients")
+  .select("intercept", "coef

spark git commit: [SPARK-16177][ML] model loading backward compatibility for ml.regression

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 6a3c6276f -> 14bc5a7f3


[SPARK-16177][ML] model loading backward compatibility for ml.regression

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-16177
model loading backward compatibility for ml.regression

## How was this patch tested?

existing ut and manual test for loading 1.6 models.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #13879 from hhbyyh/regreComp.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14bc5a7f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14bc5a7f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14bc5a7f

Branch: refs/heads/master
Commit: 14bc5a7f36bed19cd714a4c725a83feaccac3468
Parents: 6a3c627
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Thu Jun 23 20:43:19 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 20:43:19 2016 -0700

--
 .../apache/spark/ml/regression/AFTSurvivalRegression.scala  | 9 +
 .../org/apache/spark/ml/regression/LinearRegression.scala   | 8 +---
 2 files changed, 10 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/14bc5a7f/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 2dbac49..7c51845 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -33,6 +33,7 @@ import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -389,10 +390,10 @@ object AFTSurvivalRegressionModel extends 
MLReadable[AFTSurvivalRegressionModel]
 
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.parquet(dataPath)
-.select("coefficients", "intercept", "scale").head()
-  val coefficients = data.getAs[Vector](0)
-  val intercept = data.getDouble(1)
-  val scale = data.getDouble(2)
+  val Row(coefficients: Vector, intercept: Double, scale: Double) =
+MLUtils.convertVectorColumnsToML(data, "coefficients")
+  .select("coefficients", "intercept", "scale")
+  .head()
   val model = new AFTSurvivalRegressionModel(metadata.uid, coefficients, 
intercept, scale)
 
   DefaultParamsReader.getAndSetParams(model, metadata)

http://git-wip-us.apache.org/repos/asf/spark/blob/14bc5a7f/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 2723f74..0a4d98c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -39,6 +39,7 @@ import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -500,9 +501,10 @@ object LinearRegressionModel extends 
MLReadable[LinearRegressionModel] {
 
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.format("parquet").load(dataPath)
-.select("intercept", "coefficients").head()
-  val intercept = data.getDouble(0)
-  val coefficients = data.getAs[Vector](1)
+  val Row(intercept: Double, coefficients: Vector) =
+MLUtils.convertVectorColumnsToML(data, "coefficients")
+  .select("intercept", "coefficients")
+  .head()
   val model = new LinearRegressionModel(metadata.uid, coefficients, 
intercept)
 
   DefaultParamsReader.getAndSetParams(model, metadata)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with child predicate first

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 738f134bf -> 91b1ef28d


[SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with 
child predicate first

## What changes were proposed in this pull request?

This PR changes `CombineFilters` to compose the final predicate condition by 
using (`child predicate` AND `parent predicate`) instead of (`parent predicate` 
AND `child predicate`). This is a best effort approach. Some other optimization 
rules may destroy this order by reorganizing conjunctive predicates.

**Reported Error Scenario**
Chris McCubbin reported a bug when he used StringIndexer in an ML pipeline with 
additional filters. It seems that during filter pushdown, we changed the 
ordering in the logical plan.
```scala
import org.apache.spark.ml.feature._
val df1 = (0 until 3).map(_.toString).toDF
val indexer = new StringIndexer()
  .setInputCol("value")
  .setOutputCol("idx")
  .setHandleInvalid("skip")
  .fit(df1)
val df2 = (0 until 5).map(_.toString).toDF
val predictions = indexer.transform(df2)
predictions.show() // this is okay
predictions.where('idx > 2).show() // this will throw an exception
```

Please see the notebook at 
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1233855/2159162931615821/588180/latest.html
 for error messages.

## How was this patch tested?

Pass the Jenkins tests (including a new testcase).

Author: Dongjoon Hyun <dongj...@apache.org>

Closes #13872 from dongjoon-hyun/SPARK-16164.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91b1ef28
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91b1ef28
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91b1ef28

Branch: refs/heads/master
Commit: 91b1ef28d134313d7b6faaffa1c390f3ca4455d0
Parents: 738f134
Author: Dongjoon Hyun <dongj...@apache.org>
Authored: Thu Jun 23 15:27:43 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 15:27:43 2016 -0700

--
 .../spark/sql/catalyst/optimizer/Optimizer.scala  |  2 +-
 .../catalyst/optimizer/FilterPushdownSuite.scala  | 18 ++
 2 files changed, 19 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/91b1ef28/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 6e78ad0..2bca31d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1002,7 +1002,7 @@ object CombineFilters extends Rule[LogicalPlan] with 
PredicateHelper {
   (ExpressionSet(splitConjunctivePredicates(fc)) --
 ExpressionSet(splitConjunctivePredicates(nc))).reduceOption(And) match 
{
 case Some(ac) =>
-  Filter(And(ac, nc), grandChild)
+  Filter(And(nc, ac), grandChild)
 case None =>
   nf
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/91b1ef28/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index b8f28e8..9cb49e7 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -94,6 +94,24 @@ class FilterPushdownSuite extends PlanTest {
 comparePlans(optimized, correctAnswer)
   }
 
+  test("SPARK-16164: Filter pushdown should keep the ordering in the logical 
plan") {
+val originalQuery =
+  testRelation
+.where('a === 1)
+.select('a, 'b)
+.where('b === 1)
+
+val optimized = Optimize.execute(originalQuery.analyze)
+val correctAnswer =
+  testRelation
+.where('a === 1 && 'b === 1)
+.select('a, 'b)
+.analyze
+
+// We can not use comparePlans here because it normalized the plan.
+assert(optimized == correctAnswer)
+  }
+
   test("can't push without rewrite") {
 val originalQuery =
   testRelation


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with child predicate first

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 2ce240cfe -> 6cb24de99


[SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with 
child predicate first

## What changes were proposed in this pull request?

This PR changes `CombineFilters` to compose the final predicate condition by 
using (`child predicate` AND `parent predicate`) instead of (`parent predicate` 
AND `child predicate`). This is a best effort approach. Some other optimization 
rules may destroy this order by reorganizing conjunctive predicates.

**Reported Error Scenario**
Chris McCubbin reported a bug when he used StringIndexer in an ML pipeline with 
additional filters. It seems that during filter pushdown, we changed the 
ordering in the logical plan.
```scala
import org.apache.spark.ml.feature._
val df1 = (0 until 3).map(_.toString).toDF
val indexer = new StringIndexer()
  .setInputCol("value")
  .setOutputCol("idx")
  .setHandleInvalid("skip")
  .fit(df1)
val df2 = (0 until 5).map(_.toString).toDF
val predictions = indexer.transform(df2)
predictions.show() // this is okay
predictions.where('idx > 2).show() // this will throw an exception
```

Please see the notebook at 
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1233855/2159162931615821/588180/latest.html
 for error messages.

## How was this patch tested?

Pass the Jenkins tests (including a new testcase).

Author: Dongjoon Hyun <dongj...@apache.org>

Closes #13872 from dongjoon-hyun/SPARK-16164.

(cherry picked from commit 91b1ef28d134313d7b6faaffa1c390f3ca4455d0)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cb24de9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cb24de9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cb24de9

Branch: refs/heads/branch-2.0
Commit: 6cb24de99e011ce97fb7d3513a2760b0d1a85a45
Parents: 2ce240c
Author: Dongjoon Hyun <dongj...@apache.org>
Authored: Thu Jun 23 15:27:43 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 15:27:50 2016 -0700

--
 .../spark/sql/catalyst/optimizer/Optimizer.scala  |  2 +-
 .../catalyst/optimizer/FilterPushdownSuite.scala  | 18 ++
 2 files changed, 19 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6cb24de9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 6190f7a..6b10484 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -963,7 +963,7 @@ object CombineFilters extends Rule[LogicalPlan] with 
PredicateHelper {
   (ExpressionSet(splitConjunctivePredicates(fc)) --
 ExpressionSet(splitConjunctivePredicates(nc))).reduceOption(And) match 
{
 case Some(ac) =>
-  Filter(And(ac, nc), grandChild)
+  Filter(And(nc, ac), grandChild)
 case None =>
   nf
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/6cb24de9/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index b8f28e8..9cb49e7 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -94,6 +94,24 @@ class FilterPushdownSuite extends PlanTest {
 comparePlans(optimized, correctAnswer)
   }
 
+  test("SPARK-16164: Filter pushdown should keep the ordering in the logical 
plan") {
+val originalQuery =
+  testRelation
+.where('a === 1)
+.select('a, 'b)
+.where('b === 1)
+
+val optimized = Optimize.execute(originalQuery.analyze)
+val correctAnswer =
+  testRelation
+.where('a === 1 && 'b === 1)
+.select('a, 'b)
+.analyze
+
+// We can not use comparePlans here because it normalized the plan.
+assert(optimized == correctAnswer)
+  }
+
   test("can't push without rewrite") {
 val originalQuery =
   testRelation


---

spark git commit: [SPARK-16130][ML] model loading backward compatibility for ml.classfication.LogisticRegression

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master d85bb10ce -> 60398dabc


[SPARK-16130][ML] model loading backward compatibility for 
ml.classfication.LogisticRegression

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-16130
model loading backward compatibility for ml.classfication.LogisticRegression

## How was this patch tested?
existing ut and manual test for loading old models.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #13841 from hhbyyh/lrcomp.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60398dab
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60398dab
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60398dab

Branch: refs/heads/master
Commit: 60398dabc50d402bbab4190fbe94ebed6d3a48dc
Parents: d85bb10
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Thu Jun 23 11:00:00 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 11:00:00 2016 -0700

--
 .../spark/ml/classification/LogisticRegression.scala  | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/60398dab/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index be69d46..9c9f5ce 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -674,12 +674,12 @@ object LogisticRegressionModel extends 
MLReadable[LogisticRegressionModel] {
 
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.format("parquet").load(dataPath)
-.select("numClasses", "numFeatures", "intercept", 
"coefficients").head()
+
   // We will need numClasses, numFeatures in the future for multinomial 
logreg support.
-  // val numClasses = data.getInt(0)
-  // val numFeatures = data.getInt(1)
-  val intercept = data.getDouble(2)
-  val coefficients = data.getAs[Vector](3)
+  val Row(numClasses: Int, numFeatures: Int, intercept: Double, 
coefficients: Vector) =
+MLUtils.convertVectorColumnsToML(data, "coefficients")
+  .select("numClasses", "numFeatures", "intercept", "coefficients")
+  .head()
   val model = new LogisticRegressionModel(metadata.uid, coefficients, 
intercept)
 
   DefaultParamsReader.getAndSetParams(model, metadata)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16130][ML] model loading backward compatibility for ml.classfication.LogisticRegression

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 63fd3301c -> dff3d75db


[SPARK-16130][ML] model loading backward compatibility for 
ml.classfication.LogisticRegression

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-16130
model loading backward compatibility for ml.classfication.LogisticRegression

## How was this patch tested?
existing ut and manual test for loading old models.

Author: Yuhao Yang <hhb...@gmail.com>

Closes #13841 from hhbyyh/lrcomp.

(cherry picked from commit 60398dabc50d402bbab4190fbe94ebed6d3a48dc)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dff3d75d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dff3d75d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dff3d75d

Branch: refs/heads/branch-2.0
Commit: dff3d75db4c2848a43ed8a3084c75f38c93138af
Parents: 63fd330
Author: Yuhao Yang <hhb...@gmail.com>
Authored: Thu Jun 23 11:00:00 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 11:00:06 2016 -0700

--
 .../spark/ml/classification/LogisticRegression.scala  | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dff3d75d/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index be69d46..9c9f5ce 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -674,12 +674,12 @@ object LogisticRegressionModel extends 
MLReadable[LogisticRegressionModel] {
 
   val dataPath = new Path(path, "data").toString
   val data = sparkSession.read.format("parquet").load(dataPath)
-.select("numClasses", "numFeatures", "intercept", 
"coefficients").head()
+
   // We will need numClasses, numFeatures in the future for multinomial 
logreg support.
-  // val numClasses = data.getInt(0)
-  // val numFeatures = data.getInt(1)
-  val intercept = data.getDouble(2)
-  val coefficients = data.getAs[Vector](3)
+  val Row(numClasses: Int, numFeatures: Int, intercept: Double, 
coefficients: Vector) =
+MLUtils.convertVectorColumnsToML(data, "coefficients")
+  .select("numClasses", "numFeatures", "intercept", "coefficients")
+  .head()
   val model = new LogisticRegressionModel(metadata.uid, coefficients, 
intercept)
 
   DefaultParamsReader.getAndSetParams(model, metadata)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 5bf2889bf -> 65d1f0f71


[SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs

## What changes were proposed in this pull request?

Since we decided to switch spark.mllib package into maintenance mode in 2.0, it 
would be nice to update the package docs to reflect this change.

## How was this patch tested?

Manually checked generated APIs.

Author: Xiangrui Meng <m...@databricks.com>

Closes #13859 from mengxr/SPARK-16154.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65d1f0f7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65d1f0f7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65d1f0f7

Branch: refs/heads/master
Commit: 65d1f0f716f50dd14b5dfe1e7fac772f1b4d2be0
Parents: 5bf2889
Author: Xiangrui Meng <m...@databricks.com>
Authored: Thu Jun 23 08:26:17 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 08:26:17 2016 -0700

--
 .../scala/org/apache/spark/ml/package-info.java |  7 ++---
 .../scala/org/apache/spark/ml/package.scala |  4 +--
 .../org/apache/spark/mllib/JavaPackage.java | 31 
 .../org/apache/spark/mllib/package-info.java| 22 +-
 .../scala/org/apache/spark/mllib/package.scala  | 17 ++-
 5 files changed, 72 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/65d1f0f7/mllib/src/main/scala/org/apache/spark/ml/package-info.java
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package-info.java 
b/mllib/src/main/scala/org/apache/spark/ml/package-info.java
index 9a40f5dd..cb97382 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/package-info.java
@@ -16,10 +16,7 @@
  */
 
 /**
- * Spark ML is a component that adds a new set of machine learning APIs to let 
users quickly
- * assemble and configure practical machine learning pipelines.
+ * DataFrame-based machine learning APIs to let users quickly assemble and 
configure practical
+ * machine learning pipelines.
  */
-@Experimental
 package org.apache.spark.ml;
-
-import org.apache.spark.annotation.Experimental;

http://git-wip-us.apache.org/repos/asf/spark/blob/65d1f0f7/mllib/src/main/scala/org/apache/spark/ml/package.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala 
b/mllib/src/main/scala/org/apache/spark/ml/package.scala
index 5cc328b..a445c67 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/package.scala
@@ -18,8 +18,8 @@
 package org.apache.spark
 
 /**
- * Spark ML is a component that adds a new set of machine learning APIs to let 
users quickly
- * assemble and configure practical machine learning pipelines.
+ * DataFrame-based machine learning APIs to let users quickly assemble and 
configure practical
+ * machine learning pipelines.
  *
  * @groupname param Parameters
  * @groupdesc param A list of (hyper-)parameter keys this algorithm can take. 
Users can set and get

http://git-wip-us.apache.org/repos/asf/spark/blob/65d1f0f7/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java 
b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java
new file mode 100644
index 000..22e3452
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib;
+
+import org.apache.spark.annotation.AlphaComponent;
+
+/**
+ * A dummy class as a workaround to show the package doc of 
spark.mllib in generated
+ * Java API docs.
+ * @see http://bugs.java.com/bugdatabase/view_bug.do?bug_id=449265

spark git commit: [SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs

2016-06-23 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 4ad731ed6 -> 567093596


[SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs

## What changes were proposed in this pull request?

Since we decided to switch spark.mllib package into maintenance mode in 2.0, it 
would be nice to update the package docs to reflect this change.

## How was this patch tested?

Manually checked generated APIs.

Author: Xiangrui Meng <m...@databricks.com>

Closes #13859 from mengxr/SPARK-16154.

(cherry picked from commit 65d1f0f716f50dd14b5dfe1e7fac772f1b4d2be0)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56709359
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56709359
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56709359

Branch: refs/heads/branch-2.0
Commit: 567093596057eb77d940d53c88b82da128acfd9b
Parents: 4ad731e
Author: Xiangrui Meng <m...@databricks.com>
Authored: Thu Jun 23 08:26:17 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Jun 23 08:26:25 2016 -0700

--
 .../scala/org/apache/spark/ml/package-info.java |  7 ++---
 .../scala/org/apache/spark/ml/package.scala |  4 +--
 .../org/apache/spark/mllib/JavaPackage.java | 31 
 .../org/apache/spark/mllib/package-info.java| 22 +-
 .../scala/org/apache/spark/mllib/package.scala  | 17 ++-
 5 files changed, 72 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/56709359/mllib/src/main/scala/org/apache/spark/ml/package-info.java
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package-info.java 
b/mllib/src/main/scala/org/apache/spark/ml/package-info.java
index 9a40f5dd..cb97382 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/package-info.java
@@ -16,10 +16,7 @@
  */
 
 /**
- * Spark ML is a component that adds a new set of machine learning APIs to let 
users quickly
- * assemble and configure practical machine learning pipelines.
+ * DataFrame-based machine learning APIs to let users quickly assemble and 
configure practical
+ * machine learning pipelines.
  */
-@Experimental
 package org.apache.spark.ml;
-
-import org.apache.spark.annotation.Experimental;

http://git-wip-us.apache.org/repos/asf/spark/blob/56709359/mllib/src/main/scala/org/apache/spark/ml/package.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala 
b/mllib/src/main/scala/org/apache/spark/ml/package.scala
index 5cc328b..a445c67 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/package.scala
@@ -18,8 +18,8 @@
 package org.apache.spark
 
 /**
- * Spark ML is a component that adds a new set of machine learning APIs to let 
users quickly
- * assemble and configure practical machine learning pipelines.
+ * DataFrame-based machine learning APIs to let users quickly assemble and 
configure practical
+ * machine learning pipelines.
  *
  * @groupname param Parameters
  * @groupdesc param A list of (hyper-)parameter keys this algorithm can take. 
Users can set and get

http://git-wip-us.apache.org/repos/asf/spark/blob/56709359/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java 
b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java
new file mode 100644
index 000..22e3452
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib;
+
+import org.apache.spark.annotation.AlphaComponent;
+
+/**
+ * A dummy class as a workaround to sh

spark git commit: [SPARK-16155][DOC] remove package grouping in Java docs

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 02435acf3 -> 1d3c56e77


[SPARK-16155][DOC] remove package grouping in Java docs

## What changes were proposed in this pull request?

In 1.4 and earlier releases, we have package grouping in the generated Java API 
docs. See http://spark.apache.org/docs/1.4.0/api/java/index.html. However, this 
disappeared in 1.5.0: http://spark.apache.org/docs/1.5.0/api/java/index.html.

Rather than fixing it, I'd suggest removing grouping. Because it might take 
some time to fix and it is a manual process to update the grouping in 
`SparkBuild.scala`. I didn't find anyone complaining about missing groups since 
1.5.0 on Google.

Manually checked the generated Java API docs and confirmed that they are the 
same as in master.

Author: Xiangrui Meng <m...@databricks.com>

Closes #13856 from mengxr/SPARK-16155.

(cherry picked from commit 857ecff1d8268b28bb287e47cda370c87afe9d41)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d3c56e7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d3c56e7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d3c56e7

Branch: refs/heads/branch-2.0
Commit: 1d3c56e778b28ad4587d07765896814bfc1201f4
Parents: 02435ac
Author: Xiangrui Meng <m...@databricks.com>
Authored: Wed Jun 22 15:52:37 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Wed Jun 22 15:52:47 2016 -0700

--
 project/SparkBuild.scala | 20 
 1 file changed, 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1d3c56e7/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index bce7f1d..4b44469 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -684,11 +684,6 @@ object Unidoc {
   import sbtunidoc.Plugin._
   import UnidocKeys._
 
-  // for easier specification of JavaDoc package groups
-  private def packageList(names: String*): String = {
-names.map(s => "org.apache.spark." + s).mkString(":")
-  }
-
   private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): 
Seq[Seq[File]] = {
 packages
   .map(_.filterNot(_.getName.contains("$")))
@@ -731,21 +726,6 @@ object Unidoc {
 javacOptions in doc := Seq(
   "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " 
JavaDoc",
   "-public",
-  "-group", "Core Java API", packageList("api.java", "api.java.function"),
-  "-group", "Spark Streaming", packageList(
-"streaming.api.java", "streaming.flume", "streaming.kafka", 
"streaming.kinesis"
-  ),
-  "-group", "MLlib", packageList(
-"mllib.classification", "mllib.clustering", "mllib.evaluation.binary", 
"mllib.linalg",
-"mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", 
"mllib.recommendation",
-"mllib.regression", "mllib.stat", "mllib.tree", 
"mllib.tree.configuration",
-"mllib.tree.impurity", "mllib.tree.model", "mllib.util",
-"mllib.evaluation", "mllib.feature", "mllib.random", 
"mllib.stat.correlation",
-"mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss",
-"ml", "ml.attribute", "ml.classification", "ml.clustering", 
"ml.evaluation", "ml.feature",
-"ml.param", "ml.recommendation", "ml.regression", "ml.tuning"
-  ),
-  "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", 
"sql.hive.api.java"),
   "-noqualifier", "java.lang"
 ),
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1469 matches

Mail list logo