[spark] branch master updated: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e248bc7 [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF e248bc7 is described below commit e248bc7af6086cde7dd89a51459ae6a221a600c8 Author: Weichen Xu AuthorDate: Tue May 12 08:54:28 2020 -0700 [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF ### What changes were proposed in this pull request? Expose hashFunc property in HashingTF Some third-party library such as mleap need to access it. See background description here: https://github.com/combust/mleap/pull/665#issuecomment-621258623 ### Why are the changes needed? See https://github.com/combust/mleap/pull/665#issuecomment-621258623 ### Does this PR introduce any user-facing change? No. Only add a package private constructor. ### How was this patch tested? N/A Closes #28413 from WeichenXu123/hashing_tf_expose_hashfunc. Authored-by: Weichen Xu Signed-off-by: Xiangrui Meng --- .../org/apache/spark/ml/feature/HashingTF.scala| 40 +- .../apache/spark/ml/feature/HashingTFSuite.scala | 4 +++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 80bf859..d2bb013 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -42,14 +42,17 @@ import org.apache.spark.util.VersionUtils.majorMinorVersion * otherwise the features will not be mapped evenly to the columns. */ @Since("1.2.0") -class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) +class HashingTF @Since("3.0.0") private[ml] ( +@Since("1.4.0") override val uid: String, +@Since("3.1.0") val hashFuncVersion: Int) extends Transformer with HasInputCol with HasOutputCol with HasNumFeatures with DefaultParamsWritable { - private var hashFunc: Any => Int = FeatureHasher.murmur3Hash - @Since("1.2.0") - def this() = this(Identifiable.randomUID("hashingTF")) + def this() = this(Identifiable.randomUID("hashingTF"), HashingTF.SPARK_3_MURMUR3_HASH) + + @Since("1.4.0") + def this(uid: String) = this(uid, hashFuncVersion = HashingTF.SPARK_3_MURMUR3_HASH) /** @group setParam */ @Since("1.4.0") @@ -122,7 +125,12 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) */ @Since("3.0.0") def indexOf(term: Any): Int = { -Utils.nonNegativeMod(hashFunc(term), $(numFeatures)) +val hashValue = hashFuncVersion match { + case HashingTF.SPARK_2_MURMUR3_HASH => OldHashingTF.murmur3Hash(term) + case HashingTF.SPARK_3_MURMUR3_HASH => FeatureHasher.murmur3Hash(term) + case _ => throw new IllegalArgumentException("Illegal hash function version setting.") +} +Utils.nonNegativeMod(hashValue, $(numFeatures)) } @Since("1.4.1") @@ -132,27 +140,41 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) override def toString: String = { s"HashingTF: uid=$uid, binary=${$(binary)}, numFeatures=${$(numFeatures)}" } + + @Since("3.0.0") + override def save(path: String): Unit = { +require(hashFuncVersion == HashingTF.SPARK_3_MURMUR3_HASH, + "Cannot save model which is loaded from lower version spark saved model. We can address " + + "it by (1) use old spark version to save the model, or (2) use new version spark to " + + "re-train the pipeline.") +super.save(path) + } } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { + private[ml] val SPARK_2_MURMUR3_HASH = 1 + private[ml] val SPARK_3_MURMUR3_HASH = 2 + private class HashingTFReader extends MLReader[HashingTF] { private val className = classOf[HashingTF].getName override def load(path: String): HashingTF = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - val hashingTF = new HashingTF(metadata.uid) - metadata.getAndSetParams(hashingTF) // We support loading old `HashingTF` saved by previous Spark versions. // Previous `HashingTF` uses `mllib.feature.HashingTF.murmur3Hash`, but new `HashingTF` uses // `ml.Feature.FeatureHasher.murmur3Hash`. val (majorVersion, _) = majorMinorVersion(metadata.sparkVersion) - if (majorVersi
[spark] branch branch-3.0 updated: [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new b50d53b [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF b50d53b is described below commit b50d53b1079ea32c75f9192f27b2b07cdec69641 Author: Weichen Xu AuthorDate: Tue May 12 08:54:28 2020 -0700 [SPARK-31610][SPARK-31668][ML] Address hashingTF saving bug and expose hashFunc property in HashingTF ### What changes were proposed in this pull request? Expose hashFunc property in HashingTF Some third-party library such as mleap need to access it. See background description here: https://github.com/combust/mleap/pull/665#issuecomment-621258623 ### Why are the changes needed? See https://github.com/combust/mleap/pull/665#issuecomment-621258623 ### Does this PR introduce any user-facing change? No. Only add a package private constructor. ### How was this patch tested? N/A Closes #28413 from WeichenXu123/hashing_tf_expose_hashfunc. Authored-by: Weichen Xu Signed-off-by: Xiangrui Meng (cherry picked from commit e248bc7af6086cde7dd89a51459ae6a221a600c8) Signed-off-by: Xiangrui Meng --- .../org/apache/spark/ml/feature/HashingTF.scala| 40 +- .../apache/spark/ml/feature/HashingTFSuite.scala | 4 +++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 80bf859..d2bb013 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -42,14 +42,17 @@ import org.apache.spark.util.VersionUtils.majorMinorVersion * otherwise the features will not be mapped evenly to the columns. */ @Since("1.2.0") -class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) +class HashingTF @Since("3.0.0") private[ml] ( +@Since("1.4.0") override val uid: String, +@Since("3.1.0") val hashFuncVersion: Int) extends Transformer with HasInputCol with HasOutputCol with HasNumFeatures with DefaultParamsWritable { - private var hashFunc: Any => Int = FeatureHasher.murmur3Hash - @Since("1.2.0") - def this() = this(Identifiable.randomUID("hashingTF")) + def this() = this(Identifiable.randomUID("hashingTF"), HashingTF.SPARK_3_MURMUR3_HASH) + + @Since("1.4.0") + def this(uid: String) = this(uid, hashFuncVersion = HashingTF.SPARK_3_MURMUR3_HASH) /** @group setParam */ @Since("1.4.0") @@ -122,7 +125,12 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) */ @Since("3.0.0") def indexOf(term: Any): Int = { -Utils.nonNegativeMod(hashFunc(term), $(numFeatures)) +val hashValue = hashFuncVersion match { + case HashingTF.SPARK_2_MURMUR3_HASH => OldHashingTF.murmur3Hash(term) + case HashingTF.SPARK_3_MURMUR3_HASH => FeatureHasher.murmur3Hash(term) + case _ => throw new IllegalArgumentException("Illegal hash function version setting.") +} +Utils.nonNegativeMod(hashValue, $(numFeatures)) } @Since("1.4.1") @@ -132,27 +140,41 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String) override def toString: String = { s"HashingTF: uid=$uid, binary=${$(binary)}, numFeatures=${$(numFeatures)}" } + + @Since("3.0.0") + override def save(path: String): Unit = { +require(hashFuncVersion == HashingTF.SPARK_3_MURMUR3_HASH, + "Cannot save model which is loaded from lower version spark saved model. We can address " + + "it by (1) use old spark version to save the model, or (2) use new version spark to " + + "re-train the pipeline.") +super.save(path) + } } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { + private[ml] val SPARK_2_MURMUR3_HASH = 1 + private[ml] val SPARK_3_MURMUR3_HASH = 2 + private class HashingTFReader extends MLReader[HashingTF] { private val className = classOf[HashingTF].getName override def load(path: String): HashingTF = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - val hashingTF = new HashingTF(metadata.uid) - metadata.getAndSetParams(hashingTF) // We support loading old `HashingTF` saved by previous Spark versions. // Previous `HashingTF` uses `mllib.feature.HashingTF.murmur3Hash`, but new `HashingTF` uses
[spark] branch branch-3.0 updated: [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 4421178 [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model 4421178 is described below commit 442117812ca6edc6e0ab271da829032b9637e89e Author: Weichen Xu AuthorDate: Sun Apr 26 21:04:14 2020 -0700 [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model ### What changes were proposed in this pull request? Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model. Most pyspark estimators/transformers inherit `JavaParams`, but some estimators are special (in order to support pure python implemented nested estimators/transformers): * Pipeline * OneVsRest * CrossValidator * TrainValidationSplit But note that, currently, in pyspark, estimators listed above, their model reader/writer do NOT support pure python implemented nested estimators/transformers. Because they use java reader/writer wrapper as python side reader/writer. Pyspark CrossValidator/TrainValidationSplit model reader/writer require all estimators define the `_transfer_param_map_to_java` and `_transfer_param_map_from_java` (used in model read/write). OneVsRest class already defines the two methods, but Pipeline do not, so it lead to this bug. In this PR I add `_transfer_param_map_to_java` and `_transfer_param_map_from_java` into Pipeline class. ### Why are the changes needed? Bug fix. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Unit test. Manually test in pyspark shell: 1) CrossValidator with Simple Pipeline estimator ``` from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) cvModel.save('/tmp/cv_model001') CrossValidatorModel.load('/tmp/cv_model001') ``` 2) CrossValidator with Pipeline estimator which include a OneVsRest estimator stage, and OneVsRest estimator nest a LogisticRegression estimator. ``` from pyspark.ml.linalg import Vectors from pyspark.ml import Estimator, Model from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, OneVsRest from pyspark.ml.evaluation import BinaryClassificationEvaluator, \ MulticlassClassificationEvaluator, RegressionEvaluator from pyspark.ml.linalg import Vectors from pyspark.ml.param import Param, Params from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder, \ TrainValidationSplit, TrainValidationSplitModel from pyspark.sql.functions import rand from pyspark.testing.mlutils import SparkSessionTestCase dataset = spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticReg
[spark] branch master updated: [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4a21c4c [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model 4a21c4c is described below commit 4a21c4cc92805b034ade0593eea3c4a9b6122083 Author: Weichen Xu AuthorDate: Sun Apr 26 21:04:14 2020 -0700 [SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model ### What changes were proposed in this pull request? Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model. Most pyspark estimators/transformers inherit `JavaParams`, but some estimators are special (in order to support pure python implemented nested estimators/transformers): * Pipeline * OneVsRest * CrossValidator * TrainValidationSplit But note that, currently, in pyspark, estimators listed above, their model reader/writer do NOT support pure python implemented nested estimators/transformers. Because they use java reader/writer wrapper as python side reader/writer. Pyspark CrossValidator/TrainValidationSplit model reader/writer require all estimators define the `_transfer_param_map_to_java` and `_transfer_param_map_from_java` (used in model read/write). OneVsRest class already defines the two methods, but Pipeline do not, so it lead to this bug. In this PR I add `_transfer_param_map_to_java` and `_transfer_param_map_from_java` into Pipeline class. ### Why are the changes needed? Bug fix. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Unit test. Manually test in pyspark shell: 1) CrossValidator with Simple Pipeline estimator ``` from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) cvModel.save('/tmp/cv_model001') CrossValidatorModel.load('/tmp/cv_model001') ``` 2) CrossValidator with Pipeline estimator which include a OneVsRest estimator stage, and OneVsRest estimator nest a LogisticRegression estimator. ``` from pyspark.ml.linalg import Vectors from pyspark.ml import Estimator, Model from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, OneVsRest from pyspark.ml.evaluation import BinaryClassificationEvaluator, \ MulticlassClassificationEvaluator, RegressionEvaluator from pyspark.ml.linalg import Vectors from pyspark.ml.param import Param, Params from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder, \ TrainValidationSplit, TrainValidationSplitModel from pyspark.sql.functions import rand from pyspark.testing.mlutils import SparkSessionTestCase dataset = spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticReg
[spark] branch branch-3.0 updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new f482187 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext f482187 is described below commit f482187c127418d2ea538ac2551ae0fce1ddbc31 Author: sarthfrey-db AuthorDate: Thu Feb 13 16:15:00 2020 -0800 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext ### What changes were proposed in this pull request? The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call. ### Why are the changes needed? There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on. ### Does this PR introduce any user-facing change? Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs. ### How was this patch tested? Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID. An example through the Python API: ```python >>> from pyspark import BarrierTaskContext >>> >>> def f(iterator): ... context = BarrierTaskContext.get() ... return [context.allGather('{}'.format(context.partitionId()))] ... >>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0] [u'3', u'1', u'0', u'2'] ``` Closes #27395 from sarthfrey/master. Lead-authored-by: sarthfrey-db Co-authored-by: sarthfrey Signed-off-by: Xiangrui Meng (cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002) Signed-off-by: Xiangrui Meng --- .../org/apache/spark/BarrierCoordinator.scala | 113 +-- .../org/apache/spark/BarrierTaskContext.scala | 153 ++--- .../org/apache/spark/api/python/PythonRunner.scala | 51 +-- .../spark/scheduler/BarrierTaskContextSuite.scala | 74 ++ python/pyspark/taskcontext.py | 49 ++- python/pyspark/tests/test_taskcontext.py | 20 +++ 6 files changed, 381 insertions(+), 79 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 4e41767..042a266 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -17,12 +17,17 @@ package org.apache.spark +import java.nio.charset.StandardCharsets.UTF_8 import java.util.{Timer, TimerTask} import java.util.concurrent.ConcurrentHashMap import java.util.function.Consumer import scala.collection.mutable.ArrayBuffer +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} + import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted} @@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator( // reset when a barrier() call fails due to timeout. private var barrierEpoch: Int = 0 -// An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier() -// call. +// An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks) +// An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call +private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer] + +// The blocking requestMethod called by tasks to sync up for this stage attempt +private var requestMethodToSync: RequestMethod.Value = RequestM
[spark] branch master updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new af63971 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext af63971 is described below commit af63971cb7a5e7c7cb23ff1f87e5838d54c59a7d Author: sarthfrey-db AuthorDate: Thu Feb 13 16:15:00 2020 -0800 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext ### What changes were proposed in this pull request? The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call. ### Why are the changes needed? There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on. ### Does this PR introduce any user-facing change? Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs. ### How was this patch tested? Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID. An example through the Python API: ```python >>> from pyspark import BarrierTaskContext >>> >>> def f(iterator): ... context = BarrierTaskContext.get() ... return [context.allGather('{}'.format(context.partitionId()))] ... >>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0] [u'3', u'1', u'0', u'2'] ``` Closes #27395 from sarthfrey/master. Lead-authored-by: sarthfrey-db Co-authored-by: sarthfrey Signed-off-by: Xiangrui Meng (cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002) Signed-off-by: Xiangrui Meng --- .../org/apache/spark/BarrierCoordinator.scala | 113 +-- .../org/apache/spark/BarrierTaskContext.scala | 153 ++--- .../org/apache/spark/api/python/PythonRunner.scala | 51 +-- .../spark/scheduler/BarrierTaskContextSuite.scala | 74 ++ python/pyspark/taskcontext.py | 49 ++- python/pyspark/tests/test_taskcontext.py | 20 +++ 6 files changed, 381 insertions(+), 79 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 4e41767..042a266 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -17,12 +17,17 @@ package org.apache.spark +import java.nio.charset.StandardCharsets.UTF_8 import java.util.{Timer, TimerTask} import java.util.concurrent.ConcurrentHashMap import java.util.function.Consumer import scala.collection.mutable.ArrayBuffer +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} + import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted} @@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator( // reset when a barrier() call fails due to timeout. private var barrierEpoch: Int = 0 -// An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier() -// call. +// An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks) +// An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call +private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer] + +// The blocking requestMethod called by tasks to sync up for this stage attempt +private var requestMethodToSync: RequestMethod.Value = RequestM
[spark] branch branch-3.0 updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6001866 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext 6001866 is described below commit 6001866cea1216da421c5acd71d6fc74228222ac Author: sarthfrey-db AuthorDate: Thu Feb 13 16:15:00 2020 -0800 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext ### What changes were proposed in this pull request? The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call. ### Why are the changes needed? There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on. ### Does this PR introduce any user-facing change? Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs. ### How was this patch tested? Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID. An example through the Python API: ```python >>> from pyspark import BarrierTaskContext >>> >>> def f(iterator): ... context = BarrierTaskContext.get() ... return [context.allGather('{}'.format(context.partitionId()))] ... >>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0] [u'3', u'1', u'0', u'2'] ``` Closes #27395 from sarthfrey/master. Lead-authored-by: sarthfrey-db Co-authored-by: sarthfrey Signed-off-by: Xiangrui Meng (cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002) Signed-off-by: Xiangrui Meng --- .../org/apache/spark/BarrierCoordinator.scala | 113 +-- .../org/apache/spark/BarrierTaskContext.scala | 153 ++--- .../org/apache/spark/api/python/PythonRunner.scala | 51 +-- .../spark/scheduler/BarrierTaskContextSuite.scala | 74 ++ python/pyspark/taskcontext.py | 49 ++- python/pyspark/tests/test_taskcontext.py | 20 +++ 6 files changed, 381 insertions(+), 79 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 4e41767..042a266 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -17,12 +17,17 @@ package org.apache.spark +import java.nio.charset.StandardCharsets.UTF_8 import java.util.{Timer, TimerTask} import java.util.concurrent.ConcurrentHashMap import java.util.function.Consumer import scala.collection.mutable.ArrayBuffer +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} + import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted} @@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator( // reset when a barrier() call fails due to timeout. private var barrierEpoch: Int = 0 -// An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier() -// call. +// An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks) +// An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call +private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer] + +// The blocking requestMethod called by tasks to sync up for this stage attempt +private var requestMethodToSync: RequestMethod.Value = RequestM
[spark] branch master updated: [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 57254c9 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext 57254c9 is described below commit 57254c9719f9af9ad985596ed7fbbaafa4052002 Author: sarthfrey-db AuthorDate: Thu Feb 13 16:15:00 2020 -0800 [SPARK-30667][CORE] Add allGather method to BarrierTaskContext ### What changes were proposed in this pull request? The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call. ### Why are the changes needed? There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on. ### Does this PR introduce any user-facing change? Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs. ### How was this patch tested? Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID. An example through the Python API: ```python >>> from pyspark import BarrierTaskContext >>> >>> def f(iterator): ... context = BarrierTaskContext.get() ... return [context.allGather('{}'.format(context.partitionId()))] ... >>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0] [u'3', u'1', u'0', u'2'] ``` Closes #27395 from sarthfrey/master. Lead-authored-by: sarthfrey-db Co-authored-by: sarthfrey Signed-off-by: Xiangrui Meng --- .../org/apache/spark/BarrierCoordinator.scala | 113 +-- .../org/apache/spark/BarrierTaskContext.scala | 153 ++--- .../org/apache/spark/api/python/PythonRunner.scala | 51 +-- .../spark/scheduler/BarrierTaskContextSuite.scala | 74 ++ python/pyspark/taskcontext.py | 49 ++- python/pyspark/tests/test_taskcontext.py | 20 +++ 6 files changed, 381 insertions(+), 79 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 4e41767..042a266 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -17,12 +17,17 @@ package org.apache.spark +import java.nio.charset.StandardCharsets.UTF_8 import java.util.{Timer, TimerTask} import java.util.concurrent.ConcurrentHashMap import java.util.function.Consumer import scala.collection.mutable.ArrayBuffer +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} + import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted} @@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator( // reset when a barrier() call fails due to timeout. private var barrierEpoch: Int = 0 -// An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier() -// call. +// An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks) +// An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call +private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer] + +// The blocking requestMethod called by tasks to sync up for this stage attempt +private var requestMethodToSync: RequestMethod.Value = RequestMethod.BARRIER + // A timer task that ensures we may timeout for a barrier() call. private var timerTask:
[spark] branch master updated: [SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 88542bc [SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays 88542bc is described below commit 88542bc3d9e506b1a0e852f3e9c632920d3fe553 Author: WeichenXu AuthorDate: Mon Jan 6 16:18:51 2020 -0800 [SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng --- dev/sparktestsupport/modules.py| 1 + .../main/scala/org/apache/spark/ml/functions.scala | 48 +++ .../scala/org/apache/spark/ml/FunctionsSuite.scala | 65 + python/docs/pyspark.ml.rst | 8 +++ python/pyspark/ml/functions.py | 68 ++ 5 files changed, 190 insertions(+) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 1443584..4179359 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -460,6 +460,7 @@ pyspark_ml = Module( "pyspark.ml.evaluation", "pyspark.ml.feature", "pyspark.ml.fpm", +"pyspark.ml.functions", "pyspark.ml.image", "pyspark.ml.linalg.__init__", "pyspark.ml.recommendation", diff --git a/mllib/src/main/scala/org/apache/spark/ml/functions.scala b/mllib/src/main/scala/org/apache/spark/ml/functions.scala new file mode 100644 index 000..1faf562 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/functions.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml + +import org.apache.spark.annotation.Since +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.mllib.linalg.{Vector => OldVector} +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.udf + +// scalastyle:off +@Since("3.0.0") +object functions { +// scalastyle:on + + private val vectorToArrayUdf = udf { vec: Any => +vec match { + case v: Vector => v.toArray + case v: OldVector => v.toArray + case v => throw new IllegalArgumentException( +"function vector_to_array requires a non-null input argument and input type must be " + +"`org.apache.spark.ml.linalg.Vector` or `org.apache.spark.mllib.linalg.Vector`, " + +s"but got ${ if (v == null) "null" else v.getClass.getName }.") +} + }.asNonNullable() + + /** + * Converts a column of MLlib sparse/dense vectors into a column of dense arrays. + * + * @since 3.0.0 + */ + def vector_to_array(v: Column): Column = vectorToArrayUdf(v) +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala new file mode 100644 index 000..2f5062c --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF lice
[spark] branch master updated: [SPARK-28978][ ] Support > 256 args to python udf
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 8152a87 [SPARK-28978][ ] Support > 256 args to python udf 8152a87 is described below commit 8152a87235a63a13969f7c1ff5ed038956e8ed76 Author: Bago Amirbekian AuthorDate: Fri Nov 8 19:19:14 2019 -0800 [SPARK-28978][ ] Support > 256 args to python udf ### What changes were proposed in this pull request? On the worker we express lambda functions as strings and then eval them to create a "mapper" function. This make the code hard to read & limits the # of arguments a udf can support to 256 for python <= 3.6. This PR rewrites the mapper functions as nested functions instead of "lambda strings" and allows passing in more than 255 args. ### Why are the changes needed? The jira ticket associated with this issue describes how MLflow uses udfs to consume columns as features. This pattern isn't unique and a limit of 255 features is quite low. ### Does this PR introduce any user-facing change? Users can now pass more than 255 cols to a udf function. ### How was this patch tested? Added a unit test for passing in > 255 args to udf. Closes #26442 from MrBago/replace-lambdas-on-worker. Authored-by: Bago Amirbekian Signed-off-by: Xiangrui Meng --- python/pyspark/sql/tests/test_udf.py | 13 python/pyspark/worker.py | 62 +--- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index c274dc7..3b9f12f 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -629,6 +629,19 @@ class UDFTests(ReusedSQLTestCase): self.sc.parallelize(range(1), 1).mapPartitions(task).count() +def test_udf_with_256_args(self): +N = 256 +data = [["data-%d" % i for i in range(N)]] * 5 +df = self.spark.createDataFrame(data) + +def f(*a): +return "success" + +fUdf = udf(f, StringType()) + +r = df.select(fUdf(*df.columns)) +self.assertEqual(r.first()[0], "success") + class UDFInitializationTests(unittest.TestCase): def tearDown(self): diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 3a1200e..bfa8d97 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -403,54 +403,50 @@ def read_udfs(pickleSer, infile, eval_type): idx += offsets_len return parsed -udfs = {} -call_udf = [] -mapper_str = "" if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: -# Create function like this: -# lambda a: f([a[0]], [a[0], a[1]]) - # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes -arg_offsets, udf = read_single_udf( -pickleSer, infile, eval_type, runner_conf, udf_index=0) -udfs['f'] = udf +arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) parsed_offsets = extract_key_value_indexes(arg_offsets) -keys = ["a[%d]" % (o,) for o in parsed_offsets[0][0]] -vals = ["a[%d]" % (o, ) for o in parsed_offsets[0][1]] -mapper_str = "lambda a: f([%s], [%s])" % (", ".join(keys), ", ".join(vals)) + +# Create function like this: +# mapper a: f([a[0]], [a[0], a[1]]) +def mapper(a): +keys = [a[o] for o in parsed_offsets[0][0]] +vals = [a[o] for o in parsed_offsets[0][1]] +return f(keys, vals) elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: # We assume there is only one UDF here because cogrouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 -arg_offsets, udf = read_single_udf( -pickleSer, infile, eval_type, runner_conf, udf_index=0) -udfs['f'] = udf +arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) + parsed_offsets = extract_key_value_indexes(arg_offsets) -df1_keys = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][0]] -df1_vals = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][1]] -df2_keys = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][0]] -df2_vals = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][1]] -mapper_s
[spark] branch master updated: [SPARK-29417][CORE] Resource Scheduling - add TaskContext.resource java api
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a42d894 [SPARK-29417][CORE] Resource Scheduling - add TaskContext.resource java api a42d894 is described below commit a42d894a4090c97a90ce23b0989163909ebf548d Author: Thomas Graves AuthorDate: Mon Oct 14 13:27:34 2019 -0700 [SPARK-29417][CORE] Resource Scheduling - add TaskContext.resource java api ### What changes were proposed in this pull request? We added a TaskContext.resources() api, but I realized this is returning a scala Map which is not ideal for access from Java. Here I add a resourcesJMap function which returns a java.util.Map to make it easily accessible from Java. ### Why are the changes needed? Java API access ### Does this PR introduce any user-facing change? Yes, new TaskContext function to access from Java ### How was this patch tested? new unit test Closes #26083 from tgravescs/SPARK-29417. Lead-authored-by: Thomas Graves Co-authored-by: Thomas Graves Co-authored-by: Thomas Graves Signed-off-by: Xiangrui Meng --- core/src/main/scala/org/apache/spark/BarrierTaskContext.scala | 5 + core/src/main/scala/org/apache/spark/TaskContext.scala| 8 core/src/main/scala/org/apache/spark/TaskContextImpl.scala| 5 + .../java/test/org/apache/spark/JavaTaskContextCompileCheck.java | 5 + project/MimaExcludes.scala| 3 +++ 5 files changed, 26 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 5afd8a5..3d36980 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -19,6 +19,7 @@ package org.apache.spark import java.util.{Properties, Timer, TimerTask} +import scala.collection.JavaConverters._ import scala.concurrent.TimeoutException import scala.concurrent.duration._ @@ -211,6 +212,10 @@ class BarrierTaskContext private[spark] ( override def resources(): Map[String, ResourceInformation] = taskContext.resources() + override def resourcesJMap(): java.util.Map[String, ResourceInformation] = { +resources().asJava + } + override private[spark] def killTaskIfInterrupted(): Unit = taskContext.killTaskIfInterrupted() override private[spark] def getKillReason(): Option[String] = taskContext.getKillReason() diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala index 2299c54..fd41fac 100644 --- a/core/src/main/scala/org/apache/spark/TaskContext.scala +++ b/core/src/main/scala/org/apache/spark/TaskContext.scala @@ -185,6 +185,14 @@ abstract class TaskContext extends Serializable { @Evolving def resources(): Map[String, ResourceInformation] + /** + * (java-specific) Resources allocated to the task. The key is the resource name and the value + * is information about the resource. Please refer to + * [[org.apache.spark.resource.ResourceInformation]] for specifics. + */ + @Evolving + def resourcesJMap(): java.util.Map[String, ResourceInformation] + @DeveloperApi def taskMetrics(): TaskMetrics diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala index 516fb95..08a58a0 100644 --- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala +++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala @@ -20,6 +20,7 @@ package org.apache.spark import java.util.Properties import javax.annotation.concurrent.GuardedBy +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.executor.TaskMetrics @@ -101,6 +102,10 @@ private[spark] class TaskContextImpl( this } + override def resourcesJMap(): java.util.Map[String, ResourceInformation] = { +resources.asJava + } + @GuardedBy("this") private[spark] override def markTaskFailed(error: Throwable): Unit = synchronized { if (failed) return diff --git a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java index 62a0b85..5ce7937 100644 --- a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java +++ b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java @@ -17,7 +17,10 @@ package test.org.apache.spark; +import java.util.Map; + import org.apache.spark.TaskContext; +import org.apache.spark.resource.ResourceInformation; import org.apache.spark.util.TaskCompletionListene
[spark] branch master updated: [SPARK-28206][PYTHON] Remove the legacy Epydoc in PySpark API documentation
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new fe75ff8 [SPARK-28206][PYTHON] Remove the legacy Epydoc in PySpark API documentation fe75ff8 is described below commit fe75ff8bea3330a10aba1a61f3aba42e541195a8 Author: HyukjinKwon AuthorDate: Fri Jul 5 10:08:22 2019 -0700 [SPARK-28206][PYTHON] Remove the legacy Epydoc in PySpark API documentation ## What changes were proposed in this pull request? Seems like we used to generate PySpark API documentation by Epydoc almost at the very first place (see https://github.com/apache/spark/commit/85b8f2c64f0fc4be5645d8736629fc082cb3587b). This fixes an actual issue: Before: ![Screen Shot 2019-07-05 at 8 20 01 PM](https://user-images.githubusercontent.com/6477701/60720491-e9879180-9f65-11e9-9562-100830a456cd.png) After: ![Screen Shot 2019-07-05 at 8 20 05 PM](https://user-images.githubusercontent.com/6477701/60720495-ec828200-9f65-11e9-8277-8f689e292cb0.png) It seems apparently a bug within `epytext` plugin during the conversion between`param` and `:param` syntax. See also [Epydoc syntax](http://epydoc.sourceforge.net/manual-epytext.html). Actually, Epydoc syntax violates [PEP-257](https://www.python.org/dev/peps/pep-0257/) IIRC and blocks us to enable some rules for doctest linter as well. We should remove this legacy away and I guess Spark 3 is good timing to do it. ## How was this patch tested? Manually built the doc and check each. I had to manually find the Epydoc syntax by `git grep -r "{L"`, for instance. Closes #25060 from HyukjinKwon/SPARK-28206. Authored-by: HyukjinKwon Signed-off-by: Xiangrui Meng --- python/docs/conf.py | 1 - python/docs/epytext.py | 30 python/pyspark/accumulators.py | 14 ++-- python/pyspark/broadcast.py | 6 +- python/pyspark/conf.py | 8 +-- python/pyspark/context.py| 56 +++ python/pyspark/files.py | 7 +- python/pyspark/ml/feature.py | 2 +- python/pyspark/ml/linalg/__init__.py | 8 +-- python/pyspark/mllib/classification.py | 4 +- python/pyspark/mllib/clustering.py | 6 +- python/pyspark/mllib/linalg/__init__.py | 8 +-- python/pyspark/mllib/random.py | 6 +- python/pyspark/mllib/stat/_statistics.py | 4 +- python/pyspark/mllib/util.py | 4 +- python/pyspark/rdd.py| 114 +++ python/pyspark/serializers.py| 12 ++-- python/pyspark/sql/dataframe.py | 10 +-- python/pyspark/sql/types.py | 2 +- python/pyspark/streaming/context.py | 42 ++-- python/pyspark/streaming/dstream.py | 50 +++--- python/pyspark/taskcontext.py| 2 +- python/pyspark/testing/streamingutils.py | 6 +- 23 files changed, 185 insertions(+), 217 deletions(-) diff --git a/python/docs/conf.py b/python/docs/conf.py index f507ee3..9e7afb7 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -31,7 +31,6 @@ needs_sphinx = '1.2' extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', -'epytext', 'sphinx.ext.mathjax', ] diff --git a/python/docs/epytext.py b/python/docs/epytext.py deleted file mode 100644 index 4bbbf65..000 --- a/python/docs/epytext.py +++ /dev/null @@ -1,30 +0,0 @@ -import re - -RULES = ( -(r"<(!BLANKLINE)[\w.]+>", r""), -(r"L{([\w.()]+)}", r":class:`\1`"), -(r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"), -(r"C{([\w.()]+)}", r":class:`\1`"), -(r"[IBCM]{([^}]+)}", r"`\1`"), -('pyspark.rdd.RDD', 'RDD'), -) - - -def _convert_epytext(line): -""" ->>> _convert_epytext("L{A}") -:class:`A` -""" -line = line.replace('@', ':') -for p, sub in RULES: -line = re.sub(p, sub, line) -return line - - -def _process_docstring(app, what, name, obj, options, lines): -for i in range(len(lines)): -lines[i] = _convert_epytext(lines[i]) - - -def setup(app): -app.connect("autodoc-process-docstring", _process_docstring) diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py index 00ec094..a5d5132 100644 --- a/python/pyspark/accumulators.py +++ b/python/pyspark/accumulators.py @@ -123,13 +123,13 @@ class Accumulator(object): """ A shared variable that can be accumulated, i.e., has a commutative and associative "add" -operation. Worker tasks on a Spark clus
[spark] branch master updated: [SPARK-28115][CORE][TEST] Fix flaky test: SparkContextSuite.test resource scheduling under local-cluster mode
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d98a5ce [SPARK-28115][CORE][TEST] Fix flaky test: SparkContextSuite.test resource scheduling under local-cluster mode d98a5ce is described below commit d98a5ce34d6b4b098d30c26c89a9d65d931f930d Author: Xingbo Jiang AuthorDate: Thu Jun 20 13:23:29 2019 -0700 [SPARK-28115][CORE][TEST] Fix flaky test: SparkContextSuite.test resource scheduling under local-cluster mode ## What changes were proposed in this pull request? The test `SparkContextSuite.test resource scheduling under local-cluster mode` has been flaky, because it expects the size of `sc.statusTracker.getExecutorInfos` be the same as the number of executors, while the returned list contains both the driver and executors. ## How was this patch tested? Updated existing tests. Closes #24917 from jiangxb1987/getExecutorInfos. Authored-by: Xingbo Jiang Signed-off-by: Xiangrui Meng --- .../src/main/scala/org/apache/spark/SparkStatusTracker.scala | 1 + core/src/test/scala/org/apache/spark/SparkContextSuite.scala | 12 +++- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala index 815237e..555c085 100644 --- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala +++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala @@ -99,6 +99,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext, store: AppStatusStore /** * Returns information of all known executors, including host, port, cacheSize, numRunningTasks * and memory metrics. + * Note this include information for both the driver and executors. */ def getExecutorInfos: Array[SparkExecutorInfo] = { store.executorList(true).map { exec => diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index fa2c4bd..628ac60 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -750,9 +750,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) // Ensure all executors has started - eventually(timeout(10.seconds)) { -assert(sc.statusTracker.getExecutorInfos.size == 1) - } + TestUtils.waitUntilExecutorsUp(sc, 1, 1) assert(sc.resources.size === 1) assert(sc.resources.get(GPU).get.addresses === Array("5", "6")) assert(sc.resources.get(GPU).get.name === "gpu") @@ -780,9 +778,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) // Ensure all executors has started - eventually(timeout(10.seconds)) { -assert(sc.statusTracker.getExecutorInfos.size == 1) - } + TestUtils.waitUntilExecutorsUp(sc, 1, 1) // driver gpu resources file should take precedence over the script assert(sc.resources.size === 1) assert(sc.resources.get(GPU).get.addresses === Array("0", "1", "8")) @@ -855,9 +851,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) // Ensure all executors has started - eventually(timeout(60.seconds)) { -assert(sc.statusTracker.getExecutorInfos.size == 3) - } + TestUtils.waitUntilExecutorsUp(sc, 3, 6) val rdd = sc.makeRDD(1 to 10, 9).mapPartitions { it => val context = TaskContext.get() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-28056][PYTHON] add doc for SCALAR_ITER Pandas UDF
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1b2448b [SPARK-28056][PYTHON] add doc for SCALAR_ITER Pandas UDF 1b2448b is described below commit 1b2448bc10a8ee732d08fa1abae6d64ae25e3a14 Author: Xiangrui Meng AuthorDate: Mon Jun 17 20:51:36 2019 -0700 [SPARK-28056][PYTHON] add doc for SCALAR_ITER Pandas UDF ## What changes were proposed in this pull request? Add docs for `SCALAR_ITER` Pandas UDF. cc: WeichenXu123 HyukjinKwon ## How was this patch tested? Tested example code manually. Closes #24897 from mengxr/SPARK-28056. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- docs/sql-pyspark-pandas-with-arrow.md | 17 +++ examples/src/main/python/sql/arrow.py | 86 +++ 2 files changed, 103 insertions(+) diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md index 6cf280c..9cab4be 100644 --- a/docs/sql-pyspark-pandas-with-arrow.md +++ b/docs/sql-pyspark-pandas-with-arrow.md @@ -86,6 +86,23 @@ The following example shows how to create a scalar Pandas UDF that computes the +### Scalar Iterator + +Scalar iterator (`SCALAR_ITER`) Pandas UDF is the same as scalar Pandas UDF above except that the +underlying Python function takes an iterator of batches as input instead of a single batch and, +instead of returning a single output batch, it yields output batches or returns an iterator of +output batches. +It is useful when the UDF execution requires initializing some states, e.g., loading an machine +learning model file to apply inference to every input batch. + +The following example shows how to create scalar iterator Pandas UDFs: + + + +{% include_example scalar_iter_pandas_udf python/sql/arrow.py %} + + + ### Grouped Map Grouped map Pandas UDFs are used with `groupBy().apply()` which implements the "split-apply-combine" pattern. Split-apply-combine consists of three steps: diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py index c1e2d29..ede121b 100644 --- a/examples/src/main/python/sql/arrow.py +++ b/examples/src/main/python/sql/arrow.py @@ -86,6 +86,92 @@ def scalar_pandas_udf_example(spark): # $example off:scalar_pandas_udf$ +def scalar_iter_pandas_udf_example(spark): +# $example on:scalar_iter_pandas_udf$ +import pandas as pd + +from pyspark.sql.functions import col, pandas_udf, struct, PandasUDFType + +pdf = pd.DataFrame([1, 2, 3], columns=["x"]) +df = spark.createDataFrame(pdf) + +# When the UDF is called with a single column that is not StructType, +# the input to the underlying function is an iterator of pd.Series. +@pandas_udf("long", PandasUDFType.SCALAR_ITER) +def plus_one(batch_iter): +for x in batch_iter: +yield x + 1 + +df.select(plus_one(col("x"))).show() +# +---+ +# |plus_one(x)| +# +---+ +# | 2| +# | 3| +# | 4| +# +---+ + +# When the UDF is called with more than one columns, +# the input to the underlying function is an iterator of pd.Series tuple. +@pandas_udf("long", PandasUDFType.SCALAR_ITER) +def multiply_two_cols(batch_iter): +for a, b in batch_iter: +yield a * b + +df.select(multiply_two_cols(col("x"), col("x"))).show() +# +---+ +# |multiply_two_cols(x, x)| +# +---+ +# | 1| +# | 4| +# | 9| +# +---+ + +# When the UDF is called with a single column that is StructType, +# the input to the underlying function is an iterator of pd.DataFrame. +@pandas_udf("long", PandasUDFType.SCALAR_ITER) +def multiply_two_nested_cols(pdf_iter): +for pdf in pdf_iter: +yield pdf["a"] * pdf["b"] + +df.select( +multiply_two_nested_cols( +struct(col("x").alias("a"), col("x").alias("b")) +).alias("y") +).show() +# +---+ +# | y| +# +---+ +# | 1| +# | 4| +# | 9| +# +---+ + +# In the UDF, you can initialize some states before processing batches. +# Wrap your code with try/finally or use context managers to ensure +# the release of resources at the end. +y_bc = spark.sparkContext.broadcast(1) + +@pandas_udf("long", PandasUDFType.SCALAR_ITER) +def plus_y(batch_iter): +y = y_bc.value # initialize states +try: +for x in batch_iter: +
[spark] branch master updated: [SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple of pd.Series
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6d441dc [SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple of pd.Series 6d441dc is described below commit 6d441dcdc68dae886e375794a55658f70cd18d9d Author: WeichenXu AuthorDate: Sat Jun 15 08:29:20 2019 -0700 [SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple of pd.Series ## What changes were proposed in this pull request? Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple of pd.Series. Note the UDF input args will be always one iterator: * if the udf take only column as input, the iterator's element will be pd.Series (corresponding to the column values batch) * if the udf take multiple columns as inputs, the iterator's element will be a tuple composed of multiple `pd.Series`s, each one corresponding to the multiple columns as inputs (keep the same order). For example: ``` pandas_udf("int", PandasUDFType.SCALAR_ITER) def the_udf(iterator): for col1_batch, col2_batch in iterator: yield col1_batch + col2_batch df.select(the_udf("col1", "col2")) ``` The udf above will add col1 and col2. I haven't add unit tests, but manually tests show it works fine. So it is ready for first pass review. We can test several typical cases: ``` from pyspark.sql import SparkSession from pyspark.sql.functions import pandas_udf, PandasUDFType from pyspark.sql.functions import udf from pyspark.taskcontext import TaskContext df = spark.createDataFrame([(1, 20), (3, 40)], ["a", "b"]) pandas_udf("int", PandasUDFType.SCALAR_ITER) def fi1(it): pid = TaskContext.get().partitionId() print("DBG: fi1: do init stuff, partitionId=" + str(pid)) for batch in it: yield batch + 100 print("DBG: fi1: do close stuff, partitionId=" + str(pid)) pandas_udf("int", PandasUDFType.SCALAR_ITER) def fi2(it): pid = TaskContext.get().partitionId() print("DBG: fi2: do init stuff, partitionId=" + str(pid)) for batch in it: yield batch + 1 print("DBG: fi2: do close stuff, partitionId=" + str(pid)) pandas_udf("int", PandasUDFType.SCALAR_ITER) def fi3(it): pid = TaskContext.get().partitionId() print("DBG: fi3: do init stuff, partitionId=" + str(pid)) for x, y in it: yield x + y * 10 + 10 print("DBG: fi3: do close stuff, partitionId=" + str(pid)) pandas_udf("int", PandasUDFType.SCALAR) def fp1(x): return x + 1000 udf("int") def fu1(x): return x + 10 # test select "pandas iter udf/pandas udf/sql udf" expressions at the same time. # Note this case the `fi1("a"), fi2("b"), fi3("a", "b")` will generate only one plan, # and `fu1("a")`, `fp1("a")` will generate another two separate plans. df.select(fi1("a"), fi2("b"), fi3("a", "b"), fu1("a"), fp1("a")).show() # test chain two pandas iter udf together # Note this case `fi2(fi1("a"))` will generate only one plan # Also note the init stuff/close stuff call order will be like: # (debug output following) # DBG: fi2: do init stuff, partitionId=0 # DBG: fi1: do init stuff, partitionId=0 # DBG: fi1: do close stuff, partitionId=0 # DBG: fi2: do close stuff, partitionId=0 df.select(fi2(fi1("a"))).show() # test more complex chain # Note this case `fi1("a"), fi2("a")` will generate one plan, # and `fi3(fi1_output, fi2_output)` will generate another plan df.select(fi3(fi1("a"), fi2("a"))).show() ``` ## How was this patch tested? To be added. Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24643 from WeichenXu123/pandas_udf_iter. Lead-authored-by: WeichenXu Co-authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- .../org/apache/spark/api/python/PythonRunner.scala | 2 + python/pyspark/rdd.py | 1 + python/pyspark/sql/functions.py| 3 + python/pyspark/sql/tests/test_pandas_udf_scalar.py | 882 ++--- python/pyspark/sql/udf.py | 13 +- python/pyspark/worker.py
[spark] branch master updated: [SPARK-28030][SQL] convert filePath to URI in binary file data source
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4f4829b [SPARK-28030][SQL] convert filePath to URI in binary file data source 4f4829b is described below commit 4f4829b4ae261a9fd656fbf1928e6440d31f8d8c Author: Xiangrui Meng AuthorDate: Wed Jun 12 13:24:02 2019 -0700 [SPARK-28030][SQL] convert filePath to URI in binary file data source ## What changes were proposed in this pull request? Convert `PartitionedFile.filePath` to URI first in binary file data source. Otherwise Spark will throw a FileNotFound exception because we create `Path` with URL encoded string, instead of wrapping it with URI. ## How was this patch tested? Unit test. Closes #24855 from mengxr/SPARK-28030. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- .../spark/sql/execution/datasources/FileScanRDD.scala | 2 +- .../datasources/binaryfile/BinaryFileFormat.scala | 3 ++- .../datasources/binaryfile/BinaryFileFormatSuite.scala | 14 ++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index d92ea2e..9e98b0b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -38,7 +38,7 @@ import org.apache.spark.util.NextIterator * that need to be prepended to each row. * * @param partitionValues value of partition columns to be prepended to each row. - * @param filePath path of the file to read + * @param filePath URI of the file to read * @param start the beginning offset (in bytes) of the block. * @param length number of bytes to read. * @param locations locality information (list of nodes that have the data). diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala index cdc7cd5..fda4e14 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.datasources.binaryfile +import java.net.URI import java.sql.Timestamp import com.google.common.io.{ByteStreams, Closeables} @@ -100,7 +101,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { val maxLength = sparkSession.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH) file: PartitionedFile => { - val path = new Path(file.filePath) + val path = new Path(new URI(file.filePath)) val fs = path.getFileSystem(broadcastedHadoopConf.value.value) val status = fs.getFileStatus(path) if (filterFuncs.forall(_.apply(status))) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala index 01dc96c..9e2969b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala @@ -368,4 +368,18 @@ class BinaryFileFormatSuite extends QueryTest with SharedSQLContext with SQLTest assert(caught.getMessage.contains("exceeds the max length allowed")) } } + + test("SPARK-28030: support chars in file names that require URL encoding") { +withTempDir { dir => + val file = new File(dir, "test space.txt") + val content = "123".getBytes + Files.write(file.toPath, content, StandardOpenOption.CREATE, StandardOpenOption.WRITE) + val df = spark.read.format(BINARY_FILE).load(dir.getPath) + df.select(col(PATH), col(CONTENT)).first() match { +case Row(p: String, c: Array[Byte]) => + assert(p.endsWith(file.getAbsolutePath), "should support space in file name") + assert(c === content, "should read file with space in file name") + } +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27968] ArrowEvalPythonExec.evaluate shouldn't eagerly read the first row
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4d770db [SPARK-27968] ArrowEvalPythonExec.evaluate shouldn't eagerly read the first row 4d770db is described below commit 4d770db0eb252c56072f093eae318bad3d20b8d7 Author: Xiangrui Meng AuthorDate: Thu Jun 6 15:45:44 2019 -0700 [SPARK-27968] ArrowEvalPythonExec.evaluate shouldn't eagerly read the first row ## What changes were proposed in this pull request? Issued fixed in https://github.com/apache/spark/pull/24734 but that PR might takes longer to merge. ## How was this patch tested? It should pass existing unit tests. Closes #24816 from mengxr/SPARK-27968. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- .../sql/execution/python/ArrowEvalPythonExec.scala | 27 -- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala index 000ae97..73a43af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala @@ -86,28 +86,11 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute] sessionLocalTimeZone, pythonRunnerConf).compute(batchIter, context.partitionId(), context) -new Iterator[InternalRow] { - - private var currentIter = if (columnarBatchIter.hasNext) { -val batch = columnarBatchIter.next() -val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) -assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " + - s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") -batch.rowIterator.asScala - } else { -Iterator.empty - } - - override def hasNext: Boolean = currentIter.hasNext || { -if (columnarBatchIter.hasNext) { - currentIter = columnarBatchIter.next().rowIterator.asScala - hasNext -} else { - false -} - } - - override def next(): InternalRow = currentIter.next() +columnarBatchIter.flatMap { batch => + val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) + assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " + +s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") + batch.rowIterator.asScala } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27366][CORE] Support GPU Resources in Spark job scheduling
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ac808e2 [SPARK-27366][CORE] Support GPU Resources in Spark job scheduling ac808e2 is described below commit ac808e2a02d67ed6210986704b84c8079791b123 Author: Xingbo Jiang AuthorDate: Tue Jun 4 16:57:47 2019 -0700 [SPARK-27366][CORE] Support GPU Resources in Spark job scheduling ## What changes were proposed in this pull request? This PR adds support to schedule tasks with extra resource requirements (eg. GPUs) on executors with available resources. It also introduce a new method `TaskContext.resources()` so tasks can access available resource addresses allocated to them. ## How was this patch tested? * Added new end-to-end test cases in `SparkContextSuite`; * Added new test case in `CoarseGrainedSchedulerBackendSuite`; * Added new test case in `CoarseGrainedExecutorBackendSuite`; * Added new test case in `TaskSchedulerImplSuite`; * Added new test case in `TaskSetManagerSuite`; * Updated existing tests. Closes #24374 from jiangxb1987/gpu. Authored-by: Xingbo Jiang Signed-off-by: Xiangrui Meng --- .../org/apache/spark/BarrierTaskContext.scala | 2 + .../main/scala/org/apache/spark/SparkConf.scala| 33 ++--- .../main/scala/org/apache/spark/SparkContext.scala | 84 ++--- .../main/scala/org/apache/spark/TaskContext.scala | 9 +- .../scala/org/apache/spark/TaskContextImpl.scala | 3 +- .../main/scala/org/apache/spark/TestUtils.scala| 11 ++ .../executor/CoarseGrainedExecutorBackend.scala| 14 ++- .../scala/org/apache/spark/executor/Executor.scala | 3 +- .../spark/scheduler/ExecutorResourceInfo.scala | 101 .../scala/org/apache/spark/scheduler/Task.scala| 7 +- .../apache/spark/scheduler/TaskDescription.scala | 51 +++- .../apache/spark/scheduler/TaskSchedulerImpl.scala | 33 - .../apache/spark/scheduler/TaskSetManager.scala| 14 ++- .../org/apache/spark/scheduler/WorkerOffer.scala | 5 +- .../cluster/CoarseGrainedClusterMessage.scala | 15 ++- .../cluster/CoarseGrainedSchedulerBackend.scala| 34 +- .../spark/scheduler/cluster/ExecutorData.scala | 5 +- .../scheduler/local/LocalSchedulerBackend.scala| 1 + .../apache/spark/JavaTaskContextCompileCheck.java | 4 + ...g.apache.spark.scheduler.ExternalClusterManager | 1 + .../scala/org/apache/spark/ResourceName.scala} | 18 +-- .../scala/org/apache/spark/SparkConfSuite.scala| 24 .../scala/org/apache/spark/SparkContextSuite.scala | 79 ++-- .../CoarseGrainedExecutorBackendSuite.scala| 63 +- .../org/apache/spark/executor/ExecutorSuite.scala | 2 + .../CoarseGrainedSchedulerBackendSuite.scala | 133 - .../scheduler/ExecutorResourceInfoSuite.scala | 91 ++ .../apache/spark/scheduler/TaskContextSuite.scala | 4 +- .../spark/scheduler/TaskDescriptionSuite.scala | 17 +++ .../spark/scheduler/TaskSchedulerImplSuite.scala | 40 ++- .../spark/scheduler/TaskSetManagerSuite.scala | 21 project/MimaExcludes.scala | 3 + .../MesosFineGrainedSchedulerBackendSuite.scala| 6 +- 33 files changed, 825 insertions(+), 106 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index a354f44..cf957ff 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -185,6 +185,8 @@ class BarrierTaskContext private[spark] ( taskContext.getMetricsSources(sourceName) } + override def resources(): Map[String, ResourceInformation] = taskContext.resources() + override private[spark] def killTaskIfInterrupted(): Unit = taskContext.killTaskIfInterrupted() override private[spark] def getKillReason(): Option[String] = taskContext.getKillReason() diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 15f1730..227f4a5 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -508,6 +508,15 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria } /** + * Get task resource requirements. + */ + private[spark] def getTaskResourceRequirements(): Map[String, Int] = { +getAllWithPrefix(SPARK_TASK_RESOURCE_PREFIX) + .withFilter { case (k, v) => k.endsWith(SPARK_RESOURCE_COUNT_SUFFIX)} + .map { case (k, v) => (k.dropRight(SPARK_RESOURCE_COUNT_SUFFIX.length), v.toInt)}.toMap + } + + /** * Checks for i
[spark] branch master updated: [SPARK-27488][CORE] Driver interface to support GPU resources
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 74e5e41 [SPARK-27488][CORE] Driver interface to support GPU resources 74e5e41 is described below commit 74e5e41eebf9ed596b48e6db52a2a9c642e5cbc3 Author: Thomas Graves AuthorDate: Thu May 23 11:46:13 2019 -0700 [SPARK-27488][CORE] Driver interface to support GPU resources ## What changes were proposed in this pull request? Added the driver functionality to get the resources. The user interface is: SparkContext.resources - I called it this to match the TaskContext.resources api proposed in the other PR. Originally it was going to be called SparkContext.getResources but changed to be consistent, if people have strong feelings I can change it. There are 2 ways the driver can discover what resources it has. 1) user specifies a discoveryScript, this is similar to the executors and is meant for yarn and k8s where they don't tell you what you were allocated but you are running in isolated environment. 2) read the config spark.driver.resource.resourceName.addresses. The config is meant to be used with standalone mode where the Worker will have to assign what GPU addresses the Driver is allowed to use by setting that config. When the user runs a spark application, if they want the driver to have GPU's they would specify the conf spark.driver.resource.gpu.count=X where x is the number they want. If they are running on yarn or k8s they will also have to specify the discoveryScript as specified above, if they are on standalone mode and cluster is setup properly they wouldn't have to specify anything else. We could potentially get rid of the spark.driver.resources.gpu.addresses config which is really meant [...] - This PR also has changes to be consistent about using resourceName everywhere. - change the config names from POSTFIX to SUFFIX to be more consistent with other areas in Spark - Moved the config checks around a bit since now used by both executor and driver. Note those might overlap a bit with https://github.com/apache/spark/pull/24374 so we will have to figure out which one should go in first. ## How was this patch tested? Unit tests and manually test the interface. Closes #24615 from tgravescs/SPARK-27488. Authored-by: Thomas Graves Signed-off-by: Xiangrui Meng --- .../org/apache/spark/ResourceDiscoverer.scala | 88 ++- .../main/scala/org/apache/spark/SparkConf.scala| 60 + .../main/scala/org/apache/spark/SparkContext.scala | 47 +++ .../executor/CoarseGrainedExecutorBackend.scala| 64 +++--- .../org/apache/spark/internal/config/package.scala | 5 +- .../org/apache/spark/ResourceDiscovererSuite.scala | 83 ++ .../scala/org/apache/spark/SparkConfSuite.scala| 43 ++ .../scala/org/apache/spark/SparkContextSuite.scala | 98 ++ .../CoarseGrainedExecutorBackendSuite.scala| 61 -- docs/configuration.md | 29 +-- 10 files changed, 436 insertions(+), 142 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala b/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala index 1963942..d3b3860 100644 --- a/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala +++ b/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala @@ -29,10 +29,10 @@ import org.apache.spark.internal.config._ import org.apache.spark.util.Utils.executeAndGetOutput /** - * Discovers resources (GPUs/FPGAs/etc). It currently only supports resources that have - * addresses. + * Discovers information about resources (GPUs/FPGAs/etc). It currently only supports + * resources that have addresses. * This class finds resources by running and parsing the output of the user specified script - * from the config spark.{driver/executor}.resource.{resourceType}.discoveryScript. + * from the config spark.{driver/executor}.resource.{resourceName}.discoveryScript. * The output of the script it runs is expected to be JSON in the format of the * ResourceInformation class. * @@ -42,28 +42,41 @@ private[spark] object ResourceDiscoverer extends Logging { private implicit val formats = DefaultFormats - def findResources(sparkConf: SparkConf, isDriver: Boolean): Map[String, ResourceInformation] = { -val prefix = if (isDriver) { - SPARK_DRIVER_RESOURCE_PREFIX -} else { - SPARK_EXECUTOR_RESOURCE_PREFIX -} -// get unique resource types by grabbing first part config with multiple periods, -// ie resourceType.count, grab resourceType part -val resourceNames = sparkConf.getAllWithPrefix(prefix).map { case (k
[spark] branch master updated: [SPARK-27588] Binary file data source fails fast and doesn't attempt to read very large files
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 618d6bf [SPARK-27588] Binary file data source fails fast and doesn't attempt to read very large files 618d6bf is described below commit 618d6bff71073c8c93501ab7392c3cc579730f0b Author: Xiangrui Meng AuthorDate: Mon Apr 29 16:24:49 2019 -0700 [SPARK-27588] Binary file data source fails fast and doesn't attempt to read very large files ## What changes were proposed in this pull request? If a file is too big (>2GB), we should fail fast and do not try to read the file. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24483 from mengxr/SPARK-27588. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- .../org/apache/spark/sql/internal/SQLConf.scala| 8 ++ .../datasources/binaryfile/BinaryFileFormat.scala | 8 ++ .../binaryfile/BinaryFileFormatSuite.scala | 31 +- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 96d3f5c..87bce1f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1744,6 +1744,14 @@ object SQLConf { "and from_utc_timestamp() functions.") .booleanConf .createWithDefault(false) + + val SOURCES_BINARY_FILE_MAX_LENGTH = buildConf("spark.sql.sources.binaryFile.maxLength") +.doc("The max length of a file that can be read by the binary file data source. " + + "Spark will fail fast and not attempt to read the file if its length exceeds this value. " + + "The theoretical max is Int.MaxValue, though VMs might implement a smaller max.") +.internal() +.intConf +.createWithDefault(Int.MaxValue) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala index db93268..2637784 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala @@ -24,11 +24,13 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path} import org.apache.hadoop.mapreduce.Job +import org.apache.spark.SparkException import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} +import org.apache.spark.sql.internal.SQLConf.SOURCES_BINARY_FILE_MAX_LENGTH import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -99,6 +101,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { val binaryFileSourceOptions = new BinaryFileSourceOptions(options) val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter val filterFuncs = filters.map(filter => createFilterFunction(filter)) +val maxLength = sparkSession.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH) file: PartitionedFile => { val path = new Path(file.filePath) @@ -115,6 +118,11 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { case (MODIFICATION_TIME, i) => writer.write(i, DateTimeUtils.fromMillis(status.getModificationTime)) case (CONTENT, i) => + if (status.getLen > maxLength) { +throw new SparkException( + s"The length of ${status.getPath} is ${status.getLen}, " + +s"which exceeds the max length allowed: ${maxLength}.") + } val stream = fs.open(status.getPath) try { writer.write(i, ByteStreams.toByteArray(stream)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/exec
[spark] branch master updated: [SPARK-27472] add user guide for binary file data source
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new fbc7942 [SPARK-27472] add user guide for binary file data source fbc7942 is described below commit fbc794268340bec868a0abcae3516e4ae3714286 Author: Xiangrui Meng AuthorDate: Mon Apr 29 08:58:56 2019 -0700 [SPARK-27472] add user guide for binary file data source ## What changes were proposed in this pull request? Add user guide for binary file data source. https://user-images.githubusercontent.com/829644/56877594-0488d300-6a04-11e9-9064-5047dfedd913.png;> ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24484 from mengxr/SPARK-27472. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- docs/sql-data-sources-binaryFile.md | 80 + docs/sql-data-sources.md| 1 + 2 files changed, 81 insertions(+) diff --git a/docs/sql-data-sources-binaryFile.md b/docs/sql-data-sources-binaryFile.md new file mode 100644 index 000..d861a24 --- /dev/null +++ b/docs/sql-data-sources-binaryFile.md @@ -0,0 +1,80 @@ +--- +layout: global +title: Binary File Data Source +displayTitle: Binary File Data Source +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +Since Spark 3.0, Spark supports binary file data source, +which reads binary files and converts each file into a single record that contains the raw content +and metadata of the file. +It produces a DataFrame with the following columns and possibly partition columns: +* `path`: StringType +* `modificationTime`: TimestampType +* `length`: LongType +* `content`: BinaryType + +It supports the following read option: + + Property NameDefaultMeaning + +pathGlobFilter +none (accepts all) + +An optional glob pattern to only include files with paths matching the pattern. +The syntax follows org.apache.hadoop.fs.GlobFilter. +It does not change the behavior of partition discovery. + + + + +To read whole binary files, you need to specify the data source `format` as `binaryFile`. +For example, the following code reads all PNG files from the input directory: + + + +{% highlight scala %} + +spark.read.format("binaryFile").option("pathGlobFilter", "*.png").load("/path/to/data") + +{% endhighlight %} + + + +{% highlight java %} + +spark.read().format("binaryFile").option("pathGlobFilter", "*.png").load("/path/to/data"); + +{% endhighlight %} + + +{% highlight python %} + +spark.read.format("binaryFile").option("pathGlobFilter", "*.png").load("/path/to/data") + +{% endhighlight %} + + +{% highlight r %} + +read.df("/path/to/data", source = "binaryFile", pathGlobFilter = "*.png") + +{% endhighlight %} + + + +Binary file data source does not support writing a DataFrame back to the original files. diff --git a/docs/sql-data-sources.md b/docs/sql-data-sources.md index d908aac..079c540 100644 --- a/docs/sql-data-sources.md +++ b/docs/sql-data-sources.md @@ -54,4 +54,5 @@ goes into specific options that are available for the built-in data sources. * [Compatibility with Databricks spark-avro](sql-data-sources-avro.html#compatibility-with-databricks-spark-avro) * [Supported types for Avro -> Spark SQL conversion](sql-data-sources-avro.html#supported-types-for-avro---spark-sql-conversion) * [Supported types for Spark SQL -> Avro conversion](sql-data-sources-avro.html#supported-types-for-spark-sql---avro-conversion) +* [Whole Binary Files](sql-data-sources-binaryFile.html) * [Troubleshooting](sql-data-sources-troubleshooting.html) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27534][SQL] Do not load `content` column in binary data source if it is not selected
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 20a3ef7 [SPARK-27534][SQL] Do not load `content` column in binary data source if it is not selected 20a3ef7 is described below commit 20a3ef7259490e0c9f6348f13db1e99da5f0df83 Author: Xiangrui Meng AuthorDate: Sun Apr 28 07:57:03 2019 -0700 [SPARK-27534][SQL] Do not load `content` column in binary data source if it is not selected ## What changes were proposed in this pull request? A follow-up task from SPARK-25348. To save I/O cost, Spark shouldn't attempt to read the file if users didn't request the `content` column. For example: ``` spark.read.format("binaryFile").load(path).filter($"length" < 100).count() ``` ## How was this patch tested? Unit test added. Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24473 from WeichenXu123/SPARK-27534. Lead-authored-by: Xiangrui Meng Co-authored-by: WeichenXu Signed-off-by: Xiangrui Meng --- .../datasources/binaryfile/BinaryFileFormat.scala | 74 +- .../binaryfile/BinaryFileFormatSuite.scala | 63 -- 2 files changed, 89 insertions(+), 48 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala index 8617ae3..db93268 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala @@ -26,12 +26,10 @@ import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.AttributeReference -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} -import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, GreaterThan, - GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or} +import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.SerializableConfiguration @@ -80,7 +78,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { false } - override def shortName(): String = "binaryFile" + override def shortName(): String = BINARY_FILE override protected def buildReader( sparkSession: SparkSession, @@ -90,54 +88,43 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { +require(dataSchema.sameType(schema), + s""" + |Binary file data source expects dataSchema: $schema, + |but got: $dataSchema. +""".stripMargin) val broadcastedHadoopConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) - val binaryFileSourceOptions = new BinaryFileSourceOptions(options) - val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter - val filterFuncs = filters.map(filter => createFilterFunction(filter)) file: PartitionedFile => { - val path = file.filePath - val fsPath = new Path(path) - + val path = new Path(file.filePath) // TODO: Improve performance here: each file will recompile the glob pattern here. - if (pathGlobPattern.forall(new GlobFilter(_).accept(fsPath))) { -val fs = fsPath.getFileSystem(broadcastedHadoopConf.value.value) -val fileStatus = fs.getFileStatus(fsPath) -val length = fileStatus.getLen -val modificationTime = fileStatus.getModificationTime - -if (filterFuncs.forall(_.apply(fileStatus))) { - val stream = fs.open(fsPath) - val content = try { -ByteStreams.toByteArray(stream) - } finally { -Closeables.close(stream, true) - } - - val fullOutput = dataSchema.map { f => -AttributeReference(f.name, f.dataType, f.nullable, f.metadata)() - } - val requiredOutput
[spark] branch master updated: [SPARK-27473][SQL] Support filter push down for status fields in binary file data source
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9793d9e [SPARK-27473][SQL] Support filter push down for status fields in binary file data source 9793d9e is described below commit 9793d9ec22ff7d9778554e4fa3f03ef4f93d473d Author: WeichenXu AuthorDate: Sun Apr 21 12:45:59 2019 -0700 [SPARK-27473][SQL] Support filter push down for status fields in binary file data source ## What changes were proposed in this pull request? Support 4 kinds of filters: - LessThan - LessThanOrEqual - GreatThan - GreatThanOrEqual Support filters applied on 2 columns: - modificationTime - length Note: In order to support datasource filter push-down, I flatten schema to be: ``` val schema = StructType( StructField("path", StringType, false) :: StructField("modificationTime", TimestampType, false) :: StructField("length", LongType, false) :: StructField("content", BinaryType, true) :: Nil) ``` ## How was this patch tested? To be added. Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24387 from WeichenXu123/binary_ds_filter. Lead-authored-by: WeichenXu Co-authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- .../datasources/binaryfile/BinaryFileFormat.scala | 134 ++- .../binaryfile/BinaryFileFormatSuite.scala | 188 ++--- 2 files changed, 256 insertions(+), 66 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala index ad9292a..8617ae3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.datasources.binaryfile +import java.sql.Timestamp + import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path} @@ -28,7 +30,8 @@ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} -import org.apache.spark.sql.sources.{DataSourceRegister, Filter} +import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, GreaterThan, + GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.SerializableConfiguration @@ -55,10 +58,12 @@ import org.apache.spark.util.SerializableConfiguration */ class BinaryFileFormat extends FileFormat with DataSourceRegister { + import BinaryFileFormat._ + override def inferSchema( sparkSession: SparkSession, options: Map[String, String], - files: Seq[FileStatus]): Option[StructType] = Some(BinaryFileFormat.schema) + files: Seq[FileStatus]): Option[StructType] = Some(schema) override def prepareWrite( sparkSession: SparkSession, @@ -84,7 +89,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], - hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = { + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { val broadcastedHadoopConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) @@ -93,46 +98,49 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter -(file: PartitionedFile) => { +val filterFuncs = filters.map(filter => createFilterFunction(filter)) + +file: PartitionedFile => { val path = file.filePath val fsPath = new Path(path) // TODO: Improve performance here: each file will recompile the glob pattern here. - val globFilter = pathGlobPattern.map(new GlobFilter(_)) - if (!globFilter.isDefined || globFilter.get.accept(fsPath)) { + if (pathGlobPattern.forall(new GlobFilter(_).accept(fsPath))) { val fs = fsPath.getFileSystem(broadcastedHadoopConf.value.value) val fileStatus = fs.getFil
[spark] branch master updated: [SPARK-25348][SQL] Data source for binary files
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1bb0c8e [SPARK-25348][SQL] Data source for binary files 1bb0c8e is described below commit 1bb0c8e407e0fcd1283f0eb2f742ba2567eda87e Author: WeichenXu AuthorDate: Tue Apr 16 15:41:32 2019 -0700 [SPARK-25348][SQL] Data source for binary files ## What changes were proposed in this pull request? Implement binary file data source in Spark. Format name: "binaryFile" (case-insensitive) Schema: - content: BinaryType - status: StructType - path: StringType - modificationTime: TimestampType - length: LongType Options: * pathGlobFilter (instead of pathFilterRegex) to reply on GlobFilter behavior * maxBytesPerPartition is not implemented since it is controlled by two SQL confs: maxPartitionBytes and openCostInBytes. ## How was this patch tested? Unit test added. Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24354 from WeichenXu123/binary_file_datasource. Lead-authored-by: WeichenXu Co-authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng --- ...org.apache.spark.sql.sources.DataSourceRegister | 1 + .../datasources/binaryfile/BinaryFileFormat.scala | 177 + .../binaryfile/BinaryFileFormatSuite.scala | 143 + 3 files changed, 321 insertions(+) diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index be9cb81..d988287 100644 --- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -8,3 +8,4 @@ org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider org.apache.spark.sql.execution.streaming.sources.RateStreamProvider org.apache.spark.sql.execution.streaming.sources.TextSocketSourceProvider +org.apache.spark.sql.execution.datasources.binaryfile.BinaryFileFormat diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala new file mode 100644 index 000..ad9292a --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.binaryfile + +import com.google.common.io.{ByteStreams, Closeables} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path} +import org.apache.hadoop.mapreduce.Job + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.AttributeReference +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} +import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} +import org.apache.spark.sql.sources.{DataSourceRegister, Filter} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.SerializableConfiguration + + +/** + * The binary file data source. + * + * It reads binary files and converts each file into a single record that contains the raw content + * and metadata of the file. + * + * Example: + * {{{ + * // Scala + * val df = spark.read.format("binaryFile") + * .option("pathGlobFilter", "*.png") + * .load("/path/to/fileDir") + * + * // Jav
[spark] branch master updated: [SPARK-27454][ML][SQL] Spark image datasource fail when encounter some illegal images
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d35e81f [SPARK-27454][ML][SQL] Spark image datasource fail when encounter some illegal images d35e81f is described below commit d35e81f4bc561598676a508319ec872f7361b069 Author: WeichenXu AuthorDate: Mon Apr 15 11:55:51 2019 -0700 [SPARK-27454][ML][SQL] Spark image datasource fail when encounter some illegal images ## What changes were proposed in this pull request? Fix in Spark image datasource fail when encounter some illegal images. This related to bugs inside `ImageIO.read` so in spark code I add exception handling for it. ## How was this patch tested? N/A Please review http://spark.apache.org/contributing.html before opening a pull request. Closes #24362 from WeichenXu123/fix_image_ds_bug. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng --- mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala index 0b13eef..a7ddf2f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala @@ -133,7 +133,13 @@ object ImageSchema { */ private[spark] def decode(origin: String, bytes: Array[Byte]): Option[Row] = { -val img = ImageIO.read(new ByteArrayInputStream(bytes)) +val img = try { + ImageIO.read(new ByteArrayInputStream(bytes)) +} catch { + // Catch runtime exception because `ImageIO` may throw unexcepted `RuntimeException`. + // But do not catch the declared `IOException` (regarded as FileSystem failure) + case _: RuntimeException => null +} if (img == null) { None - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-25970][ML] Add Instrumentation to PrefixSpan
This is an automated email from the ASF dual-hosted git repository. meng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new aa0d4ca [SPARK-25970][ML] Add Instrumentation to PrefixSpan aa0d4ca is described below commit aa0d4ca8bab08a467645080a5b8a28bf6dd8a042 Author: zhengruifeng AuthorDate: Thu Dec 20 11:22:49 2018 -0800 [SPARK-25970][ML] Add Instrumentation to PrefixSpan ## What changes were proposed in this pull request? Add Instrumentation to PrefixSpan ## How was this patch tested? existing tests Closes #22971 from zhengruifeng/log_PrefixSpan. Authored-by: zhengruifeng Signed-off-by: Xiangrui Meng --- mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala index 2a34135..b0006a8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.fpm import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.util.Instrumentation.instrumented import org.apache.spark.mllib.fpm.{PrefixSpan => mllibPrefixSpan} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.col @@ -135,7 +136,10 @@ final class PrefixSpan(@Since("2.4.0") override val uid: String) extends Params * - `freq: Long` */ @Since("2.4.0") - def findFrequentSequentialPatterns(dataset: Dataset[_]): DataFrame = { + def findFrequentSequentialPatterns(dataset: Dataset[_]): DataFrame = instrumented { instr => +instr.logDataset(dataset) +instr.logParams(this, params: _*) + val sequenceColParam = $(sequenceCol) val inputType = dataset.schema(sequenceColParam).dataType require(inputType.isInstanceOf[ArrayType] && - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25321][ML] Fix local LDA model constructor
Repository: spark Updated Branches: refs/heads/branch-2.4 138a63165 -> 1303eb5c8 [SPARK-25321][ML] Fix local LDA model constructor ## What changes were proposed in this pull request? change back the constructor to: ``` class LocalLDAModel private[ml] ( uid: String, vocabSize: Int, private[clustering] val oldLocalModel : OldLocalLDAModel, sparkSession: SparkSession) ``` Although it is marked `private[ml]`, it is used in `mleap` and the master change breaks `mleap` building. See mleap code [here](https://github.com/combust/mleap/blob/c7860af328d519cf56441b4a7cd8e6ec9d9fee59/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/clustering/LDAModelOp.scala#L57) ## How was this patch tested? Manual. Closes #22510 from WeichenXu123/LDA_fix. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng (cherry picked from commit 40edab209bdefe793b59b650099cea026c244484) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1303eb5c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1303eb5c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1303eb5c Branch: refs/heads/branch-2.4 Commit: 1303eb5c8d976748ba3da23b66abb8eb6512ea5d Parents: 138a631 Author: WeichenXu Authored: Fri Sep 21 13:08:01 2018 -0700 Committer: Xiangrui Meng Committed: Fri Sep 21 13:08:11 2018 -0700 -- mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1303eb5c/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 50867f7..84e73dc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -570,13 +570,11 @@ abstract class LDAModel private[ml] ( class LocalLDAModel private[ml] ( uid: String, vocabSize: Int, -private[clustering] val oldLocalModel_ : OldLocalLDAModel, +private[clustering] val oldLocalModel : OldLocalLDAModel, sparkSession: SparkSession) extends LDAModel(uid, vocabSize, sparkSession) { - override private[clustering] def oldLocalModel: OldLocalLDAModel = { -oldLocalModel_.setSeed(getSeed) - } + oldLocalModel.setSeed(getSeed) @Since("1.6.0") override def copy(extra: ParamMap): LocalLDAModel = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25321][ML] Fix local LDA model constructor
Repository: spark Updated Branches: refs/heads/master 4a1120953 -> 40edab209 [SPARK-25321][ML] Fix local LDA model constructor ## What changes were proposed in this pull request? change back the constructor to: ``` class LocalLDAModel private[ml] ( uid: String, vocabSize: Int, private[clustering] val oldLocalModel : OldLocalLDAModel, sparkSession: SparkSession) ``` Although it is marked `private[ml]`, it is used in `mleap` and the master change breaks `mleap` building. See mleap code [here](https://github.com/combust/mleap/blob/c7860af328d519cf56441b4a7cd8e6ec9d9fee59/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/clustering/LDAModelOp.scala#L57) ## How was this patch tested? Manual. Closes #22510 from WeichenXu123/LDA_fix. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/40edab20 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/40edab20 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/40edab20 Branch: refs/heads/master Commit: 40edab209bdefe793b59b650099cea026c244484 Parents: 4a11209 Author: WeichenXu Authored: Fri Sep 21 13:08:01 2018 -0700 Committer: Xiangrui Meng Committed: Fri Sep 21 13:08:01 2018 -0700 -- mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/40edab20/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 50867f7..84e73dc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -570,13 +570,11 @@ abstract class LDAModel private[ml] ( class LocalLDAModel private[ml] ( uid: String, vocabSize: Int, -private[clustering] val oldLocalModel_ : OldLocalLDAModel, +private[clustering] val oldLocalModel : OldLocalLDAModel, sparkSession: SparkSession) extends LDAModel(uid, vocabSize, sparkSession) { - override private[clustering] def oldLocalModel: OldLocalLDAModel = { -oldLocalModel_.setSeed(getSeed) - } + oldLocalModel.setSeed(getSeed) @Since("1.6.0") override def copy(extra: ParamMap): LocalLDAModel = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change
Repository: spark Updated Branches: refs/heads/branch-2.4 ce6636112 -> 138a63165 [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change ## What changes were proposed in this pull request? Revert SPARK-14681 to avoid API breaking change. PR [SPARK-14681] will break mleap. ## How was this patch tested? N/A Closes #22492 from WeichenXu123/revert_tree_change. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/138a6316 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/138a6316 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/138a6316 Branch: refs/heads/branch-2.4 Commit: 138a63165ce90f8400e0a5c7503894662ead03c5 Parents: ce66361 Author: WeichenXu Authored: Fri Sep 21 13:05:24 2018 -0700 Committer: Xiangrui Meng Committed: Fri Sep 21 13:05:24 2018 -0700 -- .../classification/DecisionTreeClassifier.scala | 14 +- .../spark/ml/classification/GBTClassifier.scala | 6 +- .../classification/RandomForestClassifier.scala | 6 +- .../ml/regression/DecisionTreeRegressor.scala | 13 +- .../spark/ml/regression/GBTRegressor.scala | 6 +- .../ml/regression/RandomForestRegressor.scala | 6 +- .../scala/org/apache/spark/ml/tree/Node.scala | 247 --- .../spark/ml/tree/impl/RandomForest.scala | 10 +- .../org/apache/spark/ml/tree/treeModels.scala | 36 +-- .../DecisionTreeClassifierSuite.scala | 31 +-- .../ml/classification/GBTClassifierSuite.scala | 4 +- .../RandomForestClassifierSuite.scala | 5 +- .../regression/DecisionTreeRegressorSuite.scala | 14 -- .../spark/ml/tree/impl/RandomForestSuite.scala | 22 +- .../apache/spark/ml/tree/impl/TreeTests.scala | 12 +- project/MimaExcludes.scala | 7 - 16 files changed, 107 insertions(+), 332 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/138a6316/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 8a57bfc..6648e78 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -168,7 +168,7 @@ object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifi @Since("1.4.0") class DecisionTreeClassificationModel private[ml] ( @Since("1.4.0")override val uid: String, -@Since("1.4.0")override val rootNode: ClassificationNode, +@Since("1.4.0")override val rootNode: Node, @Since("1.6.0")override val numFeatures: Int, @Since("1.5.0")override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel] @@ -181,7 +181,7 @@ class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: ClassificationNode, numFeatures: Int, numClasses: Int) = + private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses) override def predict(features: Vector): Double = { @@ -279,9 +279,8 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] - val root = loadTreeNodes(path, metadata, sparkSession, isClassification = true) - val model = new DecisionTreeClassificationModel(metadata.uid, -root.asInstanceOf[ClassificationNode], numFeatures, numClasses) + val root = loadTreeNodes(path, metadata, sparkSession) + val model = new DecisionTreeClassificationModel(metadata.uid, root, numFeatures, numClasses) metadata.getAndSetParams(model) model } @@ -296,10 +295,9 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica require(oldModel.algo == OldAlgo.Classification, s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") -val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures, isClassif
spark git commit: [SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema representation
Repository: spark Updated Branches: refs/heads/master cb1b55cf7 -> 6f681d429 [SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema representation ## What changes were proposed in this pull request? Improve testcase "image datasource test: read non image" to tolerate different schema representation. Because file:/path and file:///path are both valid URI-ifications so in some environment the testcase will fail. ## How was this patch tested? Manual. Closes #22449 from WeichenXu123/image_url. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f681d42 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f681d42 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f681d42 Branch: refs/heads/master Commit: 6f681d42964884d19bf22deb614550d712223117 Parents: cb1b55c Author: WeichenXu Authored: Wed Sep 19 15:16:20 2018 -0700 Committer: Xiangrui Meng Committed: Wed Sep 19 15:16:20 2018 -0700 -- .../spark/ml/source/image/ImageFileFormatSuite.scala | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f681d42/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala index 1a6a8d6..38e2513 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.source.image +import java.net.URI import java.nio.file.Paths import org.apache.spark.SparkFunSuite @@ -58,8 +59,14 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext { .load(filePath) assert(df2.count() === 1) val result = df2.head() -assert(result === invalidImageRow( - Paths.get(filePath).toAbsolutePath().normalize().toUri().toString)) + +val resultOrigin = result.getStruct(0).getString(0) +// covert `origin` to `java.net.URI` object and then compare. +// because `file:/path` and `file:///path` are both valid URI-ifications +assert(new URI(resultOrigin) === Paths.get(filePath).toAbsolutePath().normalize().toUri()) + +// Compare other columns in the row to be the same with the `invalidImageRow` +assert(result === invalidImageRow(resultOrigin)) } test("image datasource partition test") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema representation
Repository: spark Updated Branches: refs/heads/branch-2.4 9fefb47fe -> 83a75a83c [SPARK-22666][ML][FOLLOW-UP] Improve testcase to tolerate different schema representation ## What changes were proposed in this pull request? Improve testcase "image datasource test: read non image" to tolerate different schema representation. Because file:/path and file:///path are both valid URI-ifications so in some environment the testcase will fail. ## How was this patch tested? Manual. Closes #22449 from WeichenXu123/image_url. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng (cherry picked from commit 6f681d42964884d19bf22deb614550d712223117) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/83a75a83 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/83a75a83 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/83a75a83 Branch: refs/heads/branch-2.4 Commit: 83a75a83cb24d20d4c2df5389bb8db34ad0335d9 Parents: 9fefb47 Author: WeichenXu Authored: Wed Sep 19 15:16:20 2018 -0700 Committer: Xiangrui Meng Committed: Wed Sep 19 15:16:30 2018 -0700 -- .../spark/ml/source/image/ImageFileFormatSuite.scala | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/83a75a83/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala index 1a6a8d6..38e2513 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.source.image +import java.net.URI import java.nio.file.Paths import org.apache.spark.SparkFunSuite @@ -58,8 +59,14 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext { .load(filePath) assert(df2.count() === 1) val result = df2.head() -assert(result === invalidImageRow( - Paths.get(filePath).toAbsolutePath().normalize().toUri().toString)) + +val resultOrigin = result.getStruct(0).getString(0) +// covert `origin` to `java.net.URI` object and then compare. +// because `file:/path` and `file:///path` are both valid URI-ifications +assert(new URI(resultOrigin) === Paths.get(filePath).toAbsolutePath().normalize().toUri()) + +// Compare other columns in the row to be the same with the `invalidImageRow` +assert(result === invalidImageRow(resultOrigin)) } test("image datasource partition test") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25345][ML] Deprecate public APIs from ImageSchema
Repository: spark Updated Branches: refs/heads/master 01c3dfab1 -> 08c02e637 [SPARK-25345][ML] Deprecate public APIs from ImageSchema ## What changes were proposed in this pull request? Deprecate public APIs from ImageSchema. ## How was this patch tested? N/A Closes #22349 from WeichenXu123/image_api_deprecate. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08c02e63 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08c02e63 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08c02e63 Branch: refs/heads/master Commit: 08c02e637ac601df2fe890b8b5a7a049bdb4541b Parents: 01c3dfa Author: WeichenXu Authored: Sat Sep 8 09:09:14 2018 -0700 Committer: Xiangrui Meng Committed: Sat Sep 8 09:09:14 2018 -0700 -- .../main/scala/org/apache/spark/ml/image/ImageSchema.scala | 4 python/pyspark/ml/image.py | 8 +++- 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08c02e63/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala index dcc40b6..0b13eef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala @@ -198,6 +198,8 @@ object ImageSchema { * @return DataFrame with a single column "image" of images; * see ImageSchema for the details */ + @deprecated("use `spark.read.format(\"image\").load(path)` and this `readImages` will be " + +"removed in 3.0.0.", "2.4.0") def readImages(path: String): DataFrame = readImages(path, null, false, -1, false, 1.0, 0) /** @@ -218,6 +220,8 @@ object ImageSchema { * @return DataFrame with a single column "image" of images; * see ImageSchema for the details */ + @deprecated("use `spark.read.format(\"image\").load(path)` and this `readImages` will be " + +"removed in 3.0.0.", "2.4.0") def readImages( path: String, sparkSession: SparkSession, http://git-wip-us.apache.org/repos/asf/spark/blob/08c02e63/python/pyspark/ml/image.py -- diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index ef6785b..edb90a3 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -25,8 +25,10 @@ """ import sys +import warnings import numpy as np + from pyspark import SparkContext from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string from pyspark.sql import DataFrame, SparkSession @@ -207,6 +209,9 @@ class _ImageSchema(object): .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but potentially non-deterministic. +.. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and +this `readImages` will be removed in 3.0.0. + :param str path: Path to the image directory. :param bool recursive: Recursive search flag. :param int numPartitions: Number of DataFrame partitions. @@ -222,7 +227,8 @@ class _ImageSchema(object): .. versionadded:: 2.3.0 """ - +warnings.warn("`ImageSchema.readImage` is deprecated. " + + "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning) spark = SparkSession.builder.getOrCreate() image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema jsession = spark._jsparkSession - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25345][ML] Deprecate public APIs from ImageSchema
Repository: spark Updated Branches: refs/heads/branch-2.4 80567fad4 -> 904192ad1 [SPARK-25345][ML] Deprecate public APIs from ImageSchema ## What changes were proposed in this pull request? Deprecate public APIs from ImageSchema. ## How was this patch tested? N/A Closes #22349 from WeichenXu123/image_api_deprecate. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng (cherry picked from commit 08c02e637ac601df2fe890b8b5a7a049bdb4541b) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/904192ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/904192ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/904192ad Branch: refs/heads/branch-2.4 Commit: 904192ad18ff09cc5874e09b03447dd5f7754963 Parents: 80567fa Author: WeichenXu Authored: Sat Sep 8 09:09:14 2018 -0700 Committer: Xiangrui Meng Committed: Sat Sep 8 09:09:33 2018 -0700 -- .../main/scala/org/apache/spark/ml/image/ImageSchema.scala | 4 python/pyspark/ml/image.py | 8 +++- 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/904192ad/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala index dcc40b6..0b13eef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala @@ -198,6 +198,8 @@ object ImageSchema { * @return DataFrame with a single column "image" of images; * see ImageSchema for the details */ + @deprecated("use `spark.read.format(\"image\").load(path)` and this `readImages` will be " + +"removed in 3.0.0.", "2.4.0") def readImages(path: String): DataFrame = readImages(path, null, false, -1, false, 1.0, 0) /** @@ -218,6 +220,8 @@ object ImageSchema { * @return DataFrame with a single column "image" of images; * see ImageSchema for the details */ + @deprecated("use `spark.read.format(\"image\").load(path)` and this `readImages` will be " + +"removed in 3.0.0.", "2.4.0") def readImages( path: String, sparkSession: SparkSession, http://git-wip-us.apache.org/repos/asf/spark/blob/904192ad/python/pyspark/ml/image.py -- diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index ef6785b..edb90a3 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -25,8 +25,10 @@ """ import sys +import warnings import numpy as np + from pyspark import SparkContext from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string from pyspark.sql import DataFrame, SparkSession @@ -207,6 +209,9 @@ class _ImageSchema(object): .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but potentially non-deterministic. +.. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and +this `readImages` will be removed in 3.0.0. + :param str path: Path to the image directory. :param bool recursive: Recursive search flag. :param int numPartitions: Number of DataFrame partitions. @@ -222,7 +227,8 @@ class _ImageSchema(object): .. versionadded:: 2.3.0 """ - +warnings.warn("`ImageSchema.readImage` is deprecated. " + + "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning) spark = SparkSession.builder.getOrCreate() image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema jsession = spark._jsparkSession - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-22666][ML][SQL] Spark datasource for image format
Repository: spark Updated Branches: refs/heads/master c66eef844 -> 925449283 [SPARK-22666][ML][SQL] Spark datasource for image format ## What changes were proposed in this pull request? Implement an image schema datasource. This image datasource support: - partition discovery (loading partitioned images) - dropImageFailures (the same behavior with `ImageSchema.readImage`) - path wildcard matching (the same behavior with `ImageSchema.readImage`) - loading recursively from directory (different from `ImageSchema.readImage`, but use such path: `/path/to/dir/**`) This datasource **NOT** support: - specify `numPartitions` (it will be determined by datasource automatically) - sampling (you can use `df.sample` later but the sampling operator won't be pushdown to datasource) ## How was this patch tested? Unit tests. ## Benchmark I benchmark and compare the cost time between old `ImageSchema.read` API and my image datasource. **cluster**: 4 nodes, each with 64GB memory, 8 cores CPU **test dataset**: Flickr8k_Dataset (about 8091 images) **time cost**: - My image datasource time (automatically generate 258 partitions): 38.04s - `ImageSchema.read` time (set 16 partitions): 68.4s - `ImageSchema.read` time (set 258 partitions): 90.6s **time cost when increase image number by double (clone Flickr8k_Dataset and loads double number images)**: - My image datasource time (automatically generate 515 partitions): 95.4s - `ImageSchema.read` (set 32 partitions): 109s - `ImageSchema.read` (set 515 partitions): 105s So we can see that my image datasource implementation (this PR) bring some performance improvement compared against old`ImageSchema.read` API. Closes #22328 from WeichenXu123/image_datasource. Authored-by: WeichenXu Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92544928 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92544928 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92544928 Branch: refs/heads/master Commit: 925449283dcaef80e0f77e60aea6ef988bd697b4 Parents: c66eef8 Author: WeichenXu Authored: Wed Sep 5 11:59:00 2018 -0700 Committer: Xiangrui Meng Committed: Wed Sep 5 11:59:00 2018 -0700 -- .../images/kittens/29.5.a_b_EGDP022204.jpg | Bin 27295 -> 0 bytes data/mllib/images/kittens/54893.jpg | Bin 35914 -> 0 bytes data/mllib/images/kittens/DP153539.jpg | Bin 26354 -> 0 bytes data/mllib/images/kittens/DP802813.jpg | Bin 30432 -> 0 bytes data/mllib/images/kittens/not-image.txt | 1 - data/mllib/images/multi-channel/BGRA.png| Bin 683 -> 0 bytes .../images/multi-channel/BGRA_alpha_60.png | Bin 747 -> 0 bytes data/mllib/images/multi-channel/chr30.4.184.jpg | Bin 59472 -> 0 bytes data/mllib/images/multi-channel/grayscale.jpg | Bin 36728 -> 0 bytes .../origin/kittens/29.5.a_b_EGDP022204.jpg | Bin 0 -> 27295 bytes data/mllib/images/origin/kittens/54893.jpg | Bin 0 -> 35914 bytes data/mllib/images/origin/kittens/DP153539.jpg | Bin 0 -> 26354 bytes data/mllib/images/origin/kittens/DP802813.jpg | Bin 0 -> 30432 bytes data/mllib/images/origin/kittens/not-image.txt | 1 + data/mllib/images/origin/license.txt| 13 ++ data/mllib/images/origin/multi-channel/BGRA.png | Bin 0 -> 683 bytes .../origin/multi-channel/BGRA_alpha_60.png | Bin 0 -> 747 bytes .../images/origin/multi-channel/chr30.4.184.jpg | Bin 0 -> 59472 bytes .../images/origin/multi-channel/grayscale.jpg | Bin 0 -> 36728 bytes .../date=2018-01/29.5.a_b_EGDP022204.jpg| Bin 0 -> 27295 bytes .../cls=kittens/date=2018-01/not-image.txt | 1 + .../cls=kittens/date=2018-02/54893.jpg | Bin 0 -> 35914 bytes .../cls=kittens/date=2018-02/DP153539.jpg | Bin 0 -> 26354 bytes .../cls=kittens/date=2018-02/DP802813.jpg | Bin 0 -> 30432 bytes .../cls=multichannel/date=2018-01/BGRA.png | Bin 0 -> 683 bytes .../date=2018-01/BGRA_alpha_60.png | Bin 0 -> 747 bytes .../date=2018-02/chr30.4.184.jpg| Bin 0 -> 59472 bytes .../cls=multichannel/date=2018-02/grayscale.jpg | Bin 0 -> 36728 bytes apache.spark.sql.sources.DataSourceRegister | 1 + .../spark/ml/source/image/ImageDataSource.scala | 53 + .../spark/ml/source/image/ImageFileFormat.scala | 100 .../spark/ml/source/image/ImageOptions.scala| 32 + .../spark/ml/image/ImageSchemaSuite.scala | 2 +- .../ml/source/image/ImageFileFormatSuite.scala | 119 +++ python/pyspark/ml/image.py | 2 +- python/pyspark/ml/tests.py | 4 +- 36 files changed, 324 insertions(+), 5 deletions(-) ---
spark git commit: [SPARK-25248][CORE] Audit barrier Scala APIs for 2.4
Repository: spark Updated Branches: refs/heads/master 3aa60282c -> 061bb01d9 [SPARK-25248][CORE] Audit barrier Scala APIs for 2.4 ## What changes were proposed in this pull request? I made one pass over barrier APIs added to Spark 2.4 and updates some scopes and docs. I will update Python docs once Scala doc was reviewed. One major issue is that `BarrierTaskContext` implements `TaskContextImpl` that exposes some public methods. And internally there were several direct references to `TaskContextImpl` methods instead of `TaskContext`. This PR moved some methods from `TaskContextImpl` to `TaskContext`, remaining package private, and used delegate methods to avoid inheriting `TaskContextImp` and exposing unnecessary APIs. TODOs: - [x] scala doc - [x] python doc (#22261 ). Closes #22240 from mengxr/SPARK-25248. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/061bb01d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/061bb01d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/061bb01d Branch: refs/heads/master Commit: 061bb01d9b99911353e66a90abc3164c467fcae1 Parents: 3aa6028 Author: Xiangrui Meng Authored: Tue Sep 4 09:55:53 2018 -0700 Committer: Xiangrui Meng Committed: Tue Sep 4 09:55:53 2018 -0700 -- .../org/apache/spark/BarrierTaskContext.scala | 114 +++ .../org/apache/spark/BarrierTaskInfo.scala | 2 +- .../scala/org/apache/spark/TaskContext.scala| 14 +++ .../org/apache/spark/TaskContextImpl.scala | 15 +-- .../apache/spark/api/python/PythonRunner.scala | 2 +- .../main/scala/org/apache/spark/rdd/RDD.scala | 10 +- .../scala/org/apache/spark/rdd/RDDBarrier.scala | 22 ++-- .../scala/org/apache/spark/scheduler/Task.scala | 35 +++--- .../scala/org/apache/spark/util/Utils.scala | 2 +- project/MimaExcludes.scala | 7 ++ .../spark/sql/internal/ReadOnlySQLConf.scala| 4 +- 11 files changed, 163 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/061bb01d/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 3901f96..90a5c41 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -24,25 +24,22 @@ import scala.language.postfixOps import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.executor.TaskMetrics +import org.apache.spark.internal.Logging import org.apache.spark.memory.TaskMemoryManager -import org.apache.spark.metrics.MetricsSystem +import org.apache.spark.metrics.source.Source import org.apache.spark.rpc.{RpcEndpointRef, RpcTimeout} -import org.apache.spark.util.{RpcUtils, Utils} - -/** A [[TaskContext]] with extra info and tooling for a barrier stage. */ -class BarrierTaskContext( -override val stageId: Int, -override val stageAttemptNumber: Int, -override val partitionId: Int, -override val taskAttemptId: Long, -override val attemptNumber: Int, -override val taskMemoryManager: TaskMemoryManager, -localProperties: Properties, -@transient private val metricsSystem: MetricsSystem, -// The default value is only used in tests. -override val taskMetrics: TaskMetrics = TaskMetrics.empty) - extends TaskContextImpl(stageId, stageAttemptNumber, partitionId, taskAttemptId, attemptNumber, - taskMemoryManager, localProperties, metricsSystem, taskMetrics) { +import org.apache.spark.shuffle.FetchFailedException +import org.apache.spark.util._ + +/** + * :: Experimental :: + * A [[TaskContext]] with extra contextual info and tooling for tasks in a barrier stage. + * Use [[BarrierTaskContext#get]] to obtain the barrier context for a running barrier task. + */ +@Experimental +@Since("2.4.0") +class BarrierTaskContext private[spark] ( +taskContext: TaskContext) extends TaskContext with Logging { // Find the driver side RPCEndpointRef of the coordinator that handles all the barrier() calls. private val barrierCoordinator: RpcEndpointRef = { @@ -68,7 +65,7 @@ class BarrierTaskContext( * * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all * possible code branches. Otherwise, you may get the job hanging or a SparkException after - * timeout. Some examples of misuses listed below: + * timeout. Some examples of '''misuses''' are listed below: * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
spark git commit: [SPARK-25248][.1][PYSPARK] update barrier Python API
Repository: spark Updated Branches: refs/heads/master 3864480e1 -> 20b7c684c [SPARK-25248][.1][PYSPARK] update barrier Python API ## What changes were proposed in this pull request? I made one pass over the Python APIs for barrier mode and updated them to match the Scala doc in #22240 . Major changes: * export the public classes * expand the docs * add doc for BarrierTaskInfo.addresss cc: jiangxb1987 Closes #22261 from mengxr/SPARK-25248.1. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20b7c684 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20b7c684 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20b7c684 Branch: refs/heads/master Commit: 20b7c684cc4a8136b9a9c56390a4948de04e7c34 Parents: 3864480 Author: Xiangrui Meng Authored: Wed Aug 29 07:22:03 2018 -0700 Committer: Xiangrui Meng Committed: Wed Aug 29 07:22:03 2018 -0700 -- python/pyspark/__init__.py| 12 +--- python/pyspark/rdd.py | 22 ++ python/pyspark/taskcontext.py | 26 +- 3 files changed, 44 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/20b7c684/python/pyspark/__init__.py -- diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 5821891..ee153af 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -36,7 +36,12 @@ Public classes: Finer-grained cache persistence levels. - :class:`TaskContext`: Information about the current running task, available on the workers and experimental. - + - :class:`RDDBarrier`: + Wraps an RDD under a barrier stage for barrier execution. + - :class:`BarrierTaskContext`: + A :class:`TaskContext` that provides extra info and tooling for barrier execution. + - :class:`BarrierTaskInfo`: + Information about a barrier task. """ from functools import wraps @@ -44,14 +49,14 @@ import types from pyspark.conf import SparkConf from pyspark.context import SparkContext -from pyspark.rdd import RDD +from pyspark.rdd import RDD, RDDBarrier from pyspark.files import SparkFiles from pyspark.storagelevel import StorageLevel from pyspark.accumulators import Accumulator, AccumulatorParam from pyspark.broadcast import Broadcast from pyspark.serializers import MarshalSerializer, PickleSerializer from pyspark.status import * -from pyspark.taskcontext import TaskContext +from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo from pyspark.profiler import Profiler, BasicProfiler from pyspark.version import __version__ from pyspark._globals import _NoValue @@ -113,4 +118,5 @@ __all__ = [ "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast", "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer", "StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", "BasicProfiler", "TaskContext", +"RDDBarrier", "BarrierTaskContext", "BarrierTaskInfo", ] http://git-wip-us.apache.org/repos/asf/spark/blob/20b7c684/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 380475e..b317156 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2390,7 +2390,18 @@ class RDD(object): """ .. note:: Experimental -Indicates that Spark must launch the tasks together for the current stage. +Marks the current stage as a barrier stage, where Spark must launch all tasks together. +In case of a task failure, instead of only restarting the failed task, Spark will abort the +entire stage and relaunch all tasks for this stage. +The barrier execution mode feature is experimental and it only handles limited scenarios. +Please read the linked SPIP and design docs to understand the limitations and future plans. + +:return: an :class:`RDDBarrier` instance that provides actions within a barrier stage. + +.. seealso:: :class:`BarrierTaskContext` +.. seealso:: `SPIP: Barrier Execution Mode \ +<http://jira.apache.org/jira/browse/SPARK-24374>`_ +.. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_ .. versionadded:: 2.4.0 """ @@ -2430,8 +2441,8 @@ class RDDBarrier(object): """ .. note:: Exper
spark git commit: [SPARK-25266][CORE] Fix memory leak in Barrier Execution Mode
Repository: spark Updated Branches: refs/heads/master 1fd59c129 -> 3864480e1 [SPARK-25266][CORE] Fix memory leak in Barrier Execution Mode ## What changes were proposed in this pull request? BarrierCoordinator uses Timer and TimerTask. `TimerTask#cancel()` is invoked in ContextBarrierState#cancelTimerTask but `Timer#purge()` is never invoked. Once a TimerTask is scheduled, the reference to it is not released until `Timer#purge()` is invoked even though `TimerTask#cancel()` is invoked. ## How was this patch tested? I checked the number of instances related to the TimerTask using jmap. Closes #22258 from sarutak/fix-barrierexec-oom. Authored-by: sarutak Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3864480e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3864480e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3864480e Branch: refs/heads/master Commit: 3864480e14a4961720cc1be43635c7c7dec08c09 Parents: 1fd59c1 Author: sarutak Authored: Wed Aug 29 07:13:13 2018 -0700 Committer: Xiangrui Meng Committed: Wed Aug 29 07:13:13 2018 -0700 -- core/src/main/scala/org/apache/spark/BarrierCoordinator.scala | 1 + core/src/main/scala/org/apache/spark/BarrierTaskContext.scala | 1 + 2 files changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3864480e/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala -- diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 5e546c6..6439ca5 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -123,6 +123,7 @@ private[spark] class BarrierCoordinator( private def cancelTimerTask(): Unit = { if (timerTask != null) { timerTask.cancel() +timer.purge() timerTask = null } } http://git-wip-us.apache.org/repos/asf/spark/blob/3864480e/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index de82798..3901f96 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -140,6 +140,7 @@ class BarrierTaskContext( throw e } finally { timerTask.cancel() + timer.purge() } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25234][SPARKR] avoid integer overflow in parallelize
Repository: spark Updated Branches: refs/heads/branch-2.3 fcc9bd632 -> 42c1fdd22 [SPARK-25234][SPARKR] avoid integer overflow in parallelize ## What changes were proposed in this pull request? `parallelize` uses integer multiplication to determine the split indices. It might cause integer overflow. ## How was this patch tested? unit test Closes #5 from mengxr/SPARK-25234. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng (cherry picked from commit 9714fa547325ed7b6a8066a88957537936b233dd) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42c1fdd2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42c1fdd2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42c1fdd2 Branch: refs/heads/branch-2.3 Commit: 42c1fdd229b3cf19ff804b7516eae9d36ae50c81 Parents: fcc9bd6 Author: Xiangrui Meng Authored: Fri Aug 24 15:03:00 2018 -0700 Committer: Xiangrui Meng Committed: Fri Aug 24 15:04:11 2018 -0700 -- R/pkg/R/context.R| 9 - R/pkg/tests/fulltests/test_context.R | 7 +++ 2 files changed, 11 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/42c1fdd2/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 443c2ff..25e2d15 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -138,11 +138,10 @@ parallelize <- function(sc, coll, numSlices = 1) { sizeLimit <- getMaxAllocationLimit(sc) objectSize <- object.size(coll) + len <- length(coll) # For large objects we make sure the size of each slice is also smaller than sizeLimit - numSerializedSlices <- max(numSlices, ceiling(objectSize / sizeLimit)) - if (numSerializedSlices > length(coll)) -numSerializedSlices <- length(coll) + numSerializedSlices <- min(len, max(numSlices, ceiling(objectSize / sizeLimit))) # Generate the slice ids to put each row # For instance, for numSerializedSlices of 22, length of 50 @@ -153,8 +152,8 @@ parallelize <- function(sc, coll, numSlices = 1) { splits <- if (numSerializedSlices > 0) { unlist(lapply(0: (numSerializedSlices - 1), function(x) { # nolint start - start <- trunc((x * length(coll)) / numSerializedSlices) - end <- trunc(((x + 1) * length(coll)) / numSerializedSlices) + start <- trunc((as.numeric(x) * len) / numSerializedSlices) + end <- trunc(((as.numeric(x) + 1) * len) / numSerializedSlices) # nolint end rep(start, end - start) })) http://git-wip-us.apache.org/repos/asf/spark/blob/42c1fdd2/R/pkg/tests/fulltests/test_context.R -- diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R index f0d0a51..288a271 100644 --- a/R/pkg/tests/fulltests/test_context.R +++ b/R/pkg/tests/fulltests/test_context.R @@ -240,3 +240,10 @@ test_that("add and get file to be downloaded with Spark job on every node", { unlink(path, recursive = TRUE) sparkR.session.stop() }) + +test_that("SPARK-25234: parallelize should not have integer overflow", { + sc <- sparkR.sparkContext(master = sparkRTestMaster) + # 47000 * 47000 exceeds integer range + parallelize(sc, 1:47000, 47000) + sparkR.session.stop() +}) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25234][SPARKR] avoid integer overflow in parallelize
Repository: spark Updated Branches: refs/heads/master f8346d2fc -> 9714fa547 [SPARK-25234][SPARKR] avoid integer overflow in parallelize ## What changes were proposed in this pull request? `parallelize` uses integer multiplication to determine the split indices. It might cause integer overflow. ## How was this patch tested? unit test Closes #5 from mengxr/SPARK-25234. Authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9714fa54 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9714fa54 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9714fa54 Branch: refs/heads/master Commit: 9714fa547325ed7b6a8066a88957537936b233dd Parents: f8346d2 Author: Xiangrui Meng Authored: Fri Aug 24 15:03:00 2018 -0700 Committer: Xiangrui Meng Committed: Fri Aug 24 15:03:00 2018 -0700 -- R/pkg/R/context.R| 9 - R/pkg/tests/fulltests/test_context.R | 7 +++ 2 files changed, 11 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9714fa54/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 7e77ea4..f168ca7 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -138,11 +138,10 @@ parallelize <- function(sc, coll, numSlices = 1) { sizeLimit <- getMaxAllocationLimit(sc) objectSize <- object.size(coll) + len <- length(coll) # For large objects we make sure the size of each slice is also smaller than sizeLimit - numSerializedSlices <- max(numSlices, ceiling(objectSize / sizeLimit)) - if (numSerializedSlices > length(coll)) -numSerializedSlices <- length(coll) + numSerializedSlices <- min(len, max(numSlices, ceiling(objectSize / sizeLimit))) # Generate the slice ids to put each row # For instance, for numSerializedSlices of 22, length of 50 @@ -153,8 +152,8 @@ parallelize <- function(sc, coll, numSlices = 1) { splits <- if (numSerializedSlices > 0) { unlist(lapply(0: (numSerializedSlices - 1), function(x) { # nolint start - start <- trunc((x * length(coll)) / numSerializedSlices) - end <- trunc(((x + 1) * length(coll)) / numSerializedSlices) + start <- trunc((as.numeric(x) * len) / numSerializedSlices) + end <- trunc(((as.numeric(x) + 1) * len) / numSerializedSlices) # nolint end rep(start, end - start) })) http://git-wip-us.apache.org/repos/asf/spark/blob/9714fa54/R/pkg/tests/fulltests/test_context.R -- diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R index f0d0a51..288a271 100644 --- a/R/pkg/tests/fulltests/test_context.R +++ b/R/pkg/tests/fulltests/test_context.R @@ -240,3 +240,10 @@ test_that("add and get file to be downloaded with Spark job on every node", { unlink(path, recursive = TRUE) sparkR.session.stop() }) + +test_that("SPARK-25234: parallelize should not have integer overflow", { + sc <- sparkR.sparkContext(master = sparkRTestMaster) + # 47000 * 47000 exceeds integer range + parallelize(sc, 1:47000, 47000) + sparkR.session.stop() +}) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25095][PYSPARK] Python support for BarrierTaskContext
Repository: spark Updated Branches: refs/heads/master 42035a4fe -> ad45299d0 [SPARK-25095][PYSPARK] Python support for BarrierTaskContext ## What changes were proposed in this pull request? Add method `barrier()` and `getTaskInfos()` in python TaskContext, these two methods are only allowed for barrier tasks. ## How was this patch tested? Add new tests in `tests.py` Closes #22085 from jiangxb1987/python.barrier. Authored-by: Xingbo Jiang Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad45299d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad45299d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad45299d Branch: refs/heads/master Commit: ad45299d047c10472fd3a86103930fe7c54a4cf1 Parents: 42035a4 Author: Xingbo Jiang Authored: Tue Aug 21 15:54:30 2018 -0700 Committer: Xiangrui Meng Committed: Tue Aug 21 15:54:30 2018 -0700 -- .../apache/spark/api/python/PythonRunner.scala | 106 ++ python/pyspark/serializers.py | 7 + python/pyspark/taskcontext.py | 144 +++ python/pyspark/tests.py | 36 - python/pyspark/worker.py| 16 ++- 5 files changed, 305 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ad45299d/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index 7b31857..f824191 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -20,12 +20,14 @@ package org.apache.spark.api.python import java.io._ import java.net._ import java.nio.charset.StandardCharsets +import java.nio.charset.StandardCharsets.UTF_8 import java.util.concurrent.atomic.AtomicBoolean import scala.collection.JavaConverters._ import org.apache.spark._ import org.apache.spark.internal.Logging +import org.apache.spark.security.SocketAuthHelper import org.apache.spark.util._ @@ -76,6 +78,12 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( // TODO: support accumulator in multiple UDF protected val accumulator = funcs.head.funcs.head.accumulator + // Expose a ServerSocket to support method calls via socket from Python side. + private[spark] var serverSocket: Option[ServerSocket] = None + + // Authentication helper used when serving method calls via socket from Python side. + private lazy val authHelper = new SocketAuthHelper(SparkEnv.get.conf) + def compute( inputIterator: Iterator[IN], partitionIndex: Int, @@ -180,7 +188,73 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( dataOut.writeInt(partitionIndex) // Python version of driver PythonRDD.writeUTF(pythonVer, dataOut) +// Init a ServerSocket to accept method calls from Python side. +val isBarrier = context.isInstanceOf[BarrierTaskContext] +if (isBarrier) { + serverSocket = Some(new ServerSocket(/* port */ 0, +/* backlog */ 1, +InetAddress.getByName("localhost"))) + // A call to accept() for ServerSocket shall block infinitely. + serverSocket.map(_.setSoTimeout(0)) + new Thread("accept-connections") { +setDaemon(true) + +override def run(): Unit = { + while (!serverSocket.get.isClosed()) { +var sock: Socket = null +try { + sock = serverSocket.get.accept() + // Wait for function call from python side. + sock.setSoTimeout(1) + val input = new DataInputStream(sock.getInputStream()) + input.readInt() match { +case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION => + // The barrier() function may wait infinitely, socket shall not timeout + // before the function finishes. + sock.setSoTimeout(0) + barrierAndServe(sock) + +case _ => + val out = new DataOutputStream(new BufferedOutputStream( +sock.getOutputStream)) + writeUTF(BarrierTaskContextMessageProtocol.ERROR_UNRECOGNIZED_FUNCTION, out) + } +} catch { + case e: SocketException if e.getMessage.contains("Socket closed") => +// It is possible that the ServerSo
spark git commit: [SPARK-25161][CORE] Fix several bugs in failure handling of barrier execution mode
Repository: spark Updated Branches: refs/heads/master b8788b3e7 -> 5059255d9 [SPARK-25161][CORE] Fix several bugs in failure handling of barrier execution mode ## What changes were proposed in this pull request? Fix several bugs in failure handling of barrier execution mode: * Mark TaskSet for a barrier stage as zombie when a task attempt fails; * Multiple barrier task failures from a single barrier stage should not trigger multiple stage retries; * Barrier task failure from a previous failed stage attempt should not trigger stage retry; * Fail the job when a task from a barrier ResultStage failed; * RDD.isBarrier() should not rely on `ShuffleDependency`s. ## How was this patch tested? Added corresponding test cases in `DAGSchedulerSuite` and `TaskSchedulerImplSuite`. Closes #22158 from jiangxb1987/failure. Authored-by: Xingbo Jiang Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5059255d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5059255d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5059255d Branch: refs/heads/master Commit: 5059255d91fc7a9810e013eba39e12d30291dd08 Parents: b8788b3 Author: Xingbo Jiang Authored: Tue Aug 21 08:25:02 2018 -0700 Committer: Xiangrui Meng Committed: Tue Aug 21 08:25:02 2018 -0700 -- .../main/scala/org/apache/spark/rdd/RDD.scala | 3 +- .../apache/spark/scheduler/DAGScheduler.scala | 125 +++ .../apache/spark/scheduler/TaskSetManager.scala | 4 + .../spark/scheduler/DAGSchedulerSuite.scala | 106 .../scheduler/TaskSchedulerImplSuite.scala | 18 +++ 5 files changed, 200 insertions(+), 56 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5059255d/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index cbc1143..374b846 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1863,7 +1863,8 @@ abstract class RDD[T: ClassTag]( // From performance concern, cache the value to avoid repeatedly compute `isBarrier()` on a long // RDD chain. - @transient protected lazy val isBarrier_ : Boolean = dependencies.exists(_.rdd.isBarrier()) + @transient protected lazy val isBarrier_ : Boolean = +dependencies.filter(!_.isInstanceOf[ShuffleDependency[_, _, _]]).exists(_.rdd.isBarrier()) } http://git-wip-us.apache.org/repos/asf/spark/blob/5059255d/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 2b0ca13..6787250 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1478,9 +1478,11 @@ private[spark] class DAGScheduler( mapOutputTracker.unregisterAllMapOutput(failedMapStage.shuffleDep.shuffleId) case failedResultStage: ResultStage => -// Mark all the partitions of the result stage to be not finished, to ensure retry -// all the tasks on resubmitted stage attempt. -failedResultStage.activeJob.map(_.resetAllPartitions()) +// Abort the failed result stage since we may have committed output for some +// partitions. +val reason = "Could not recover from a failed barrier ResultStage. Most recent " + + s"failure reason: $failureMessage" +abortStage(failedResultStage, reason, None) } } @@ -1553,62 +1555,75 @@ private[spark] class DAGScheduler( // Always fail the current stage and retry all the tasks when a barrier task fail. val failedStage = stageIdToStage(task.stageId) -logInfo(s"Marking $failedStage (${failedStage.name}) as failed due to a barrier task " + - "failed.") -val message = s"Stage failed because barrier task $task finished unsuccessfully.\n" + - failure.toErrorString -try { - // killAllTaskAttempts will fail if a SchedulerBackend does not implement killTask. - val reason = s"Task $task from barrier stage $failedStage (${failedStage.name}) failed." - taskScheduler.killAllTaskAttempts(stageId, interruptThread = false, reason) -} catch { - case e: UnsupportedOperationExcepti
spark git commit: [SPARK-24819][CORE] Fail fast when no enough slots to launch the barrier stage on job submitted
Repository: spark Updated Branches: refs/heads/master 4d8ae0d1c -> bfb74394a [SPARK-24819][CORE] Fail fast when no enough slots to launch the barrier stage on job submitted ## What changes were proposed in this pull request? We shall check whether the barrier stage requires more slots (to be able to launch all tasks in the barrier stage together) than the total number of active slots currently, and fail fast if trying to submit a barrier stage that requires more slots than current total number. This PR proposes to add a new method `getNumSlots()` to try to get the total number of currently active slots in `SchedulerBackend`, support of this new method has been added to all the first-class scheduler backends except `MesosFineGrainedSchedulerBackend`. ## How was this patch tested? Added new test cases in `BarrierStageOnSubmittedSuite`. Closes #22001 from jiangxb1987/SPARK-24819. Lead-authored-by: Xingbo Jiang Co-authored-by: Xiangrui Meng Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfb74394 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfb74394 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfb74394 Branch: refs/heads/master Commit: bfb74394a5513134ea1da9fcf4a1783b77dd64e4 Parents: 4d8ae0d Author: Xingbo Jiang Authored: Wed Aug 15 13:31:28 2018 -0700 Committer: Xiangrui Meng Committed: Wed Aug 15 13:31:28 2018 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 9 ++ .../apache/spark/internal/config/package.scala | 27 ++ .../scheduler/BarrierJobAllocationFailed.scala | 62 + .../apache/spark/scheduler/DAGScheduler.scala | 88 ++- .../spark/scheduler/SchedulerBackend.scala | 9 ++ .../cluster/CoarseGrainedSchedulerBackend.scala | 6 ++ .../scheduler/local/LocalSchedulerBackend.scala | 2 + .../spark/BarrierStageOnSubmittedSuite.scala| 91 ++-- .../spark/ExecutorAllocationManagerSuite.scala | 2 + .../org/apache/spark/SparkContextSuite.scala| 1 + .../CoarseGrainedSchedulerBackendSuite.scala| 89 ++- .../spark/scheduler/DAGSchedulerSuite.scala | 2 +- .../scheduler/ExternalClusterManagerSuite.scala | 1 + .../scheduler/SchedulerIntegrationSuite.scala | 2 + .../scheduler/TaskSchedulerImplSuite.scala | 1 + .../MesosFineGrainedSchedulerBackend.scala | 4 + 16 files changed, 364 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bfb74394/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index a7ffb35..e5b1e0e 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1603,6 +1603,15 @@ class SparkContext(config: SparkConf) extends Logging { } /** + * Get the max number of tasks that can be concurrent launched currently. + * Note that please don't cache the value returned by this method, because the number can change + * due to add/remove executors. + * + * @return The max number of tasks that can be concurrent launched currently. + */ + private[spark] def maxNumConcurrentTasks(): Int = schedulerBackend.maxNumConcurrentTasks() + + /** * Update the cluster manager on our scheduling needs. Three bits of information are included * to help it make decisions. * @param numExecutors The total number of executors we'd like to have. The cluster manager http://git-wip-us.apache.org/repos/asf/spark/blob/bfb74394/core/src/main/scala/org/apache/spark/internal/config/package.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index eb08628..a8aa691 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -577,4 +577,31 @@ package object config { .timeConf(TimeUnit.SECONDS) .checkValue(v => v > 0, "The value should be a positive time value.") .createWithDefaultString("365d") + + private[spark] val BARRIER_MAX_CONCURRENT_TASKS_CHECK_INTERVAL = +ConfigBuilder("spark.scheduler.barrier.maxConcurrentTasksCheck.interval") + .doc("Time in seconds to wait between a max concurrent tasks check failure and the next " + +"check. A max concurrent tasks check ensures the cluster can launch more concurrent " + +
spark git commit: [SPARK-25045][CORE] Make `RDDBarrier.mapParititions` similar to `RDD.mapPartitions`
Repository: spark Updated Branches: refs/heads/master 66699c5c3 -> d90f1336d [SPARK-25045][CORE] Make `RDDBarrier.mapParititions` similar to `RDD.mapPartitions` ## What changes were proposed in this pull request? Signature of the function passed to `RDDBarrier.mapPartitions()` is different from that of `RDD.mapPartitions`. The later doesn’t take a `TaskContext`. We shall make the function signature the same to avoid confusion and misusage. This PR proposes the following API changes: - In `RDDBarrier`, migrate `mapPartitions` from ``` def mapPartitions[S: ClassTag]( f: (Iterator[T], BarrierTaskContext) => Iterator[S], preservesPartitioning: Boolean = false): RDD[S] } ``` to ``` def mapPartitions[S: ClassTag]( f: Iterator[T] => Iterator[S], preservesPartitioning: Boolean = false): RDD[S] } ``` - Add new static method to get a `BarrierTaskContext`: ``` object BarrierTaskContext { def get(): BarrierTaskContext } ``` ## How was this patch tested? Existing test cases. Author: Xingbo Jiang Closes #22026 from jiangxb1987/mapPartitions. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d90f1336 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d90f1336 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d90f1336 Branch: refs/heads/master Commit: d90f1336d87199aac56fe227a0fe14ab0ae3a332 Parents: 66699c5 Author: Xingbo Jiang Authored: Tue Aug 7 17:32:41 2018 -0700 Committer: Xiangrui Meng Committed: Tue Aug 7 17:32:41 2018 -0700 -- .../org/apache/spark/BarrierTaskContext.scala | 14 +++-- .../scala/org/apache/spark/rdd/RDDBarrier.scala | 7 +++ .../spark/BarrierStageOnSubmittedSuite.scala| 22 ++-- .../org/apache/spark/SparkContextSuite.scala| 6 -- .../org/apache/spark/rdd/RDDBarrierSuite.scala | 6 +++--- .../scheduler/BarrierTaskContextSuite.scala | 15 - .../spark/scheduler/DAGSchedulerSuite.scala | 4 ++-- 7 files changed, 45 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d90f1336/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 8e2b155..de82798 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -72,7 +72,8 @@ class BarrierTaskContext( * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it * shall lead to timeout of the function call. * {{{ - * rdd.barrier().mapPartitions { (iter, context) => + * rdd.barrier().mapPartitions { iter => + * val context = BarrierTaskContext.get() * if (context.partitionId() == 0) { * // Do nothing. * } else { @@ -85,7 +86,8 @@ class BarrierTaskContext( * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the * second function call. * {{{ - * rdd.barrier().mapPartitions { (iter, context) => + * rdd.barrier().mapPartitions { iter => + * val context = BarrierTaskContext.get() * try { * // Do something that might throw an Exception. * doSomething() @@ -152,3 +154,11 @@ class BarrierTaskContext( addressesStr.split(",").map(_.trim()).map(new BarrierTaskInfo(_)) } } + +object BarrierTaskContext { + /** + * Return the currently active BarrierTaskContext. This can be called inside of user functions to + * access contextual information about running barrier tasks. + */ + def get(): BarrierTaskContext = TaskContext.get().asInstanceOf[BarrierTaskContext] +} http://git-wip-us.apache.org/repos/asf/spark/blob/d90f1336/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala b/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala index 71f38bf..978e7c0 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala @@ -28,7 +28,7 @@ class RDDBarrier[T: ClassTag](rdd: RDD[T]) { /** * :: Experimental :: - * Maps partitions together with a provided [[org.apache.spark.BarrierTaskContext]]. + * Generate a new barrier RDD by applying a function to each partitions of the prev RDD. * * `preservesPartitioning` indicates wheth
spark git commit: [SPARK-24954][CORE] Fail fast on job submit if run a barrier stage with dynamic resource allocation enabled
Repository: spark Updated Branches: refs/heads/master c32dbd6bd -> 92b48842b [SPARK-24954][CORE] Fail fast on job submit if run a barrier stage with dynamic resource allocation enabled ## What changes were proposed in this pull request? We don't support run a barrier stage with dynamic resource allocation enabled, it shall lead to some confusing behaviors (eg. with dynamic resource allocation enabled, it may happen that we acquire some executors (but not enough to launch all the tasks in a barrier stage) and later release them due to executor idle time expire, and then acquire again). We perform the check on job submit and fail fast if running a barrier stage with dynamic resource allocation enabled. ## How was this patch tested? Added new test suite `BarrierStageOnSubmittedSuite` to cover all the fail fast cases that submitted a job containing one or more barrier stages. Author: Xingbo Jiang Closes #21915 from jiangxb1987/SPARK-24954. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92b48842 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92b48842 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92b48842 Branch: refs/heads/master Commit: 92b48842b944a3e430472294cdc3c481bad6b804 Parents: c32dbd6 Author: Xingbo Jiang Authored: Fri Aug 3 09:36:56 2018 -0700 Committer: Xiangrui Meng Committed: Fri Aug 3 09:36:56 2018 -0700 -- .../apache/spark/scheduler/DAGScheduler.scala | 25 + .../spark/BarrierStageOnSubmittedSuite.scala| 57 2 files changed, 71 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92b48842/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 3dd0718..cf1fcbc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -364,6 +364,7 @@ class DAGScheduler( */ def createShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): ShuffleMapStage = { val rdd = shuffleDep.rdd +checkBarrierStageWithDynamicAllocation(rdd) checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions) val numTasks = rdd.partitions.length val parents = getOrCreateParentStages(rdd, jobId) @@ -385,6 +386,23 @@ class DAGScheduler( } /** + * We don't support run a barrier stage with dynamic resource allocation enabled, it shall lead + * to some confusing behaviors (eg. with dynamic resource allocation enabled, it may happen that + * we acquire some executors (but not enough to launch all the tasks in a barrier stage) and + * later release them due to executor idle time expire, and then acquire again). + * + * We perform the check on job submit and fail fast if running a barrier stage with dynamic + * resource allocation enabled. + * + * TODO SPARK-24942 Improve cluster resource management with jobs containing barrier stage + */ + private def checkBarrierStageWithDynamicAllocation(rdd: RDD[_]): Unit = { +if (rdd.isBarrier() && Utils.isDynamicAllocationEnabled(sc.getConf)) { + throw new SparkException(DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_DYN_ALLOCATION) +} + } + + /** * Create a ResultStage associated with the provided jobId. */ private def createResultStage( @@ -393,6 +411,7 @@ class DAGScheduler( partitions: Array[Int], jobId: Int, callSite: CallSite): ResultStage = { +checkBarrierStageWithDynamicAllocation(rdd) checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size) val parents = getOrCreateParentStages(rdd, jobId) val id = nextStageId.getAndIncrement() @@ -2001,4 +2020,10 @@ private[spark] object DAGScheduler { "PartitionPruningRDD). A workaround for first()/take() can be barrierRdd.collect().head " + "(scala) or barrierRdd.collect()[0] (python).\n" + "2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2))." + + // Error message when running a barrier stage with dynamic resource allocation enabled. + val ERROR_MESSAGE_RUN_BARRIER_WITH_DYN_ALLOCATION = +"[SPARK-24942]: Barrier execution mode does not support dynamic resource allocation for " + + "now. You can disable dynamic resource allocation by setting Spark conf " + + "\"spark.dynamicAllocation.enabled\" to \"false\"." } http://git-wip-us.apache.org/repos/asf/
spark git commit: [SPARK-24795][CORE][FOLLOWUP] Combine BarrierTaskContext with BarrierTaskContextImpl
Repository: spark Updated Branches: refs/heads/master bbdcc3bf6 -> 29077a1d1 [SPARK-24795][CORE][FOLLOWUP] Combine BarrierTaskContext with BarrierTaskContextImpl ## What changes were proposed in this pull request? According to https://github.com/apache/spark/pull/21758#discussion_r206746905 , current declaration of `BarrierTaskContext` didn't extend methods from `TaskContext`. Since `TaskContext` is an abstract class and we don't want to change it to a trait, we have to define class `BarrierTaskContext` directly. ## How was this patch tested? Existing tests. Author: Xingbo Jiang Closes #21972 from jiangxb1987/BarrierTaskContext. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29077a1d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29077a1d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29077a1d Branch: refs/heads/master Commit: 29077a1d15e49dfafe7f2eab963830ba9cc6b29a Parents: bbdcc3b Author: Xingbo Jiang Authored: Thu Aug 2 17:19:42 2018 -0700 Committer: Xiangrui Meng Committed: Thu Aug 2 17:19:42 2018 -0700 -- .../org/apache/spark/BarrierTaskContext.scala | 60 +++- .../apache/spark/BarrierTaskContextImpl.scala | 49 .../scala/org/apache/spark/rdd/RDDBarrier.scala | 2 +- .../scala/org/apache/spark/scheduler/Task.scala | 2 +- 4 files changed, 59 insertions(+), 54 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29077a1d/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 4c35862..ba30368 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -17,20 +17,71 @@ package org.apache.spark +import java.util.Properties + import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.executor.TaskMetrics +import org.apache.spark.memory.TaskMemoryManager +import org.apache.spark.metrics.MetricsSystem /** A [[TaskContext]] with extra info and tooling for a barrier stage. */ -trait BarrierTaskContext extends TaskContext { +class BarrierTaskContext( +override val stageId: Int, +override val stageAttemptNumber: Int, +override val partitionId: Int, +override val taskAttemptId: Long, +override val attemptNumber: Int, +override val taskMemoryManager: TaskMemoryManager, +localProperties: Properties, +@transient private val metricsSystem: MetricsSystem, +// The default value is only used in tests. +override val taskMetrics: TaskMetrics = TaskMetrics.empty) + extends TaskContextImpl(stageId, stageAttemptNumber, partitionId, taskAttemptId, attemptNumber, + taskMemoryManager, localProperties, metricsSystem, taskMetrics) { /** * :: Experimental :: * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same * stage have reached this routine. + * + * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all + * possible code branches. Otherwise, you may get the job hanging or a SparkException after + * timeout. Some examples of misuses listed below: + * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it + * shall lead to timeout of the function call. + * {{{ + * rdd.barrier().mapPartitions { (iter, context) => + * if (context.partitionId() == 0) { + * // Do nothing. + * } else { + * context.barrier() + * } + * iter + * } + * }}} + * + * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the + * second function call. + * {{{ + * rdd.barrier().mapPartitions { (iter, context) => + * try { + * // Do something that might throw an Exception. + * doSomething() + * context.barrier() + * } catch { + * case e: Exception => logWarning("...", e) + * } + * context.barrier() + * iter + * } + * }}} */ @Experimental @Since("2.4.0") - def barrier(): Unit + def barrier(): Unit = { +// TODO SPARK-24817 implement global barrier. + } /** * :: Experimental :: @@ -38,5 +89,8 @@ trait BarrierTaskContext extends TaskContext { */ @Experimental @Since("2.4.0") - def getTaskInfos(): Array[BarrierTaskInfo] + def getTaskInfos(): Array[
spark git commit: [SPARK-24820][SPARK-24821][CORE] Fail fast when submitted job contains a barrier stage with unsupported RDD chain pattern
Repository: spark Updated Branches: refs/heads/master ad2e63662 -> 38e4699c9 [SPARK-24820][SPARK-24821][CORE] Fail fast when submitted job contains a barrier stage with unsupported RDD chain pattern ## What changes were proposed in this pull request? Check on job submit to make sure we don't launch a barrier stage with unsupported RDD chain pattern. The following patterns are not supported: - Ancestor RDDs that have different number of partitions from the resulting RDD (eg. union()/coalesce()/first()/PartitionPruningRDD); - An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2)). ## How was this patch tested? Add test cases in `BarrierStageOnSubmittedSuite`. Author: Xingbo Jiang Closes #21927 from jiangxb1987/SPARK-24820. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/38e4699c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/38e4699c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/38e4699c Branch: refs/heads/master Commit: 38e4699c978e56a0f24b8efb94fd3206cdd8b3fe Parents: ad2e636 Author: Xingbo Jiang Authored: Thu Aug 2 09:36:26 2018 -0700 Committer: Xiangrui Meng Committed: Thu Aug 2 09:36:26 2018 -0700 -- .../apache/spark/scheduler/DAGScheduler.scala | 55 ++- .../spark/BarrierStageOnSubmittedSuite.scala| 153 +++ 2 files changed, 207 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/38e4699c/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 4858af7..3dd0718 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -39,7 +39,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.internal.config import org.apache.spark.network.util.JavaUtils import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult} -import org.apache.spark.rdd.{RDD, RDDCheckpointData} +import org.apache.spark.rdd.{PartitionPruningRDD, RDD, RDDCheckpointData} import org.apache.spark.rpc.RpcTimeout import org.apache.spark.storage._ import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat @@ -341,6 +341,22 @@ class DAGScheduler( } /** + * Check to make sure we don't launch a barrier stage with unsupported RDD chain pattern. The + * following patterns are not supported: + * 1. Ancestor RDDs that have different number of partitions from the resulting RDD (eg. + * union()/coalesce()/first()/take()/PartitionPruningRDD); + * 2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2)). + */ + private def checkBarrierStageWithRDDChainPattern(rdd: RDD[_], numTasksInStage: Int): Unit = { +val predicate: RDD[_] => Boolean = (r => + r.getNumPartitions == numTasksInStage && r.dependencies.filter(_.rdd.isBarrier()).size <= 1) +if (rdd.isBarrier() && !traverseParentRDDsWithinStage(rdd, predicate)) { + throw new SparkException( + DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN) +} + } + + /** * Creates a ShuffleMapStage that generates the given shuffle dependency's partitions. If a * previously run stage generated the same shuffle data, this function will copy the output * locations that are still available from the previous shuffle to avoid unnecessarily @@ -348,6 +364,7 @@ class DAGScheduler( */ def createShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): ShuffleMapStage = { val rdd = shuffleDep.rdd +checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions) val numTasks = rdd.partitions.length val parents = getOrCreateParentStages(rdd, jobId) val id = nextStageId.getAndIncrement() @@ -376,6 +393,7 @@ class DAGScheduler( partitions: Array[Int], jobId: Int, callSite: CallSite): ResultStage = { +checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size) val parents = getOrCreateParentStages(rdd, jobId) val id = nextStageId.getAndIncrement() val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite) @@ -451,6 +469,32 @@ class DAGScheduler( parents } + /** + * Traverses the given RDD and its ancestors within the same stage and checks whether all of the + * RDDs satisfy a given predicate. + */ + private def traverseParentRDDsWithinStage(rdd: RDD[_], predicate: RDD[_] => Boolean): Boolean = { +val visited = new HashSet[RDD[_]]
spark git commit: [SPARK-24557][ML] ClusteringEvaluator support array input
Repository: spark Updated Branches: refs/heads/master 166f34618 -> 57d994994 [SPARK-24557][ML] ClusteringEvaluator support array input ## What changes were proposed in this pull request? ClusteringEvaluator support array input ## How was this patch tested? added tests Author: zhengruifeng Closes #21563 from zhengruifeng/clu_eval_support_array. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57d99499 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57d99499 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57d99499 Branch: refs/heads/master Commit: 57d994994d27154f57f2724924c42beb2ab2e0e7 Parents: 166f346 Author: zhengruifeng Authored: Wed Aug 1 23:46:01 2018 -0700 Committer: Xiangrui Meng Committed: Wed Aug 1 23:46:01 2018 -0700 -- .../spark/ml/evaluation/ClusteringEvaluator.scala| 15 +-- .../ml/evaluation/ClusteringEvaluatorSuite.scala | 15 ++- 2 files changed, 23 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57d99499/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 4353c46..a6d6b4e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -21,11 +21,10 @@ import org.apache.spark.SparkContext import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.attribute.AttributeGroup -import org.apache.spark.ml.linalg.{BLAS, DenseVector, SparseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.linalg.{BLAS, DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} -import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, - SchemaUtils} +import org.apache.spark.ml.util._ import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.{avg, col, udf} import org.apache.spark.sql.types.DoubleType @@ -107,15 +106,19 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str @Since("2.3.0") override def evaluate(dataset: Dataset[_]): Double = { -SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.validateVectorCompatibleColumn(dataset.schema, $(featuresCol)) SchemaUtils.checkNumericType(dataset.schema, $(predictionCol)) +val vectorCol = DatasetUtils.columnToVector(dataset, $(featuresCol)) +val df = dataset.select(col($(predictionCol)), + vectorCol.as($(featuresCol), dataset.schema($(featuresCol)).metadata)) + ($(metricName), $(distanceMeasure)) match { case ("silhouette", "squaredEuclidean") => SquaredEuclideanSilhouette.computeSilhouetteScore( - dataset, $(predictionCol), $(featuresCol)) + df, $(predictionCol), $(featuresCol)) case ("silhouette", "cosine") => -CosineSilhouette.computeSilhouetteScore(dataset, $(predictionCol), $(featuresCol)) +CosineSilhouette.computeSilhouetteScore(df, $(predictionCol), $(featuresCol)) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/57d99499/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index 2c175ff..e2d7756 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset @@ -33,10 +33,17 @@ class ClusteringEvaluatorSuite import testImplicits._ @tran
spark git commit: [SPARK-15064][ML] Locale support in StopWordsRemover
Repository: spark Updated Branches: refs/heads/master 1d7db65e9 -> 5d6a53d98 [SPARK-15064][ML] Locale support in StopWordsRemover ## What changes were proposed in this pull request? Add locale support for `StopWordsRemover`. ## How was this patch tested? [Scala|Python] unit tests. Author: Lee Dongjin Closes #21501 from dongjinleekr/feature/SPARK-15064. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d6a53d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d6a53d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d6a53d9 Branch: refs/heads/master Commit: 5d6a53d9831cc1e2115560db5cebe0eea2565dcd Parents: 1d7db65 Author: Lee Dongjin Authored: Tue Jun 12 08:16:37 2018 -0700 Committer: Xiangrui Meng Committed: Tue Jun 12 08:16:37 2018 -0700 -- .../spark/ml/feature/StopWordsRemover.scala | 30 ++-- .../ml/feature/StopWordsRemoverSuite.scala | 51 python/pyspark/ml/feature.py| 30 ++-- python/pyspark/ml/tests.py | 7 +++ 4 files changed, 109 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5d6a53d9/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 3fcd84c..0f946dd 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -17,9 +17,11 @@ package org.apache.spark.ml.feature +import java.util.Locale + import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} +import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} @@ -84,7 +86,27 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String @Since("1.5.0") def getCaseSensitive: Boolean = $(caseSensitive) - setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), caseSensitive -> false) + /** + * Locale of the input for case insensitive matching. Ignored when [[caseSensitive]] + * is true. + * Default: Locale.getDefault.toString + * @group param + */ + @Since("2.4.0") + val locale: Param[String] = new Param[String](this, "locale", +"Locale of the input for case insensitive matching. Ignored when caseSensitive is true.", + ParamValidators.inArray[String](Locale.getAvailableLocales.map(_.toString))) + + /** @group setParam */ + @Since("2.4.0") + def setLocale(value: String): this.type = set(locale, value) + + /** @group getParam */ + @Since("2.4.0") + def getLocale: String = $(locale) + + setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), +caseSensitive -> false, locale -> Locale.getDefault.toString) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { @@ -95,8 +117,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String terms.filter(s => !stopWordsSet.contains(s)) } } else { - // TODO: support user locale (SPARK-15064) - val toLower = (s: String) => if (s != null) s.toLowerCase else s + val lc = new Locale($(locale)) + val toLower = (s: String) => if (s != null) s.toLowerCase(lc) else s val lowerStopWords = $(stopWords).map(toLower(_)).toSet udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) http://git-wip-us.apache.org/repos/asf/spark/blob/5d6a53d9/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 21259a5..20972d1 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -65,6 +65,57 @@ class StopWordsRemoverSuite extends MLTest with DefaultReadWriteTest { testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with localed input (case insensitive)") { +
spark git commit: [SPARK-19826][ML][PYTHON] add spark.ml Python API for PIC
Repository: spark Updated Branches: refs/heads/master 3e5b4ae63 -> a99d284c1 [SPARK-19826][ML][PYTHON] add spark.ml Python API for PIC ## What changes were proposed in this pull request? add spark.ml Python API for PIC ## How was this patch tested? add doctest Author: Huaxin Gao Closes #21513 from huaxingao/spark--19826. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a99d284c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a99d284c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a99d284c Branch: refs/heads/master Commit: a99d284c16cc4e00ce7c83ecdc3db6facd467552 Parents: 3e5b4ae Author: Huaxin Gao Authored: Mon Jun 11 12:15:14 2018 -0700 Committer: Xiangrui Meng Committed: Mon Jun 11 12:15:14 2018 -0700 -- python/pyspark/ml/clustering.py | 184 ++- 1 file changed, 179 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a99d284c/python/pyspark/ml/clustering.py -- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index b3d5fb1..4aa1cf8 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -19,14 +19,15 @@ import sys from pyspark import since, keyword_only from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaWrapper from pyspark.ml.param.shared import * from pyspark.ml.common import inherit_doc +from pyspark.sql import DataFrame __all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary', 'KMeans', 'KMeansModel', 'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary', - 'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel'] + 'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel', 'PowerIterationClustering'] class ClusteringSummary(JavaWrapper): @@ -836,7 +837,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter Terminology: - - "term" = "word": an el + - "term" = "word": an element of the vocabulary - "token": instance of a term appearing in a document - "topic": multinomial distribution over terms representing some concept - "document": one piece of text, corresponding to one row in the input data @@ -938,7 +939,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\ subsamplingRate=0.05, optimizeDocConcentration=True,\ docConcentration=None, topicConcentration=None,\ - topicDistributionCol="topicDistribution", keepLastCheckpoint=True): + topicDistributionCol="topicDistribution", keepLastCheckpoint=True) """ super(LDA, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid) @@ -967,7 +968,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\ subsamplingRate=0.05, optimizeDocConcentration=True,\ docConcentration=None, topicConcentration=None,\ - topicDistributionCol="topicDistribution", keepLastCheckpoint=True): + topicDistributionCol="topicDistribution", keepLastCheckpoint=True) Sets params for LDA. """ @@ -1156,6 +1157,179 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter return self.getOrDefault(self.keepLastCheckpoint) +@inherit_doc +class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReadable, + JavaMLWritable): +""" +.. note:: Experimental + +Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by +http://www.icml2010.org/papers/387.pdf>Lin and Cohen. From the abstract: +PIC finds a very low-dimensional embedding of a dataset using truncated power +iteration on a normalized pair-wise similarity matrix of the data. + +This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method +to run the PowerIterationClustering algorithm. + +.. seealso:: `Wikipedia on Spectral clustering \ +<http://en.wikipedia.org/wiki/Spectral_clustering>
spark git commit: [SPARK-24477][SPARK-24454][ML][PYTHON] Imports submodule in ml/__init__.py and add ImageSchema into __all__
Repository: spark Updated Branches: refs/heads/master a5d775a1f -> 173fe450d [SPARK-24477][SPARK-24454][ML][PYTHON] Imports submodule in ml/__init__.py and add ImageSchema into __all__ ## What changes were proposed in this pull request? This PR attaches submodules to ml's `__init__.py` module. Also, adds `ImageSchema` into `image.py` explicitly. ## How was this patch tested? Before: ```python >>> from pyspark import ml >>> ml.image Traceback (most recent call last): File "", line 1, in AttributeError: 'module' object has no attribute 'image' >>> ml.image.ImageSchema Traceback (most recent call last): File "", line 1, in AttributeError: 'module' object has no attribute 'image' ``` ```python >>> "image" in globals() False >>> from pyspark.ml import * >>> "image" in globals() False >>> image Traceback (most recent call last): File "", line 1, in NameError: name 'image' is not defined ``` After: ```python >>> from pyspark import ml >>> ml.image >>> ml.image.ImageSchema ``` ```python >>> "image" in globals() False >>> from pyspark.ml import * >>> "image" in globals() True >>> image ``` Author: hyukjinkwon Closes #21483 from HyukjinKwon/SPARK-24454. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/173fe450 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/173fe450 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/173fe450 Branch: refs/heads/master Commit: 173fe450df203b262b58f7e71c6b52a79db95ee0 Parents: a5d775a Author: hyukjinkwon Authored: Fri Jun 8 09:32:11 2018 -0700 Committer: Xiangrui Meng Committed: Fri Jun 8 09:32:11 2018 -0700 -- python/pyspark/ml/__init__.py | 8 +++- python/pyspark/ml/image.py| 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/173fe450/python/pyspark/ml/__init__.py -- diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py index 129d7d6..d99a253 100644 --- a/python/pyspark/ml/__init__.py +++ b/python/pyspark/ml/__init__.py @@ -21,5 +21,11 @@ machine learning pipelines. """ from pyspark.ml.base import Estimator, Model, Transformer, UnaryTransformer from pyspark.ml.pipeline import Pipeline, PipelineModel +from pyspark.ml import classification, clustering, evaluation, feature, fpm, \ +image, pipeline, recommendation, regression, stat, tuning, util, linalg, param -__all__ = ["Transformer", "UnaryTransformer", "Estimator", "Model", "Pipeline", "PipelineModel"] +__all__ = [ +"Transformer", "UnaryTransformer", "Estimator", "Model", "Pipeline", "PipelineModel", +"classification", "clustering", "evaluation", "feature", "fpm", "image", +"recommendation", "regression", "stat", "tuning", "util", "linalg", "param", +] http://git-wip-us.apache.org/repos/asf/spark/blob/173fe450/python/pyspark/ml/image.py -- diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index 96d702f..5f0c57e 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -31,6 +31,8 @@ from pyspark import SparkContext from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string from pyspark.sql import DataFrame, SparkSession +__all__ = ["ImageSchema"] + class _ImageSchema(object): """ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24300][ML] change the way to set seed in ml.cluster.LDASuite.generateLDAData
Repository: spark Updated Branches: refs/heads/master b24d3dba6 -> ff0501b0c [SPARK-24300][ML] change the way to set seed in ml.cluster.LDASuite.generateLDAData ## What changes were proposed in this pull request? Using different RNG in all different partitions. ## How was this patch tested? manually Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Lu WANG Closes #21492 from ludatabricks/SPARK-24300. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff0501b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff0501b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff0501b0 Branch: refs/heads/master Commit: ff0501b0c27dc8149bd5fb38a19d9b0056698766 Parents: b24d3db Author: Lu WANG Authored: Mon Jun 4 16:08:27 2018 -0700 Committer: Xiangrui Meng Committed: Mon Jun 4 16:08:27 2018 -0700 -- .../src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ff0501b0/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala index 096b541..db92132 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala @@ -34,9 +34,8 @@ object LDASuite { vocabSize: Int): DataFrame = { val avgWC = 1 // average instances of each word in a doc val sc = spark.sparkContext -val rng = new java.util.Random() -rng.setSeed(1) val rdd = sc.parallelize(1 to rows).map { i => + val rng = new java.util.Random(i) Vectors.dense(Array.fill(vocabSize)(rng.nextInt(2 * avgWC).toDouble)) }.map(v => new TestRow(v)) spark.createDataFrame(rdd) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24290][ML] add support for Array input for instrumentation.logNamedValue
Repository: spark Updated Branches: refs/heads/master 7297ae04d -> b24d3dba6 [SPARK-24290][ML] add support for Array input for instrumentation.logNamedValue ## What changes were proposed in this pull request? Extend instrumentation.logNamedValue to support Array input change the logging for "clusterSizes" to new method ## How was this patch tested? N/A Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Lu WANG Closes #21347 from ludatabricks/SPARK-24290. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b24d3dba Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b24d3dba Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b24d3dba Branch: refs/heads/master Commit: b24d3dba6571fd3c9e2649aceeaadc3f9c6cc90f Parents: 7297ae0 Author: Lu WANG Authored: Mon Jun 4 14:54:31 2018 -0700 Committer: Xiangrui Meng Committed: Mon Jun 4 14:54:31 2018 -0700 -- .../apache/spark/ml/clustering/BisectingKMeans.scala | 3 +-- .../apache/spark/ml/clustering/GaussianMixture.scala | 3 +-- .../scala/org/apache/spark/ml/clustering/KMeans.scala | 3 +-- .../org/apache/spark/ml/util/Instrumentation.scala | 13 + 4 files changed, 16 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 1ad4e09..9c96145 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -276,8 +276,7 @@ class BisectingKMeans @Since("2.0.0") ( val summary = new BisectingKMeansSummary( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) -// TODO: need to extend logNamedValue to support Array -instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) +instr.logNamedValue("clusterSizes", summary.clusterSizes) instr.logSuccess(model) model } http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 3091bb5..64ecc1e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -426,8 +426,7 @@ class GaussianMixture @Since("2.0.0") ( $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) instr.logNamedValue("logLikelihood", logLikelihood) -// TODO: need to extend logNamedValue to support Array -instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) +instr.logNamedValue("clusterSizes", summary.clusterSizes) instr.logSuccess(model) model } http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index e72d7f9..1704412 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -359,8 +359,7 @@ class KMeans @Since("1.5.0") ( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) -// TODO: need to extend logNamedValue to support Array -instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) +instr.logNamedValue("clusterSizes", summary.clusterSizes) instr.logSuccess(model) if (handlePersistence) { instances.unpersist() http://git-wip-us.apache.org/repos/asf/spark/blob/b24d3dba/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentatio
spark git commit: [SPARK-20114][ML][FOLLOW-UP] spark.ml parity for sequential pattern mining - PrefixSpan
Repository: spark Updated Branches: refs/heads/master a40ffc656 -> df125062c [SPARK-20114][ML][FOLLOW-UP] spark.ml parity for sequential pattern mining - PrefixSpan ## What changes were proposed in this pull request? Change `PrefixSpan` into a class with param setter/getters. This address issues mentioned here: https://github.com/apache/spark/pull/20973#discussion_r186931806 ## How was this patch tested? UT. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: WeichenXu <weichen...@databricks.com> Closes #21393 from WeichenXu123/fix_prefix_span. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df125062 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df125062 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df125062 Branch: refs/heads/master Commit: df125062c8dac9fee3328d67dd438a456b7a3b74 Parents: a40ffc6 Author: WeichenXu <weichen...@databricks.com> Authored: Wed May 23 11:00:23 2018 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed May 23 11:00:23 2018 -0700 -- .../org/apache/spark/ml/fpm/PrefixSpan.scala| 127 +++ .../apache/spark/ml/fpm/PrefixSpanSuite.scala | 28 ++-- 2 files changed, 119 insertions(+), 36 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df125062/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala index 02168fe..41716c6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/PrefixSpan.scala @@ -18,6 +18,8 @@ package org.apache.spark.ml.fpm import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.fpm.{PrefixSpan => mllibPrefixSpan} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.col @@ -29,13 +31,97 @@ import org.apache.spark.sql.types.{ArrayType, LongType, StructField, StructType} * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns * Efficiently by Prefix-Projected Pattern Growth * (see http://doi.org/10.1109/ICDE.2001.914830;>here). + * This class is not yet an Estimator/Transformer, use `findFrequentSequentialPatterns` method to + * run the PrefixSpan algorithm. * * @see https://en.wikipedia.org/wiki/Sequential_Pattern_Mining;>Sequential Pattern Mining * (Wikipedia) */ @Since("2.4.0") @Experimental -object PrefixSpan { +final class PrefixSpan(@Since("2.4.0") override val uid: String) extends Params { + + @Since("2.4.0") + def this() = this(Identifiable.randomUID("prefixSpan")) + + /** + * Param for the minimal support level (default: `0.1`). + * Sequential patterns that appear more than (minSupport * size-of-the-dataset) times are + * identified as frequent sequential patterns. + * @group param + */ + @Since("2.4.0") + val minSupport = new DoubleParam(this, "minSupport", "The minimal support level of the " + +"sequential pattern. Sequential pattern that appears more than " + +"(minSupport * size-of-the-dataset)." + +"times will be output.", ParamValidators.gtEq(0.0)) + + /** @group getParam */ + @Since("2.4.0") + def getMinSupport: Double = $(minSupport) + + /** @group setParam */ + @Since("2.4.0") + def setMinSupport(value: Double): this.type = set(minSupport, value) + + /** + * Param for the maximal pattern length (default: `10`). + * @group param + */ + @Since("2.4.0") + val maxPatternLength = new IntParam(this, "maxPatternLength", +"The maximal length of the sequential pattern.", +ParamValidators.gt(0)) + + /** @group getParam */ + @Since("2.4.0") + def getMaxPatternLength: Int = $(maxPatternLength) + + /** @group setParam */ + @Since("2.4.0") + def setMaxPatternLength(value: Int): this.type = set(maxPatternLength, value) + + /** + * Param for the maximum number of items (including delimiters used in the internal storage + * format) allowed in a projected database before local processing (default: `3200`). + * If a projected database exceeds this size, another iteration of distributed prefix growth + * is run. + * @group param + */ + @Since("2.4.0") + val maxLocalProjDBSize = new LongParam(this, "maxLocalProjDBSize", +"T
spark git commit: [SPARK-22884][ML] ML tests for StructuredStreaming: spark.ml.clustering
Repository: spark Updated Branches: refs/heads/master 439c69511 -> d4a0895c6 [SPARK-22884][ML] ML tests for StructuredStreaming: spark.ml.clustering ## What changes were proposed in this pull request? Converting clustering tests to also check code with structured streaming, using the ML testing infrastructure implemented in SPARK-22882. This PR is a new version of https://github.com/apache/spark/pull/20319 Author: Sandor Murakozi <smurak...@gmail.com> Author: Joseph K. Bradley <jos...@databricks.com> Closes #21358 from jkbradley/smurakozi-SPARK-22884. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4a0895c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4a0895c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4a0895c Branch: refs/heads/master Commit: d4a0895c628ca854895c3c35c46ed990af36ec61 Parents: 439c695 Author: Sandor Murakozi <smurak...@gmail.com> Authored: Thu May 17 16:33:06 2018 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu May 17 16:33:06 2018 -0700 -- .../ml/clustering/BisectingKMeansSuite.scala| 41 ++-- .../ml/clustering/GaussianMixtureSuite.scala| 22 --- .../spark/ml/clustering/KMeansSuite.scala | 31 +++ .../apache/spark/ml/clustering/LDASuite.scala | 21 -- 4 files changed, 50 insertions(+), 65 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4a0895c/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala index f3ff2af..81842af 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala @@ -19,17 +19,18 @@ package org.apache.spark.ml.clustering import scala.language.existentials -import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.SparkException import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.clustering.DistanceMeasure -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.Dataset -class BisectingKMeansSuite - extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + +class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest { + + import testImplicits._ final val k = 5 @transient var dataset: Dataset[_] = _ @@ -68,10 +69,13 @@ class BisectingKMeansSuite // Verify fit does not fail on very sparse data val model = bkm.fit(sparseDataset) -val result = model.transform(sparseDataset) -val numClusters = result.select("prediction").distinct().collect().length -// Verify we hit the edge case -assert(numClusters < k && numClusters > 1) + +testTransformerByGlobalCheckFunc[Tuple1[Vector]](sparseDataset.toDF(), model, "prediction") { + rows => +val numClusters = rows.distinct.length +// Verify we hit the edge case +assert(numClusters < k && numClusters > 1) +} } test("setter/getter") { @@ -104,19 +108,16 @@ class BisectingKMeansSuite val bkm = new BisectingKMeans().setK(k).setPredictionCol(predictionColName).setSeed(1) val model = bkm.fit(dataset) assert(model.clusterCenters.length === k) - -val transformed = model.transform(dataset) -val expectedColumns = Array("features", predictionColName) -expectedColumns.foreach { column => - assert(transformed.columns.contains(column)) -} -val clusters = - transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet -assert(clusters.size === k) -assert(clusters === Set(0, 1, 2, 3, 4)) assert(model.computeCost(dataset) < 0.1) assert(model.hasParent) +testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataset.toDF(), model, + "features", predictionColName) { rows => + val clusters = rows.map(_.getAs[Int](predictionColName)).toSet + assert(clusters.size === k) + assert(clusters === Set(0, 1, 2, 3, 4)) +} + // Check validity of model summary val numRows = dataset.count() assert(model.hasSum
spark git commit: [SPARK-24115] Have logging pass through instrumentation class.
Repository: spark Updated Branches: refs/heads/master 8a837bf4f -> a7a9b1837 [SPARK-24115] Have logging pass through instrumentation class. ## What changes were proposed in this pull request? Fixes to tuning instrumentation. ## How was this patch tested? Existing tests. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Bago Amirbekian <b...@databricks.com> Closes #21340 from MrBago/tunning-instrumentation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a7a9b183 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a7a9b183 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a7a9b183 Branch: refs/heads/master Commit: a7a9b1837808b281f47643490abcf054f6de7b50 Parents: 8a837bf Author: Bago Amirbekian <b...@databricks.com> Authored: Thu May 17 11:13:16 2018 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu May 17 11:13:16 2018 -0700 -- .../scala/org/apache/spark/ml/tuning/CrossValidator.scala | 10 +- .../org/apache/spark/ml/tuning/TrainValidationSplit.scala | 10 +- .../scala/org/apache/spark/ml/util/Instrumentation.scala | 7 +++ 3 files changed, 17 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a7a9b183/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 5e916cc..f327f37 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -144,7 +144,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) => val trainingDataset = sparkSession.createDataFrame(training, schema).cache() val validationDataset = sparkSession.createDataFrame(validation, schema).cache() - logDebug(s"Train split $splitIndex with multiple sets of parameters.") + instr.logDebug(s"Train split $splitIndex with multiple sets of parameters.") // Fit models in a Future for training in parallel val foldMetricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) => @@ -155,7 +155,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) } // TODO: duplicate evaluator to take extra params from input val metric = eval.evaluate(model.transform(validationDataset, paramMap)) - logDebug(s"Got metric $metric for model trained with $paramMap.") + instr.logDebug(s"Got metric $metric for model trained with $paramMap.") metric } (executionContext) } @@ -169,12 +169,12 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) foldMetrics }.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all splits -logInfo(s"Average cross-validation metrics: ${metrics.toSeq}") +instr.logInfo(s"Average cross-validation metrics: ${metrics.toSeq}") val (bestMetric, bestIndex) = if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) else metrics.zipWithIndex.minBy(_._1) -logInfo(s"Best set of parameters:\n${epm(bestIndex)}") -logInfo(s"Best cross-validation metric: $bestMetric.") +instr.logInfo(s"Best set of parameters:\n${epm(bestIndex)}") +instr.logInfo(s"Best cross-validation metric: $bestMetric.") val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]] instr.logSuccess(bestModel) copyValues(new CrossValidatorModel(uid, bestModel, metrics) http://git-wip-us.apache.org/repos/asf/spark/blob/a7a9b183/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala index 13369c4..14d6a69 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala @@ -143,7 +143,7 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St } else None // Fit models in a Future for training in parallel -
spark git commit: [SPARK-24155][ML] Instrumentation improvements for clustering
Repository: spark Updated Branches: refs/heads/master c26f67325 -> 075d678c8 [SPARK-24155][ML] Instrumentation improvements for clustering ## What changes were proposed in this pull request? changed the instrument for all of the clustering methods ## How was this patch tested? N/A Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Lu WANG <lu.w...@databricks.com> Closes #21218 from ludatabricks/SPARK-23686-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/075d678c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/075d678c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/075d678c Branch: refs/heads/master Commit: 075d678c8844614910b50abca07282bde31ef7e0 Parents: c26f673 Author: Lu WANG <lu.w...@databricks.com> Authored: Mon May 14 13:35:54 2018 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon May 14 13:35:54 2018 -0700 -- .../org/apache/spark/ml/clustering/BisectingKMeans.scala | 7 +-- .../org/apache/spark/ml/clustering/GaussianMixture.scala | 5 - .../main/scala/org/apache/spark/ml/clustering/KMeans.scala| 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 438e53b..1ad4e09 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -261,8 +261,9 @@ class BisectingKMeans @Since("2.0.0") ( transformSchema(dataset.schema, logging = true) val rdd = DatasetUtils.columnToOldVector(dataset, getFeaturesCol) -val instr = Instrumentation.create(this, rdd) -instr.logParams(featuresCol, predictionCol, k, maxIter, seed, minDivisibleClusterSize) +val instr = Instrumentation.create(this, dataset) +instr.logParams(featuresCol, predictionCol, k, maxIter, seed, + minDivisibleClusterSize, distanceMeasure) val bkm = new MLlibBisectingKMeans() .setK($(k)) @@ -275,6 +276,8 @@ class BisectingKMeans @Since("2.0.0") ( val summary = new BisectingKMeansSummary( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) +// TODO: need to extend logNamedValue to support Array +instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) instr.logSuccess(model) model } http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 88d618c..3091bb5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -352,7 +352,7 @@ class GaussianMixture @Since("2.0.0") ( s"than ${GaussianMixture.MAX_NUM_FEATURES} features because the size of the covariance" + s" matrix is quadratic in the number of features.") -val instr = Instrumentation.create(this, instances) +val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, seed, tol) instr.logNumFeatures(numFeatures) @@ -425,6 +425,9 @@ class GaussianMixture @Since("2.0.0") ( val summary = new GaussianMixtureSummary(model.transform(dataset), $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) +instr.logNamedValue("logLikelihood", logLikelihood) +// TODO: need to extend logNamedValue to support Array +instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) instr.logSuccess(model) model } http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 97f246f..e7
spark git commit: [SPARK-24132][ML] Instrumentation improvement for classification
Repository: spark Updated Branches: refs/heads/master 9498e528d -> 7e7350285 [SPARK-24132][ML] Instrumentation improvement for classification ## What changes were proposed in this pull request? - Add OptionalInstrumentation as argument for getNumClasses in ml.classification.Classifier - Change the function call for getNumClasses in train() in ml.classification.DecisionTreeClassifier, ml.classification.RandomForestClassifier, and ml.classification.NaiveBayes - Modify the instrumentation creation in ml.classification.LinearSVC - Change the log call in ml.classification.OneVsRest and ml.classification.LinearSVC ## How was this patch tested? Manual. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Lu WANG <lu.w...@databricks.com> Closes #21204 from ludatabricks/SPARK-23686. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e735028 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e735028 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e735028 Branch: refs/heads/master Commit: 7e7350285dc22764f599671d874617c0eea093e5 Parents: 9498e52 Author: Lu WANG <lu.w...@databricks.com> Authored: Tue May 8 21:20:58 2018 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue May 8 21:20:58 2018 -0700 -- .../spark/ml/classification/DecisionTreeClassifier.scala| 9 ++--- .../org/apache/spark/ml/classification/LinearSVC.scala | 9 ++--- .../org/apache/spark/ml/classification/NaiveBayes.scala | 3 ++- .../org/apache/spark/ml/classification/OneVsRest.scala | 4 ++-- .../spark/ml/classification/RandomForestClassifier.scala| 4 +++- 5 files changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7e735028/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 57797d1..c9786f1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -97,9 +97,11 @@ class DecisionTreeClassifier @Since("1.4.0") ( override def setSeed(value: Long): this.type = set(seed, value) override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = { +val instr = Instrumentation.create(this, dataset) val categoricalFeatures: Map[Int, Int] = MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol))) val numClasses: Int = getNumClasses(dataset) +instr.logNumClasses(numClasses) if (isDefined(thresholds)) { require($(thresholds).length == numClasses, this.getClass.getSimpleName + @@ -110,8 +112,8 @@ class DecisionTreeClassifier @Since("1.4.0") ( val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset, numClasses) val strategy = getOldStrategy(categoricalFeatures, numClasses) -val instr = Instrumentation.create(this, oldDataset) -instr.logParams(params: _*) +instr.logParams(maxDepth, maxBins, minInstancesPerNode, minInfoGain, maxMemoryInMB, + cacheNodeIds, checkpointInterval, impurity, seed) val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all", seed = $(seed), instr = Some(instr), parentUID = Some(uid)) @@ -125,7 +127,8 @@ class DecisionTreeClassifier @Since("1.4.0") ( private[ml] def train(data: RDD[LabeledPoint], oldStrategy: OldStrategy): DecisionTreeClassificationModel = { val instr = Instrumentation.create(this, data) -instr.logParams(params: _*) +instr.logParams(maxDepth, maxBins, minInstancesPerNode, minInfoGain, maxMemoryInMB, + cacheNodeIds, checkpointInterval, impurity, seed) val trees = RandomForest.run(data, oldStrategy, numTrees = 1, featureSubsetStrategy = "all", seed = 0L, instr = Some(instr), parentUID = Some(uid)) http://git-wip-us.apache.org/repos/asf/spark/blob/7e735028/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 80c537e..38eb045 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -170,7 +170,7
spark git commit: [SPARK-23975][ML] Add support of array input for all clustering methods
Repository: spark Updated Branches: refs/heads/master 76ecd0950 -> 0d63eb888 [SPARK-23975][ML] Add support of array input for all clustering methods ## What changes were proposed in this pull request? Add support for all of the clustering methods ## How was this patch tested? unit tests added Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Lu WANG <lu.w...@databricks.com> Closes #21195 from ludatabricks/SPARK-23975-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d63eb88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d63eb88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d63eb88 Branch: refs/heads/master Commit: 0d63ebd17df747fb41d7ba254718bb7af3ae Parents: 76ecd09 Author: Lu WANG <lu.w...@databricks.com> Authored: Mon May 7 20:08:41 2018 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon May 7 20:08:41 2018 -0700 -- .../spark/ml/clustering/BisectingKMeans.scala | 21 - .../spark/ml/clustering/GaussianMixture.scala | 12 +++-- .../org/apache/spark/ml/clustering/KMeans.scala | 31 +++-- .../org/apache/spark/ml/clustering/LDA.scala| 9 ++-- .../org/apache/spark/ml/util/DatasetUtils.scala | 13 +- .../org/apache/spark/ml/util/SchemaUtils.scala | 16 ++- .../ml/clustering/BisectingKMeansSuite.scala| 21 - .../ml/clustering/GaussianMixtureSuite.scala| 21 - .../spark/ml/clustering/KMeansSuite.scala | 48 ++-- .../apache/spark/ml/clustering/LDASuite.scala | 20 +++- .../apache/spark/ml/util/MLTestingUtils.scala | 23 +- 11 files changed, 147 insertions(+), 88 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d63eb88/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index addc12ac..438e53b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -22,17 +22,15 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.linalg.{Vector, VectorUDT} +import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.clustering.{BisectingKMeans => MLlibBisectingKMeans, BisectingKMeansModel => MLlibBisectingKMeansModel} -import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ -import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{IntegerType, StructType} @@ -75,7 +73,7 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter * @return output schema */ protected def validateAndTransformSchema(schema: StructType): StructType = { -SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT) +SchemaUtils.validateVectorCompatibleColumn(schema, getFeaturesCol) SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType) } } @@ -113,7 +111,8 @@ class BisectingKMeansModel private[ml] ( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val predictUDF = udf((vector: Vector) => predict(vector)) -dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol +dataset.withColumn($(predictionCol), + predictUDF(DatasetUtils.columnToVector(dataset, getFeaturesCol))) } @Since("2.0.0") @@ -132,9 +131,9 @@ class BisectingKMeansModel private[ml] ( */ @Since("2.0.0") def computeCost(dataset: Dataset[_]): Double = { -SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) -val data = dataset.select(col($(featuresCol))).rdd.map { case Row(point: Vector) => point } -parentModel.computeCost(data.map(OldVectors.fromML)) +SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol) +val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol) +parentModel.computeCost(data) } @Since("2.0.0") @@ -260,9 +259,7 @@ class BisectingKMeans @Sin
spark git commit: [SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples.
Repository: spark Updated Branches: refs/heads/branch-2.3 29ed71873 -> f8f522c01 [SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples. ## What changes were proposed in this pull request? Added documentation for new transformer. Author: Bago Amirbekian <b...@databricks.com> Closes #20285 from MrBago/sizeHintDocs. (cherry picked from commit 05839d164836e544af79c13de25802552eadd636) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f8f522c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f8f522c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f8f522c0 Branch: refs/heads/branch-2.3 Commit: f8f522c01025e78eca1724c909c749374f855039 Parents: 29ed718 Author: Bago Amirbekian <b...@databricks.com> Authored: Tue Jan 23 14:11:23 2018 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Jan 23 14:11:48 2018 -0800 -- docs/ml-features.md | 51 + .../examples/ml/JavaVectorSizeHintExample.java | 79 .../main/python/ml/vector_size_hint_example.py | 57 ++ .../examples/ml/VectorSizeHintExample.scala | 63 4 files changed, 250 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f8f522c0/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 466a8fb..3370eb3 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1291,6 +1291,57 @@ for more details on the API. +## VectorSizeHint + +It can sometimes be useful to explicitly specify the size of the vectors for a column of +`VectorType`. For example, `VectorAssembler` uses size information from its input columns to +produce size information and metadata for its output column. While in some cases this information +can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are +not available until the stream is started. `VectorSizeHint` allows a user to explicitly specify the +vector size for a column so that `VectorAssembler`, or other transformers that might +need to know vector size, can use that column as an input. + +To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. Applying this +transformer to a dataframe produces a new dataframe with updated metadata for `inputCol` specifying +the vector size. Downstream operations on the resulting dataframe can get this size using the +meatadata. + +`VectorSizeHint` can also take an optional `handleInvalid` parameter which controls its +behaviour when the vector column contains nulls or vectors of the wrong size. By default +`handleInvalid` is set to "error", indicating an exception should be thrown. This parameter can +also be set to "skip", indicating that rows containing invalid values should be filtered out from +the resulting dataframe, or "optimistic", indicating that the column should not be checked for +invalid values and all rows should be kept. Note that the use of "optimistic" can cause the +resulting dataframe to be in an inconsistent state, me:aning the metadata for the column +`VectorSizeHint` was applied to does not match the contents of that column. Users should take care +to avoid this kind of inconsistent state. + + + + +Refer to the [VectorSizeHint Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint) +for more details on the API. + +{% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %} + + + + +Refer to the [VectorSizeHint Java docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html) +for more details on the API. + +{% include_example java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %} + + + + +Refer to the [VectorSizeHint Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint) +for more details on the API. + +{% include_example python/ml/vector_size_hint_example.py %} + + + ## QuantileDiscretizer `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned http://git-wip-us.apache.org/repos/asf/spark/blob/f8f522c0/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java new file mode 100644 index 000..d649a2c --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java @@ -0,0 +1,79 @@
spark git commit: [SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples.
Repository: spark Updated Branches: refs/heads/master dc4761fd8 -> 05839d164 [SPARK-22735][ML][DOC] Added VectorSizeHint docs and examples. ## What changes were proposed in this pull request? Added documentation for new transformer. Author: Bago Amirbekian <b...@databricks.com> Closes #20285 from MrBago/sizeHintDocs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/05839d16 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/05839d16 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/05839d16 Branch: refs/heads/master Commit: 05839d164836e544af79c13de25802552eadd636 Parents: dc4761f Author: Bago Amirbekian <b...@databricks.com> Authored: Tue Jan 23 14:11:23 2018 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Jan 23 14:11:23 2018 -0800 -- docs/ml-features.md | 51 + .../examples/ml/JavaVectorSizeHintExample.java | 79 .../main/python/ml/vector_size_hint_example.py | 57 ++ .../examples/ml/VectorSizeHintExample.scala | 63 4 files changed, 250 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/05839d16/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 466a8fb..3370eb3 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1291,6 +1291,57 @@ for more details on the API. +## VectorSizeHint + +It can sometimes be useful to explicitly specify the size of the vectors for a column of +`VectorType`. For example, `VectorAssembler` uses size information from its input columns to +produce size information and metadata for its output column. While in some cases this information +can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are +not available until the stream is started. `VectorSizeHint` allows a user to explicitly specify the +vector size for a column so that `VectorAssembler`, or other transformers that might +need to know vector size, can use that column as an input. + +To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. Applying this +transformer to a dataframe produces a new dataframe with updated metadata for `inputCol` specifying +the vector size. Downstream operations on the resulting dataframe can get this size using the +meatadata. + +`VectorSizeHint` can also take an optional `handleInvalid` parameter which controls its +behaviour when the vector column contains nulls or vectors of the wrong size. By default +`handleInvalid` is set to "error", indicating an exception should be thrown. This parameter can +also be set to "skip", indicating that rows containing invalid values should be filtered out from +the resulting dataframe, or "optimistic", indicating that the column should not be checked for +invalid values and all rows should be kept. Note that the use of "optimistic" can cause the +resulting dataframe to be in an inconsistent state, me:aning the metadata for the column +`VectorSizeHint` was applied to does not match the contents of that column. Users should take care +to avoid this kind of inconsistent state. + + + + +Refer to the [VectorSizeHint Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint) +for more details on the API. + +{% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %} + + + + +Refer to the [VectorSizeHint Java docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html) +for more details on the API. + +{% include_example java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %} + + + + +Refer to the [VectorSizeHint Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint) +for more details on the API. + +{% include_example python/ml/vector_size_hint_example.py %} + + + ## QuantileDiscretizer `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned http://git-wip-us.apache.org/repos/asf/spark/blob/05839d16/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java new file mode 100644 index 000..d649a2c --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distri
spark git commit: [SPARK-20088] Do not create new SparkContext in SparkR createSparkContext
Repository: spark Updated Branches: refs/heads/master 890493458 -> 0588dc7c0 [SPARK-20088] Do not create new SparkContext in SparkR createSparkContext ## What changes were proposed in this pull request? Instead of creating new `JavaSparkContext` we use `SparkContext.getOrCreate`. ## How was this patch tested? Existing tests Author: Hossein <hoss...@databricks.com> Closes #17423 from falaki/SPARK-20088. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0588dc7c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0588dc7c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0588dc7c Branch: refs/heads/master Commit: 0588dc7c0a9f3180dddae0dc202a6d41eb43464f Parents: 8904934 Author: Hossein <hoss...@databricks.com> Authored: Mon Mar 27 08:53:45 2017 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Mar 27 08:53:45 2017 -0700 -- core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0588dc7c/core/src/main/scala/org/apache/spark/api/r/RRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index 72ae034..295355c 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -136,7 +136,7 @@ private[r] object RRDD { .mkString(File.separator)) } -val jsc = new JavaSparkContext(sparkConf) +val jsc = new JavaSparkContext(SparkContext.getOrCreate(sparkConf)) jars.foreach { jar => jsc.addJar(jar) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes
Repository: spark Updated Branches: refs/heads/branch-2.1 25b97589e -> 5693ac8e5 [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes ## What changes were proposed in this pull request? Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content minimal since users can type `?spark.randomForest` to see the full doc. cc: jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #16264 from mengxr/SPARK-18793. (cherry picked from commit 594b14f1ebd0b3db9f630e504be92228f11b4d9f) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5693ac8e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5693ac8e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5693ac8e Branch: refs/heads/branch-2.1 Commit: 5693ac8e5bd5df8aca1b0d6df0be072a45abcfbd Parents: 25b9758 Author: Xiangrui Meng <m...@databricks.com> Authored: Tue Dec 13 16:59:09 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Dec 13 16:59:15 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 32 +++ 1 file changed, 32 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5693ac8e/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 625b759..334daa5 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -449,6 +449,10 @@ SparkR supports the following machine learning models and algorithms. * Generalized Linear Model (GLM) +* Random Forest + +* Gradient-Boosted Trees (GBT) + * Naive Bayes Model * $k$-means Clustering @@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF) head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) ``` + Random Forest + +`spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +In the following example, we use the `longley` dataset to train a random forest and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +summary(rfModel) +predictions <- predict(rfModel, df) +``` + + Gradient-Boosted Trees + +`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +summary(gbtModel) +predictions <- predict(gbtModel, df) +``` + Naive Bayes Model Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes
Repository: spark Updated Branches: refs/heads/master c68fb426d -> 594b14f1e [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes ## What changes were proposed in this pull request? Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content minimal since users can type `?spark.randomForest` to see the full doc. cc: jkbradley Author: Xiangrui Meng <m...@databricks.com> Closes #16264 from mengxr/SPARK-18793. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/594b14f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/594b14f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/594b14f1 Branch: refs/heads/master Commit: 594b14f1ebd0b3db9f630e504be92228f11b4d9f Parents: c68fb42 Author: Xiangrui Meng <m...@databricks.com> Authored: Tue Dec 13 16:59:09 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Dec 13 16:59:09 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 32 +++ 1 file changed, 32 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/594b14f1/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 625b759..334daa5 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -449,6 +449,10 @@ SparkR supports the following machine learning models and algorithms. * Generalized Linear Model (GLM) +* Random Forest + +* Gradient-Boosted Trees (GBT) + * Naive Bayes Model * $k$-means Clustering @@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF) head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) ``` + Random Forest + +`spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +In the following example, we use the `longley` dataset to train a random forest and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +summary(rfModel) +predictions <- predict(rfModel, df) +``` + + Gradient-Boosted Trees + +`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +summary(gbtModel) +predictions <- predict(gbtModel, df) +``` + Naive Bayes Model Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes
Repository: spark Updated Branches: refs/heads/branch-2.1 9dc5fa5f7 -> 9f0e3be62 [SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes ## What changes were proposed in this pull request? spark.logit is added in 2.1. We need to update spark-vignettes to reflect the changes. This is part of SparkR QA work. ## How was this patch tested? Manual build html. Please see attached image for the result. ![test](https://cloud.githubusercontent.com/assets/5033592/21032237/01b565fe-bd5d-11e6-8b59-4de4b6ef611d.jpeg) Author: wm...@hotmail.com <wm...@hotmail.com> Closes #16222 from wangmiao1981/veg. (cherry picked from commit 2aa16d03db79a642cbe21f387441c34fc51a8236) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f0e3be6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f0e3be6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f0e3be6 Branch: refs/heads/branch-2.1 Commit: 9f0e3be622c77f7a677ce2c930b6dba2f652df00 Parents: 9dc5fa5 Author: wm...@hotmail.com <wm...@hotmail.com> Authored: Mon Dec 12 22:41:11 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 12 22:41:20 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 45 ++- 1 file changed, 38 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f0e3be6/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index a36f8fc..625b759 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -565,7 +565,7 @@ head(aftPredictions) Gaussian Mixture Model -(Coming in 2.1.0) +(Added in 2.1.0) `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. @@ -584,7 +584,7 @@ head(select(gmmFitted, "V1", "V2", "prediction")) Latent Dirichlet Allocation -(Coming in 2.1.0) +(Added in 2.1.0) `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: @@ -657,7 +657,7 @@ perplexity Multilayer Perceptron -(Coming in 2.1.0) +(Added in 2.1.0) Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows: $$ @@ -694,7 +694,7 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu Collaborative Filtering -(Coming in 2.1.0) +(Added in 2.1.0) `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614). @@ -725,7 +725,7 @@ head(predicted) Isotonic Regression Model -(Coming in 2.1.0) +(Added in 2.1.0) `spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to minimize $$ @@ -768,8 +768,39 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2))) head(predict(isoregModel, newDF)) ``` - What's More? -We also expect Decision Tree, Random Forest, Kolmogorov-Smirnov Test coming in the next version 2.1.0. +### Logistic Regression Model + +(Added in 2.1.0) + +[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a widely-used model when the response is categorical. It can be seen as
spark git commit: [SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes
Repository: spark Updated Branches: refs/heads/master 417e45c58 -> 2aa16d03d [SPARK-18797][SPARKR] Update spark.logit in sparkr-vignettes ## What changes were proposed in this pull request? spark.logit is added in 2.1. We need to update spark-vignettes to reflect the changes. This is part of SparkR QA work. ## How was this patch tested? Manual build html. Please see attached image for the result. ![test](https://cloud.githubusercontent.com/assets/5033592/21032237/01b565fe-bd5d-11e6-8b59-4de4b6ef611d.jpeg) Author: wm...@hotmail.com <wm...@hotmail.com> Closes #16222 from wangmiao1981/veg. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2aa16d03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2aa16d03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2aa16d03 Branch: refs/heads/master Commit: 2aa16d03db79a642cbe21f387441c34fc51a8236 Parents: 417e45c Author: wm...@hotmail.com <wm...@hotmail.com> Authored: Mon Dec 12 22:41:11 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Dec 12 22:41:11 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 45 ++- 1 file changed, 38 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2aa16d03/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index a36f8fc..625b759 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -565,7 +565,7 @@ head(aftPredictions) Gaussian Mixture Model -(Coming in 2.1.0) +(Added in 2.1.0) `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. @@ -584,7 +584,7 @@ head(select(gmmFitted, "V1", "V2", "prediction")) Latent Dirichlet Allocation -(Coming in 2.1.0) +(Added in 2.1.0) `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: @@ -657,7 +657,7 @@ perplexity Multilayer Perceptron -(Coming in 2.1.0) +(Added in 2.1.0) Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows: $$ @@ -694,7 +694,7 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu Collaborative Filtering -(Coming in 2.1.0) +(Added in 2.1.0) `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614). @@ -725,7 +725,7 @@ head(predicted) Isotonic Regression Model -(Coming in 2.1.0) +(Added in 2.1.0) `spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to minimize $$ @@ -768,8 +768,39 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2))) head(predict(isoregModel, newDF)) ``` - What's More? -We also expect Decision Tree, Random Forest, Kolmogorov-Smirnov Test coming in the next version 2.1.0. +### Logistic Regression Model + +(Added in 2.1.0) + +[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a widely-used model when the response is categorical. It can be seen as a special case of the [Generalized Linear Predictive Model](https://en.wikipedia.org/wiki/Generalized_linear_model). +We provide `
spark git commit: [SPARK-18812][MLLIB] explain "Spark ML"
Repository: spark Updated Branches: refs/heads/branch-2.1 562507ef0 -> e45345d91 [SPARK-18812][MLLIB] explain "Spark ML" ## What changes were proposed in this pull request? There has been some confusion around "Spark ML" vs. "MLlib". This PR adds some FAQ-like entries to the MLlib user guide to explain "Spark ML" and reduce the confusion. I check the [Spark FAQ page](http://spark.apache.org/faq.html), which seems too high-level for the content here. So I added it to the MLlib user guide instead. cc: mateiz Author: Xiangrui Meng <m...@databricks.com> Closes #16241 from mengxr/SPARK-18812. (cherry picked from commit d2493a203e852adf63dde4e1fc993e8d11efec3d) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e45345d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e45345d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e45345d9 Branch: refs/heads/branch-2.1 Commit: e45345d91e333e0b5f9219e857affeda461863c6 Parents: 562507e Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Dec 9 17:34:52 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Dec 9 17:34:58 2016 -0800 -- docs/ml-guide.md | 12 1 file changed, 12 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e45345d9/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index ddf81be..9717619 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -35,6 +35,18 @@ The primary Machine Learning API for Spark is now the [DataFrame](sql-programmin * The DataFrame-based API for MLlib provides a uniform API across ML algorithms and across multiple languages. * DataFrames facilitate practical ML Pipelines, particularly feature transformations. See the [Pipelines guide](ml-pipeline.html) for details. +*What is "Spark ML"?* + +* "Spark ML" is not an official name but occasionally used to refer to the MLlib DataFrame-based API. + This is majorly due to the `org.apache.spark.ml` Scala package name used by the DataFrame-based API, + and the "Spark ML Pipelines" term we used initially to emphasize the pipeline concept. + +*Is MLlib deprecated?* + +* No. MLlib includes both the RDD-based API and the DataFrame-based API. + The RDD-based API is now in maintenance mode. + But neither API is deprecated, nor MLlib as a whole. + # Dependencies MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18812][MLLIB] explain "Spark ML"
Repository: spark Updated Branches: refs/heads/master cf33a8628 -> d2493a203 [SPARK-18812][MLLIB] explain "Spark ML" ## What changes were proposed in this pull request? There has been some confusion around "Spark ML" vs. "MLlib". This PR adds some FAQ-like entries to the MLlib user guide to explain "Spark ML" and reduce the confusion. I check the [Spark FAQ page](http://spark.apache.org/faq.html), which seems too high-level for the content here. So I added it to the MLlib user guide instead. cc: mateiz Author: Xiangrui Meng <m...@databricks.com> Closes #16241 from mengxr/SPARK-18812. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2493a20 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2493a20 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2493a20 Branch: refs/heads/master Commit: d2493a203e852adf63dde4e1fc993e8d11efec3d Parents: cf33a86 Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Dec 9 17:34:52 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Dec 9 17:34:52 2016 -0800 -- docs/ml-guide.md | 12 1 file changed, 12 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d2493a20/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index ddf81be..9717619 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -35,6 +35,18 @@ The primary Machine Learning API for Spark is now the [DataFrame](sql-programmin * The DataFrame-based API for MLlib provides a uniform API across ML algorithms and across multiple languages. * DataFrames facilitate practical ML Pipelines, particularly feature transformations. See the [Pipelines guide](ml-pipeline.html) for details. +*What is "Spark ML"?* + +* "Spark ML" is not an official name but occasionally used to refer to the MLlib DataFrame-based API. + This is majorly due to the `org.apache.spark.ml` Scala package name used by the DataFrame-based API, + and the "Spark ML Pipelines" term we used initially to emphasize the pipeline concept. + +*Is MLlib deprecated?* + +* No. MLlib includes both the RDD-based API and the DataFrame-based API. + The RDD-based API is now in maintenance mode. + But neither API is deprecated, nor MLlib as a whole. + # Dependencies MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend
Repository: spark Updated Branches: refs/heads/branch-2.0 44df6d2ce -> 65b4b0561 [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend ## What changes were proposed in this pull request? * This PR changes `JVMObjectTracker` from `object` to `class` and let its instance associated with each RBackend. So we can manage the lifecycle of JVM objects when there are multiple `RBackend` sessions. `RBackend.close` will clear the object tracker explicitly. * I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, which could be wrong. * Small refactor of `SerDe.sqlSerDe` to increase readability. ## How was this patch tested? * Added unit tests for `JVMObjectTracker`. * Wait for Jenkins to run full tests. Author: Xiangrui Meng <m...@databricks.com> Closes #16154 from mengxr/SPARK-17822. (cherry picked from commit fd48d80a6145ea94f03e7fc6e4d724a0fbccac58) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65b4b056 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65b4b056 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65b4b056 Branch: refs/heads/branch-2.0 Commit: 65b4b05616bf8f5cf70a618cc15d379634e9b42d Parents: 44df6d2 Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Dec 9 07:51:46 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Dec 9 07:55:58 2016 -0800 -- .../apache/spark/api/r/JVMObjectTracker.scala | 87 ++ .../scala/org/apache/spark/api/r/RBackend.scala | 4 + .../apache/spark/api/r/RBackendHandler.scala| 54 ++-- .../scala/org/apache/spark/api/r/RRunner.scala | 2 +- .../scala/org/apache/spark/api/r/SerDe.scala| 92 .../spark/api/r/JVMObjectTrackerSuite.scala | 73 .../org/apache/spark/api/r/RBackendSuite.scala | 31 +++ .../org/apache/spark/sql/api/r/SQLUtils.scala | 12 +-- 8 files changed, 264 insertions(+), 91 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65b4b056/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala new file mode 100644 index 000..3432700 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.r + +import java.util.concurrent.atomic.AtomicInteger +import java.util.concurrent.ConcurrentHashMap + +/** JVM object ID wrapper */ +private[r] case class JVMObjectId(id: String) { + require(id != null, "Object ID cannot be null.") +} + +/** + * Counter that tracks JVM objects returned to R. + * This is useful for referencing these objects in RPC calls. + */ +private[r] class JVMObjectTracker { + + private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]() + private[this] val objCounter = new AtomicInteger() + + /** + * Returns the JVM object associated with the input key or None if not found. + */ + final def get(id: JVMObjectId): Option[Object] = this.synchronized { +if (objMap.containsKey(id)) { + Some(objMap.get(id)) +} else { + None +} + } + + /** + * Returns the JVM object associated with the input key or throws an exception if not found. + */ + @throws[NoSuchElementException]("if key does not exist.") + final def apply(id: JVMObjectId): Object = { +get(id).getOrElse( + throw new NoSuchElementException(s"$id does not exist.") +) + } + + /** + * Adds a JVM object to track and returns assigned ID, which is unique within this tracker. + */ + final def addAndGetId(obj: Object): JVMObjectId = { +val id = JVMObjectId(objCounter.getAndIncrement().toString) +obj
spark git commit: [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend
Repository: spark Updated Branches: refs/heads/branch-2.1 b226f10e3 -> 0c6415aec [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend ## What changes were proposed in this pull request? * This PR changes `JVMObjectTracker` from `object` to `class` and let its instance associated with each RBackend. So we can manage the lifecycle of JVM objects when there are multiple `RBackend` sessions. `RBackend.close` will clear the object tracker explicitly. * I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, which could be wrong. * Small refactor of `SerDe.sqlSerDe` to increase readability. ## How was this patch tested? * Added unit tests for `JVMObjectTracker`. * Wait for Jenkins to run full tests. Author: Xiangrui Meng <m...@databricks.com> Closes #16154 from mengxr/SPARK-17822. (cherry picked from commit fd48d80a6145ea94f03e7fc6e4d724a0fbccac58) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c6415ae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c6415ae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c6415ae Branch: refs/heads/branch-2.1 Commit: 0c6415aeca7a5c2fc5462c483c60d770f0236efe Parents: b226f10 Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Dec 9 07:51:46 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Dec 9 07:51:58 2016 -0800 -- .../apache/spark/api/r/JVMObjectTracker.scala | 87 ++ .../scala/org/apache/spark/api/r/RBackend.scala | 6 +- .../apache/spark/api/r/RBackendHandler.scala| 54 ++-- .../scala/org/apache/spark/api/r/RRunner.scala | 2 +- .../scala/org/apache/spark/api/r/SerDe.scala| 92 .../spark/api/r/JVMObjectTrackerSuite.scala | 73 .../org/apache/spark/api/r/RBackendSuite.scala | 31 +++ .../org/apache/spark/sql/api/r/SQLUtils.scala | 12 +-- 8 files changed, 265 insertions(+), 92 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c6415ae/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala new file mode 100644 index 000..3432700 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.r + +import java.util.concurrent.atomic.AtomicInteger +import java.util.concurrent.ConcurrentHashMap + +/** JVM object ID wrapper */ +private[r] case class JVMObjectId(id: String) { + require(id != null, "Object ID cannot be null.") +} + +/** + * Counter that tracks JVM objects returned to R. + * This is useful for referencing these objects in RPC calls. + */ +private[r] class JVMObjectTracker { + + private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]() + private[this] val objCounter = new AtomicInteger() + + /** + * Returns the JVM object associated with the input key or None if not found. + */ + final def get(id: JVMObjectId): Option[Object] = this.synchronized { +if (objMap.containsKey(id)) { + Some(objMap.get(id)) +} else { + None +} + } + + /** + * Returns the JVM object associated with the input key or throws an exception if not found. + */ + @throws[NoSuchElementException]("if key does not exist.") + final def apply(id: JVMObjectId): Object = { +get(id).getOrElse( + throw new NoSuchElementException(s"$id does not exist.") +) + } + + /** + * Adds a JVM object to track and returns assigned ID, which is unique within this tracker. + */ + final def addAndGetId(obj: Object): JVMObjectId = { +val id = JVMObjectId(objCounter.getAndIncrement().toString) +obj
spark git commit: [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend
Repository: spark Updated Branches: refs/heads/master b162cc0c2 -> fd48d80a6 [SPARK-17822][R] Make JVMObjectTracker a member variable of RBackend ## What changes were proposed in this pull request? * This PR changes `JVMObjectTracker` from `object` to `class` and let its instance associated with each RBackend. So we can manage the lifecycle of JVM objects when there are multiple `RBackend` sessions. `RBackend.close` will clear the object tracker explicitly. * I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, which could be wrong. * Small refactor of `SerDe.sqlSerDe` to increase readability. ## How was this patch tested? * Added unit tests for `JVMObjectTracker`. * Wait for Jenkins to run full tests. Author: Xiangrui Meng <m...@databricks.com> Closes #16154 from mengxr/SPARK-17822. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd48d80a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd48d80a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd48d80a Branch: refs/heads/master Commit: fd48d80a6145ea94f03e7fc6e4d724a0fbccac58 Parents: b162cc0 Author: Xiangrui Meng <m...@databricks.com> Authored: Fri Dec 9 07:51:46 2016 -0800 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Dec 9 07:51:46 2016 -0800 -- .../apache/spark/api/r/JVMObjectTracker.scala | 87 ++ .../scala/org/apache/spark/api/r/RBackend.scala | 6 +- .../apache/spark/api/r/RBackendHandler.scala| 54 ++-- .../scala/org/apache/spark/api/r/RRunner.scala | 2 +- .../scala/org/apache/spark/api/r/SerDe.scala| 92 .../spark/api/r/JVMObjectTrackerSuite.scala | 73 .../org/apache/spark/api/r/RBackendSuite.scala | 31 +++ .../org/apache/spark/sql/api/r/SQLUtils.scala | 12 +-- 8 files changed, 265 insertions(+), 92 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fd48d80a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala new file mode 100644 index 000..3432700 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.r + +import java.util.concurrent.atomic.AtomicInteger +import java.util.concurrent.ConcurrentHashMap + +/** JVM object ID wrapper */ +private[r] case class JVMObjectId(id: String) { + require(id != null, "Object ID cannot be null.") +} + +/** + * Counter that tracks JVM objects returned to R. + * This is useful for referencing these objects in RPC calls. + */ +private[r] class JVMObjectTracker { + + private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]() + private[this] val objCounter = new AtomicInteger() + + /** + * Returns the JVM object associated with the input key or None if not found. + */ + final def get(id: JVMObjectId): Option[Object] = this.synchronized { +if (objMap.containsKey(id)) { + Some(objMap.get(id)) +} else { + None +} + } + + /** + * Returns the JVM object associated with the input key or throws an exception if not found. + */ + @throws[NoSuchElementException]("if key does not exist.") + final def apply(id: JVMObjectId): Object = { +get(id).getOrElse( + throw new NoSuchElementException(s"$id does not exist.") +) + } + + /** + * Adds a JVM object to track and returns assigned ID, which is unique within this tracker. + */ + final def addAndGetId(obj: Object): JVMObjectId = { +val id = JVMObjectId(objCounter.getAndIncrement().toString) +objMap.put(id, obj) +id + } + + /** + * Removes and returns a JVM object with the specific ID from the tracker, or None if not found. + */ +
spark git commit: [SPARKR][MINOR] Fix LDA doc
Repository: spark Updated Branches: refs/heads/master 08913ce00 -> 6a0fda2c0 [SPARKR][MINOR] Fix LDA doc ## What changes were proposed in this pull request? This PR tries to fix the name of the `SparkDataFrame` used in the example. Also, it gives a reference url of an example data file so that users can play with. ## How was this patch tested? Manual test. Author: Junyang Qian <junya...@databricks.com> Closes #14853 from junyangq/SPARKR-FixLDADoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a0fda2c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a0fda2c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a0fda2c Branch: refs/heads/master Commit: 6a0fda2c0590b455e8713da79cd5f2413e5d0f28 Parents: 08913ce Author: Junyang Qian <junya...@databricks.com> Authored: Mon Aug 29 10:23:10 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Aug 29 10:23:10 2016 -0700 -- R/pkg/R/mllib.R | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6a0fda2c/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 6808aae..64d19fa 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -994,18 +994,22 @@ setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula #' @export #' @examples #' \dontrun{ -#' text <- read.df("path/to/data", source = "libsvm") +#' # nolint start +#' # An example "path/to/file" can be +#' # paste0(Sys.getenv("SPARK_HOME"), "/data/mllib/sample_lda_libsvm_data.txt") +#' # nolint end +#' text <- read.df("path/to/file", source = "libsvm") #' model <- spark.lda(data = text, optimizer = "em") #' #' # get a summary of the model #' summary(model) #' #' # compute posterior probabilities -#' posterior <- spark.posterior(model, df) +#' posterior <- spark.posterior(model, text) #' showDF(posterior) #' #' # compute perplexity -#' perplexity <- spark.perplexity(model, df) +#' perplexity <- spark.perplexity(model, text) #' #' # save and load the model #' path <- "path/to/model" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] Update R DESCRIPTION file
Repository: spark Updated Branches: refs/heads/branch-2.0 eaea1c86b -> d16f9a0b7 [SPARKR][MINOR] Update R DESCRIPTION file ## What changes were proposed in this pull request? Update DESCRIPTION ## How was this patch tested? Run install and CRAN tests Author: Felix Cheung <felixcheun...@hotmail.com> Closes #14764 from felixcheung/rpackagedescription. (cherry picked from commit d2b3d3e63e1a9217de6ef507c350308017664a62) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d16f9a0b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d16f9a0b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d16f9a0b Branch: refs/heads/branch-2.0 Commit: d16f9a0b7c464728d7b11899740908e23820a797 Parents: eaea1c8 Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Mon Aug 22 20:15:03 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Aug 22 20:15:14 2016 -0700 -- R/pkg/DESCRIPTION | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d16f9a0b/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index d81f1a3..e5afed2 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -3,10 +3,15 @@ Type: Package Title: R Frontend for Apache Spark Version: 2.0.0 Date: 2016-07-07 -Author: The Apache Software Foundation -Maintainer: Shivaram Venkataraman <shiva...@cs.berkeley.edu> -Xiangrui Meng <m...@databricks.com> -Felix Cheung <felixcheun...@hotmail.com> +Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), + email = "shiva...@cs.berkeley.edu"), + person("Xiangrui", "Meng", role = "aut", +email = "m...@databricks.com"), + person("Felix", "Cheung", role = "aut", +email = "felixche...@apache.org"), + person(family = "The Apache Software Foundation", role = c("aut", "cph"))) +URL: http://www.apache.org/ http://spark.apache.org/ +BugReports: https://issues.apache.org/jira/secure/CreateIssueDetails!init.jspa?pid=12315420=12325400=4 Depends: R (>= 3.0), methods - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore
Repository: spark Updated Branches: refs/heads/branch-2.0 faff9297d -> 26d5a8b0d [MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore ## What changes were proposed in this pull request? Ignore temp files generated by `check-cran.sh`. Author: Xiangrui Meng <m...@databricks.com> Closes #14740 from mengxr/R-gitignore. (cherry picked from commit ab7143463daf2056736c85e3a943c826b5992623) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26d5a8b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26d5a8b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26d5a8b0 Branch: refs/heads/branch-2.0 Commit: 26d5a8b0dab10310ec76b91465b3b4ff465e9746 Parents: faff929 Author: Xiangrui Meng <m...@databricks.com> Authored: Sun Aug 21 10:31:25 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Aug 21 10:31:32 2016 -0700 -- R/.gitignore | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26d5a8b0/R/.gitignore -- diff --git a/R/.gitignore b/R/.gitignore index 9a5889b..c98504a 100644 --- a/R/.gitignore +++ b/R/.gitignore @@ -4,3 +4,5 @@ lib pkg/man pkg/html +SparkR.Rcheck/ +SparkR_*.tar.gz - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore
Repository: spark Updated Branches: refs/heads/master e328f577e -> ab7143463 [MINOR][R] add SparkR.Rcheck/ and SparkR_*.tar.gz to R/.gitignore ## What changes were proposed in this pull request? Ignore temp files generated by `check-cran.sh`. Author: Xiangrui Meng <m...@databricks.com> Closes #14740 from mengxr/R-gitignore. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab714346 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab714346 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab714346 Branch: refs/heads/master Commit: ab7143463daf2056736c85e3a943c826b5992623 Parents: e328f57 Author: Xiangrui Meng <m...@databricks.com> Authored: Sun Aug 21 10:31:25 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sun Aug 21 10:31:25 2016 -0700 -- R/.gitignore | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab714346/R/.gitignore -- diff --git a/R/.gitignore b/R/.gitignore index 9a5889b..c98504a 100644 --- a/R/.gitignore +++ b/R/.gitignore @@ -4,3 +4,5 @@ lib pkg/man pkg/html +SparkR.Rcheck/ +SparkR_*.tar.gz - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16443][SPARKR] Alternating Least Squares (ALS) wrapper
Repository: spark Updated Branches: refs/heads/master cf0cce903 -> acac7a508 [SPARK-16443][SPARKR] Alternating Least Squares (ALS) wrapper ## What changes were proposed in this pull request? Add Alternating Least Squares wrapper in SparkR. Unit tests have been updated. ## How was this patch tested? SparkR unit tests. (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) ![screen shot 2016-07-27 at 3 50 31 pm](https://cloud.githubusercontent.com/assets/15318264/17195347/f7a6352a-5411-11e6-8e21-61a48070192a.png) ![screen shot 2016-07-27 at 3 50 46 pm](https://cloud.githubusercontent.com/assets/15318264/17195348/f7a7d452-5411-11e6-845f-6d292283bc28.png) Author: Junyang Qian <junya...@databricks.com> Closes #14384 from junyangq/SPARK-16443. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acac7a50 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acac7a50 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acac7a50 Branch: refs/heads/master Commit: acac7a508a29d0f75d86ee2e4ca83ebf01a36cf8 Parents: cf0cce9 Author: Junyang Qian <junya...@databricks.com> Authored: Fri Aug 19 14:24:09 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Aug 19 14:24:09 2016 -0700 -- R/pkg/NAMESPACE | 3 +- R/pkg/R/generics.R | 4 + R/pkg/R/mllib.R | 159 ++- R/pkg/inst/tests/testthat/test_mllib.R | 40 + .../org/apache/spark/ml/r/ALSWrapper.scala | 119 ++ .../scala/org/apache/spark/ml/r/RWrappers.scala | 2 + 6 files changed, 322 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/acac7a50/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 4404cff..e1b87b2 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -29,7 +29,8 @@ exportMethods("glm", "spark.posterior", "spark.perplexity", "spark.isoreg", - "spark.gaussianMixture") + "spark.gaussianMixture", + "spark.als") # Job group lifecycle management methods export("setJobGroup", http://git-wip-us.apache.org/repos/asf/spark/blob/acac7a50/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index fe04bcf..693aa31 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1332,3 +1332,7 @@ setGeneric("spark.gaussianMixture", #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) + +#' @rdname spark.als +#' @export +setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") }) http://git-wip-us.apache.org/repos/asf/spark/blob/acac7a50/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b952741..36f38fc 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -74,6 +74,13 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @note GaussianMixtureModel since 2.1.0 setClass("GaussianMixtureModel", representation(jobj = "jobj")) +#' S4 class that represents an ALSModel +#' +#' @param jobj a Java object reference to the backing Scala ALSWrapper +#' @export +#' @note ALSModel since 2.1.0 +setClass("ALSModel", representation(jobj = "jobj")) + #' Saves the MLlib model to the input path #' #' Saves the MLlib model to the input path. For more information, see the specific @@ -82,8 +89,8 @@ setClass("GaussianMixtureModel", representation(jobj = "jobj")) #' @name write.ml #' @export #' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} -#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}, \link{spark.lda} -#' @seealso \link{spark.isoreg} +#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.lda}, \link{spark.naiveBayes} +#' @seealso \link{spark.survreg}, \link{spark.isoreg} #' @seealso \link{read.ml} NULL @@ -95,10 +102,11 @@ NULL #' @name predict #' @export #' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} -#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} #' @seealso \link{spark.isoreg} NULL + #
spark git commit: [SPARK-16446][SPARKR][ML] Gaussian Mixture Model wrapper in SparkR
Repository: spark Updated Branches: refs/heads/master e3fec51fa -> 4d92af310 [SPARK-16446][SPARKR][ML] Gaussian Mixture Model wrapper in SparkR ## What changes were proposed in this pull request? Gaussian Mixture Model wrapper in SparkR, similarly to R's ```mvnormalmixEM```. ## How was this patch tested? Unit test. Author: Yanbo Liang <yblia...@gmail.com> Closes #14392 from yanboliang/spark-16446. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4d92af31 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4d92af31 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4d92af31 Branch: refs/heads/master Commit: 4d92af310ad29ade039e4130f91f2a3d9180deef Parents: e3fec51 Author: Yanbo Liang <yblia...@gmail.com> Authored: Wed Aug 17 11:18:33 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Aug 17 11:18:33 2016 -0700 -- R/pkg/NAMESPACE | 3 +- R/pkg/R/generics.R | 7 + R/pkg/R/mllib.R | 139 ++- R/pkg/inst/tests/testthat/test_mllib.R | 62 + .../spark/ml/r/GaussianMixtureWrapper.scala | 128 + .../scala/org/apache/spark/ml/r/RWrappers.scala | 2 + 6 files changed, 338 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4d92af31/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1e23b23..c71eec5 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -25,7 +25,8 @@ exportMethods("glm", "fitted", "spark.naiveBayes", "spark.survreg", - "spark.isoreg") + "spark.isoreg", + "spark.gaussianMixture") # Job group lifecycle management methods export("setJobGroup", http://git-wip-us.apache.org/repos/asf/spark/blob/4d92af31/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ebacc11..06bb25d 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1308,6 +1308,13 @@ setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spar #' @export setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") }) +#' @rdname spark.gaussianMixture +#' @export +setGeneric("spark.gaussianMixture", + function(data, formula, ...) { + standardGeneric("spark.gaussianMixture") + }) + #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/4d92af31/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 0dcc54d..db74046 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -60,6 +60,13 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' @note IsotonicRegressionModel since 2.1.0 setClass("IsotonicRegressionModel", representation(jobj = "jobj")) +#' S4 class that represents a GaussianMixtureModel +#' +#' @param jobj a Java object reference to the backing Scala GaussianMixtureModel +#' @export +#' @note GaussianMixtureModel since 2.1.0 +setClass("GaussianMixtureModel", representation(jobj = "jobj")) + #' Saves the MLlib model to the input path #' #' Saves the MLlib model to the input path. For more information, see the specific @@ -67,7 +74,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @rdname write.ml #' @name write.ml #' @export -#' @seealso \link{spark.glm}, \link{glm} +#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} #' @seealso \link{spark.isoreg} #' @seealso \link{read.ml} @@ -80,7 +87,7 @@ NULL #' @rdname predict #' @name predict #' @export -#' @seealso \link{spark.glm}, \link{glm} +#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} #' @seealso \link{spark.isoreg} NULL @@ -649,6 +656,25 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char invisible(callJMethod(writer, "save", path)) }) +# Save fitted MLlib model to the input path + +#' @param path the directory
spark git commit: [SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin
Repository: spark Updated Branches: refs/heads/branch-2.0 b52bd8070 -> a54852350 [SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin ## What changes were proposed in this pull request? This PR adds labelling support for the `include_example` Jekyll plugin, so that we may split a single source file into multiple line blocks with different labels, and include them in multiple code snippets in the generated HTML page. ## How was this patch tested? Manually tested. https://cloud.githubusercontent.com/assets/230655/16451099/66a76db2-3e33-11e6-84fb-63104c2f0688.png;> Author: Cheng Lian <l...@databricks.com> Closes #13972 from liancheng/include-example-with-labels. (cherry picked from commit bde1d6a61593aeb62370f526542cead94919b0c0) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a5485235 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a5485235 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a5485235 Branch: refs/heads/branch-2.0 Commit: a54852350346cacae61d851d796bc3a7abd3a048 Parents: b52bd80 Author: Cheng Lian <l...@databricks.com> Authored: Wed Jun 29 22:50:53 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jun 29 22:51:04 2016 -0700 -- docs/_plugins/include_example.rb| 25 +--- docs/sql-programming-guide.md | 41 +++- .../apache/spark/examples/sql/JavaSparkSQL.java | 5 +++ examples/src/main/python/sql.py | 5 +++ .../apache/spark/examples/sql/RDDRelation.scala | 10 - 5 files changed, 43 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a5485235/docs/_plugins/include_example.rb -- diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index f748582..306 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -32,8 +32,18 @@ module Jekyll @code_dir = File.join(site.source, config_dir) clean_markup = @markup.strip - @file = File.join(@code_dir, clean_markup) - @lang = clean_markup.split('.').last + + parts = clean_markup.strip.split(' ') + if parts.length > 1 then +@snippet_label = ':' + parts[0] +snippet_file = parts[1] + else +@snippet_label = '' +snippet_file = parts[0] + end + + @file = File.join(@code_dir, snippet_file) + @lang = snippet_file.split('.').last code = File.open(@file).read.encode("UTF-8") code = select_lines(code) @@ -41,7 +51,7 @@ module Jekyll rendered_code = Pygments.highlight(code, :lexer => @lang) hint = "Find full example code at " \ -"\"examples/src/main/#{clean_markup}\" in the Spark repo." +"\"examples/src/main/#{snippet_file}\" in the Spark repo." rendered_code + hint end @@ -66,13 +76,13 @@ module Jekyll # Select the array of start labels from code. startIndices = lines .each_with_index -.select { |l, i| l.include? "$example on$" } +.select { |l, i| l.include? "$example on#{@snippet_label}$" } .map { |l, i| i } # Select the array of end labels from code. endIndices = lines .each_with_index -.select { |l, i| l.include? "$example off$" } +.select { |l, i| l.include? "$example off#{@snippet_label}$" } .map { |l, i| i } raise "Start indices amount is not equal to end indices amount, see #{@file}." \ @@ -92,7 +102,10 @@ module Jekyll if start == endline lastIndex = endline range = Range.new(start + 1, endline - 1) -result += trim_codeblock(lines[range]).join +trimmed = trim_codeblock(lines[range]) +# Filter out possible example tags of overlapped labels. +taggs_filtered = trimmed.select { |l| !l.include? '$example ' } +result += taggs_filtered.join result += "\n" end result http://git-wip-us.apache.org/repos/asf/spark/blob/a5485235/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 6c6bc8d..68419e1 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -63,52 +63,23 @@ Throughout this document, we will often refer to Scala/Java Datasets of `Row`s a -The entry point into all functionality in Spark is the [`SparkSession`](api/sca
spark git commit: [SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin
Repository: spark Updated Branches: refs/heads/master d3af6731f -> bde1d6a61 [SPARK-16294][SQL] Labelling support for the include_example Jekyll plugin ## What changes were proposed in this pull request? This PR adds labelling support for the `include_example` Jekyll plugin, so that we may split a single source file into multiple line blocks with different labels, and include them in multiple code snippets in the generated HTML page. ## How was this patch tested? Manually tested. https://cloud.githubusercontent.com/assets/230655/16451099/66a76db2-3e33-11e6-84fb-63104c2f0688.png;> Author: Cheng Lian <l...@databricks.com> Closes #13972 from liancheng/include-example-with-labels. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bde1d6a6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bde1d6a6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bde1d6a6 Branch: refs/heads/master Commit: bde1d6a61593aeb62370f526542cead94919b0c0 Parents: d3af673 Author: Cheng Lian <l...@databricks.com> Authored: Wed Jun 29 22:50:53 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jun 29 22:50:53 2016 -0700 -- docs/_plugins/include_example.rb| 25 +--- docs/sql-programming-guide.md | 41 +++- .../apache/spark/examples/sql/JavaSparkSQL.java | 5 +++ examples/src/main/python/sql.py | 5 +++ .../apache/spark/examples/sql/RDDRelation.scala | 10 - 5 files changed, 43 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bde1d6a6/docs/_plugins/include_example.rb -- diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index f748582..306 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -32,8 +32,18 @@ module Jekyll @code_dir = File.join(site.source, config_dir) clean_markup = @markup.strip - @file = File.join(@code_dir, clean_markup) - @lang = clean_markup.split('.').last + + parts = clean_markup.strip.split(' ') + if parts.length > 1 then +@snippet_label = ':' + parts[0] +snippet_file = parts[1] + else +@snippet_label = '' +snippet_file = parts[0] + end + + @file = File.join(@code_dir, snippet_file) + @lang = snippet_file.split('.').last code = File.open(@file).read.encode("UTF-8") code = select_lines(code) @@ -41,7 +51,7 @@ module Jekyll rendered_code = Pygments.highlight(code, :lexer => @lang) hint = "Find full example code at " \ -"\"examples/src/main/#{clean_markup}\" in the Spark repo." +"\"examples/src/main/#{snippet_file}\" in the Spark repo." rendered_code + hint end @@ -66,13 +76,13 @@ module Jekyll # Select the array of start labels from code. startIndices = lines .each_with_index -.select { |l, i| l.include? "$example on$" } +.select { |l, i| l.include? "$example on#{@snippet_label}$" } .map { |l, i| i } # Select the array of end labels from code. endIndices = lines .each_with_index -.select { |l, i| l.include? "$example off$" } +.select { |l, i| l.include? "$example off#{@snippet_label}$" } .map { |l, i| i } raise "Start indices amount is not equal to end indices amount, see #{@file}." \ @@ -92,7 +102,10 @@ module Jekyll if start == endline lastIndex = endline range = Range.new(start + 1, endline - 1) -result += trim_codeblock(lines[range]).join +trimmed = trim_codeblock(lines[range]) +# Filter out possible example tags of overlapped labels. +taggs_filtered = trimmed.select { |l| !l.include? '$example ' } +result += taggs_filtered.join result += "\n" end result http://git-wip-us.apache.org/repos/asf/spark/blob/bde1d6a6/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 6c6bc8d..68419e1 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -63,52 +63,23 @@ Throughout this document, we will often refer to Scala/Java Datasets of `Row`s a -The entry point into all functionality in Spark is the [`SparkSession`](api/scala/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.build()`: - -{% hig
spark git commit: [SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc
Repository: spark Updated Branches: refs/heads/branch-2.0 d96e8c2dd -> 1cde325e2 [SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc https://issues.apache.org/jira/browse/SPARK-16140 ## What changes were proposed in this pull request? Group the R doc of spark.kmeans, predict(KM), summary(KM), read/write.ml(KM) under Rd spark.kmeans. The example code was updated. ## How was this patch tested? Tested on my local machine And on my laptop `jekyll build` is failing to build API docs, so here I can only show you the html I manually generated from Rd files, with no CSS applied, but the doc content should be there. ![screenshotkmeans](https://cloud.githubusercontent.com/assets/3925641/16403203/c2c9ca1e-3ca7-11e6-9e29-f2164aee75fc.png) Author: Xin Ren <iamsh...@126.com> Closes #13921 from keypointt/SPARK-16140. (cherry picked from commit 8c9cd0a7a719ce4286f77f35bb787e2b626a472e) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cde325e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cde325e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cde325e Branch: refs/heads/branch-2.0 Commit: 1cde325e29286a8c6631b0b32351994aad7db567 Parents: d96e8c2 Author: Xin Ren <iamsh...@126.com> Authored: Wed Jun 29 11:25:00 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jun 29 11:25:07 2016 -0700 -- R/pkg/R/generics.R | 2 ++ R/pkg/R/mllib.R| 72 +++-- 2 files changed, 35 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1cde325e/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 27dfd67..0e4350f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1247,6 +1247,7 @@ setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.gl #' @export setGeneric("glm") +#' predict #' @rdname predict #' @export setGeneric("predict", function(object, ...) { standardGeneric("predict") }) @@ -1271,6 +1272,7 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s #' @export setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") }) +#' write.ml #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/1cde325e/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 897a376..4fe7367 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -267,9 +267,10 @@ setMethod("summary", signature(object = "NaiveBayesModel"), return(list(apriori = apriori, tables = tables)) }) -#' Fit a k-means model +#' K-Means Clustering Model #' -#' Fit a k-means model, similarly to R's kmeans(). +#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). +#' Users can print, make predictions on the produced model and save the model to the input path. #' #' @param data SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -278,14 +279,32 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' @param k Number of centers #' @param maxIter Maximum iteration number #' @param initMode The initialization algorithm choosen to fit the model -#' @return A fitted k-means model +#' @return \code{spark.kmeans} returns a fitted k-means model #' @rdname spark.kmeans +#' @name spark.kmeans #' @export #' @examples #' \dontrun{ -#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random") +#' sparkR.session() +#' data(iris) +#' df <- createDataFrame(iris) +#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random") +#' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "Sepal_Length", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) #' } #' @note spark.kmeans since 2.0.0 +#' @seealso \link{predict}, \link{read.ml}, \link{write.ml} setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula&qu
spark git commit: [SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc
Repository: spark Updated Branches: refs/heads/master c6a220d75 -> 8c9cd0a7a [SPARK-16140][MLLIB][SPARKR][DOCS] Group k-means method in generated R doc https://issues.apache.org/jira/browse/SPARK-16140 ## What changes were proposed in this pull request? Group the R doc of spark.kmeans, predict(KM), summary(KM), read/write.ml(KM) under Rd spark.kmeans. The example code was updated. ## How was this patch tested? Tested on my local machine And on my laptop `jekyll build` is failing to build API docs, so here I can only show you the html I manually generated from Rd files, with no CSS applied, but the doc content should be there. ![screenshotkmeans](https://cloud.githubusercontent.com/assets/3925641/16403203/c2c9ca1e-3ca7-11e6-9e29-f2164aee75fc.png) Author: Xin Ren <iamsh...@126.com> Closes #13921 from keypointt/SPARK-16140. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c9cd0a7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c9cd0a7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c9cd0a7 Branch: refs/heads/master Commit: 8c9cd0a7a719ce4286f77f35bb787e2b626a472e Parents: c6a220d Author: Xin Ren <iamsh...@126.com> Authored: Wed Jun 29 11:25:00 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jun 29 11:25:00 2016 -0700 -- R/pkg/R/generics.R | 2 ++ R/pkg/R/mllib.R| 72 +++-- 2 files changed, 35 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8c9cd0a7/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 27dfd67..0e4350f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1247,6 +1247,7 @@ setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.gl #' @export setGeneric("glm") +#' predict #' @rdname predict #' @export setGeneric("predict", function(object, ...) { standardGeneric("predict") }) @@ -1271,6 +1272,7 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s #' @export setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") }) +#' write.ml #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/8c9cd0a7/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 897a376..4fe7367 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -267,9 +267,10 @@ setMethod("summary", signature(object = "NaiveBayesModel"), return(list(apriori = apriori, tables = tables)) }) -#' Fit a k-means model +#' K-Means Clustering Model #' -#' Fit a k-means model, similarly to R's kmeans(). +#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). +#' Users can print, make predictions on the produced model and save the model to the input path. #' #' @param data SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -278,14 +279,32 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' @param k Number of centers #' @param maxIter Maximum iteration number #' @param initMode The initialization algorithm choosen to fit the model -#' @return A fitted k-means model +#' @return \code{spark.kmeans} returns a fitted k-means model #' @rdname spark.kmeans +#' @name spark.kmeans #' @export #' @examples #' \dontrun{ -#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random") +#' sparkR.session() +#' data(iris) +#' df <- createDataFrame(iris) +#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random") +#' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "Sepal_Length", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) #' } #' @note spark.kmeans since 2.0.0 +#' @seealso \link{predict}, \link{read.ml}, \link{write.ml} setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random")) {
spark git commit: [MINOR][SPARKR] Fix arguments of survreg in SparkR
Repository: spark Updated Branches: refs/heads/branch-2.0 ba71cf451 -> d96e8c2dd [MINOR][SPARKR] Fix arguments of survreg in SparkR ## What changes were proposed in this pull request? Fix wrong arguments description of ```survreg``` in SparkR. ## How was this patch tested? ```Arguments``` section of ```survreg``` doc before this PR (with wrong description for ```path``` and missing ```overwrite```): ![image](https://cloud.githubusercontent.com/assets/1962026/16447548/fe7a5ed4-3da1-11e6-8b96-b5bf2083b07e.png) After this PR: ![image](https://cloud.githubusercontent.com/assets/1962026/16447617/368e0b18-3da2-11e6-8277-45640fb11859.png) Author: Yanbo Liang <yblia...@gmail.com> Closes #13970 from yanboliang/spark-16143-followup. (cherry picked from commit c6a220d756f23ee89a0d1366b20259890c9d67c9) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d96e8c2d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d96e8c2d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d96e8c2d Branch: refs/heads/branch-2.0 Commit: d96e8c2dd0a9949751d3074b6ab61eee12f5d622 Parents: ba71cf4 Author: Yanbo Liang <yblia...@gmail.com> Authored: Wed Jun 29 11:20:35 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jun 29 11:20:41 2016 -0700 -- R/pkg/R/mllib.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d96e8c2d/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 8e6c2dd..897a376 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -442,11 +442,11 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), # Saves the AFT survival regression model to the input path. -#' @param path The directory where the model is savedist containing the model's coefficien +#' @param path The directory where the model is saved +#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE #' which means throw exception if the output path exists. #' #' @rdname spark.survreg -#' @name write.ml #' @export #' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0 #' @seealso \link{read.ml} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA
Repository: spark Updated Branches: refs/heads/branch-2.0 dd70a115c -> 22b4072e7 [SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA ## What changes were proposed in this pull request? model loading backward compatibility for ml.feature.PCA. ## How was this patch tested? existing ut and manual test for loading models saved by Spark 1.6. Author: Yanbo Liang <yblia...@gmail.com> Closes #13937 from yanboliang/spark-16245. (cherry picked from commit 0df5ce1bc1387a58b33cd185008f4022bd3dcc69) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/22b4072e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/22b4072e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/22b4072e Branch: refs/heads/branch-2.0 Commit: 22b4072e704f9a68a605e9a4cebf54d2122fe448 Parents: dd70a11 Author: Yanbo Liang <yblia...@gmail.com> Authored: Tue Jun 28 19:53:07 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Jun 28 19:53:16 2016 -0700 -- .../scala/org/apache/spark/ml/feature/PCA.scala | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/22b4072e/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 72167b5..ef8b085 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -206,24 +206,22 @@ object PCAModel extends MLReadable[PCAModel] { override def load(path: String): PCAModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - // explainedVariance field is not present in Spark <= 1.6 - val versionRegex = "([0-9]+)\\.([0-9]+).*".r - val hasExplainedVariance = metadata.sparkVersion match { -case versionRegex(major, minor) => - major.toInt >= 2 || (major.toInt == 1 && minor.toInt > 6) -case _ => false - } + val versionRegex = "([0-9]+)\\.(.+)".r + val versionRegex(major, _) = metadata.sparkVersion val dataPath = new Path(path, "data").toString - val model = if (hasExplainedVariance) { + val model = if (major.toInt >= 2) { val Row(pc: DenseMatrix, explainedVariance: DenseVector) = sparkSession.read.parquet(dataPath) .select("pc", "explainedVariance") .head() new PCAModel(metadata.uid, pc, explainedVariance) } else { -val Row(pc: DenseMatrix) = sparkSession.read.parquet(dataPath).select("pc").head() -new PCAModel(metadata.uid, pc, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]) +// pc field is the old matrix format in Spark <= 1.6 +// explainedVariance field is not present in Spark <= 1.6 +val Row(pc: OldDenseMatrix) = sparkSession.read.parquet(dataPath).select("pc").head() +new PCAModel(metadata.uid, pc.asML, + Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]) } DefaultParamsReader.getAndSetParams(model, metadata) model - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA
Repository: spark Updated Branches: refs/heads/master 363bcedee -> 0df5ce1bc [SPARK-16245][ML] model loading backward compatibility for ml.feature.PCA ## What changes were proposed in this pull request? model loading backward compatibility for ml.feature.PCA. ## How was this patch tested? existing ut and manual test for loading models saved by Spark 1.6. Author: Yanbo Liang <yblia...@gmail.com> Closes #13937 from yanboliang/spark-16245. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0df5ce1b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0df5ce1b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0df5ce1b Branch: refs/heads/master Commit: 0df5ce1bc1387a58b33cd185008f4022bd3dcc69 Parents: 363bced Author: Yanbo Liang <yblia...@gmail.com> Authored: Tue Jun 28 19:53:07 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Tue Jun 28 19:53:07 2016 -0700 -- .../scala/org/apache/spark/ml/feature/PCA.scala | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0df5ce1b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 72167b5..ef8b085 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -206,24 +206,22 @@ object PCAModel extends MLReadable[PCAModel] { override def load(path: String): PCAModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - // explainedVariance field is not present in Spark <= 1.6 - val versionRegex = "([0-9]+)\\.([0-9]+).*".r - val hasExplainedVariance = metadata.sparkVersion match { -case versionRegex(major, minor) => - major.toInt >= 2 || (major.toInt == 1 && minor.toInt > 6) -case _ => false - } + val versionRegex = "([0-9]+)\\.(.+)".r + val versionRegex(major, _) = metadata.sparkVersion val dataPath = new Path(path, "data").toString - val model = if (hasExplainedVariance) { + val model = if (major.toInt >= 2) { val Row(pc: DenseMatrix, explainedVariance: DenseVector) = sparkSession.read.parquet(dataPath) .select("pc", "explainedVariance") .head() new PCAModel(metadata.uid, pc, explainedVariance) } else { -val Row(pc: DenseMatrix) = sparkSession.read.parquet(dataPath).select("pc").head() -new PCAModel(metadata.uid, pc, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]) +// pc field is the old matrix format in Spark <= 1.6 +// explainedVariance field is not present in Spark <= 1.6 +val Row(pc: OldDenseMatrix) = sparkSession.read.parquet(dataPath).select("pc").head() +new PCAModel(metadata.uid, pc.asML, + Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]) } DefaultParamsReader.getAndSetParams(model, metadata) model - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16143][R] group AFT survival regression methods docs in a single Rd
Repository: spark Updated Branches: refs/heads/branch-2.0 c7704099d -> 4c4f7775c [SPARK-16143][R] group AFT survival regression methods docs in a single Rd ## What changes were proposed in this pull request? This PR groups `spark.survreg`, `summary(AFT)`, `predict(AFT)`, `write.ml(AFT)` for survival regression into a single Rd. ## How was this patch tested? Manually checked generated HTML doc. See attached screenshots. ![screen shot 2016-06-27 at 10 28 20 am](https://cloud.githubusercontent.com/assets/15318264/16392008/a14cf472-3c5e-11e6-9ce5-490ed1a52249.png) ![screen shot 2016-06-27 at 10 28 35 am](https://cloud.githubusercontent.com/assets/15318264/16392009/a14e333c-3c5e-11e6-8bd7-c2e9ba71f8e2.png) Author: Junyang Qian <junya...@databricks.com> Closes #13927 from junyangq/SPARK-16143. (cherry picked from commit 1b7fc5817203db5a56489b289fb1a0dd44b2e26b) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c4f7775 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c4f7775 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c4f7775 Branch: refs/heads/branch-2.0 Commit: 4c4f7775cbf5dd69e688350ee59a9319bcaa56fe Parents: c770409 Author: Junyang Qian <junya...@databricks.com> Authored: Mon Jun 27 20:32:27 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jun 27 20:32:35 2016 -0700 -- R/pkg/R/mllib.R | 88 +--- 1 file changed, 42 insertions(+), 46 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c4f7775/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 853cfce..8e6c2dd 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -233,9 +233,10 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), # Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), # similarly to R package e1071's predict. -#' @rdname spark.naiveBayes +#' @param newData A SparkDataFrame for testing #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named #' "prediction" +#' @rdname spark.naiveBayes #' @export #' @note predict(NaiveBayesModel) since 2.0.0 setMethod("predict", signature(object = "NaiveBayesModel"), @@ -439,25 +440,16 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), invisible(callJMethod(writer, "save", path)) }) -#' Save fitted MLlib model to the input path -#' -#' Save the AFT survival regression model to the input path. -#' -#' @param object A fitted AFT survival regression model -#' @param path The directory where the model is saved -#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE +# Saves the AFT survival regression model to the input path. + +#' @param path The directory where the model is savedist containing the model's coefficien #' which means throw exception if the output path exists. #' -#' @rdname write.ml +#' @rdname spark.survreg #' @name write.ml #' @export -#' @examples -#' \dontrun{ -#' model <- spark.survreg(trainingData, Surv(futime, fustat) ~ ecog_ps + rx) -#' path <- "path/to/model" -#' write.ml(model, path) -#' } #' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0 +#' @seealso \link{read.ml} setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "character"), function(object, path, overwrite = FALSE) { writer <- callJMethod(object@jobj, "write") @@ -542,15 +534,18 @@ read.ml <- function(path) { } } -#' Fit an accelerated failure time (AFT) survival regression model. +#' Accelerated Failure Time (AFT) Survival Regression Model #' -#' Fit an accelerated failure time (AFT) survival regression model on a Spark DataFrame. +#' \code{spark.survreg} fits an accelerated failure time (AFT) survival regression model on +#' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted AFT model, +#' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to +#' save/load fitted models. #' -#' @param data SparkDataFrame for training. +#' @param data A SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula #'operators are supported, including '~', ':', '+', and '-'. -#'Note that operator '.' is not supported currently. -#' @return a
spark git commit: [SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors
Repository: spark Updated Branches: refs/heads/branch-2.0 e4bb31fb3 -> 27f3462d0 [SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors ## What changes were proposed in this pull request? Need to convert ML Vectors to the old MLlib style before doing Statistics.colStats operations on the DataFrame ## How was this patch tested? Ran example, local tests Author: Bryan Cutler <cutl...@gmail.com> Closes #13928 from BryanCutler/pyspark-ml-example-vector-conv-SPARK-16231. (cherry picked from commit 1aa191e58e905f470f73663fc1c35f36e05e929a) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27f3462d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27f3462d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27f3462d Branch: refs/heads/branch-2.0 Commit: 27f3462d0e11b4768140e452f02ab043438b8e86 Parents: e4bb31f Author: Bryan Cutler <cutl...@gmail.com> Authored: Mon Jun 27 12:58:39 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jun 27 14:12:31 2016 -0700 -- examples/src/main/python/ml/dataframe_example.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27f3462d/examples/src/main/python/ml/dataframe_example.py -- diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py index a7d8b90..c1818d7 100644 --- a/examples/src/main/python/ml/dataframe_example.py +++ b/examples/src/main/python/ml/dataframe_example.py @@ -28,6 +28,7 @@ import shutil from pyspark.sql import SparkSession from pyspark.mllib.stat import Statistics +from pyspark.mllib.util import MLUtils if __name__ == "__main__": if len(sys.argv) > 2: @@ -55,7 +56,8 @@ if __name__ == "__main__": labelSummary.show() # Convert features column to an RDD of vectors. -features = df.select("features").rdd.map(lambda r: r.features) +features = MLUtils.convertVectorColumnsFromML(df, "features") \ +.select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors
Repository: spark Updated Branches: refs/heads/master c17b1abff -> 1aa191e58 [SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors ## What changes were proposed in this pull request? Need to convert ML Vectors to the old MLlib style before doing Statistics.colStats operations on the DataFrame ## How was this patch tested? Ran example, local tests Author: Bryan Cutler <cutl...@gmail.com> Closes #13928 from BryanCutler/pyspark-ml-example-vector-conv-SPARK-16231. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1aa191e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1aa191e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1aa191e5 Branch: refs/heads/master Commit: 1aa191e58e905f470f73663fc1c35f36e05e929a Parents: c17b1ab Author: Bryan Cutler <cutl...@gmail.com> Authored: Mon Jun 27 12:58:39 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jun 27 12:58:39 2016 -0700 -- examples/src/main/python/ml/dataframe_example.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1aa191e5/examples/src/main/python/ml/dataframe_example.py -- diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py index a7d8b90..c1818d7 100644 --- a/examples/src/main/python/ml/dataframe_example.py +++ b/examples/src/main/python/ml/dataframe_example.py @@ -28,6 +28,7 @@ import shutil from pyspark.sql import SparkSession from pyspark.mllib.stat import Statistics +from pyspark.mllib.util import MLUtils if __name__ == "__main__": if len(sys.argv) > 2: @@ -55,7 +56,8 @@ if __name__ == "__main__": labelSummary.show() # Convert features column to an RDD of vectors. -features = df.select("features").rdd.map(lambda r: r.features) +features = MLUtils.convertVectorColumnsFromML(df, "features") \ +.select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java
Repository: spark Updated Branches: refs/heads/branch-2.0 f2017c59b -> e4bb31fb3 [SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16187 This is to provide conversion utils between old/new vector columns in a DataFrame. So users can use it to migrate their datasets and pipelines manually. ## How was this patch tested? java and scala ut Author: Yuhao Yang <yuhao.y...@intel.com> Closes #13888 from hhbyyh/matComp. (cherry picked from commit c17b1abff8f8c6d24cb0cf4ff4f8c14a780c64b0) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4bb31fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4bb31fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4bb31fb Branch: refs/heads/branch-2.0 Commit: e4bb31fb3afeaf6b6ddc1af4c9c07f1f7001b7cc Parents: f2017c5 Author: Yuhao Yang <yuhao.y...@intel.com> Authored: Mon Jun 27 12:27:39 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jun 27 12:27:47 2016 -0700 -- .../org/apache/spark/ml/linalg/MatrixUDT.scala | 2 +- .../org/apache/spark/mllib/util/MLUtils.scala | 107 ++- .../spark/mllib/util/JavaMLUtilsSuite.java | 29 - .../apache/spark/mllib/util/MLUtilsSuite.scala | 56 +- 4 files changed, 187 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e4bb31fb/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala index 521a216..a1e5366 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.types._ * User-defined type for [[Matrix]] in [[mllib-local]] which allows easy interaction with SQL * via [[org.apache.spark.sql.Dataset]]. */ -private[ml] class MatrixUDT extends UserDefinedType[Matrix] { +private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense http://git-wip-us.apache.org/repos/asf/spark/blob/e4bb31fb/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 7d5bdff..e96c2bc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -23,7 +23,7 @@ import scala.reflect.ClassTag import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging -import org.apache.spark.ml.linalg.{VectorUDT => MLVectorUDT} +import org.apache.spark.ml.linalg.{MatrixUDT => MLMatrixUDT, VectorUDT => MLVectorUDT} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.BLAS.dot import org.apache.spark.mllib.regression.LabeledPoint @@ -309,8 +309,8 @@ object MLUtils extends Logging { } /** - * Converts vector columns in an input Dataset to the [[org.apache.spark.ml.linalg.Vector]] type - * from the new [[org.apache.spark.mllib.linalg.Vector]] type under the `spark.ml` package. + * Converts vector columns in an input Dataset to the [[org.apache.spark.mllib.linalg.Vector]] + * type from the new [[org.apache.spark.ml.linalg.Vector]] type under the `spark.ml` package. * @param dataset input dataset * @param cols a list of vector columns to be converted. Old vector columns will be ignored. If * unspecified, all new vector columns will be converted except nested ones. @@ -361,6 +361,107 @@ object MLUtils extends Logging { } /** + * Converts Matrix columns in an input Dataset from the [[org.apache.spark.mllib.linalg.Matrix]] + * type to the new [[org.apache.spark.ml.linalg.Matrix]] type under the `spark.ml` package. + * @param dataset input dataset + * @param cols a list of matrix columns to be converted. New matrix columns will be ignored. If + * unspecified, all old matrix columns will be converted except nested ones. + * @return the input [[DataFrame]] with old matrix columns converted to the new matrix type + */ + @Since("2.0.0") + @varargs + def convertMatrixColumnsToML(dataset: Dataset[_], cols: String*): DataFrame = { +
spark git commit: [SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java
Repository: spark Updated Branches: refs/heads/master c48c8ebc0 -> c17b1abff [SPARK-16187][ML] Implement util method for ML Matrix conversion in scala/java ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16187 This is to provide conversion utils between old/new vector columns in a DataFrame. So users can use it to migrate their datasets and pipelines manually. ## How was this patch tested? java and scala ut Author: Yuhao Yang <yuhao.y...@intel.com> Closes #13888 from hhbyyh/matComp. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c17b1abf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c17b1abf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c17b1abf Branch: refs/heads/master Commit: c17b1abff8f8c6d24cb0cf4ff4f8c14a780c64b0 Parents: c48c8eb Author: Yuhao Yang <yuhao.y...@intel.com> Authored: Mon Jun 27 12:27:39 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Mon Jun 27 12:27:39 2016 -0700 -- .../org/apache/spark/ml/linalg/MatrixUDT.scala | 2 +- .../org/apache/spark/mllib/util/MLUtils.scala | 107 ++- .../spark/mllib/util/JavaMLUtilsSuite.java | 29 - .../apache/spark/mllib/util/MLUtilsSuite.scala | 56 +- 4 files changed, 187 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c17b1abf/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala index 521a216..a1e5366 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/MatrixUDT.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.types._ * User-defined type for [[Matrix]] in [[mllib-local]] which allows easy interaction with SQL * via [[org.apache.spark.sql.Dataset]]. */ -private[ml] class MatrixUDT extends UserDefinedType[Matrix] { +private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense http://git-wip-us.apache.org/repos/asf/spark/blob/c17b1abf/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 7d5bdff..e96c2bc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -23,7 +23,7 @@ import scala.reflect.ClassTag import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging -import org.apache.spark.ml.linalg.{VectorUDT => MLVectorUDT} +import org.apache.spark.ml.linalg.{MatrixUDT => MLMatrixUDT, VectorUDT => MLVectorUDT} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.BLAS.dot import org.apache.spark.mllib.regression.LabeledPoint @@ -309,8 +309,8 @@ object MLUtils extends Logging { } /** - * Converts vector columns in an input Dataset to the [[org.apache.spark.ml.linalg.Vector]] type - * from the new [[org.apache.spark.mllib.linalg.Vector]] type under the `spark.ml` package. + * Converts vector columns in an input Dataset to the [[org.apache.spark.mllib.linalg.Vector]] + * type from the new [[org.apache.spark.ml.linalg.Vector]] type under the `spark.ml` package. * @param dataset input dataset * @param cols a list of vector columns to be converted. Old vector columns will be ignored. If * unspecified, all new vector columns will be converted except nested ones. @@ -361,6 +361,107 @@ object MLUtils extends Logging { } /** + * Converts Matrix columns in an input Dataset from the [[org.apache.spark.mllib.linalg.Matrix]] + * type to the new [[org.apache.spark.ml.linalg.Matrix]] type under the `spark.ml` package. + * @param dataset input dataset + * @param cols a list of matrix columns to be converted. New matrix columns will be ignored. If + * unspecified, all old matrix columns will be converted except nested ones. + * @return the input [[DataFrame]] with old matrix columns converted to the new matrix type + */ + @Since("2.0.0") + @varargs + def convertMatrixColumnsToML(dataset: Dataset[_], cols: String*): DataFrame = { +val schema = dataset.schema +val colSet = if (cols.nonEmpty) { + cols.flatMap { c => +val dataType = schema(c).dataTyp
spark git commit: [SPARK-16133][ML] model loading backward compatibility for ml.feature
Repository: spark Updated Branches: refs/heads/branch-2.0 557eee5b6 -> 3d8d95644 [SPARK-16133][ML] model loading backward compatibility for ml.feature ## What changes were proposed in this pull request? model loading backward compatibility for ml.feature, ## How was this patch tested? existing ut and manual test for loading 1.6 models. Author: Yuhao Yang <yuhao.y...@intel.com> Author: Yuhao Yang <hhb...@gmail.com> Closes #13844 from hhbyyh/featureComp. (cherry picked from commit cc6778ee0bf4fa7a78abd30542c4a6f80ea371c5) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3d8d9564 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3d8d9564 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3d8d9564 Branch: refs/heads/branch-2.0 Commit: 3d8d956448fd3b7ae8d380e655bfa245b11c4ea0 Parents: 557eee5 Author: Yuhao Yang <yuhao.y...@intel.com> Authored: Thu Jun 23 21:50:25 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 21:50:32 2016 -0700 -- mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala | 3 ++- .../scala/org/apache/spark/ml/feature/MinMaxScaler.scala| 9 ++--- .../scala/org/apache/spark/ml/feature/StandardScaler.scala | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3d8d9564/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala index 02d4e6a..5d6287f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala @@ -27,6 +27,7 @@ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ @@ -180,9 +181,9 @@ object IDFModel extends MLReadable[IDFModel] { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) + val Row(idf: Vector) = MLUtils.convertVectorColumnsToML(data, "idf") .select("idf") .head() - val idf = data.getAs[Vector](0) val model = new IDFModel(metadata.uid, new feature.IDFModel(OldVectors.fromML(idf))) DefaultParamsReader.getAndSetParams(model, metadata) model http://git-wip-us.apache.org/repos/asf/spark/blob/3d8d9564/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index 562b3f3..d5ad5ab 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -28,6 +28,7 @@ import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ @@ -232,9 +233,11 @@ object MinMaxScalerModel extends MLReadable[MinMaxScalerModel] { override def load(path: String): MinMaxScalerModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(originalMin: Vector, originalMax: Vector) = sparkSession.read.parquet(dataPath) -.select("originalMin", "originalMax") -.head() + val data = sparkSession.read.parquet(dataPath) + val Row(originalMin: Vector, originalMax: Vector) = +MLUtils.convertVectorColumnsToML(data, "originalMin", "originalMax") + .select("originalMin", "originalMax") + .head() val model = new MinMaxScalerModel(metadata.uid, originalMin, originalMax) DefaultParamsReader.getAndSetParams(model, metadata) model http://git-wip-us.apache.org/repos/asf/spark/blob/3d8d9564/mllib/src/main/scala/org/apache/spark/ml/feature/StandardS
spark git commit: [SPARK-16142][R] group naiveBayes method docs in a single Rd
Repository: spark Updated Branches: refs/heads/master 14bc5a7f3 -> 4a40d43bb [SPARK-16142][R] group naiveBayes method docs in a single Rd ## What changes were proposed in this pull request? This PR groups `spark.naiveBayes`, `summary(NB)`, `predict(NB)`, and `write.ml(NB)` into a single Rd. ## How was this patch tested? Manually checked generated HTML doc. See attached screenshots. ![screen shot 2016-06-23 at 2 11 00 pm](https://cloud.githubusercontent.com/assets/829644/16320452/a5885e92-394c-11e6-994f-2ab5cddad86f.png) ![screen shot 2016-06-23 at 2 11 15 pm](https://cloud.githubusercontent.com/assets/829644/16320455/aad1f6d8-394c-11e6-8ef4-13bee989f52f.png) Author: Xiangrui Meng <m...@databricks.com> Closes #13877 from mengxr/SPARK-16142. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a40d43b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a40d43b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a40d43b Branch: refs/heads/master Commit: 4a40d43bb29704734b8128bf2a3f27802ae34e17 Parents: 14bc5a7 Author: Xiangrui Meng <m...@databricks.com> Authored: Thu Jun 23 21:43:13 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 21:43:13 2016 -0700 -- R/pkg/R/mllib.R | 90 1 file changed, 42 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4a40d43b/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index dbff1b9..853cfce 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -218,9 +218,10 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { # Makes predictions from a generalized linear model produced by glm() or spark.glm(), # similarly to R's predict(). -#' + #' @param newData SparkDataFrame for testing -#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named "prediction" +#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named +#' "prediction" #' @rdname spark.glm #' @export #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0 @@ -229,41 +230,26 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Predicted values based on model -#' -#' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), -#' similarly to R package e1071's predict. -#' -#' @param object A fitted naive Bayes model -#' @rdname predict +# Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), +# similarly to R package e1071's predict. + +#' @rdname spark.naiveBayes +#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named +#' "prediction" #' @export -#' @examples -#' \dontrun{ -#' model <- spark.naiveBayes(trainingData, y ~ x) -#' predicted <- predict(model, testData) -#' showDF(predicted) -#'} #' @note predict(NaiveBayesModel) since 2.0.0 setMethod("predict", signature(object = "NaiveBayesModel"), function(object, newData) { return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Get the summary of a naive Bayes model -#' -#' Returns the summary of a naive Bayes model produced by spark.naiveBayes(), -#' similarly to R's summary(). -#' -#' @param object A fitted MLlib model -#' @return a list containing 'apriori', the label distribution, and 'tables', conditional -# probabilities given the target label -#' @rdname summary +# Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes} + +#' @param object A naive Bayes model fitted by \code{spark.naiveBayes} +#' @return \code{summary} returns a list containing \code{apriori}, the label distribution, and +#' \code{tables}, conditional probabilities given the target label +#' @rdname spark.naiveBayes #' @export -#' @examples -#' \dontrun{ -#' model <- spark.naiveBayes(trainingData, y ~ x) -#' summary(model) -#'} #' @note summary(NaiveBayesModel) since 2.0.0 setMethod("summary", signature(object = "NaiveBayesModel"), function(object, ...) { @@ -390,23 +376,41 @@ setMethod("predict", signature(object = "KMeansModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Fit a Bernoulli naive Bayes model +#' Naive Bayes Models #' -#' Fit a Bernoulli naive Bayes model o
spark git commit: [SPARK-16142][R] group naiveBayes method docs in a single Rd
Repository: spark Updated Branches: refs/heads/branch-2.0 ea0cf93d3 -> 557eee5b6 [SPARK-16142][R] group naiveBayes method docs in a single Rd ## What changes were proposed in this pull request? This PR groups `spark.naiveBayes`, `summary(NB)`, `predict(NB)`, and `write.ml(NB)` into a single Rd. ## How was this patch tested? Manually checked generated HTML doc. See attached screenshots. ![screen shot 2016-06-23 at 2 11 00 pm](https://cloud.githubusercontent.com/assets/829644/16320452/a5885e92-394c-11e6-994f-2ab5cddad86f.png) ![screen shot 2016-06-23 at 2 11 15 pm](https://cloud.githubusercontent.com/assets/829644/16320455/aad1f6d8-394c-11e6-8ef4-13bee989f52f.png) Author: Xiangrui Meng <m...@databricks.com> Closes #13877 from mengxr/SPARK-16142. (cherry picked from commit 4a40d43bb29704734b8128bf2a3f27802ae34e17) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/557eee5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/557eee5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/557eee5b Branch: refs/heads/branch-2.0 Commit: 557eee5b6d07f8a17257cd9aae5d7830b4de4690 Parents: ea0cf93 Author: Xiangrui Meng <m...@databricks.com> Authored: Thu Jun 23 21:43:13 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 21:43:21 2016 -0700 -- R/pkg/R/mllib.R | 90 1 file changed, 42 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/557eee5b/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index dbff1b9..853cfce 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -218,9 +218,10 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { # Makes predictions from a generalized linear model produced by glm() or spark.glm(), # similarly to R's predict(). -#' + #' @param newData SparkDataFrame for testing -#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named "prediction" +#' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named +#' "prediction" #' @rdname spark.glm #' @export #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0 @@ -229,41 +230,26 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Predicted values based on model -#' -#' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), -#' similarly to R package e1071's predict. -#' -#' @param object A fitted naive Bayes model -#' @rdname predict +# Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), +# similarly to R package e1071's predict. + +#' @rdname spark.naiveBayes +#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named +#' "prediction" #' @export -#' @examples -#' \dontrun{ -#' model <- spark.naiveBayes(trainingData, y ~ x) -#' predicted <- predict(model, testData) -#' showDF(predicted) -#'} #' @note predict(NaiveBayesModel) since 2.0.0 setMethod("predict", signature(object = "NaiveBayesModel"), function(object, newData) { return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Get the summary of a naive Bayes model -#' -#' Returns the summary of a naive Bayes model produced by spark.naiveBayes(), -#' similarly to R's summary(). -#' -#' @param object A fitted MLlib model -#' @return a list containing 'apriori', the label distribution, and 'tables', conditional -# probabilities given the target label -#' @rdname summary +# Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes} + +#' @param object A naive Bayes model fitted by \code{spark.naiveBayes} +#' @return \code{summary} returns a list containing \code{apriori}, the label distribution, and +#' \code{tables}, conditional probabilities given the target label +#' @rdname spark.naiveBayes #' @export -#' @examples -#' \dontrun{ -#' model <- spark.naiveBayes(trainingData, y ~ x) -#' summary(model) -#'} #' @note summary(NaiveBayesModel) since 2.0.0 setMethod("summary", signature(object = "NaiveBayesModel"), function(object, ...) { @@ -390,23 +376,41 @@ setMethod("predict", signature(object = "KMeansModel"), return(dataFrame(callJMethod(object@jobj, "transform",
spark git commit: [SPARK-16177][ML] model loading backward compatibility for ml.regression
Repository: spark Updated Branches: refs/heads/branch-2.0 a6edec2c5 -> ea0cf93d3 [SPARK-16177][ML] model loading backward compatibility for ml.regression ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16177 model loading backward compatibility for ml.regression ## How was this patch tested? existing ut and manual test for loading 1.6 models. Author: Yuhao Yang <hhb...@gmail.com> Closes #13879 from hhbyyh/regreComp. (cherry picked from commit 14bc5a7f36bed19cd714a4c725a83feaccac3468) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea0cf93d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea0cf93d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea0cf93d Branch: refs/heads/branch-2.0 Commit: ea0cf93d3969845e9df8305c0ce54326cdfb2bbd Parents: a6edec2 Author: Yuhao Yang <hhb...@gmail.com> Authored: Thu Jun 23 20:43:19 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 20:43:29 2016 -0700 -- .../apache/spark/ml/regression/AFTSurvivalRegression.scala | 9 + .../org/apache/spark/ml/regression/LinearRegression.scala | 8 +--- 2 files changed, 10 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ea0cf93d/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 2dbac49..7c51845 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -33,6 +33,7 @@ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ @@ -389,10 +390,10 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel] val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) -.select("coefficients", "intercept", "scale").head() - val coefficients = data.getAs[Vector](0) - val intercept = data.getDouble(1) - val scale = data.getDouble(2) + val Row(coefficients: Vector, intercept: Double, scale: Double) = +MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("coefficients", "intercept", "scale") + .head() val model = new AFTSurvivalRegressionModel(metadata.uid, coefficients, intercept, scale) DefaultParamsReader.getAndSetParams(model, metadata) http://git-wip-us.apache.org/repos/asf/spark/blob/ea0cf93d/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 2723f74..0a4d98c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -39,6 +39,7 @@ import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ @@ -500,9 +501,10 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] { val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) -.select("intercept", "coefficients").head() - val intercept = data.getDouble(0) - val coefficients = data.getAs[Vector](1) + val Row(intercept: Double, coefficients: Vector) = +MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("intercept", "coef
spark git commit: [SPARK-16177][ML] model loading backward compatibility for ml.regression
Repository: spark Updated Branches: refs/heads/master 6a3c6276f -> 14bc5a7f3 [SPARK-16177][ML] model loading backward compatibility for ml.regression ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16177 model loading backward compatibility for ml.regression ## How was this patch tested? existing ut and manual test for loading 1.6 models. Author: Yuhao Yang <hhb...@gmail.com> Closes #13879 from hhbyyh/regreComp. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14bc5a7f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14bc5a7f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14bc5a7f Branch: refs/heads/master Commit: 14bc5a7f36bed19cd714a4c725a83feaccac3468 Parents: 6a3c627 Author: Yuhao Yang <hhb...@gmail.com> Authored: Thu Jun 23 20:43:19 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 20:43:19 2016 -0700 -- .../apache/spark/ml/regression/AFTSurvivalRegression.scala | 9 + .../org/apache/spark/ml/regression/LinearRegression.scala | 8 +--- 2 files changed, 10 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/14bc5a7f/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 2dbac49..7c51845 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -33,6 +33,7 @@ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ @@ -389,10 +390,10 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel] val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) -.select("coefficients", "intercept", "scale").head() - val coefficients = data.getAs[Vector](0) - val intercept = data.getDouble(1) - val scale = data.getDouble(2) + val Row(coefficients: Vector, intercept: Double, scale: Double) = +MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("coefficients", "intercept", "scale") + .head() val model = new AFTSurvivalRegressionModel(metadata.uid, coefficients, intercept, scale) DefaultParamsReader.getAndSetParams(model, metadata) http://git-wip-us.apache.org/repos/asf/spark/blob/14bc5a7f/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 2723f74..0a4d98c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -39,6 +39,7 @@ import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ @@ -500,9 +501,10 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] { val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) -.select("intercept", "coefficients").head() - val intercept = data.getDouble(0) - val coefficients = data.getAs[Vector](1) + val Row(intercept: Double, coefficients: Vector) = +MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("intercept", "coefficients") + .head() val model = new LinearRegressionModel(metadata.uid, coefficients, intercept) DefaultParamsReader.getAndSetParams(model, metadata) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with child predicate first
Repository: spark Updated Branches: refs/heads/master 738f134bf -> 91b1ef28d [SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with child predicate first ## What changes were proposed in this pull request? This PR changes `CombineFilters` to compose the final predicate condition by using (`child predicate` AND `parent predicate`) instead of (`parent predicate` AND `child predicate`). This is a best effort approach. Some other optimization rules may destroy this order by reorganizing conjunctive predicates. **Reported Error Scenario** Chris McCubbin reported a bug when he used StringIndexer in an ML pipeline with additional filters. It seems that during filter pushdown, we changed the ordering in the logical plan. ```scala import org.apache.spark.ml.feature._ val df1 = (0 until 3).map(_.toString).toDF val indexer = new StringIndexer() .setInputCol("value") .setOutputCol("idx") .setHandleInvalid("skip") .fit(df1) val df2 = (0 until 5).map(_.toString).toDF val predictions = indexer.transform(df2) predictions.show() // this is okay predictions.where('idx > 2).show() // this will throw an exception ``` Please see the notebook at https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1233855/2159162931615821/588180/latest.html for error messages. ## How was this patch tested? Pass the Jenkins tests (including a new testcase). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13872 from dongjoon-hyun/SPARK-16164. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91b1ef28 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91b1ef28 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91b1ef28 Branch: refs/heads/master Commit: 91b1ef28d134313d7b6faaffa1c390f3ca4455d0 Parents: 738f134 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Thu Jun 23 15:27:43 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 15:27:43 2016 -0700 -- .../spark/sql/catalyst/optimizer/Optimizer.scala | 2 +- .../catalyst/optimizer/FilterPushdownSuite.scala | 18 ++ 2 files changed, 19 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/91b1ef28/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 6e78ad0..2bca31d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1002,7 +1002,7 @@ object CombineFilters extends Rule[LogicalPlan] with PredicateHelper { (ExpressionSet(splitConjunctivePredicates(fc)) -- ExpressionSet(splitConjunctivePredicates(nc))).reduceOption(And) match { case Some(ac) => - Filter(And(ac, nc), grandChild) + Filter(And(nc, ac), grandChild) case None => nf } http://git-wip-us.apache.org/repos/asf/spark/blob/91b1ef28/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala index b8f28e8..9cb49e7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala @@ -94,6 +94,24 @@ class FilterPushdownSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + test("SPARK-16164: Filter pushdown should keep the ordering in the logical plan") { +val originalQuery = + testRelation +.where('a === 1) +.select('a, 'b) +.where('b === 1) + +val optimized = Optimize.execute(originalQuery.analyze) +val correctAnswer = + testRelation +.where('a === 1 && 'b === 1) +.select('a, 'b) +.analyze + +// We can not use comparePlans here because it normalized the plan. +assert(optimized == correctAnswer) + } + test("can't push without rewrite") { val originalQuery = testRelation - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with child predicate first
Repository: spark Updated Branches: refs/heads/branch-2.0 2ce240cfe -> 6cb24de99 [SPARK-16164][SQL] Update `CombineFilters` to try to construct predicates with child predicate first ## What changes were proposed in this pull request? This PR changes `CombineFilters` to compose the final predicate condition by using (`child predicate` AND `parent predicate`) instead of (`parent predicate` AND `child predicate`). This is a best effort approach. Some other optimization rules may destroy this order by reorganizing conjunctive predicates. **Reported Error Scenario** Chris McCubbin reported a bug when he used StringIndexer in an ML pipeline with additional filters. It seems that during filter pushdown, we changed the ordering in the logical plan. ```scala import org.apache.spark.ml.feature._ val df1 = (0 until 3).map(_.toString).toDF val indexer = new StringIndexer() .setInputCol("value") .setOutputCol("idx") .setHandleInvalid("skip") .fit(df1) val df2 = (0 until 5).map(_.toString).toDF val predictions = indexer.transform(df2) predictions.show() // this is okay predictions.where('idx > 2).show() // this will throw an exception ``` Please see the notebook at https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1233855/2159162931615821/588180/latest.html for error messages. ## How was this patch tested? Pass the Jenkins tests (including a new testcase). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13872 from dongjoon-hyun/SPARK-16164. (cherry picked from commit 91b1ef28d134313d7b6faaffa1c390f3ca4455d0) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6cb24de9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6cb24de9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6cb24de9 Branch: refs/heads/branch-2.0 Commit: 6cb24de99e011ce97fb7d3513a2760b0d1a85a45 Parents: 2ce240c Author: Dongjoon Hyun <dongj...@apache.org> Authored: Thu Jun 23 15:27:43 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 15:27:50 2016 -0700 -- .../spark/sql/catalyst/optimizer/Optimizer.scala | 2 +- .../catalyst/optimizer/FilterPushdownSuite.scala | 18 ++ 2 files changed, 19 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6cb24de9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 6190f7a..6b10484 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -963,7 +963,7 @@ object CombineFilters extends Rule[LogicalPlan] with PredicateHelper { (ExpressionSet(splitConjunctivePredicates(fc)) -- ExpressionSet(splitConjunctivePredicates(nc))).reduceOption(And) match { case Some(ac) => - Filter(And(ac, nc), grandChild) + Filter(And(nc, ac), grandChild) case None => nf } http://git-wip-us.apache.org/repos/asf/spark/blob/6cb24de9/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala index b8f28e8..9cb49e7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala @@ -94,6 +94,24 @@ class FilterPushdownSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + test("SPARK-16164: Filter pushdown should keep the ordering in the logical plan") { +val originalQuery = + testRelation +.where('a === 1) +.select('a, 'b) +.where('b === 1) + +val optimized = Optimize.execute(originalQuery.analyze) +val correctAnswer = + testRelation +.where('a === 1 && 'b === 1) +.select('a, 'b) +.analyze + +// We can not use comparePlans here because it normalized the plan. +assert(optimized == correctAnswer) + } + test("can't push without rewrite") { val originalQuery = testRelation ---
spark git commit: [SPARK-16130][ML] model loading backward compatibility for ml.classfication.LogisticRegression
Repository: spark Updated Branches: refs/heads/master d85bb10ce -> 60398dabc [SPARK-16130][ML] model loading backward compatibility for ml.classfication.LogisticRegression ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16130 model loading backward compatibility for ml.classfication.LogisticRegression ## How was this patch tested? existing ut and manual test for loading old models. Author: Yuhao Yang <hhb...@gmail.com> Closes #13841 from hhbyyh/lrcomp. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60398dab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60398dab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60398dab Branch: refs/heads/master Commit: 60398dabc50d402bbab4190fbe94ebed6d3a48dc Parents: d85bb10 Author: Yuhao Yang <hhb...@gmail.com> Authored: Thu Jun 23 11:00:00 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 11:00:00 2016 -0700 -- .../spark/ml/classification/LogisticRegression.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60398dab/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index be69d46..9c9f5ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -674,12 +674,12 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) -.select("numClasses", "numFeatures", "intercept", "coefficients").head() + // We will need numClasses, numFeatures in the future for multinomial logreg support. - // val numClasses = data.getInt(0) - // val numFeatures = data.getInt(1) - val intercept = data.getDouble(2) - val coefficients = data.getAs[Vector](3) + val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) = +MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("numClasses", "numFeatures", "intercept", "coefficients") + .head() val model = new LogisticRegressionModel(metadata.uid, coefficients, intercept) DefaultParamsReader.getAndSetParams(model, metadata) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16130][ML] model loading backward compatibility for ml.classfication.LogisticRegression
Repository: spark Updated Branches: refs/heads/branch-2.0 63fd3301c -> dff3d75db [SPARK-16130][ML] model loading backward compatibility for ml.classfication.LogisticRegression ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16130 model loading backward compatibility for ml.classfication.LogisticRegression ## How was this patch tested? existing ut and manual test for loading old models. Author: Yuhao Yang <hhb...@gmail.com> Closes #13841 from hhbyyh/lrcomp. (cherry picked from commit 60398dabc50d402bbab4190fbe94ebed6d3a48dc) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dff3d75d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dff3d75d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dff3d75d Branch: refs/heads/branch-2.0 Commit: dff3d75db4c2848a43ed8a3084c75f38c93138af Parents: 63fd330 Author: Yuhao Yang <hhb...@gmail.com> Authored: Thu Jun 23 11:00:00 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 11:00:06 2016 -0700 -- .../spark/ml/classification/LogisticRegression.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dff3d75d/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index be69d46..9c9f5ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -674,12 +674,12 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) -.select("numClasses", "numFeatures", "intercept", "coefficients").head() + // We will need numClasses, numFeatures in the future for multinomial logreg support. - // val numClasses = data.getInt(0) - // val numFeatures = data.getInt(1) - val intercept = data.getDouble(2) - val coefficients = data.getAs[Vector](3) + val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) = +MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("numClasses", "numFeatures", "intercept", "coefficients") + .head() val model = new LogisticRegressionModel(metadata.uid, coefficients, intercept) DefaultParamsReader.getAndSetParams(model, metadata) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs
Repository: spark Updated Branches: refs/heads/master 5bf2889bf -> 65d1f0f71 [SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs ## What changes were proposed in this pull request? Since we decided to switch spark.mllib package into maintenance mode in 2.0, it would be nice to update the package docs to reflect this change. ## How was this patch tested? Manually checked generated APIs. Author: Xiangrui Meng <m...@databricks.com> Closes #13859 from mengxr/SPARK-16154. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65d1f0f7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65d1f0f7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65d1f0f7 Branch: refs/heads/master Commit: 65d1f0f716f50dd14b5dfe1e7fac772f1b4d2be0 Parents: 5bf2889 Author: Xiangrui Meng <m...@databricks.com> Authored: Thu Jun 23 08:26:17 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 08:26:17 2016 -0700 -- .../scala/org/apache/spark/ml/package-info.java | 7 ++--- .../scala/org/apache/spark/ml/package.scala | 4 +-- .../org/apache/spark/mllib/JavaPackage.java | 31 .../org/apache/spark/mllib/package-info.java| 22 +- .../scala/org/apache/spark/mllib/package.scala | 17 ++- 5 files changed, 72 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65d1f0f7/mllib/src/main/scala/org/apache/spark/ml/package-info.java -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/package-info.java index 9a40f5dd..cb97382 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/package-info.java +++ b/mllib/src/main/scala/org/apache/spark/ml/package-info.java @@ -16,10 +16,7 @@ */ /** - * Spark ML is a component that adds a new set of machine learning APIs to let users quickly - * assemble and configure practical machine learning pipelines. + * DataFrame-based machine learning APIs to let users quickly assemble and configure practical + * machine learning pipelines. */ -@Experimental package org.apache.spark.ml; - -import org.apache.spark.annotation.Experimental; http://git-wip-us.apache.org/repos/asf/spark/blob/65d1f0f7/mllib/src/main/scala/org/apache/spark/ml/package.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala b/mllib/src/main/scala/org/apache/spark/ml/package.scala index 5cc328b..a445c67 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/package.scala @@ -18,8 +18,8 @@ package org.apache.spark /** - * Spark ML is a component that adds a new set of machine learning APIs to let users quickly - * assemble and configure practical machine learning pipelines. + * DataFrame-based machine learning APIs to let users quickly assemble and configure practical + * machine learning pipelines. * * @groupname param Parameters * @groupdesc param A list of (hyper-)parameter keys this algorithm can take. Users can set and get http://git-wip-us.apache.org/repos/asf/spark/blob/65d1f0f7/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java new file mode 100644 index 000..22e3452 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib; + +import org.apache.spark.annotation.AlphaComponent; + +/** + * A dummy class as a workaround to show the package doc of spark.mllib in generated + * Java API docs. + * @see http://bugs.java.com/bugdatabase/view_bug.do?bug_id=449265
spark git commit: [SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs
Repository: spark Updated Branches: refs/heads/branch-2.0 4ad731ed6 -> 567093596 [SPARK-16154][MLLIB] Update spark.ml and spark.mllib package docs ## What changes were proposed in this pull request? Since we decided to switch spark.mllib package into maintenance mode in 2.0, it would be nice to update the package docs to reflect this change. ## How was this patch tested? Manually checked generated APIs. Author: Xiangrui Meng <m...@databricks.com> Closes #13859 from mengxr/SPARK-16154. (cherry picked from commit 65d1f0f716f50dd14b5dfe1e7fac772f1b4d2be0) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56709359 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56709359 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56709359 Branch: refs/heads/branch-2.0 Commit: 567093596057eb77d940d53c88b82da128acfd9b Parents: 4ad731e Author: Xiangrui Meng <m...@databricks.com> Authored: Thu Jun 23 08:26:17 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jun 23 08:26:25 2016 -0700 -- .../scala/org/apache/spark/ml/package-info.java | 7 ++--- .../scala/org/apache/spark/ml/package.scala | 4 +-- .../org/apache/spark/mllib/JavaPackage.java | 31 .../org/apache/spark/mllib/package-info.java| 22 +- .../scala/org/apache/spark/mllib/package.scala | 17 ++- 5 files changed, 72 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/56709359/mllib/src/main/scala/org/apache/spark/ml/package-info.java -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/package-info.java index 9a40f5dd..cb97382 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/package-info.java +++ b/mllib/src/main/scala/org/apache/spark/ml/package-info.java @@ -16,10 +16,7 @@ */ /** - * Spark ML is a component that adds a new set of machine learning APIs to let users quickly - * assemble and configure practical machine learning pipelines. + * DataFrame-based machine learning APIs to let users quickly assemble and configure practical + * machine learning pipelines. */ -@Experimental package org.apache.spark.ml; - -import org.apache.spark.annotation.Experimental; http://git-wip-us.apache.org/repos/asf/spark/blob/56709359/mllib/src/main/scala/org/apache/spark/ml/package.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala b/mllib/src/main/scala/org/apache/spark/ml/package.scala index 5cc328b..a445c67 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/package.scala @@ -18,8 +18,8 @@ package org.apache.spark /** - * Spark ML is a component that adds a new set of machine learning APIs to let users quickly - * assemble and configure practical machine learning pipelines. + * DataFrame-based machine learning APIs to let users quickly assemble and configure practical + * machine learning pipelines. * * @groupname param Parameters * @groupdesc param A list of (hyper-)parameter keys this algorithm can take. Users can set and get http://git-wip-us.apache.org/repos/asf/spark/blob/56709359/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java new file mode 100644 index 000..22e3452 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/JavaPackage.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib; + +import org.apache.spark.annotation.AlphaComponent; + +/** + * A dummy class as a workaround to sh
spark git commit: [SPARK-16155][DOC] remove package grouping in Java docs
Repository: spark Updated Branches: refs/heads/branch-2.0 02435acf3 -> 1d3c56e77 [SPARK-16155][DOC] remove package grouping in Java docs ## What changes were proposed in this pull request? In 1.4 and earlier releases, we have package grouping in the generated Java API docs. See http://spark.apache.org/docs/1.4.0/api/java/index.html. However, this disappeared in 1.5.0: http://spark.apache.org/docs/1.5.0/api/java/index.html. Rather than fixing it, I'd suggest removing grouping. Because it might take some time to fix and it is a manual process to update the grouping in `SparkBuild.scala`. I didn't find anyone complaining about missing groups since 1.5.0 on Google. Manually checked the generated Java API docs and confirmed that they are the same as in master. Author: Xiangrui Meng <m...@databricks.com> Closes #13856 from mengxr/SPARK-16155. (cherry picked from commit 857ecff1d8268b28bb287e47cda370c87afe9d41) Signed-off-by: Xiangrui Meng <m...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d3c56e7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d3c56e7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d3c56e7 Branch: refs/heads/branch-2.0 Commit: 1d3c56e778b28ad4587d07765896814bfc1201f4 Parents: 02435ac Author: Xiangrui Meng <m...@databricks.com> Authored: Wed Jun 22 15:52:37 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jun 22 15:52:47 2016 -0700 -- project/SparkBuild.scala | 20 1 file changed, 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d3c56e7/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index bce7f1d..4b44469 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -684,11 +684,6 @@ object Unidoc { import sbtunidoc.Plugin._ import UnidocKeys._ - // for easier specification of JavaDoc package groups - private def packageList(names: String*): String = { -names.map(s => "org.apache.spark." + s).mkString(":") - } - private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { packages .map(_.filterNot(_.getName.contains("$"))) @@ -731,21 +726,6 @@ object Unidoc { javacOptions in doc := Seq( "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc", "-public", - "-group", "Core Java API", packageList("api.java", "api.java.function"), - "-group", "Spark Streaming", packageList( -"streaming.api.java", "streaming.flume", "streaming.kafka", "streaming.kinesis" - ), - "-group", "MLlib", packageList( -"mllib.classification", "mllib.clustering", "mllib.evaluation.binary", "mllib.linalg", -"mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", "mllib.recommendation", -"mllib.regression", "mllib.stat", "mllib.tree", "mllib.tree.configuration", -"mllib.tree.impurity", "mllib.tree.model", "mllib.util", -"mllib.evaluation", "mllib.feature", "mllib.random", "mllib.stat.correlation", -"mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss", -"ml", "ml.attribute", "ml.classification", "ml.clustering", "ml.evaluation", "ml.feature", -"ml.param", "ml.recommendation", "ml.regression", "ml.tuning" - ), - "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", "sql.hive.api.java"), "-noqualifier", "java.lang" ), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org