svn commit: r29928 - in /dev/spark/3.0.0-SNAPSHOT-2018_10_07_12_02-ebd899b-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sun Oct 7 19:17:10 2018 New Revision: 29928 Log: Apache Spark 3.0.0-SNAPSHOT-2018_10_07_12_02-ebd899b docs [This commit notification would consist of 1481 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change
Repository: spark Updated Branches: refs/heads/master 669ade3a8 -> ebd899b8a [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change ## What changes were proposed in this pull request? This is the same as #22492 but for master branch. Revert SPARK-14681 to avoid API breaking changes. cc: WeichenXu123 ## How was this patch tested? Existing unit tests. Closes #22618 from mengxr/SPARK-25321.master. Authored-by: WeichenXu Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebd899b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebd899b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebd899b8 Branch: refs/heads/master Commit: ebd899b8a865395e6f1137163cb508086696879b Parents: 669ade3 Author: WeichenXu Authored: Sun Oct 7 10:06:44 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 7 10:06:44 2018 -0700 -- .../classification/DecisionTreeClassifier.scala | 14 +- .../spark/ml/classification/GBTClassifier.scala | 6 +- .../classification/RandomForestClassifier.scala | 6 +- .../ml/regression/DecisionTreeRegressor.scala | 13 +- .../spark/ml/regression/GBTRegressor.scala | 6 +- .../ml/regression/RandomForestRegressor.scala | 6 +- .../scala/org/apache/spark/ml/tree/Node.scala | 247 --- .../spark/ml/tree/impl/RandomForest.scala | 10 +- .../org/apache/spark/ml/tree/treeModels.scala | 36 +-- .../DecisionTreeClassifierSuite.scala | 31 +-- .../ml/classification/GBTClassifierSuite.scala | 4 +- .../RandomForestClassifierSuite.scala | 5 +- .../regression/DecisionTreeRegressorSuite.scala | 14 -- .../spark/ml/tree/impl/RandomForestSuite.scala | 22 +- .../apache/spark/ml/tree/impl/TreeTests.scala | 12 +- project/MimaExcludes.scala | 7 - 16 files changed, 107 insertions(+), 332 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebd899b8/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 8a57bfc..6648e78 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -168,7 +168,7 @@ object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifi @Since("1.4.0") class DecisionTreeClassificationModel private[ml] ( @Since("1.4.0")override val uid: String, -@Since("1.4.0")override val rootNode: ClassificationNode, +@Since("1.4.0")override val rootNode: Node, @Since("1.6.0")override val numFeatures: Int, @Since("1.5.0")override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel] @@ -181,7 +181,7 @@ class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: ClassificationNode, numFeatures: Int, numClasses: Int) = + private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses) override def predict(features: Vector): Double = { @@ -279,9 +279,8 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] - val root = loadTreeNodes(path, metadata, sparkSession, isClassification = true) - val model = new DecisionTreeClassificationModel(metadata.uid, -root.asInstanceOf[ClassificationNode], numFeatures, numClasses) + val root = loadTreeNodes(path, metadata, sparkSession) + val model = new DecisionTreeClassificationModel(metadata.uid, root, numFeatures, numClasses) metadata.getAndSetParams(model) model } @@ -296,10 +295,9 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica require(oldModel.algo == OldAlgo.Classification, s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") -val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures, isClassification = true) +val rootNode = Node.fromOld(oldModel.topNode,
spark git commit: [SPARK-25657][SQL][TEST] Refactor HashBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master b1328cc58 -> 669ade3a8 [SPARK-25657][SQL][TEST] Refactor HashBenchmark to use main method ## What changes were proposed in this pull request? Refactor `HashBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.HashBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.HashBenchmark" ``` ## How was this patch tested? manual tests Closes #22651 from wangyum/SPARK-25657. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/669ade3a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/669ade3a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/669ade3a Branch: refs/heads/master Commit: 669ade3a8eed0016b5ece57d776cea0616417088 Parents: b1328cc Author: Yuming Wang Authored: Sun Oct 7 09:49:37 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 7 09:49:37 2018 -0700 -- .../benchmarks/HashBenchmark-results.txt| 70 + .../org/apache/spark/sql/HashBenchmark.scala| 152 +++ 2 files changed, 129 insertions(+), 93 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/669ade3a/sql/catalyst/benchmarks/HashBenchmark-results.txt -- diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt new file mode 100644 index 000..2459b35 --- /dev/null +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -0,0 +1,70 @@ + +single ints + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single ints:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +interpreted version 5615 / 5616 95.6 10.5 1.0X +codegen version 8400 / 8407 63.9 15.6 0.7X +codegen version 64-bit8139 / 8145 66.0 15.2 0.7X +codegen HiveHash version 7213 / 7348 74.4 13.4 0.8X + + + +single longs + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single longs: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +interpreted version 6053 / 6054 88.7 11.3 1.0X +codegen version 9367 / 9369 57.3 17.4 0.6X +codegen version 64-bit8041 / 8051 66.8 15.0 0.8X +codegen HiveHash version 7546 / 7575 71.1 14.1 0.8X + + + +normal + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For normal: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +interpreted version 3181 / 3182 0.7 1517.0 1.0X +codegen version 2403 / 2403 0.9 1145.7 1.3X +codegen version 64-bit 915 / 916 2.3 436.2 3.5X +codegen HiveHash version 4505 / 4527 0.5 2148.3 0.7X + + + +array
spark git commit: [SPARK-25658][SQL][TEST] Refactor HashByteArrayBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 3eb842969 -> b1328cc58 [SPARK-25658][SQL][TEST] Refactor HashByteArrayBenchmark to use main method ## What changes were proposed in this pull request? Refactor `HashByteArrayBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.HashByteArrayBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.HashByteArrayBenchmark" ``` ## How was this patch tested? manual tests Closes #22652 from wangyum/SPARK-25658. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1328cc5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1328cc5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1328cc5 Branch: refs/heads/master Commit: b1328cc58ebb73bc191de5546735cffe0c68255e Parents: 3eb8429 Author: Yuming Wang Authored: Sun Oct 7 09:44:01 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 7 09:44:01 2018 -0700 -- .../HashByteArrayBenchmark-results.txt | 77 .../spark/sql/HashByteArrayBenchmark.scala | 120 --- 2 files changed, 102 insertions(+), 95 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b1328cc5/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt -- diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt new file mode 100644 index 000..a4304ee --- /dev/null +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt @@ -0,0 +1,77 @@ + +Benchmark for MurMurHash 3 and xxHash64 + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 8: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 16 / 16127.7 7.8 1.0X +xxHash 64-bit 23 / 23 90.7 11.0 0.7X +HiveHasher 16 / 16134.8 7.4 1.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 16: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 26 / 26 79.5 12.6 1.0X +xxHash 64-bit 26 / 27 79.3 12.6 1.0X +HiveHasher 30 / 30 70.1 14.3 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 24: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 36 / 36 58.1 17.2 1.0X +xxHash 64-bit 30 / 30 70.2 14.2 1.2X +HiveHasher 45 / 45 46.4 21.5 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 31: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 50 / 50 41.8 23.9 1.0X +xxHash 64-bit 43 / 43 49.3 20.3 1.2X +HiveHasher 58 / 58 35.9 27.8 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 95:
spark git commit: [SPARK-25461][PYSPARK][SQL] Add document for mismatch between return type of Pandas.Series and return type of pandas udf
Repository: spark Updated Branches: refs/heads/master fba722e31 -> 3eb842969 [SPARK-25461][PYSPARK][SQL] Add document for mismatch between return type of Pandas.Series and return type of pandas udf ## What changes were proposed in this pull request? For Pandas UDFs, we get arrow type from defined Catalyst return data type of UDFs. We use this arrow type to do serialization of data. If the defined return data type doesn't match with actual return type of Pandas.Series returned by Pandas UDFs, it has a risk to return incorrect data from Python side. Currently we don't have reliable approach to check if the data conversion is safe or not. We leave some document to notify this to users for now. When there is next upgrade of PyArrow available we can use to check it, we should add the option to check it. ## How was this patch tested? Only document change. Closes #22610 from viirya/SPARK-25461. Authored-by: Liang-Chi Hsieh Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3eb84296 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3eb84296 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3eb84296 Branch: refs/heads/master Commit: 3eb842969906d6e81a137af6dc4339881df0a315 Parents: fba722e Author: Liang-Chi Hsieh Authored: Sun Oct 7 23:18:46 2018 +0800 Committer: hyukjinkwon Committed: Sun Oct 7 23:18:46 2018 +0800 -- python/pyspark/sql/functions.py | 6 ++ 1 file changed, 6 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3eb84296/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 7685264..be089ee 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2948,6 +2948,12 @@ def pandas_udf(f=None, returnType=None, functionType=None): can fail on special rows, the workaround is to incorporate the condition into the functions. .. note:: The user-defined functions do not take keyword arguments on the calling side. + +.. note:: The data type of returned `pandas.Series` from the user-defined functions should be +matched with defined returnType (see :meth:`types.to_arrow_type` and +:meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do +conversion on returned data. The conversion is not guaranteed to be correct and results +should be checked for accuracy by users. """ # decorator @pandas_udf(returnType, functionType) is_decorator = f is None or isinstance(f, (str, DataType)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r29927 - in /dev/spark/3.0.0-SNAPSHOT-2018_10_07_08_02-fba722e-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sun Oct 7 15:17:22 2018 New Revision: 29927 Log: Apache Spark 3.0.0-SNAPSHOT-2018_10_07_08_02-fba722e docs [This commit notification would consist of 1485 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25539][BUILD] Upgrade lz4-java to 1.5.0 get speed improvement
Repository: spark Updated Branches: refs/heads/master 8bb242902 -> fba722e31 [SPARK-25539][BUILD] Upgrade lz4-java to 1.5.0 get speed improvement ## What changes were proposed in this pull request? This PR upgrade `lz4-java` to 1.5.0 get speed improvement. **General speed improvements** LZ4 decompression speed has always been a strong point. In v1.8.2, this gets even better, as it improves decompression speed by about 10%, thanks in a large part to suggestion from svpv . For example, on a Mac OS-X laptop with an Intel Core i7-5557U CPU 3.10GHz, running lz4 -bsilesia.tar compiled with default compiler llvm v9.1.0: Version | v1.8.1 | v1.8.2 | Improvement -- | -- | -- | -- Decompression speed | 2490 MB/s | 2770 MB/s | +11% Compression speeds also receive a welcomed boost, though improvement is not evenly distributed, with higher levels benefiting quite a lot more. Version | v1.8.1 | v1.8.2 | Improvement -- | -- | -- | -- lz4 -1 | 504 MB/s | 516 MB/s | +2% lz4 -9 | 23.2 MB/s | 25.6 MB/s | +10% lz4 -12 | 3.5 Mb/s | 9.5 MB/s | +170% More details: https://github.com/lz4/lz4/releases/tag/v1.8.3 **Below is my benchmark result** set `spark.sql.parquet.compression.codec` to `lz4` and disable orc benchmark, then run `FilterPushdownBenchmark`. lz4-java 1.5.0: ``` [success] Total time: 5585 s, completed Sep 26, 2018 5:22:16 PM ``` lz4-java 1.4.0: ``` [success] Total time: 5591 s, completed Sep 26, 2018 5:22:24 PM ``` Some benchmark result: ``` lz4-java 1.5.0 Select 1 row with 500 filters: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative Parquet Vectorized1953 / 1980 0.0 1952502908.0 1.0X Parquet Vectorized (Pushdown) 2541 / 2585 0.0 2541019869.0 0.8X lz4-java 1.4.0 Select 1 row with 500 filters: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative Parquet Vectorized1979 / 2103 0.0 1979328144.0 1.0X Parquet Vectorized (Pushdown) 2596 / 2909 0.0 2596222118.0 0.8X ``` Complete benchmark result: https://issues.apache.org/jira/secure/attachment/12941360/FilterPushdownBenchmark-lz4-java-140-results.txt https://issues.apache.org/jira/secure/attachment/12941361/FilterPushdownBenchmark-lz4-java-150-results.txt ## How was this patch tested? manual tests Closes #22551 from wangyum/SPARK-25539. Authored-by: Yuming Wang Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fba722e3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fba722e3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fba722e3 Branch: refs/heads/master Commit: fba722e319e356113a69c54f59e23150017634ae Parents: 8bb2429 Author: Yuming Wang Authored: Sun Oct 7 09:51:33 2018 -0500 Committer: Sean Owen Committed: Sun Oct 7 09:51:33 2018 -0500 -- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- dev/deps/spark-deps-hadoop-3.1 | 2 +- pom.xml| 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fba722e3/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 22e86ef..e0e3e0a 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -138,7 +138,7 @@ libfb303-0.9.3.jar libthrift-0.9.3.jar log4j-1.2.17.jar logging-interceptor-3.8.1.jar -lz4-java-1.4.0.jar +lz4-java-1.5.0.jar machinist_2.11-0.6.1.jar macro-compat_2.11-1.1.1.jar mesos-1.4.0-shaded-protobuf.jar http://git-wip-us.apache.org/repos/asf/spark/blob/fba722e3/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 19dd786..3b17f88 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -139,7 +139,7 @@ libfb303-0.9.3.jar libthrift-0.9.3.jar log4j-1.2.17.jar logging-interceptor-3.8.1.jar -lz4-java-1.4.0.jar +lz4-java-1.5.0.jar machinist_2.11-0.6.1.jar macro-compat_2.11-1.1.1.jar mesos-1.4.0-shaded-protobuf.jar http://git-wip-us.apache.org/repos/asf/spark/blob/fba722e3/dev/deps/spark-deps-hadoop-3.1 -- diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index ea0f487..c818b2c 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@