spark git commit: [SPARK-25425][SQL] Extra options should override session options in DataSource V2
Repository: spark Updated Branches: refs/heads/master bb2f069cf -> e06da95cd [SPARK-25425][SQL] Extra options should override session options in DataSource V2 ## What changes were proposed in this pull request? In the PR, I propose overriding session options by extra options in DataSource V2. Extra options are more specific and set via `.option()`, and should overwrite more generic session options. Entries from seconds map overwrites entries with the same key from the first map, for example: ```Scala scala> Map("option" -> false) ++ Map("option" -> true) res0: scala.collection.immutable.Map[String,Boolean] = Map(option -> true) ``` ## How was this patch tested? Added a test for checking which option is propagated to a data source in `load()`. Closes #22413 from MaxGekk/session-options. Lead-authored-by: Maxim Gekk Co-authored-by: Dongjoon Hyun Co-authored-by: Maxim Gekk Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e06da95c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e06da95c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e06da95c Branch: refs/heads/master Commit: e06da95cd9423f55cdb154a2778b0bddf7be984c Parents: bb2f069 Author: Maxim Gekk Authored: Sat Sep 15 17:24:11 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Sep 15 17:24:11 2018 -0700 -- .../org/apache/spark/sql/DataFrameReader.scala | 2 +- .../org/apache/spark/sql/DataFrameWriter.scala | 8 +++-- .../sql/sources/v2/DataSourceV2Suite.scala | 35 +++- .../sources/v2/SimpleWritableDataSource.scala | 6 +++- 4 files changed, 45 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e06da95c/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index e6c2cba..fe69f25 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -202,7 +202,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { DataSourceOptions.PATHS_KEY -> objectMapper.writeValueAsString(paths.toArray) } Dataset.ofRows(sparkSession, DataSourceV2Relation.create( - ds, extraOptions.toMap ++ sessionOptions + pathsOption, + ds, sessionOptions ++ extraOptions.toMap + pathsOption, userSpecifiedSchema = userSpecifiedSchema)) } else { loadV1Source(paths: _*) http://git-wip-us.apache.org/repos/asf/spark/blob/e06da95c/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index dfb8c47..188fce7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -241,10 +241,12 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val source = cls.newInstance().asInstanceOf[DataSourceV2] source match { case provider: BatchWriteSupportProvider => - val options = extraOptions ++ - DataSourceV2Utils.extractSessionConfigs(source, df.sparkSession.sessionState.conf) + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( +source, +df.sparkSession.sessionState.conf) + val options = sessionOptions ++ extraOptions - val relation = DataSourceV2Relation.create(source, options.toMap) + val relation = DataSourceV2Relation.create(source, options) if (mode == SaveMode.Append) { runCommand(df.sparkSession, "save") { AppendData.byName(relation, df.logicalPlan) http://git-wip-us.apache.org/repos/asf/spark/blob/e06da95c/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala index f6c3e0c..7cc8abc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala @@ -17,6 +17,8 @@ package org.apache.sp
[2/2] spark git commit: [SPARK-25438][SQL][TEST] Fix FilterPushdownBenchmark to use the same memory assumption
[SPARK-25438][SQL][TEST] Fix FilterPushdownBenchmark to use the same memory assumption ## What changes were proposed in this pull request? This PR aims to fix three things in `FilterPushdownBenchmark`. **1. Use the same memory assumption.** The following configurations are used in ORC and Parquet. - Memory buffer for writing - parquet.block.size (default: 128MB) - orc.stripe.size (default: 64MB) - Compression chunk size - parquet.page.size (default: 1MB) - orc.compress.size (default: 256KB) SPARK-24692 used 1MB, the default value of `parquet.page.size`, for `parquet.block.size` and `orc.stripe.size`. But, it missed to match `orc.compress.size`. So, the current benchmark shows the result from ORC with 256KB memory for compression and Parquet with 1MB. To compare correctly, we need to be consistent. **2. Dictionary encoding should not be enforced for all cases.** SPARK-24206 enforced dictionary encoding for all test cases. This PR recovers the default behavior in general and enforces dictionary encoding only in case of `prepareStringDictTable`. **3. Generate test result on AWS r3.xlarge** SPARK-24206 generated the result on AWS in order to reproduce and compare easily. This PR also aims to update the result on the same machine again in the same reason. Specifically, AWS r3.xlarge with Instance Store is used. ## How was this patch tested? Manual. Enable the test cases and run `FilterPushdownBenchmark` on `AWS r3.xlarge`. It takes about 4 hours 15 minutes. Closes #22427 from dongjoon-hyun/SPARK-25438. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fefaa3c3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fefaa3c3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fefaa3c3 Branch: refs/heads/master Commit: fefaa3c30df2c56046370081cb51bfe68d26976b Parents: e06da95 Author: Dongjoon Hyun Authored: Sat Sep 15 17:48:39 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Sep 15 17:48:39 2018 -0700 -- .../FilterPushdownBenchmark-results.txt | 912 +-- .../benchmark/FilterPushdownBenchmark.scala | 11 +- 2 files changed, 428 insertions(+), 495 deletions(-) -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/2] spark git commit: [SPARK-25438][SQL][TEST] Fix FilterPushdownBenchmark to use the same memory assumption
[SPARK-25438][SQL][TEST] Fix FilterPushdownBenchmark to use the same memory assumption ## What changes were proposed in this pull request? This PR aims to fix three things in `FilterPushdownBenchmark`. **1. Use the same memory assumption.** The following configurations are used in ORC and Parquet. - Memory buffer for writing - parquet.block.size (default: 128MB) - orc.stripe.size (default: 64MB) - Compression chunk size - parquet.page.size (default: 1MB) - orc.compress.size (default: 256KB) SPARK-24692 used 1MB, the default value of `parquet.page.size`, for `parquet.block.size` and `orc.stripe.size`. But, it missed to match `orc.compress.size`. So, the current benchmark shows the result from ORC with 256KB memory for compression and Parquet with 1MB. To compare correctly, we need to be consistent. **2. Dictionary encoding should not be enforced for all cases.** SPARK-24206 enforced dictionary encoding for all test cases. This PR recovers the default behavior in general and enforces dictionary encoding only in case of `prepareStringDictTable`. **3. Generate test result on AWS r3.xlarge** SPARK-24206 generated the result on AWS in order to reproduce and compare easily. This PR also aims to update the result on the same machine again in the same reason. Specifically, AWS r3.xlarge with Instance Store is used. ## How was this patch tested? Manual. Enable the test cases and run `FilterPushdownBenchmark` on `AWS r3.xlarge`. It takes about 4 hours 15 minutes. Closes #22427 from dongjoon-hyun/SPARK-25438. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit fefaa3c30df2c56046370081cb51bfe68d26976b) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b40e5fee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b40e5fee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b40e5fee Branch: refs/heads/branch-2.4 Commit: b40e5feec2660891590e21807133a508cbd004d3 Parents: ae2ca0e Author: Dongjoon Hyun Authored: Sat Sep 15 17:48:39 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Sep 15 17:48:53 2018 -0700 -- .../FilterPushdownBenchmark-results.txt | 912 +-- .../benchmark/FilterPushdownBenchmark.scala | 11 +- 2 files changed, 428 insertions(+), 495 deletions(-) -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: [SPARK-25438][SQL][TEST] Fix FilterPushdownBenchmark to use the same memory assumption
Repository: spark Updated Branches: refs/heads/master e06da95cd -> fefaa3c30 http://git-wip-us.apache.org/repos/asf/spark/blob/fefaa3c3/sql/core/benchmarks/FilterPushdownBenchmark-results.txt -- diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index a75a15c..e680ddf 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -2,737 +2,669 @@ Pushdown for many distinct value case -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 string row (value IS NULL): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -Parquet Vectorized8970 / 9122 1.8 570.3 1.0X -Parquet Vectorized (Pushdown) 471 / 491 33.4 30.0 19.0X -Native ORC Vectorized 7661 / 7853 2.1 487.0 1.2X -Native ORC Vectorized (Pushdown) 1134 / 1161 13.9 72.1 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11405 / 11485 1.4 725.1 1.0X +Parquet Vectorized (Pushdown) 675 / 690 23.3 42.9 16.9X +Native ORC Vectorized 7127 / 7170 2.2 453.1 1.6X +Native ORC Vectorized (Pushdown) 519 / 541 30.3 33.0 22.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 string row ('7864320' < value < '7864320'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative -Parquet Vectorized9246 / 9297 1.7 587.8 1.0X -Parquet Vectorized (Pushdown) 480 / 488 32.8 30.5 19.3X -Native ORC Vectorized 7838 / 7850 2.0 498.3 1.2X -Native ORC Vectorized (Pushdown) 1054 / 1118 14.9 67.0 8.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11457 / 11473 1.4 728.4 1.0X +Parquet Vectorized (Pushdown) 656 / 686 24.0 41.7 17.5X +Native ORC Vectorized 7328 / 7342 2.1 465.9 1.6X +Native ORC Vectorized (Pushdown) 539 / 565 29.2 34.2 21.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row (value = '7864320'): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -Parquet Vectorized8989 / 9100 1.7 571.5 1.0X -Parquet Vectorized (Pushdown) 448 / 467 35.1 28.5 20.1X -Native ORC Vectorized 7680 / 7768 2.0 488.3 1.2X -Native ORC Vectorized (Pushdown) 1067 / 1118 14.7 67.8 8.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11878 / 11888 1.3 755.2 1.0X +Parquet Vectorized (Pushdown) 630 / 654 25.0 40.1 18.9X +Native ORC Vectorized 7342 / 7362 2.1 466.8 1.6X +Native ORC Vectorized (Pushdown) 519 / 537 30.3 33.0 22.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row (value <=> '7864320'): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -Parquet Vectorized9115 / 9266 1.7 579.5 1.0X -Parquet Vectorized (Pushdown) 466 / 492 33.7 29.7 19.5X -N
[1/2] spark git commit: [SPARK-25438][SQL][TEST] Fix FilterPushdownBenchmark to use the same memory assumption
Repository: spark Updated Branches: refs/heads/branch-2.4 ae2ca0e5d -> b40e5feec http://git-wip-us.apache.org/repos/asf/spark/blob/b40e5fee/sql/core/benchmarks/FilterPushdownBenchmark-results.txt -- diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index a75a15c..e680ddf 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -2,737 +2,669 @@ Pushdown for many distinct value case -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 string row (value IS NULL): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -Parquet Vectorized8970 / 9122 1.8 570.3 1.0X -Parquet Vectorized (Pushdown) 471 / 491 33.4 30.0 19.0X -Native ORC Vectorized 7661 / 7853 2.1 487.0 1.2X -Native ORC Vectorized (Pushdown) 1134 / 1161 13.9 72.1 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11405 / 11485 1.4 725.1 1.0X +Parquet Vectorized (Pushdown) 675 / 690 23.3 42.9 16.9X +Native ORC Vectorized 7127 / 7170 2.2 453.1 1.6X +Native ORC Vectorized (Pushdown) 519 / 541 30.3 33.0 22.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 string row ('7864320' < value < '7864320'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative -Parquet Vectorized9246 / 9297 1.7 587.8 1.0X -Parquet Vectorized (Pushdown) 480 / 488 32.8 30.5 19.3X -Native ORC Vectorized 7838 / 7850 2.0 498.3 1.2X -Native ORC Vectorized (Pushdown) 1054 / 1118 14.9 67.0 8.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11457 / 11473 1.4 728.4 1.0X +Parquet Vectorized (Pushdown) 656 / 686 24.0 41.7 17.5X +Native ORC Vectorized 7328 / 7342 2.1 465.9 1.6X +Native ORC Vectorized (Pushdown) 539 / 565 29.2 34.2 21.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row (value = '7864320'): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -Parquet Vectorized8989 / 9100 1.7 571.5 1.0X -Parquet Vectorized (Pushdown) 448 / 467 35.1 28.5 20.1X -Native ORC Vectorized 7680 / 7768 2.0 488.3 1.2X -Native ORC Vectorized (Pushdown) 1067 / 1118 14.7 67.8 8.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11878 / 11888 1.3 755.2 1.0X +Parquet Vectorized (Pushdown) 630 / 654 25.0 40.1 18.9X +Native ORC Vectorized 7342 / 7362 2.1 466.8 1.6X +Native ORC Vectorized (Pushdown) 519 / 537 30.3 33.0 22.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row (value <=> '7864320'): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -Parquet Vectorized9115 / 9266 1.7 579.5 1.0X -Parquet Vectorized (Pushdown) 466 / 492 33.7 29.7 19.5
spark git commit: [SPARK-25423][SQL] Output "dataFilters" in DataSourceScanExec.metadata
Repository: spark Updated Branches: refs/heads/master 30aa37fca -> 4b9542e3a [SPARK-25423][SQL] Output "dataFilters" in DataSourceScanExec.metadata ## What changes were proposed in this pull request? Output `dataFilters` in `DataSourceScanExec.metadata`. ## How was this patch tested? unit tests Closes #22435 from wangyum/SPARK-25423. Authored-by: Yuming Wang Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b9542e3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b9542e3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b9542e3 Branch: refs/heads/master Commit: 4b9542e3a3d0c493a05061be5a9f8d278c0ac980 Parents: 30aa37f Author: Yuming Wang Authored: Mon Sep 17 11:26:08 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Sep 17 11:26:08 2018 -0700 -- .../spark/sql/execution/DataSourceScanExec.scala| 1 + .../DataSourceScanExecRedactionSuite.scala | 16 2 files changed, 17 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4b9542e3/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 36ed016..738c066 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -284,6 +284,7 @@ case class FileSourceScanExec( "Batched" -> supportsBatch.toString, "PartitionFilters" -> seqToString(partitionFilters), "PushedFilters" -> seqToString(pushedDownFilters), +"DataFilters" -> seqToString(dataFilters), "Location" -> locationDesc) val withOptPartitionCount = relation.partitionSchemaOption.map { _ => http://git-wip-us.apache.org/repos/asf/spark/blob/4b9542e3/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala index c8d045a..11a1c9a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala @@ -83,4 +83,20 @@ class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext { } } + test("FileSourceScanExec metadata") { +withTempPath { path => + val dir = path.getCanonicalPath + spark.range(0, 10).write.parquet(dir) + val df = spark.read.parquet(dir) + + assert(isIncluded(df.queryExecution, "Format")) + assert(isIncluded(df.queryExecution, "ReadSchema")) + assert(isIncluded(df.queryExecution, "Batched")) + assert(isIncluded(df.queryExecution, "PartitionFilters")) + assert(isIncluded(df.queryExecution, "PushedFilters")) + assert(isIncluded(df.queryExecution, "DataFilters")) + assert(isIncluded(df.queryExecution, "Location")) +} + } + } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16323][SQL] Add IntegralDivide expression
Repository: spark Updated Branches: refs/heads/master 4b9542e3a -> 553af22f2 [SPARK-16323][SQL] Add IntegralDivide expression ## What changes were proposed in this pull request? The PR takes over #14036 and it introduces a new expression `IntegralDivide` in order to avoid the several unneded cast added previously. In order to prove the performance gain, the following benchmark has been run: ``` test("Benchmark IntegralDivide") { val r = new scala.util.Random(91) val nData = 100 val testDataInt = (1 to nData).map(_ => (r.nextInt(), r.nextInt())) val testDataLong = (1 to nData).map(_ => (r.nextLong(), r.nextLong())) val testDataShort = (1 to nData).map(_ => (r.nextInt().toShort, r.nextInt().toShort)) // old code val oldExprsInt = testDataInt.map(x => Cast(Divide(Cast(Literal(x._1), DoubleType), Cast(Literal(x._2), DoubleType)), LongType)) val oldExprsLong = testDataLong.map(x => Cast(Divide(Cast(Literal(x._1), DoubleType), Cast(Literal(x._2), DoubleType)), LongType)) val oldExprsShort = testDataShort.map(x => Cast(Divide(Cast(Literal(x._1), DoubleType), Cast(Literal(x._2), DoubleType)), LongType)) // new code val newExprsInt = testDataInt.map(x => IntegralDivide(x._1, x._2)) val newExprsLong = testDataLong.map(x => IntegralDivide(x._1, x._2)) val newExprsShort = testDataShort.map(x => IntegralDivide(x._1, x._2)) Seq(("Long", "old", oldExprsLong), ("Long", "new", newExprsLong), ("Int", "old", oldExprsInt), ("Int", "new", newExprsShort), ("Short", "old", oldExprsShort), ("Short", "new", oldExprsShort)).foreach { case (dt, t, ds) => val start = System.nanoTime() ds.foreach(e => e.eval(EmptyRow)) val endNoCodegen = System.nanoTime() println(s"Running $nData op with $t code on $dt (no-codegen): ${(endNoCodegen - start) / 100} ms") } } ``` The results on my laptop are: ``` Running 100 op with old code on Long (no-codegen): 600 ms Running 100 op with new code on Long (no-codegen): 112 ms Running 100 op with old code on Int (no-codegen): 560 ms Running 100 op with new code on Int (no-codegen): 135 ms Running 100 op with old code on Short (no-codegen): 317 ms Running 100 op with new code on Short (no-codegen): 153 ms ``` Showing a 2-5X improvement. The benchmark doesn't include code generation as it is pretty hard to test the performance there as for such simple operations the most of the time is spent in the code generation/compilation process. ## How was this patch tested? added UTs Closes #22395 from mgaido91/SPARK-16323. Authored-by: Marco Gaido Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/553af22f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/553af22f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/553af22f Branch: refs/heads/master Commit: 553af22f2c8ecdc039c8d06431564b1432e60d2d Parents: 4b9542e Author: Marco Gaido Authored: Mon Sep 17 11:33:50 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Sep 17 11:33:50 2018 -0700 -- .../catalyst/analysis/FunctionRegistry.scala| 1 + .../apache/spark/sql/catalyst/dsl/package.scala | 1 + .../sql/catalyst/expressions/arithmetic.scala | 28 .../spark/sql/catalyst/parser/AstBuilder.scala | 2 +- .../expressions/ArithmeticExpressionSuite.scala | 18 ++--- .../catalyst/parser/ExpressionParserSuite.scala | 4 +-- .../sql-tests/results/operators.sql.out | 8 +++--- 7 files changed, 45 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/553af22f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 77860e1..8b69a47 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -267,6 +267,7 @@ object FunctionRegistry { expression[Subtract]("-"), expression[Multiply]("*"), expression[Divide]("/"), +expression[IntegralDivide]("div"), expression[Remainder]("%"), // aggregate functions http://git-wip-us.apache.org/repos/asf/spark/blob/
spark git commit: Revert "[SPARK-23173][SQL] rename spark.sql.fromJsonForceNullableSchema"
Repository: spark Updated Branches: refs/heads/master a71f6a175 -> cb1b55cf7 Revert "[SPARK-23173][SQL] rename spark.sql.fromJsonForceNullableSchema" This reverts commit 6c7db7fd1ced1d143b1389d09990a620fc16be46. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cb1b55cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cb1b55cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cb1b55cf Branch: refs/heads/master Commit: cb1b55cf771018f1560f6b173cdd7c6ca8061bc7 Parents: a71f6a1 Author: Dongjoon Hyun Authored: Wed Sep 19 14:33:40 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 19 14:33:40 2018 -0700 -- .../sql/catalyst/expressions/jsonExpressions.scala | 4 ++-- .../org/apache/spark/sql/internal/SQLConf.scala | 16 2 files changed, 10 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cb1b55cf/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index ade10ab..bd9090a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -517,12 +517,12 @@ case class JsonToStructs( timeZoneId: Option[String] = None) extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback with ExpectsInputTypes { - val forceNullableSchema: Boolean = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA) + val forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA) // The JSON input data might be missing certain fields. We force the nullability // of the user-provided schema to avoid data corruptions. In particular, the parquet-mr encoder // can generate incorrect files if values are missing in columns declared as non-nullable. - val nullableSchema: DataType = if (forceNullableSchema) schema.asNullable else schema + val nullableSchema = if (forceNullableSchema) schema.asNullable else schema override def nullable: Boolean = true http://git-wip-us.apache.org/repos/asf/spark/blob/cb1b55cf/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4499a35..b1e9b17 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -608,6 +608,14 @@ object SQLConf { .stringConf .createWithDefault("_corrupt_record") + val FROM_JSON_FORCE_NULLABLE_SCHEMA = buildConf("spark.sql.fromJsonForceNullableSchema") +.internal() +.doc("When true, force the output schema of the from_json() function to be nullable " + + "(including all the fields). Otherwise, the schema might not be compatible with" + + "actual data, which leads to curruptions.") +.booleanConf +.createWithDefault(true) + val BROADCAST_TIMEOUT = buildConf("spark.sql.broadcastTimeout") .doc("Timeout in seconds for the broadcast wait time in broadcast joins.") .timeConf(TimeUnit.SECONDS) @@ -1354,14 +1362,6 @@ object SQLConf { "When this conf is not set, the value from `spark.redaction.string.regex` is used.") .fallbackConf(org.apache.spark.internal.config.STRING_REDACTION_PATTERN) - val FROM_JSON_FORCE_NULLABLE_SCHEMA = buildConf("spark.sql.function.fromJson.forceNullable") -.internal() -.doc("When true, force the output schema of the from_json() function to be nullable " + - "(including all the fields). Otherwise, the schema might not be compatible with" + - "actual data, which leads to corruptions.") -.booleanConf -.createWithDefault(true) - val CONCAT_BINARY_AS_STRING = buildConf("spark.sql.function.concatBinaryAsString") .doc("When this option is set to false and all inputs are binary, `functions.concat` returns " + "an output as binary. Otherwise, it returns as a string. ") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-23173][SQL] rename spark.sql.fromJsonForceNullableSchema"
Repository: spark Updated Branches: refs/heads/branch-2.4 538ae62e0 -> 9fefb47fe Revert "[SPARK-23173][SQL] rename spark.sql.fromJsonForceNullableSchema" This reverts commit 6c7db7fd1ced1d143b1389d09990a620fc16be46. (cherry picked from commit cb1b55cf771018f1560f6b173cdd7c6ca8061bc7) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9fefb47f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9fefb47f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9fefb47f Branch: refs/heads/branch-2.4 Commit: 9fefb47feab14b865978bdb8e6155a976de72416 Parents: 538ae62 Author: Dongjoon Hyun Authored: Wed Sep 19 14:33:40 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 19 14:38:21 2018 -0700 -- .../sql/catalyst/expressions/jsonExpressions.scala | 4 ++-- .../org/apache/spark/sql/internal/SQLConf.scala | 16 2 files changed, 10 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9fefb47f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index ade10ab..bd9090a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -517,12 +517,12 @@ case class JsonToStructs( timeZoneId: Option[String] = None) extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback with ExpectsInputTypes { - val forceNullableSchema: Boolean = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA) + val forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA) // The JSON input data might be missing certain fields. We force the nullability // of the user-provided schema to avoid data corruptions. In particular, the parquet-mr encoder // can generate incorrect files if values are missing in columns declared as non-nullable. - val nullableSchema: DataType = if (forceNullableSchema) schema.asNullable else schema + val nullableSchema = if (forceNullableSchema) schema.asNullable else schema override def nullable: Boolean = true http://git-wip-us.apache.org/repos/asf/spark/blob/9fefb47f/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 5e2ac02..3e9cde4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -588,6 +588,14 @@ object SQLConf { .stringConf .createWithDefault("_corrupt_record") + val FROM_JSON_FORCE_NULLABLE_SCHEMA = buildConf("spark.sql.fromJsonForceNullableSchema") +.internal() +.doc("When true, force the output schema of the from_json() function to be nullable " + + "(including all the fields). Otherwise, the schema might not be compatible with" + + "actual data, which leads to curruptions.") +.booleanConf +.createWithDefault(true) + val BROADCAST_TIMEOUT = buildConf("spark.sql.broadcastTimeout") .doc("Timeout in seconds for the broadcast wait time in broadcast joins.") .timeConf(TimeUnit.SECONDS) @@ -1334,14 +1342,6 @@ object SQLConf { "When this conf is not set, the value from `spark.redaction.string.regex` is used.") .fallbackConf(org.apache.spark.internal.config.STRING_REDACTION_PATTERN) - val FROM_JSON_FORCE_NULLABLE_SCHEMA = buildConf("spark.sql.function.fromJson.forceNullable") -.internal() -.doc("When true, force the output schema of the from_json() function to be nullable " + - "(including all the fields). Otherwise, the schema might not be compatible with" + - "actual data, which leads to corruptions.") -.booleanConf -.createWithDefault(true) - val CONCAT_BINARY_AS_STRING = buildConf("spark.sql.function.concatBinaryAsString") .doc("When this option is set to false and all inputs are binary, `functions.concat` returns " + "an output as binary. Otherwise, it returns as a string. ") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25425][SQL][BACKPORT-2.4] Extra options should override session options in DataSource V2
Repository: spark Updated Branches: refs/heads/branch-2.4 9031c7848 -> a9a8d3a4b [SPARK-25425][SQL][BACKPORT-2.4] Extra options should override session options in DataSource V2 ## What changes were proposed in this pull request? In the PR, I propose overriding session options by extra options in DataSource V2. Extra options are more specific and set via `.option()`, and should overwrite more generic session options. ## How was this patch tested? Added tests for read and write paths. Closes #22474 from MaxGekk/session-options-2.4. Authored-by: Maxim Gekk Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a9a8d3a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a9a8d3a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a9a8d3a4 Branch: refs/heads/branch-2.4 Commit: a9a8d3a4b92be89defd82d5f2eeb3f9af45c687d Parents: 9031c78 Author: Maxim Gekk Authored: Wed Sep 19 16:53:26 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 19 16:53:26 2018 -0700 -- .../org/apache/spark/sql/DataFrameReader.scala | 2 +- .../org/apache/spark/sql/DataFrameWriter.scala | 8 +++-- .../sql/sources/v2/DataSourceV2Suite.scala | 33 .../sources/v2/SimpleWritableDataSource.scala | 7 - 4 files changed, 45 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a9a8d3a4/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 371ec70..27a1af2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -202,7 +202,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { DataSourceOptions.PATHS_KEY -> objectMapper.writeValueAsString(paths.toArray) } Dataset.ofRows(sparkSession, DataSourceV2Relation.create( - ds, extraOptions.toMap ++ sessionOptions + pathsOption, + ds, sessionOptions ++ extraOptions.toMap + pathsOption, userSpecifiedSchema = userSpecifiedSchema)) } else { loadV1Source(paths: _*) http://git-wip-us.apache.org/repos/asf/spark/blob/a9a8d3a4/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 4aeddfd..80ade7c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -241,10 +241,12 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val source = cls.newInstance().asInstanceOf[DataSourceV2] source match { case ws: WriteSupport => - val options = extraOptions ++ - DataSourceV2Utils.extractSessionConfigs(source, df.sparkSession.sessionState.conf) + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( +source, +df.sparkSession.sessionState.conf) + val options = sessionOptions ++ extraOptions + val relation = DataSourceV2Relation.create(source, options) - val relation = DataSourceV2Relation.create(source, options.toMap) if (mode == SaveMode.Append) { runCommand(df.sparkSession, "save") { AppendData.byName(relation, df.logicalPlan) http://git-wip-us.apache.org/repos/asf/spark/blob/a9a8d3a4/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala index 12beca2..bafde50 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.sources.v2 +import java.io.File import java.util.{ArrayList, List => JList} import test.org.apache.spark.sql.sources.v2._ @@ -322,6 +323,38 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { checkCanonicalizedOutput(df, 2, 2) checkCanonicalizedOutput(df.select('i), 2, 1) } + + test("SPARK-25425: extra options should override sessions opti
spark git commit: [SPARK-25489][ML][TEST] Refactor UDTSerializationBenchmark
Repository: spark Updated Branches: refs/heads/master a72d118cd -> 9bf04d854 [SPARK-25489][ML][TEST] Refactor UDTSerializationBenchmark ## What changes were proposed in this pull request? Refactor `UDTSerializationBenchmark` to use main method and print the output as a separate file. Run blow command to generate benchmark results: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "mllib/test:runMain org.apache.spark.mllib.linalg.UDTSerializationBenchmark" ``` ## How was this patch tested? Manual tests. Closes #22499 from seancxmao/SPARK-25489. Authored-by: seancxmao Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9bf04d85 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9bf04d85 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9bf04d85 Branch: refs/heads/master Commit: 9bf04d8543d70ba8e55c970f2a8e2df872cf74f6 Parents: a72d118 Author: seancxmao Authored: Sun Sep 23 13:34:06 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Sep 23 13:34:06 2018 -0700 -- .../UDTSerializationBenchmark-results.txt | 13 .../linalg/UDTSerializationBenchmark.scala | 70 ++-- 2 files changed, 49 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9bf04d85/mllib/benchmarks/UDTSerializationBenchmark-results.txt -- diff --git a/mllib/benchmarks/UDTSerializationBenchmark-results.txt b/mllib/benchmarks/UDTSerializationBenchmark-results.txt new file mode 100644 index 000..169f4c6 --- /dev/null +++ b/mllib/benchmarks/UDTSerializationBenchmark-results.txt @@ -0,0 +1,13 @@ + +VectorUDT de/serialization + + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_131-b11 on Mac OS X 10.13.6 +Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz + +VectorUDT de/serialization: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +serialize 144 / 206 0.0 143979.7 1.0X +deserialize114 / 135 0.0 113802.6 1.3X + + http://git-wip-us.apache.org/repos/asf/spark/blob/9bf04d85/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala index e2976e1..1a2216e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala @@ -17,53 +17,55 @@ package org.apache.spark.mllib.linalg -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder /** * Serialization benchmark for VectorUDT. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "mllib/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "mllib/test:runMain " + *Results will be written to "benchmarks/UDTSerializationBenchmark-results.txt". + * }}} */ -object UDTSerializationBenchmark { +object UDTSerializationBenchmark extends BenchmarkBase { - def main(args: Array[String]): Unit = { -val iters = 1e2.toInt -val numRows = 1e3.toInt + override def benchmark(): Unit = { -val encoder = ExpressionEncoder[Vector].resolveAndBind() +runBenchmark("VectorUDT de/serialization") { + val iters = 1e2.toInt + val numRows = 1e3.toInt -val vectors = (1 to numRows).map { i => - Vectors.dense(Array.fill(1e5.toInt)(1.0 * i)) -}.toArray -val rows = vectors.map(encoder.toRow) + val encoder = ExpressionEncoder[Vector].resolveAndBind() -val benchmark = new Benchmark("VectorUDT de/serialization", numRows, iters) + val vectors = (1 to numRows).map { i => +Vectors.dense(Array.fill(1e5.toInt)(1.0 * i)) + }.toArray + val rows = vectors.map(encoder.toRow) -benchmark.addCase("serialize") { _ => - var sum = 0 - var i = 0 - while (i < numRows) { -sum += encoder.toRow(vectors(i)).numFields -i += 1 + val benchmark = new Benchmark("
spark git commit: [SPARK-25478][SQL][TEST] Refactor CompressionSchemeBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master d522a563a -> c79072aaf [SPARK-25478][SQL][TEST] Refactor CompressionSchemeBenchmark to use main method ## What changes were proposed in this pull request? Refactor `CompressionSchemeBenchmark` to use main method. Generate benchmark result: ```sh SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.columnar.compression.CompressionSchemeBenchmark" ``` ## How was this patch tested? manual tests Closes #22486 from wangyum/SPARK-25478. Lead-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c79072aa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c79072aa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c79072aa Branch: refs/heads/master Commit: c79072aafa2f406c342e393e0c61bb5cb3e89a7f Parents: d522a56 Author: Yuming Wang Authored: Sun Sep 23 20:46:40 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Sep 23 20:46:40 2018 -0700 -- .../CompressionSchemeBenchmark-results.txt | 137 ++ .../CompressionSchemeBenchmark.scala| 138 +++ 2 files changed, 156 insertions(+), 119 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c79072aa/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt -- diff --git a/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt new file mode 100644 index 000..caa9378 --- /dev/null +++ b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt @@ -0,0 +1,137 @@ + +Compression Scheme Benchmark + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +BOOLEAN Encode: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +PassThrough(1.000) 4 /4 17998.9 0.1 1.0X +RunLengthEncoding(2.501) 680 / 680 98.7 10.1 0.0X +BooleanBitSet(0.125) 365 / 365183.9 5.4 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +BOOLEAN Decode: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +PassThrough144 / 144466.5 2.1 1.0X +RunLengthEncoding 679 / 679 98.9 10.1 0.2X +BooleanBitSet 1425 / 1431 47.1 21.2 0.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Encode (Lower Skew): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +PassThrough(1.000) 7 /7 10115.0 0.1 1.0X +RunLengthEncoding(1.494) 1671 / 1672 40.2 24.9 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Decode (Lower Skew): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +PassThrough 1128 / 1128 59.5 16.8 1.0X +RunLengthEncoding 1630 / 1633 41.2 24.3 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Encode (Higher Skew): Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +PassThrough(1.000) 7 /7 10164.2 0.1 1.0X +RunLengthEncoding(1.989) 1562 / 1563 43.0 23.3 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x8
spark git commit: [SPARK-25460][BRANCH-2.4][SS] DataSourceV2: SS sources do not respect SessionConfigSupport
Repository: spark Updated Branches: refs/heads/branch-2.4 51d5378f8 -> ec384284e [SPARK-25460][BRANCH-2.4][SS] DataSourceV2: SS sources do not respect SessionConfigSupport ## What changes were proposed in this pull request? This PR proposes to backport SPARK-25460 to branch-2.4: This PR proposes to respect `SessionConfigSupport` in SS datasources as well. Currently these are only respected in batch sources: https://github.com/apache/spark/blob/e06da95cd9423f55cdb154a2778b0bddf7be984c/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L198-L203 https://github.com/apache/spark/blob/e06da95cd9423f55cdb154a2778b0bddf7be984c/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala#L244-L249 If a developer makes a datasource V2 that supports both structured streaming and batch jobs, batch jobs respect a specific configuration, let's say, URL to connect and fetch data (which end users might not be aware of); however, structured streaming ends up with not supporting this (and should explicitly be set into options). ## How was this patch tested? Unit tests were added. Closes #22529 from HyukjinKwon/SPARK-25460-backport. Authored-by: hyukjinkwon Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec384284 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec384284 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec384284 Branch: refs/heads/branch-2.4 Commit: ec384284eb427d7573bd94c70e23e4137971 Parents: 51d5378 Author: hyukjinkwon Authored: Mon Sep 24 08:49:19 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Sep 24 08:49:19 2018 -0700 -- .../spark/sql/streaming/DataStreamReader.scala | 18 ++- .../spark/sql/streaming/DataStreamWriter.scala | 16 ++- .../sources/StreamingDataSourceV2Suite.scala| 118 --- 3 files changed, 125 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec384284/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 7eb5db5..a9cb5e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -26,6 +26,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2} import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceOptions, MicroBatchReadSupport} @@ -158,7 +159,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } val ds = DataSource.lookupDataSource(source, sparkSession.sqlContext.conf).newInstance() -val options = new DataSourceOptions(extraOptions.asJava) // We need to generate the V1 data source so we can pass it to the V2 relation as a shim. // We can't be sure at this point whether we'll actually want to use V2, since we don't know the // writer or whether the query is continuous. @@ -173,12 +173,16 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } ds match { case s: MicroBatchReadSupport => +val sessionOptions = DataSourceV2Utils.extractSessionConfigs( + ds = s, conf = sparkSession.sessionState.conf) +val options = sessionOptions ++ extraOptions +val dataSourceOptions = new DataSourceOptions(options.asJava) var tempReader: MicroBatchReader = null val schema = try { tempReader = s.createMicroBatchReader( Optional.ofNullable(userSpecifiedSchema.orNull), Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath, -options) +dataSourceOptions) tempReader.readSchema() } finally { // Stop tempReader to avoid side-effect thing @@ -190,17 +194,21 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo Dataset.ofRows( sparkSession, StreamingRelationV2( -s, source, extraOptions.toMap, +s, source, options, schema.toAttributes, v1Re
spark git commit: [SPARK-25503][CORE][WEBUI] Total task message in stage page is ambiguous
Repository: spark Updated Branches: refs/heads/master 2c9ffda1b -> 615792da4 [SPARK-25503][CORE][WEBUI] Total task message in stage page is ambiguous ## What changes were proposed in this pull request? Test steps : 1) bin/spark-shell --conf spark.ui.retainedTasks=10 2) val rdd = sc.parallelize(1 to 1000, 1000) 3) rdd.count Stage page tab in the UI will display 10 tasks, but display message is wrong. It should reverse. **Before fix :** ![webui_1](https://user-images.githubusercontent.com/23054875/45917921-8926d800-be9c-11e8-8da5-3998d07e3ccc.jpg) **After fix** ![spark_web_ui2](https://user-images.githubusercontent.com/23054875/45917935-b4112c00-be9c-11e8-9d10-4fcc8e88568f.jpg) ## How was this patch tested? Manually tested Closes #22525 from shahidki31/SparkUI. Authored-by: Shahid Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/615792da Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/615792da Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/615792da Branch: refs/heads/master Commit: 615792da42b3ee3c5f623c869fada17a3aa92884 Parents: 2c9ffda Author: Shahid Authored: Mon Sep 24 20:03:52 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Sep 24 20:03:52 2018 -0700 -- core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/615792da/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index fd6a298..7428bbe 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -133,7 +133,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val totalTasksNumStr = if (totalTasks == storedTasks) { s"$totalTasks" } else { - s"$storedTasks, showing ${totalTasks}" + s"$totalTasks, showing $storedTasks" } val summary = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25503][CORE][WEBUI] Total task message in stage page is ambiguous
Repository: spark Updated Branches: refs/heads/branch-2.4 ffc081c8f -> e4c03e822 [SPARK-25503][CORE][WEBUI] Total task message in stage page is ambiguous ## What changes were proposed in this pull request? Test steps : 1) bin/spark-shell --conf spark.ui.retainedTasks=10 2) val rdd = sc.parallelize(1 to 1000, 1000) 3) rdd.count Stage page tab in the UI will display 10 tasks, but display message is wrong. It should reverse. **Before fix :** ![webui_1](https://user-images.githubusercontent.com/23054875/45917921-8926d800-be9c-11e8-8da5-3998d07e3ccc.jpg) **After fix** ![spark_web_ui2](https://user-images.githubusercontent.com/23054875/45917935-b4112c00-be9c-11e8-9d10-4fcc8e88568f.jpg) ## How was this patch tested? Manually tested Closes #22525 from shahidki31/SparkUI. Authored-by: Shahid Signed-off-by: Dongjoon Hyun (cherry picked from commit 615792da42b3ee3c5f623c869fada17a3aa92884) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4c03e82 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4c03e82 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4c03e82 Branch: refs/heads/branch-2.4 Commit: e4c03e82278791fcc725600dc5b1f31741340139 Parents: ffc081c Author: Shahid Authored: Mon Sep 24 20:03:52 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Sep 24 20:04:26 2018 -0700 -- core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e4c03e82/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index fd6a298..7428bbe 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -133,7 +133,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val totalTasksNumStr = if (totalTasks == storedTasks) { s"$totalTasks" } else { - s"$storedTasks, showing ${totalTasks}" + s"$totalTasks, showing $storedTasks" } val summary = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25503][CORE][WEBUI] Total task message in stage page is ambiguous
Repository: spark Updated Branches: refs/heads/branch-2.3 12717ba0e -> 9674d083e [SPARK-25503][CORE][WEBUI] Total task message in stage page is ambiguous ## What changes were proposed in this pull request? Test steps : 1) bin/spark-shell --conf spark.ui.retainedTasks=10 2) val rdd = sc.parallelize(1 to 1000, 1000) 3) rdd.count Stage page tab in the UI will display 10 tasks, but display message is wrong. It should reverse. **Before fix :** ![webui_1](https://user-images.githubusercontent.com/23054875/45917921-8926d800-be9c-11e8-8da5-3998d07e3ccc.jpg) **After fix** ![spark_web_ui2](https://user-images.githubusercontent.com/23054875/45917935-b4112c00-be9c-11e8-9d10-4fcc8e88568f.jpg) ## How was this patch tested? Manually tested Closes #22525 from shahidki31/SparkUI. Authored-by: Shahid Signed-off-by: Dongjoon Hyun (cherry picked from commit 615792da42b3ee3c5f623c869fada17a3aa92884) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9674d083 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9674d083 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9674d083 Branch: refs/heads/branch-2.3 Commit: 9674d083eca9e1fe0f5e8ea63640f25c907017ec Parents: 12717ba Author: Shahid Authored: Mon Sep 24 20:03:52 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Sep 24 20:04:52 2018 -0700 -- core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9674d083/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index 365a974..3ba7d4a 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -133,7 +133,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val totalTasksNumStr = if (totalTasks == storedTasks) { s"$totalTasks" } else { - s"$storedTasks, showing ${totalTasks}" + s"$totalTasks, showing $storedTasks" } val summary = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25486][TEST] Refactor SortBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 9cbd001e2 -> 04db03537 [SPARK-25486][TEST] Refactor SortBenchmark to use main method ## What changes were proposed in this pull request? Refactor SortBenchmark to use main method. Generate benchmark result: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.SortBenchmark" ``` ## How was this patch tested? manual tests Closes #22495 from yucai/SPARK-25486. Authored-by: yucai Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/04db0353 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/04db0353 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/04db0353 Branch: refs/heads/master Commit: 04db035378012907c93f6e5b4faa6ec11f1fc67b Parents: 9cbd001 Author: yucai Authored: Tue Sep 25 11:13:05 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Sep 25 11:13:05 2018 -0700 -- sql/core/benchmarks/SortBenchmark-results.txt | 17 + .../sql/execution/benchmark/SortBenchmark.scala | 38 +--- 2 files changed, 33 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/04db0353/sql/core/benchmarks/SortBenchmark-results.txt -- diff --git a/sql/core/benchmarks/SortBenchmark-results.txt b/sql/core/benchmarks/SortBenchmark-results.txt new file mode 100644 index 000..0d00a0c --- /dev/null +++ b/sql/core/benchmarks/SortBenchmark-results.txt @@ -0,0 +1,17 @@ + +radix sort + + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +radix sort 2500: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +reference TimSort key prefix array 11770 / 11960 2.1 470.8 1.0X +reference Arrays.sort 2106 / 2128 11.9 84.3 5.6X +radix sort one byte 93 / 100269.7 3.7 126.9X +radix sort two bytes 171 / 179146.0 6.9 68.7X +radix sort eight bytes 659 / 664 37.9 26.4 17.9X +radix sort key prefix array 1024 / 1053 24.4 41.0 11.5X + + http://git-wip-us.apache.org/repos/asf/spark/blob/04db0353/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala index 17619ec..958a064 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.benchmark import java.util.{Arrays, Comparator} -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.unsafe.array.LongArray import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.util.collection.Sorter @@ -28,12 +28,15 @@ import org.apache.spark.util.random.XORShiftRandom /** * Benchmark to measure performance for aggregate primitives. - * To run this: - * build/sbt "sql/test-only *benchmark.SortBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/-results.txt". + * }}} */ -class SortBenchmark extends BenchmarkWithCodegen { +object SortBenchmark extends BenchmarkBase { private def referenceKeyPrefixSort(buf: LongArray, lo: Int, hi: Int, refCmp: PrefixComparator) { val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt))) @@ -54,10 +57,10 @@ class SortBenchmark extends BenchmarkWithCodegen { new LongArray(MemoryBlock.fromLongArray(extended))) } - ignore("sort") { + def sortBenchmark(): Unit = { val size = 2500
spark git commit: [SPARK-25534][SQL] Make `SQLHelper` trait
Repository: spark Updated Branches: refs/heads/master 473d0d862 -> 81cbcca60 [SPARK-25534][SQL] Make `SQLHelper` trait ## What changes were proposed in this pull request? Currently, Spark has 7 `withTempPath` and 6 `withSQLConf` functions. This PR aims to remove duplicated and inconsistent code and reduce them to the following meaningful implementations. **withTempPath** - `SQLHelper.withTempPath`: The one which was used in `SQLTestUtils`. **withSQLConf** - `SQLHelper.withSQLConf`: The one which was used in `PlanTest`. - `ExecutorSideSQLConfSuite.withSQLConf`: The one which doesn't throw `AnalysisException` on StaticConf changes. - `SQLTestUtils.withSQLConf`: The one which overrides intentionally to change the active session. ```scala protected override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { SparkSession.setActiveSession(spark) super.withSQLConf(pairs: _*)(f) } ``` ## How was this patch tested? Pass the Jenkins with the existing tests. Closes #22548 from dongjoon-hyun/SPARK-25534. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81cbcca6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81cbcca6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81cbcca6 Branch: refs/heads/master Commit: 81cbcca60099fd267492769b465d01e90d7deeac Parents: 473d0d8 Author: Dongjoon Hyun Authored: Tue Sep 25 23:03:54 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Sep 25 23:03:54 2018 -0700 -- .../spark/sql/catalyst/plans/PlanTest.scala | 31 +- .../spark/sql/catalyst/plans/SQLHelper.scala| 64 .../benchmark/DataSourceReadBenchmark.scala | 23 +-- .../benchmark/FilterPushdownBenchmark.scala | 24 +--- .../datasources/csv/CSVBenchmarks.scala | 12 +--- .../datasources/json/JsonBenchmarks.scala | 11 +--- .../streaming/CheckpointFileManagerSuite.scala | 10 +-- .../apache/spark/sql/test/SQLTestUtils.scala| 13 .../spark/sql/hive/orc/OrcReadBenchmark.scala | 25 ++-- 9 files changed, 81 insertions(+), 132 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/81cbcca6/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala index 67740c3..3081ff9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala @@ -22,7 +22,6 @@ import org.scalatest.Suite import org.scalatest.Tag import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode @@ -57,7 +56,7 @@ trait CodegenInterpretedPlanTest extends PlanTest { * Provides helper methods for comparing plans, but without the overhead of * mandating a FunSuite. */ -trait PlanTestBase extends PredicateHelper { self: Suite => +trait PlanTestBase extends PredicateHelper with SQLHelper { self: Suite => // TODO(gatorsmile): remove this from PlanTest and all the analyzer rules protected def conf = SQLConf.get @@ -174,32 +173,4 @@ trait PlanTestBase extends PredicateHelper { self: Suite => plan1 == plan2 } } - - /** - * Sets all SQL configurations specified in `pairs`, calls `f`, and then restores all SQL - * configurations. - */ - protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { -val conf = SQLConf.get -val (keys, values) = pairs.unzip -val currentValues = keys.map { key => - if (conf.contains(key)) { -Some(conf.getConfString(key)) - } else { -None - } -} -(keys, values).zipped.foreach { (k, v) => - if (SQLConf.staticConfKeys.contains(k)) { -throw new AnalysisException(s"Cannot modify the value of a static config: $k") - } - conf.setConfString(k, v) -} -try f finally { - keys.zip(currentValues).foreach { -case (key, Some(value)) => conf.setConfString(key, value) -case (key, None) => conf.unsetConf(key) - } -} - } } http://git-wip-us.apache.org/repos/asf/spark/blob/81cbcca6/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SQLHelper.scala -- diff --git a/sql/catalyst/src/t
spark git commit: [SPARK-25425][SQL][BACKPORT-2.3] Extra options should override session options in DataSource V2
Repository: spark Updated Branches: refs/heads/branch-2.3 9674d083e -> cbb228e48 [SPARK-25425][SQL][BACKPORT-2.3] Extra options should override session options in DataSource V2 ## What changes were proposed in this pull request? In the PR, I propose overriding session options by extra options in DataSource V2. Extra options are more specific and set via `.option()`, and should overwrite more generic session options. ## How was this patch tested? Added tests for read and write paths. Closes #22489 from MaxGekk/session-options-2.3. Authored-by: Maxim Gekk Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cbb228e4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cbb228e4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cbb228e4 Branch: refs/heads/branch-2.3 Commit: cbb228e48bb046e7d88d6bf1c9b9e3b252241552 Parents: 9674d08 Author: Maxim Gekk Authored: Tue Sep 25 23:35:57 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Sep 25 23:35:57 2018 -0700 -- .../org/apache/spark/sql/DataFrameReader.scala | 8 ++-- .../org/apache/spark/sql/DataFrameWriter.scala | 8 ++-- .../sql/sources/v2/DataSourceV2Suite.scala | 50 .../sources/v2/SimpleWritableDataSource.scala | 7 ++- 4 files changed, 56 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cbb228e4/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 395e1c9..1d74b35 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -190,10 +190,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { val cls = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) if (classOf[DataSourceV2].isAssignableFrom(cls)) { val ds = cls.newInstance() - val options = new DataSourceOptions((extraOptions ++ -DataSourceV2Utils.extractSessionConfigs( - ds = ds.asInstanceOf[DataSourceV2], - conf = sparkSession.sessionState.conf)).asJava) + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( +ds = ds.asInstanceOf[DataSourceV2], +conf = sparkSession.sessionState.conf) + val options = new DataSourceOptions((sessionOptions ++ extraOptions).asJava) // Streaming also uses the data source V2 API. So it may be that the data source implements // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading http://git-wip-us.apache.org/repos/asf/spark/blob/cbb228e4/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 6c9fb52..3fcefb1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -243,10 +243,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val ds = cls.newInstance() ds match { case ws: WriteSupport => - val options = new DataSourceOptions((extraOptions ++ -DataSourceV2Utils.extractSessionConfigs( - ds = ds.asInstanceOf[DataSourceV2], - conf = df.sparkSession.sessionState.conf)).asJava) + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( +ds = ds.asInstanceOf[DataSourceV2], +conf = df.sparkSession.sessionState.conf) + val options = new DataSourceOptions((sessionOptions ++ extraOptions).asJava) // Using a timestamp and a random UUID to distinguish different writing jobs. This is good // enough as there won't be tons of writing jobs created at the same second. val jobId = new SimpleDateFormat("MMddHHmmss", Locale.US) http://git-wip-us.apache.org/repos/asf/spark/blob/cbb228e4/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala index 6ad0e5f..ec81e89 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++
spark git commit: [SPARK-24519][CORE] Compute SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS only once
Repository: spark Updated Branches: refs/heads/master bd2ae857d -> e702fb1d5 [SPARK-24519][CORE] Compute SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS only once ## What changes were proposed in this pull request? Previously SPARK-24519 created a modifiable config SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS. However, the config is being parsed for every creation of MapStatus, which could be very expensive. Another problem with the previous approach is that it created the illusion that this can be changed dynamically at runtime, which was not true. This PR changes it so the config is computed only once. ## How was this patch tested? Removed a test case that's no longer valid. Closes #22521 from rxin/SPARK-24519. Authored-by: Reynold Xin Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e702fb1d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e702fb1d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e702fb1d Branch: refs/heads/master Commit: e702fb1d5218d062fcb8e618b92dad7958eb4062 Parents: bd2ae85 Author: Reynold Xin Authored: Wed Sep 26 10:15:16 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 10:15:16 2018 -0700 -- .../org/apache/spark/scheduler/MapStatus.scala | 12 ++--- .../apache/spark/scheduler/MapStatusSuite.scala | 28 2 files changed, 9 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e702fb1d/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index 659694d..0e221ed 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -49,10 +49,16 @@ private[spark] sealed trait MapStatus { private[spark] object MapStatus { + /** + * Min partition number to use [[HighlyCompressedMapStatus]]. A bit ugly here because in test + * code we can't assume SparkEnv.get exists. + */ + private lazy val minPartitionsToUseHighlyCompressMapStatus = Option(SparkEnv.get) +.map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) + .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get) + def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = { -if (uncompressedSizes.length > Option(SparkEnv.get) - .map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) - .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get)) { +if (uncompressedSizes.length > minPartitionsToUseHighlyCompressMapStatus) { HighlyCompressedMapStatus(loc, uncompressedSizes) } else { new CompressedMapStatus(loc, uncompressedSizes) http://git-wip-us.apache.org/repos/asf/spark/blob/e702fb1d/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala index 354e638..2155a0f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala @@ -188,32 +188,4 @@ class MapStatusSuite extends SparkFunSuite { assert(count === 3000) } } - - test("SPARK-24519: HighlyCompressedMapStatus has configurable threshold") { -val conf = new SparkConf() -val env = mock(classOf[SparkEnv]) -doReturn(conf).when(env).conf -SparkEnv.set(env) -val sizes = Array.fill[Long](500)(150L) -// Test default value -val status = MapStatus(null, sizes) -assert(status.isInstanceOf[CompressedMapStatus]) -// Test Non-positive values -for (s <- -1 to 0) { - assertThrows[IllegalArgumentException] { -conf.set(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS, s) -val status = MapStatus(null, sizes) - } -} -// Test positive values -Seq(1, 100, 499, 500, 501).foreach { s => - conf.set(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS, s) - val status = MapStatus(null, sizes) - if(sizes.length > s) { -assert(status.isInstanceOf[HighlyCompressedMapStatus]) - } else { -assert(status.isInstanceOf[CompressedMapStatus]) - } -} - } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24519][CORE] Compute SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS only once
Repository: spark Updated Branches: refs/heads/branch-2.4 dc6047613 -> 8d1720079 [SPARK-24519][CORE] Compute SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS only once ## What changes were proposed in this pull request? Previously SPARK-24519 created a modifiable config SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS. However, the config is being parsed for every creation of MapStatus, which could be very expensive. Another problem with the previous approach is that it created the illusion that this can be changed dynamically at runtime, which was not true. This PR changes it so the config is computed only once. ## How was this patch tested? Removed a test case that's no longer valid. Closes #22521 from rxin/SPARK-24519. Authored-by: Reynold Xin Signed-off-by: Dongjoon Hyun (cherry picked from commit e702fb1d5218d062fcb8e618b92dad7958eb4062) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d172007 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d172007 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d172007 Branch: refs/heads/branch-2.4 Commit: 8d172007968dc4f1d4a091ccb9e16cd785c0a363 Parents: dc60476 Author: Reynold Xin Authored: Wed Sep 26 10:15:16 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 10:22:50 2018 -0700 -- .../org/apache/spark/scheduler/MapStatus.scala | 12 ++--- .../apache/spark/scheduler/MapStatusSuite.scala | 28 2 files changed, 9 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8d172007/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index 659694d..0e221ed 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -49,10 +49,16 @@ private[spark] sealed trait MapStatus { private[spark] object MapStatus { + /** + * Min partition number to use [[HighlyCompressedMapStatus]]. A bit ugly here because in test + * code we can't assume SparkEnv.get exists. + */ + private lazy val minPartitionsToUseHighlyCompressMapStatus = Option(SparkEnv.get) +.map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) + .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get) + def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = { -if (uncompressedSizes.length > Option(SparkEnv.get) - .map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) - .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get)) { +if (uncompressedSizes.length > minPartitionsToUseHighlyCompressMapStatus) { HighlyCompressedMapStatus(loc, uncompressedSizes) } else { new CompressedMapStatus(loc, uncompressedSizes) http://git-wip-us.apache.org/repos/asf/spark/blob/8d172007/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala index 354e638..2155a0f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala @@ -188,32 +188,4 @@ class MapStatusSuite extends SparkFunSuite { assert(count === 3000) } } - - test("SPARK-24519: HighlyCompressedMapStatus has configurable threshold") { -val conf = new SparkConf() -val env = mock(classOf[SparkEnv]) -doReturn(conf).when(env).conf -SparkEnv.set(env) -val sizes = Array.fill[Long](500)(150L) -// Test default value -val status = MapStatus(null, sizes) -assert(status.isInstanceOf[CompressedMapStatus]) -// Test Non-positive values -for (s <- -1 to 0) { - assertThrows[IllegalArgumentException] { -conf.set(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS, s) -val status = MapStatus(null, sizes) - } -} -// Test positive values -Seq(1, 100, 499, 500, 501).foreach { s => - conf.set(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS, s) - val status = MapStatus(null, sizes) - if(sizes.length > s) { -assert(status.isInstanceOf[HighlyCompressedMapStatus]) - } else { -assert(status.isInstanceOf[CompressedMapStatus]) - } -} - } } - To unsubscribe, e-mail: commits-unsub
spark git commit: [SPARK-25481][SQL][TEST] Refactor ColumnarBatchBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master c3c45cbd7 -> 9063b17f3 [SPARK-25481][SQL][TEST] Refactor ColumnarBatchBenchmark to use main method ## What changes were proposed in this pull request? Refactor `ColumnarBatchBenchmark` to use main method. Generate benchmark result: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.vectorized.ColumnarBatchBenchmark" ``` ## How was this patch tested? manual tests Closes #22490 from yucai/SPARK-25481. Lead-authored-by: yucai Co-authored-by: Yucai Yu Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9063b17f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9063b17f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9063b17f Branch: refs/heads/master Commit: 9063b17f3d0f22b8e4142200259190a20f832a29 Parents: c3c45cb Author: yucai Authored: Wed Sep 26 20:40:10 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 20:40:10 2018 -0700 -- .../ColumnarBatchBenchmark-results.txt | 59 ++ .../vectorized/ColumnarBatchBenchmark.scala | 84 ++-- 2 files changed, 85 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9063b17f/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt -- diff --git a/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt new file mode 100644 index 000..5963716 --- /dev/null +++ b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt @@ -0,0 +1,59 @@ + +Int Read/Write + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int Read/Write: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Java Array 244 / 244 1342.3 0.7 1.0X +ByteBuffer Unsafe 445 / 445736.5 1.4 0.5X +ByteBuffer API2124 / 2125154.3 6.5 0.1X +DirectByteBuffer 750 / 750437.2 2.3 0.3X +Unsafe Buffer 234 / 236 1401.3 0.7 1.0X +Column(on heap)245 / 245 1335.6 0.7 1.0X +Column(off heap) 489 / 489670.3 1.5 0.5X +Column(off heap direct)236 / 236 1388.1 0.7 1.0X +UnsafeRow (on heap)532 / 534616.0 1.6 0.5X +UnsafeRow (off heap) 564 / 565580.7 1.7 0.4X +Column On Heap Append 489 / 489670.6 1.5 0.5X + + + +Boolean Read/Write + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Boolean Read/Write: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Bitset 879 / 879381.9 2.6 1.0X +Byte Array 794 / 794422.6 2.4 1.1X + + + +String Read/Write + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String Read/Write: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +On Heap449 / 449 36.5 27.4 1.0X +Off Heap 679 / 679 24.1 41
spark git commit: [SPARK-25536][CORE] metric value for METRIC_OUTPUT_RECORDS_WRITTEN is incorrect
Repository: spark Updated Branches: refs/heads/master 9063b17f3 -> 5def10e61 [SPARK-25536][CORE] metric value for METRIC_OUTPUT_RECORDS_WRITTEN is incorrect ## What changes were proposed in this pull request? changed metric value of METRIC_OUTPUT_RECORDS_WRITTEN from 'task.metrics.inputMetrics.recordsRead' to 'task.metrics.outputMetrics.recordsWritten'. This bug was introduced in SPARK-22190. https://github.com/apache/spark/pull/19426 ## How was this patch tested? Existing tests Closes #22555 from shahidki31/SPARK-25536. Authored-by: Shahid Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5def10e6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5def10e6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5def10e6 Branch: refs/heads/master Commit: 5def10e61e49dba85f4d8b39c92bda15137990a2 Parents: 9063b17 Author: Shahid Authored: Wed Sep 26 21:10:39 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 21:10:39 2018 -0700 -- core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5def10e6/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 6d7d656..eba708d 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -504,7 +504,7 @@ private[spark] class Executor( executorSource.METRIC_OUTPUT_BYTES_WRITTEN .inc(task.metrics.outputMetrics.bytesWritten) executorSource.METRIC_OUTPUT_RECORDS_WRITTEN - .inc(task.metrics.inputMetrics.recordsRead) + .inc(task.metrics.outputMetrics.recordsWritten) executorSource.METRIC_RESULT_SIZE.inc(task.metrics.resultSize) executorSource.METRIC_DISK_BYTES_SPILLED.inc(task.metrics.diskBytesSpilled) executorSource.METRIC_MEMORY_BYTES_SPILLED.inc(task.metrics.memoryBytesSpilled) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25536][CORE] metric value for METRIC_OUTPUT_RECORDS_WRITTEN is incorrect
Repository: spark Updated Branches: refs/heads/branch-2.4 7656358ad -> f12769e73 [SPARK-25536][CORE] metric value for METRIC_OUTPUT_RECORDS_WRITTEN is incorrect ## What changes were proposed in this pull request? changed metric value of METRIC_OUTPUT_RECORDS_WRITTEN from 'task.metrics.inputMetrics.recordsRead' to 'task.metrics.outputMetrics.recordsWritten'. This bug was introduced in SPARK-22190. https://github.com/apache/spark/pull/19426 ## How was this patch tested? Existing tests Closes #22555 from shahidki31/SPARK-25536. Authored-by: Shahid Signed-off-by: Dongjoon Hyun (cherry picked from commit 5def10e61e49dba85f4d8b39c92bda15137990a2) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f12769e7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f12769e7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f12769e7 Branch: refs/heads/branch-2.4 Commit: f12769e73a81f5a333c4ff91813ad698ffb16eec Parents: 7656358 Author: Shahid Authored: Wed Sep 26 21:10:39 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 21:14:13 2018 -0700 -- core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f12769e7/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 27f7ec8..97dfcc4 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -499,7 +499,7 @@ private[spark] class Executor( executorSource.METRIC_OUTPUT_BYTES_WRITTEN .inc(task.metrics.outputMetrics.bytesWritten) executorSource.METRIC_OUTPUT_RECORDS_WRITTEN - .inc(task.metrics.inputMetrics.recordsRead) + .inc(task.metrics.outputMetrics.recordsWritten) executorSource.METRIC_RESULT_SIZE.inc(task.metrics.resultSize) executorSource.METRIC_DISK_BYTES_SPILLED.inc(task.metrics.diskBytesSpilled) executorSource.METRIC_MEMORY_BYTES_SPILLED.inc(task.metrics.memoryBytesSpilled) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25536][CORE] metric value for METRIC_OUTPUT_RECORDS_WRITTEN is incorrect
Repository: spark Updated Branches: refs/heads/branch-2.3 26d893a4f -> f40e4c71c [SPARK-25536][CORE] metric value for METRIC_OUTPUT_RECORDS_WRITTEN is incorrect ## What changes were proposed in this pull request? changed metric value of METRIC_OUTPUT_RECORDS_WRITTEN from 'task.metrics.inputMetrics.recordsRead' to 'task.metrics.outputMetrics.recordsWritten'. This bug was introduced in SPARK-22190. https://github.com/apache/spark/pull/19426 ## How was this patch tested? Existing tests Closes #22555 from shahidki31/SPARK-25536. Authored-by: Shahid Signed-off-by: Dongjoon Hyun (cherry picked from commit 5def10e61e49dba85f4d8b39c92bda15137990a2) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f40e4c71 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f40e4c71 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f40e4c71 Branch: refs/heads/branch-2.3 Commit: f40e4c71cdb46392648c35a2f2cb0de140f3c5a8 Parents: 26d893a Author: Shahid Authored: Wed Sep 26 21:10:39 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 21:15:37 2018 -0700 -- core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f40e4c71/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index a9c31c7..e62293f 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -442,7 +442,7 @@ private[spark] class Executor( executorSource.METRIC_OUTPUT_BYTES_WRITTEN .inc(task.metrics.outputMetrics.bytesWritten) executorSource.METRIC_OUTPUT_RECORDS_WRITTEN - .inc(task.metrics.inputMetrics.recordsRead) + .inc(task.metrics.outputMetrics.recordsWritten) executorSource.METRIC_RESULT_SIZE.inc(task.metrics.resultSize) executorSource.METRIC_DISK_BYTES_SPILLED.inc(task.metrics.diskBytesSpilled) executorSource.METRIC_MEMORY_BYTES_SPILLED.inc(task.metrics.memoryBytesSpilled) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25485][SQL][TEST] Refactor UnsafeProjectionBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 8b727994e -> f309b28bd [SPARK-25485][SQL][TEST] Refactor UnsafeProjectionBenchmark to use main method ## What changes were proposed in this pull request? Refactor `UnsafeProjectionBenchmark` to use main method. Generate benchmark result: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.UnsafeProjectionBenchmark" ``` ## How was this patch tested? manual test Closes #22493 from yucai/SPARK-25485. Lead-authored-by: yucai Co-authored-by: Yucai Yu Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f309b28b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f309b28b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f309b28b Branch: refs/heads/master Commit: f309b28bd9271719ca36fcf334f016ed6165a79b Parents: 8b72799 Author: yucai Authored: Wed Sep 26 23:27:45 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Sep 26 23:27:45 2018 -0700 -- .../UnsafeProjectionBenchmark-results.txt | 14 ++ .../spark/sql/UnsafeProjectionBenchmark.scala | 172 +-- 2 files changed, 98 insertions(+), 88 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f309b28b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt -- diff --git a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt new file mode 100644 index 000..43156dc --- /dev/null +++ b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt @@ -0,0 +1,14 @@ + +unsafe projection + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +unsafe projection: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +single long 2867 / 2868 93.6 10.7 1.0X +single nullable long 3915 / 3949 68.6 14.6 0.7X +7 primitive types 8166 / 8167 32.9 30.4 0.4X +7 nullable primitive types 12767 / 12767 21.0 47.6 0.2X + + http://git-wip-us.apache.org/repos/asf/spark/blob/f309b28b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala index faff681..cbe723f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.UnsafeProjection @@ -25,8 +25,15 @@ import org.apache.spark.sql.types._ /** * Benchmark `UnsafeProjection` for fixed-length/primitive-type fields. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/UnsafeProjectionBenchmark-results.txt". + * }}} */ -object UnsafeProjectionBenchmark { +object UnsafeProjectionBenchmark extends BenchmarkBase { def generateRows(schema: StructType, numRows: Int): Array[InternalRow] = { val generator = RandomDataGenerator.forType(schema, nullable = false).get @@ -34,103 +41,92 @@ object UnsafeProjectionBenchmark { (1 to numRows).map(_ => encoder.toRow(generator().asInstanceOf[Row]).copy()).toArray } - def main(args: Array[String]) { -val iters = 1024 * 16 -val numRows = 1024 * 16 - -val benchmark = new Benchmark("unsafe projection", iters * numRows.toLong) - - -val schema1 = new StructType().add("l", LongType, false) -val attrs1 = schema1.toAttributes -val rows1 = generateRows(schem
spark git commit: [SPARK-25546][CORE] Don't cache value of EVENT_LOG_CALLSITE_LONG_FORM.
Repository: spark Updated Branches: refs/heads/master a1adde540 -> 5fd22d053 [SPARK-25546][CORE] Don't cache value of EVENT_LOG_CALLSITE_LONG_FORM. Caching the value of that config means different instances of SparkEnv will always use whatever was the first value to be read. It also breaks tests that use RDDInfo outside of the scope of a SparkContext. Since this is not a performance sensitive area, there's no advantage in caching the config value. Closes #22558 from vanzin/SPARK-25546. Authored-by: Marcelo Vanzin Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5fd22d05 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5fd22d05 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5fd22d05 Branch: refs/heads/master Commit: 5fd22d05363dd8c0e1b10f3822ccb71eb42f6db9 Parents: a1adde5 Author: Marcelo Vanzin Authored: Thu Sep 27 09:26:50 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Sep 27 09:26:50 2018 -0700 -- core/src/main/scala/org/apache/spark/storage/RDDInfo.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5fd22d05/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala index 19f8656..917cfab 100644 --- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala +++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala @@ -55,11 +55,13 @@ class RDDInfo( } private[spark] object RDDInfo { - private val callsiteLongForm = SparkEnv.get.conf.get(EVENT_LOG_CALLSITE_LONG_FORM) - def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) +val callsiteLongForm = Option(SparkEnv.get) + .map(_.conf.get(EVENT_LOG_CALLSITE_LONG_FORM)) + .getOrElse(false) + val callSite = if (callsiteLongForm) { rdd.creationSite.longForm } else { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25546][CORE] Don't cache value of EVENT_LOG_CALLSITE_LONG_FORM.
Repository: spark Updated Branches: refs/heads/branch-2.4 659ecb54a -> 0256f8a09 [SPARK-25546][CORE] Don't cache value of EVENT_LOG_CALLSITE_LONG_FORM. Caching the value of that config means different instances of SparkEnv will always use whatever was the first value to be read. It also breaks tests that use RDDInfo outside of the scope of a SparkContext. Since this is not a performance sensitive area, there's no advantage in caching the config value. Closes #22558 from vanzin/SPARK-25546. Authored-by: Marcelo Vanzin Signed-off-by: Dongjoon Hyun (cherry picked from commit 5fd22d05363dd8c0e1b10f3822ccb71eb42f6db9) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0256f8a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0256f8a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0256f8a0 Branch: refs/heads/branch-2.4 Commit: 0256f8a0973c2fc8815fa710670dbe68317335b5 Parents: 659ecb5 Author: Marcelo Vanzin Authored: Thu Sep 27 09:26:50 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Sep 27 09:27:05 2018 -0700 -- core/src/main/scala/org/apache/spark/storage/RDDInfo.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0256f8a0/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala index 19f8656..917cfab 100644 --- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala +++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala @@ -55,11 +55,13 @@ class RDDInfo( } private[spark] object RDDInfo { - private val callsiteLongForm = SparkEnv.get.conf.get(EVENT_LOG_CALLSITE_LONG_FORM) - def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) +val callsiteLongForm = Option(SparkEnv.get) + .map(_.conf.get(EVENT_LOG_CALLSITE_LONG_FORM)) + .getOrElse(false) + val callSite = if (callsiteLongForm) { rdd.creationSite.longForm } else { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25459][SQL] Add viewOriginalText back to CatalogTable
Repository: spark Updated Branches: refs/heads/master 5fd22d053 -> 3b7395fe0 [SPARK-25459][SQL] Add viewOriginalText back to CatalogTable ## What changes were proposed in this pull request? The `show create table` will show a lot of generated attributes for views that created by older Spark version. This PR will basically revert https://issues.apache.org/jira/browse/SPARK-19272 back, so when you `DESC [FORMATTED|EXTENDED] view` will show the original view DDL text. ## How was this patch tested? Unit test. Closes #22458 from zheyuan28/testbranch. Lead-authored-by: Chris Zhao Co-authored-by: Christopher Zhao Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b7395fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b7395fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b7395fe Branch: refs/heads/master Commit: 3b7395fe025a4c9a591835e53ac6ca05be6868f1 Parents: 5fd22d0 Author: Chris Zhao Authored: Thu Sep 27 17:55:08 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Sep 27 17:55:08 2018 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 4 +++- .../spark/sql/execution/command/views.scala | 2 ++ .../sql-tests/results/describe.sql.out | 2 ++ .../spark/sql/hive/client/HiveClientImpl.scala | 9 +--- .../spark/sql/hive/execution/HiveDDLSuite.scala | 22 5 files changed, 35 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b7395fe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 30ded13..817abeb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -244,7 +244,8 @@ case class CatalogTable( unsupportedFeatures: Seq[String] = Seq.empty, tracksPartitionsInCatalog: Boolean = false, schemaPreservesCase: Boolean = true, -ignoredProperties: Map[String, String] = Map.empty) { +ignoredProperties: Map[String, String] = Map.empty, +viewOriginalText: Option[String] = None) { import CatalogTable._ @@ -331,6 +332,7 @@ case class CatalogTable( comment.foreach(map.put("Comment", _)) if (tableType == CatalogTableType.VIEW) { viewText.foreach(map.put("View Text", _)) + viewOriginalText.foreach(map.put("View Original Text", _)) viewDefaultDatabase.foreach(map.put("View Default Database", _)) if (viewQueryColumnNames.nonEmpty) { map.put("View Query Output Columns", viewQueryColumnNames.mkString("[", ", ", "]")) http://git-wip-us.apache.org/repos/asf/spark/blob/3b7395fe/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 5172f32..cd34dfa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -242,6 +242,7 @@ case class CreateViewCommand( storage = CatalogStorageFormat.empty, schema = aliasPlan(session, analyzedPlan).schema, properties = newProperties, + viewOriginalText = originalText, viewText = originalText, comment = comment ) @@ -299,6 +300,7 @@ case class AlterViewAsCommand( val updatedViewMeta = viewMeta.copy( schema = analyzedPlan.schema, properties = newProperties, + viewOriginalText = Some(originalText), viewText = Some(originalText)) session.sessionState.catalog.alterTable(updatedViewMeta) http://git-wip-us.apache.org/repos/asf/spark/blob/3b7395fe/sql/core/src/test/resources/sql-tests/results/describe.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 79390cb..9c4b70d 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -474,6 +474,7 @@ Last Access [not included in comparison] Created By [not included in comparison] Type
spark git commit: [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag
Repository: spark Updated Branches: refs/heads/master e120a38c0 -> 0b33f0868 [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag ## What changes were proposed in this pull request? This adds a missing markup tag. This should go to `master/branch-2.4`. ## How was this patch tested? Manual via `SKIP_API=1 jekyll build`. Closes #22585 from dongjoon-hyun/SPARK-23285. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b33f086 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b33f086 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b33f086 Branch: refs/heads/master Commit: 0b33f08683a41f6f3a6ec02c327010c0722cc1d1 Parents: e120a38 Author: Dongjoon Hyun Authored: Fri Sep 28 14:10:24 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:10:24 2018 -0700 -- docs/running-on-kubernetes.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b33f086/docs/running-on-kubernetes.md -- diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 4ae7aca..840e306 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -691,6 +691,7 @@ specific to Spark on Kubernetes. Example values include 0.1, 500m, 1.5, 5, etc., with the definition of cpu units documented in [CPU units](https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units). This is distinct from spark.executor.cores: it is only used and takes precedence over spark.executor.cores for specifying the executor pod cpu request if set. Task parallelism, e.g., number of tasks an executor can run concurrently is not affected by this. + spark.kubernetes.executor.limit.cores - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag
Repository: spark Updated Branches: refs/heads/branch-2.4 b2a1e2f8d -> 81391c274 [SPARK-23285][DOC][FOLLOWUP] Fix missing markup tag ## What changes were proposed in this pull request? This adds a missing markup tag. This should go to `master/branch-2.4`. ## How was this patch tested? Manual via `SKIP_API=1 jekyll build`. Closes #22585 from dongjoon-hyun/SPARK-23285. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 0b33f08683a41f6f3a6ec02c327010c0722cc1d1) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81391c27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81391c27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81391c27 Branch: refs/heads/branch-2.4 Commit: 81391c274eb371dbf4cfed0afca47806f6fcfd00 Parents: b2a1e2f Author: Dongjoon Hyun Authored: Fri Sep 28 14:10:24 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:10:47 2018 -0700 -- docs/running-on-kubernetes.md | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/81391c27/docs/running-on-kubernetes.md -- diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index c83dad6..fc7c9a5 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -678,6 +678,7 @@ specific to Spark on Kubernetes. Example values include 0.1, 500m, 1.5, 5, etc., with the definition of cpu units documented in [CPU units](https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units). This is distinct from spark.executor.cores: it is only used and takes precedence over spark.executor.cores for specifying the executor pod cpu request if set. Task parallelism, e.g., number of tasks an executor can run concurrently is not affected by this. + spark.kubernetes.executor.limit.cores - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet
Repository: spark Updated Branches: refs/heads/master 0b33f0868 -> b7d80349b [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet ## What changes were proposed in this pull request? The specified test in OpenHashMapSuite to test large items is somehow flaky to throw OOM. By considering the original work #6763 that added this test, the test can be against OpenHashSetSuite. And by doing this should be to save memory because OpenHashMap allocates two more arrays when growing the map/set. ## How was this patch tested? Existing tests. Closes #22569 from viirya/SPARK-25542. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7d80349 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7d80349 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7d80349 Branch: refs/heads/master Commit: b7d80349b0e367d78cab238e62c2ec353f0f12b3 Parents: 0b33f08 Author: Liang-Chi Hsieh Authored: Fri Sep 28 14:29:56 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:29:56 2018 -0700 -- .../spark/util/collection/OpenHashMapSuite.scala | 10 -- .../spark/util/collection/OpenHashSetSuite.scala | 13 + 2 files changed, 13 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7d80349/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 151235d..68bcc5e 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -185,16 +185,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { assert(map.contains(null)) } - test("support for more than 12M items") { -val cnt = 1200 // 12M -val map = new OpenHashMap[Int, Int](cnt) -for (i <- 0 until cnt) { - map(i) = 1 -} -val numInvalidValues = map.iterator.count(_._2 == 0) -assertResult(0)(numInvalidValues) - } - test("distinguish between the 0/0.0/0L and null") { val specializedMap1 = new OpenHashMap[String, Long] specializedMap1("a") = null.asInstanceOf[Long] http://git-wip-us.apache.org/repos/asf/spark/blob/b7d80349/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index b887f93..44d2118 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -255,4 +255,17 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { val set = new OpenHashSet[Long](0) assert(set.size === 0) } + + test("support for more than 12M items") { +val cnt = 1200 // 12M +val set = new OpenHashSet[Int](cnt) +for (i <- 0 until cnt) { + set.add(i) + assert(set.contains(i)) + + val pos1 = set.getPos(i) + val pos2 = set.addWithoutResize(i) & OpenHashSet.POSITION_MASK + assert(pos1 == pos2) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet
Repository: spark Updated Branches: refs/heads/branch-2.4 81391c274 -> 7614313c9 [SPARK-25542][CORE][TEST] Move flaky test in OpenHashMapSuite to OpenHashSetSuite and make it against OpenHashSet ## What changes were proposed in this pull request? The specified test in OpenHashMapSuite to test large items is somehow flaky to throw OOM. By considering the original work #6763 that added this test, the test can be against OpenHashSetSuite. And by doing this should be to save memory because OpenHashMap allocates two more arrays when growing the map/set. ## How was this patch tested? Existing tests. Closes #22569 from viirya/SPARK-25542. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun (cherry picked from commit b7d80349b0e367d78cab238e62c2ec353f0f12b3) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7614313c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7614313c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7614313c Branch: refs/heads/branch-2.4 Commit: 7614313c9443712553332962d62dfe5aacc7ed34 Parents: 81391c2 Author: Liang-Chi Hsieh Authored: Fri Sep 28 14:29:56 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 14:30:12 2018 -0700 -- .../spark/util/collection/OpenHashMapSuite.scala | 10 -- .../spark/util/collection/OpenHashSetSuite.scala | 13 + 2 files changed, 13 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7614313c/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 151235d..68bcc5e 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -185,16 +185,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { assert(map.contains(null)) } - test("support for more than 12M items") { -val cnt = 1200 // 12M -val map = new OpenHashMap[Int, Int](cnt) -for (i <- 0 until cnt) { - map(i) = 1 -} -val numInvalidValues = map.iterator.count(_._2 == 0) -assertResult(0)(numInvalidValues) - } - test("distinguish between the 0/0.0/0L and null") { val specializedMap1 = new OpenHashMap[String, Long] specializedMap1("a") = null.asInstanceOf[Long] http://git-wip-us.apache.org/repos/asf/spark/blob/7614313c/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index b887f93..44d2118 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -255,4 +255,17 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { val set = new OpenHashSet[Long](0) assert(set.size === 0) } + + test("support for more than 12M items") { +val cnt = 1200 // 12M +val set = new OpenHashSet[Int](cnt) +for (i <- 0 until cnt) { + set.add(i) + assert(set.contains(i)) + + val pos1 = set.getPos(i) + val pos2 = set.addWithoutResize(i) & OpenHashSet.POSITION_MASK + assert(pos1 == pos2) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25559][SQL] Remove the unsupported predicates in Parquet when possible
Repository: spark Updated Branches: refs/heads/master 9362c5cc2 -> 5d726b865 [SPARK-25559][SQL] Remove the unsupported predicates in Parquet when possible ## What changes were proposed in this pull request? Currently, in `ParquetFilters`, if one of the children predicates is not supported by Parquet, the entire predicates will be thrown away. In fact, if the unsupported predicate is in the top level `And` condition or in the child before hitting `Not` or `Or` condition, it can be safely removed. ## How was this patch tested? Tests are added. Closes #22574 from dbtsai/removeUnsupportedPredicatesInParquet. Lead-authored-by: DB Tsai Co-authored-by: Dongjoon Hyun Co-authored-by: DB Tsai Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d726b86 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d726b86 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d726b86 Branch: refs/heads/master Commit: 5d726b865948f993911fd5b9730b25cfa94e16c7 Parents: 9362c5c Author: DB Tsai Authored: Fri Sep 28 17:46:11 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Sep 28 17:46:11 2018 -0700 -- .../datasources/parquet/ParquetFilters.scala| 38 +++-- .../parquet/ParquetFilterSuite.scala| 147 ++- 2 files changed, 172 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5d726b86/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 0c286de..44a0d20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -394,7 +394,13 @@ private[parquet] class ParquetFilters( */ def createFilter(schema: MessageType, predicate: sources.Filter): Option[FilterPredicate] = { val nameToParquetField = getFieldMap(schema) +createFilterHelper(nameToParquetField, predicate, canRemoveOneSideInAnd = true) + } + private def createFilterHelper( + nameToParquetField: Map[String, ParquetField], + predicate: sources.Filter, + canRemoveOneSideInAnd: Boolean): Option[FilterPredicate] = { // Decimal type must make sure that filter value's scale matched the file. // If doesn't matched, which would cause data corruption. def isDecimalMatched(value: Any, decimalMeta: DecimalMetadata): Boolean = value match { @@ -488,26 +494,36 @@ private[parquet] class ParquetFilters( .map(_(nameToParquetField(name).fieldName, value)) case sources.And(lhs, rhs) => -// At here, it is not safe to just convert one side if we do not understand the -// other side. Here is an example used to explain the reason. +// At here, it is not safe to just convert one side and remove the other side +// if we do not understand what the parent filters are. +// +// Here is an example used to explain the reason. // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to // convert b in ('1'). If we only convert a = 2, we will end up with a filter // NOT(a = 2), which will generate wrong results. -// Pushing one side of AND down is only safe to do at the top level. -// You can see ParquetRelation's initializeLocalJobFunc method as an example. -for { - lhsFilter <- createFilter(schema, lhs) - rhsFilter <- createFilter(schema, rhs) -} yield FilterApi.and(lhsFilter, rhsFilter) +// +// Pushing one side of AND down is only safe to do at the top level or in the child +// AND before hitting NOT or OR conditions, and in this case, the unsupported predicate +// can be safely removed. +val lhsFilterOption = createFilterHelper(nameToParquetField, lhs, canRemoveOneSideInAnd) +val rhsFilterOption = createFilterHelper(nameToParquetField, rhs, canRemoveOneSideInAnd) + +(lhsFilterOption, rhsFilterOption) match { + case (Some(lhsFilter), Some(rhsFilter)) => Some(FilterApi.and(lhsFilter, rhsFilter)) + case (Some(lhsFilter), None) if canRemoveOneSideInAnd => Some(lhsFilter) + case (None, Some(rhsFilter)) if canRemoveOneSideInAnd => Some(rhsFilter) + case _ => None +} case sources.Or(lhs, rhs) => for { -
spark git commit: [SPARK-25508][SQL][TEST] Refactor OrcReadBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 623c2ec4e -> f246813af [SPARK-25508][SQL][TEST] Refactor OrcReadBenchmark to use main method ## What changes were proposed in this pull request? Refactor OrcReadBenchmark to use main method. Generate benchmark result: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "hive/test:runMain org.apache.spark.sql.hive.orc.OrcReadBenchmark" ``` ## How was this patch tested? manual tests Closes #22580 from yucai/SPARK-25508. Lead-authored-by: yucai Co-authored-by: Yucai Yu Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f246813a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f246813a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f246813a Branch: refs/heads/master Commit: f246813afba16fee4d703f09e6302011b11806f3 Parents: 623c2ec Author: yucai Authored: Sat Sep 29 09:48:03 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Sep 29 09:48:03 2018 -0700 -- .../benchmarks/OrcReadBenchmark-results.txt | 173 .../spark/sql/hive/orc/OrcReadBenchmark.scala | 196 --- 2 files changed, 212 insertions(+), 157 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f246813a/sql/hive/benchmarks/OrcReadBenchmark-results.txt -- diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt new file mode 100644 index 000..c77f966 --- /dev/null +++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt @@ -0,0 +1,173 @@ + +SQL Single Numeric Column Scan + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single TINYINT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Native ORC MR 1630 / 1639 9.7 103.6 1.0X +Native ORC Vectorized 253 / 288 62.2 16.1 6.4X +Native ORC Vectorized with copy227 / 244 69.2 14.5 7.2X +Hive built-in ORC 1980 / 1991 7.9 125.9 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single SMALLINT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Native ORC MR 1587 / 1589 9.9 100.9 1.0X +Native ORC Vectorized 227 / 242 69.2 14.5 7.0X +Native ORC Vectorized with copy228 / 238 69.0 14.5 7.0X +Hive built-in ORC 2323 / 2332 6.8 147.7 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single INT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Native ORC MR 1726 / 1771 9.1 109.7 1.0X +Native ORC Vectorized 309 / 333 50.9 19.7 5.6X +Native ORC Vectorized with copy313 / 321 50.2 19.9 5.5X +Hive built-in ORC 2668 / 2672 5.9 169.6 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single BIGINT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Native ORC MR 1722 / 1747 9.1 109.5 1.0X +Native ORC Vectorized 395 / 403 39.8 25.1 4.4X +Native ORC Vectorized with copy399 / 405 39.4 25.4 4.3X +Hive built-in ORC 2767 / 2777 5.7 175.9 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +In
spark git commit: [SPARK-25543][K8S] Print debug message iff execIdsRemovedInThisRound is not empty.
Repository: spark Updated Branches: refs/heads/master 40e6ed894 -> 4da541a5d [SPARK-25543][K8S] Print debug message iff execIdsRemovedInThisRound is not empty. ## What changes were proposed in this pull request? Spurious logs like /sec. 2018-09-26 09:33:57 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. 2018-09-26 09:33:58 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. 2018-09-26 09:33:59 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. 2018-09-26 09:34:00 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. The fix is easy, first check if there are any removed executors, before producing the log message. ## How was this patch tested? Tested by manually deploying to a minikube cluster. Closes #22565 from ScrapCodes/spark-25543/k8s/debug-log-spurious-warning. Authored-by: Prashant Sharma Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4da541a5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4da541a5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4da541a5 Branch: refs/heads/master Commit: 4da541a5d23b039eb549dd849cf121bdc8676e59 Parents: 40e6ed8 Author: Prashant Sharma Authored: Sun Sep 30 14:28:20 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Sep 30 14:28:20 2018 -0700 -- .../scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4da541a5/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala -- diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala index e2800cf..cc254b8 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala @@ -100,8 +100,11 @@ private[spark] class ExecutorPodsLifecycleManager( } } } -logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" + - s" from Spark that were either found to be deleted or non-existent in the cluster.") + +if (execIdsRemovedInThisRound.nonEmpty) { + logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" + +s" from Spark that were either found to be deleted or non-existent in the cluster.") +} } private def onFinalNonDeletedState( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25543][K8S] Print debug message iff execIdsRemovedInThisRound is not empty.
Repository: spark Updated Branches: refs/heads/branch-2.4 8e6fb473b -> c886f050b [SPARK-25543][K8S] Print debug message iff execIdsRemovedInThisRound is not empty. ## What changes were proposed in this pull request? Spurious logs like /sec. 2018-09-26 09:33:57 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. 2018-09-26 09:33:58 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. 2018-09-26 09:33:59 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. 2018-09-26 09:34:00 DEBUG ExecutorPodsLifecycleManager:58 - Removed executors with ids from Spark that were either found to be deleted or non-existent in the cluster. The fix is easy, first check if there are any removed executors, before producing the log message. ## How was this patch tested? Tested by manually deploying to a minikube cluster. Closes #22565 from ScrapCodes/spark-25543/k8s/debug-log-spurious-warning. Authored-by: Prashant Sharma Signed-off-by: Dongjoon Hyun (cherry picked from commit 4da541a5d23b039eb549dd849cf121bdc8676e59) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c886f050 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c886f050 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c886f050 Branch: refs/heads/branch-2.4 Commit: c886f050b51862bd8cdb41bc13f4dfedebdbcd31 Parents: 8e6fb47 Author: Prashant Sharma Authored: Sun Sep 30 14:28:20 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Sep 30 14:28:39 2018 -0700 -- .../scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c886f050/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala -- diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala index b28d939..1481463 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala @@ -99,8 +99,11 @@ private[spark] class ExecutorPodsLifecycleManager( } } } -logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" + - s" from Spark that were either found to be deleted or non-existent in the cluster.") + +if (execIdsRemovedInThisRound.nonEmpty) { + logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" + +s" from Spark that were either found to be deleted or non-existent in the cluster.") +} } private def onFinalNonDeletedState( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25476][SPARK-25510][TEST] Refactor AggregateBenchmark and add a new trait to better support Dataset and DataFrame API
Repository: spark Updated Branches: refs/heads/master 30f5d0f2d -> b96fd44f0 [SPARK-25476][SPARK-25510][TEST] Refactor AggregateBenchmark and add a new trait to better support Dataset and DataFrame API ## What changes were proposed in this pull request? This PR does 2 things: 1. Add a new trait(`SqlBasedBenchmark`) to better support Dataset and DataFrame API. 2. Refactor `AggregateBenchmark` to use main method. Generate benchmark result: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.AggregateBenchmark" ``` ## How was this patch tested? manual tests Closes #22484 from wangyum/SPARK-25476. Lead-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b96fd44f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b96fd44f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b96fd44f Branch: refs/heads/master Commit: b96fd44f0e91751c1ce3a617cb083bdf880701a1 Parents: 30f5d0f Author: Yuming Wang Authored: Mon Oct 1 07:32:40 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Oct 1 07:32:40 2018 -0700 -- .../benchmarks/AggregateBenchmark-results.txt | 143 +++ .../benchmark/AggregateBenchmark.scala | 943 +-- .../execution/benchmark/SqlBasedBenchmark.scala | 60 ++ 3 files changed, 633 insertions(+), 513 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b96fd44f/sql/core/benchmarks/AggregateBenchmark-results.txt -- diff --git a/sql/core/benchmarks/AggregateBenchmark-results.txt b/sql/core/benchmarks/AggregateBenchmark-results.txt new file mode 100644 index 000..19e5247 --- /dev/null +++ b/sql/core/benchmarks/AggregateBenchmark-results.txt @@ -0,0 +1,143 @@ + +aggregate without grouping + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +agg w/o group: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +agg w/o group wholestage off65374 / 70665 32.1 31.2 1.0X +agg w/o group wholestage on 1178 / 1209 1779.8 0.6 55.5X + + + +stat functions + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +stddev: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +stddev wholestage off 8667 / 8851 12.1 82.7 1.0X +stddev wholestage on 1266 / 1273 82.8 12.1 6.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +kurtosis:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +kurtosis wholestage off 41218 / 41231 2.5 393.1 1.0X +kurtosis wholestage on1347 / 1357 77.8 12.8 30.6X + + + +aggregate with linear keys + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Aggregate w keys:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +codegen = F 9309 / 9389 9.0 111.0 1.0X +codegen = T hashmap = F 4417 / 4435 19.0 52.7 2.1X +codegen = T hashmap = T 1289 / 1298 65.1 15.4 7.2X + + +==
spark-website git commit: Add a section to update test suites
Repository: spark-website Updated Branches: refs/heads/asf-site 8b7444182 -> 4ea5a5d8f Add a section to update test suites Author: Dongjoon Hyun Closes #151 from dongjoon-hyun/test_suite. Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/4ea5a5d8 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/4ea5a5d8 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/4ea5a5d8 Branch: refs/heads/asf-site Commit: 4ea5a5d8fe197802d35e287d12d6e6fcd7582044 Parents: 8b74441 Author: Dongjoon Hyun Authored: Mon Oct 1 22:04:04 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Oct 1 22:04:04 2018 -0700 -- release-process.md| 9 + site/release-process.html | 9 + 2 files changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/4ea5a5d8/release-process.md -- diff --git a/release-process.md b/release-process.md index de04539..7097c34 100644 --- a/release-process.md +++ b/release-process.md @@ -313,6 +313,15 @@ $ git shortlog v1.1.1 --grep "$EXPR" > contrib.txt $ git log v1.1.1 --grep "$expr" --shortstat --oneline | grep -B 1 -e "[3-9][0-9][0-9] insert" -e "[1-9][1-9][1-9][1-9] insert" | grep SPARK > large-patches.txt ``` +Update `HiveExternalCatalogVersionsSuite` + +When a new release occurs, `PROCESS_TABLES.testingVersions` in `HiveExternalCatalogVersionsSuite` +must be updated shortly thereafter. This list should contain the latest release in all active +maintenance branches, and no more. +For example, as of this writing, it has value `val testingVersions = Seq("2.1.3", "2.2.2", "2.3.2")`. +"2.4.0" will be added to the list when it's released. "2.1.3" will be removed (and removed from the Spark dist mirrors) +when the branch is no longer maintained. "2.3.2" will become "2.3.3" when "2.3.3" is released. + Create an Announcement Once everything is working (website docs, website changes) create an announcement on the website http://git-wip-us.apache.org/repos/asf/spark-website/blob/4ea5a5d8/site/release-process.html -- diff --git a/site/release-process.html b/site/release-process.html index d9e8fc0..daac07b 100644 --- a/site/release-process.html +++ b/site/release-process.html @@ -505,6 +505,15 @@ $ git shortlog v1.1.1 --grep "$EXPR" > contrib.txt $ git log v1.1.1 --grep "$expr" --shortstat --oneline | grep -B 1 -e "[3-9][0-9][0-9] insert" -e "[1-9][1-9][1-9][1-9] insert" | grep SPARK > large-patches.txt +Update `HiveExternalCatalogVersionsSuite` + +When a new release occurs, PROCESS_TABLES.testingVersions in HiveExternalCatalogVersionsSuite +must be updated shortly thereafter. This list should contain the latest release in all active +maintenance branches, and no more. +For example, as of this writing, it has value val testingVersions = Seq("2.1.3", "2.2.2", "2.3.2"). +“2.4.0” will be added to the list when it’s released. “2.1.3” will be removed (and removed from the Spark dist mirrors) +when the branch is no longer maintained. “2.3.2” will become “2.3.3” when “2.3.3” is released. + Create an Announcement Once everything is working (website docs, website changes) create an announcement on the website - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25583][DOC] Add history-server related configuration in the documentation.
Repository: spark Updated Branches: refs/heads/master 5114db578 -> 71876633f [SPARK-25583][DOC] Add history-server related configuration in the documentation. ## What changes were proposed in this pull request? Add history-server related configuration in the documentation. Some of the history server related configurations were missing in the documentation.Like, 'spark.history.store.maxDiskUsage', 'spark.ui.liveUpdate.period' etc. ## How was this patch tested? ![screenshot from 2018-10-01 20-58-26](https://user-images.githubusercontent.com/23054875/46298568-04833a80-c5bd-11e8-95b8-54c9d6582fd2.png) ![screenshot from 2018-10-01 20-59-31](https://user-images.githubusercontent.com/23054875/46298591-11a02980-c5bd-11e8-93d0-892afdfd4f9a.png) ![screenshot from 2018-10-01 20-59-45](https://user-images.githubusercontent.com/23054875/46298601-1533b080-c5bd-11e8-9689-e9b39882a7b5.png) Closes #22601 from shahidki31/historyConf. Authored-by: Shahid Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71876633 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71876633 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71876633 Branch: refs/heads/master Commit: 71876633f3af706408355b5fb561b58dbc593360 Parents: 5114db5 Author: Shahid Authored: Tue Oct 2 08:05:09 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 2 08:05:09 2018 -0700 -- docs/configuration.md | 16 docs/monitoring.md| 25 + 2 files changed, 41 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71876633/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 782ccff..5577393 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -794,6 +794,13 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.dagGraph.retainedRootRDDs + Int.MaxValue + +How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting. + + + spark.ui.enabled true @@ -808,6 +815,15 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.liveUpdate.period + 100ms + +How often to update live entities. -1 means "never update" when replaying applications, +meaning only the last write will happen. For live applications, this avoids a few +operations that we can live without when rapidly processing incoming task events. + + + spark.ui.port 4040 http://git-wip-us.apache.org/repos/asf/spark/blob/71876633/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index f6d52ef..69bf308 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -186,6 +186,23 @@ Security options for the Spark History Server are covered more detail in the +spark.history.fs.endEventReparseChunkSize +1m + + How many bytes to parse at the end of log files looking for the end event. + This is used to speed up generation of application listings by skipping unnecessary + parts of event log files. It can be disabled by setting this config to 0. + + + +spark.history.fs.inProgressOptimization.enabled +true + + Enable optimized handling of in-progress logs. This option may leave finished + applications that fail to rename their event logs listed as in-progress. + + + spark.history.fs.numReplayThreads 25% of available cores @@ -193,6 +210,14 @@ Security options for the Spark History Server are covered more detail in the +spark.history.store.maxDiskUsage +10g + + Maximum disk usage for the local directory where the cache application history information + are stored. + + + spark.history.store.path (none) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25583][DOC] Add history-server related configuration in the documentation.
Repository: spark Updated Branches: refs/heads/branch-2.4 ad7b3f6ba -> ea4068a0a [SPARK-25583][DOC] Add history-server related configuration in the documentation. ## What changes were proposed in this pull request? Add history-server related configuration in the documentation. Some of the history server related configurations were missing in the documentation.Like, 'spark.history.store.maxDiskUsage', 'spark.ui.liveUpdate.period' etc. ## How was this patch tested? ![screenshot from 2018-10-01 20-58-26](https://user-images.githubusercontent.com/23054875/46298568-04833a80-c5bd-11e8-95b8-54c9d6582fd2.png) ![screenshot from 2018-10-01 20-59-31](https://user-images.githubusercontent.com/23054875/46298591-11a02980-c5bd-11e8-93d0-892afdfd4f9a.png) ![screenshot from 2018-10-01 20-59-45](https://user-images.githubusercontent.com/23054875/46298601-1533b080-c5bd-11e8-9689-e9b39882a7b5.png) Closes #22601 from shahidki31/historyConf. Authored-by: Shahid Signed-off-by: Dongjoon Hyun (cherry picked from commit 71876633f3af706408355b5fb561b58dbc593360) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea4068a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea4068a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea4068a0 Branch: refs/heads/branch-2.4 Commit: ea4068a0af036f9e266f64eac558aaa984f08a93 Parents: ad7b3f6 Author: Shahid Authored: Tue Oct 2 08:05:09 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 2 08:06:32 2018 -0700 -- docs/configuration.md | 16 docs/monitoring.md| 25 + 2 files changed, 41 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ea4068a0/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 782ccff..5577393 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -794,6 +794,13 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.dagGraph.retainedRootRDDs + Int.MaxValue + +How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting. + + + spark.ui.enabled true @@ -808,6 +815,15 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.liveUpdate.period + 100ms + +How often to update live entities. -1 means "never update" when replaying applications, +meaning only the last write will happen. For live applications, this avoids a few +operations that we can live without when rapidly processing incoming task events. + + + spark.ui.port 4040 http://git-wip-us.apache.org/repos/asf/spark/blob/ea4068a0/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index f6d52ef..69bf308 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -186,6 +186,23 @@ Security options for the Spark History Server are covered more detail in the +spark.history.fs.endEventReparseChunkSize +1m + + How many bytes to parse at the end of log files looking for the end event. + This is used to speed up generation of application listings by skipping unnecessary + parts of event log files. It can be disabled by setting this config to 0. + + + +spark.history.fs.inProgressOptimization.enabled +true + + Enable optimized handling of in-progress logs. This option may leave finished + applications that fail to rename their event logs listed as in-progress. + + + spark.history.fs.numReplayThreads 25% of available cores @@ -193,6 +210,14 @@ Security options for the Spark History Server are covered more detail in the +spark.history.store.maxDiskUsage +10g + + Maximum disk usage for the local directory where the cache application history information + are stored. + + + spark.history.store.path (none) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25581][SQL] Rename method `benchmark` as `runBenchmarkSuite` in `BenchmarkBase`
Repository: spark Updated Branches: refs/heads/master 9bf397c0e -> 7b4e94f16 [SPARK-25581][SQL] Rename method `benchmark` as `runBenchmarkSuite` in `BenchmarkBase` ## What changes were proposed in this pull request? Rename method `benchmark` in `BenchmarkBase` as `runBenchmarkSuite `. Also add comments. Currently the method name `benchmark` is a bit confusing. Also the name is the same as instances of `Benchmark`: https://github.com/apache/spark/blob/f246813afba16fee4d703f09e6302011b11806f3/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala#L330-L339 ## How was this patch tested? Unit test. Closes #22599 from gengliangwang/renameBenchmarkSuite. Authored-by: Gengliang Wang Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7b4e94f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7b4e94f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7b4e94f1 Branch: refs/heads/master Commit: 7b4e94f16096cd35835450d63620583496e4f978 Parents: 9bf397c Author: Gengliang Wang Authored: Tue Oct 2 10:04:47 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 2 10:04:47 2018 -0700 -- .../scala/org/apache/spark/benchmark/BenchmarkBase.scala| 9 +++-- .../spark/mllib/linalg/UDTSerializationBenchmark.scala | 2 +- .../org/apache/spark/sql/UnsafeProjectionBenchmark.scala| 2 +- .../spark/sql/execution/benchmark/AggregateBenchmark.scala | 2 +- .../sql/execution/benchmark/FilterPushdownBenchmark.scala | 2 +- .../sql/execution/benchmark/PrimitiveArrayBenchmark.scala | 2 +- .../spark/sql/execution/benchmark/SortBenchmark.scala | 2 +- .../columnar/compression/CompressionSchemeBenchmark.scala | 2 +- .../sql/execution/vectorized/ColumnarBatchBenchmark.scala | 2 +- .../org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala| 2 +- 10 files changed, 16 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7b4e94f1/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala -- diff --git a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala index 9a37e02..89e927e 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala @@ -25,7 +25,12 @@ import java.io.{File, FileOutputStream, OutputStream} abstract class BenchmarkBase { var output: Option[OutputStream] = None - def benchmark(): Unit + /** + * Main process of the whole benchmark. + * Implementations of this method are supposed to use the wrapper method `runBenchmark` + * for each benchmark scenario. + */ + def runBenchmarkSuite(): Unit final def runBenchmark(benchmarkName: String)(func: => Any): Unit = { val separator = "=" * 96 @@ -46,7 +51,7 @@ abstract class BenchmarkBase { output = Some(new FileOutputStream(file)) } -benchmark() +runBenchmarkSuite() output.foreach { o => if (o != null) { http://git-wip-us.apache.org/repos/asf/spark/blob/7b4e94f1/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala index 1a2216e..6c1d580 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder */ object UDTSerializationBenchmark extends BenchmarkBase { - override def benchmark(): Unit = { + override def runBenchmarkSuite(): Unit = { runBenchmark("VectorUDT de/serialization") { val iters = 1e2.toInt http://git-wip-us.apache.org/repos/asf/spark/blob/7b4e94f1/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala index cbe723f..e7a9948 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala @@ -41,7 +41,7 @@ object UnsafeProjectionBenchmark extends BenchmarkBase { (1 to numRows).map(
spark git commit: [SPARK-25576][BUILD][BRANCH-2.2] Fix lint failure
Repository: spark Updated Branches: refs/heads/branch-2.2 5dd14f5d9 -> 66c7b4281 [SPARK-25576][BUILD][BRANCH-2.2] Fix lint failure ## What changes were proposed in this pull request? Line length fixes and ## How was this patch tested? Manually verified, but will ensure jenkins lint passes before merging Related Job: https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Compile/job/spark-branch-2.2-lint/913/console Closes #22596 from samdvr/SPARK-25576. Lead-authored-by: Sam Davarnia Co-authored-by: Sam Davarnia <> Co-authored-by: Dongjoon Hyun Co-authored-by: Sam Davarnia Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/66c7b428 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/66c7b428 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/66c7b428 Branch: refs/heads/branch-2.2 Commit: 66c7b42811f9a46726738da8e601d08eb2486b6e Parents: 5dd14f5 Author: Sam Davarnia Authored: Tue Oct 2 10:13:54 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 2 10:13:54 2018 -0700 -- .../spark/unsafe/types/UTF8StringSuite.java | 2 ++ .../unsafe/sort/UnsafeInMemorySorter.java | 9 ++--- .../unsafe/sort/UnsafeExternalSorterSuite.java | 21 .../unsafe/sort/UnsafeInMemorySorterSuite.java | 3 ++- .../sql/execution/UnsafeKVExternalSorter.java | 2 +- 5 files changed, 24 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/66c7b428/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java -- diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 33b9e11..ea79c28 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -63,7 +63,9 @@ public class UTF8StringSuite { checkBasic("hello", 5); // 5 * 1 byte chars checkBasic("大 å ä¸ ç", 7); checkBasic("︽ï¹ï¼ ", 3); // 3 * 3 bytes chars +// checkstyle.off: AvoidEscapedUnicodeCharacters checkBasic("\uD83E\uDD19", 1); // 4 bytes char +// checkstyle.on: AvoidEscapedUnicodeCharacters } @Test http://git-wip-us.apache.org/repos/asf/spark/blob/66c7b428/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 839b41d..b025811 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -174,9 +174,12 @@ public final class UnsafeInMemorySorter { if (consumer != null) { consumer.freeArray(array); // the call to consumer.allocateArray may trigger a spill - // which in turn access this instance and eventually re-enter this method and try to free the array again. - // by setting the array to null and its length to 0 we effectively make the spill code-path a no-op. - // setting the array to null also indicates that it has already been de-allocated which prevents a double de-allocation in free(). + // which in turn access this instance and eventually re-enter this method + // and try to free the array again. + // By setting the array to null and its length to 0 + // we effectively make the spill code-path a no-op. + // Setting the array to null also indicates that it has already been + // de-allocated which prevents a double de-allocation in free(). array = null; usableCapacity = 0; pos = 0; http://git-wip-us.apache.org/repos/asf/spark/blob/66c7b428/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java -- diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java index cce01a3..17c4d7e 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java @@ -460,7 +460,7 @@ public class UnsafeExternalSorterS
spark git commit: [SPARK-25583][DOC][BRANCH-2.3] Add history-server related configuration in the documentation.
Repository: spark Updated Branches: refs/heads/branch-2.3 8d7723f2e -> 7102aeeb2 [SPARK-25583][DOC][BRANCH-2.3] Add history-server related configuration in the documentation. ## What changes were proposed in this pull request? This is a follow up PR for the PR, https://github.com/apache/spark/pull/22601. Add history-server related configuration in the documentation for spark2.3 Some of the history server related configurations were missing in the documentation.Like, 'spark.history.store.maxDiskUsage', 'spark.ui.liveUpdate.period' etc. ## How was this patch tested? ![screenshot from 2018-10-02 21-00-43](https://user-images.githubusercontent.com/23054875/46359184-4d52f600-c686-11e8-9d55-0ab178c074ed.png) ![screenshot from 2018-10-02 21-00-08](https://user-images.githubusercontent.com/23054875/46359214-5cd23f00-c686-11e8-92a4-0ce90190cfbb.png) Closes #22613 from shahidki31/SPARK-25583. Authored-by: Shahid Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7102aeeb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7102aeeb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7102aeeb Branch: refs/heads/branch-2.3 Commit: 7102aeeb2a01bd3e4cfb5f9e0bb87e1654339721 Parents: 8d7723f Author: Shahid Authored: Wed Oct 3 04:10:59 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 3 04:10:59 2018 -0700 -- docs/configuration.md | 16 docs/monitoring.md| 8 2 files changed, 24 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7102aeeb/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index ec4f5d4..c85c04f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -791,6 +791,13 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.dagGraph.retainedRootRDDs + Int.MaxValue + +How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting. + + + spark.ui.enabled true @@ -805,6 +812,15 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.liveUpdate.period + 100ms + +How often to update live entities. -1 means "never update" when replaying applications, +meaning only the last write will happen. For live applications, this avoids a few +operations that we can live without when rapidly processing incoming task events. + + + spark.ui.port 4040 http://git-wip-us.apache.org/repos/asf/spark/blob/7102aeeb/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index 6f6cfc1..bfdaea6 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -225,6 +225,14 @@ The history server can be configured as follows: +spark.history.store.maxDiskUsage +10g + + Maximum disk usage for the local directory where the cache application history information + are stored. + + + spark.history.store.path (none) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark
Repository: spark Updated Branches: refs/heads/master 928d0739c -> 1a5d83bed [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark ## What changes were proposed in this pull request? This PR aims to add `BloomFilterBenchmark`. For ORC data source, Apache Spark has been supporting for a long time. For Parquet data source, it's expected to be added with next Parquet release update. ## How was this patch tested? Manual. ```scala SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.BloomFilterBenchmark" ``` Closes #22605 from dongjoon-hyun/SPARK-25589. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a5d83be Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a5d83be Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a5d83be Branch: refs/heads/master Commit: 1a5d83bed8a6df62ef643b08453c7dd8feebf93a Parents: 928d073 Author: Dongjoon Hyun Authored: Wed Oct 3 04:14:07 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 3 04:14:07 2018 -0700 -- .../benchmarks/BloomFilterBenchmark-results.txt | 24 ++ .../benchmark/BloomFilterBenchmark.scala| 87 2 files changed, 111 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a5d83be/sql/core/benchmarks/BloomFilterBenchmark-results.txt -- diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt new file mode 100644 index 000..2eeb26c --- /dev/null +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -0,0 +1,24 @@ + +ORC Write + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Write 100M rows: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Without bloom filter16765 / 17587 6.0 167.7 1.0X +With bloom filter 20060 / 20626 5.0 200.6 0.8X + + + +ORC Read + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Read a row from 100M rows: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Without bloom filter 1857 / 1904 53.9 18.6 1.0X +With bloom filter 1399 / 1437 71.5 14.0 1.3X + + http://git-wip-us.apache.org/repos/asf/spark/blob/1a5d83be/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala new file mode 100644 index 000..2f3caca --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import scala.util.Random + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark to measure read performance with Bloo
spark git commit: [SPARK-25483][TEST] Refactor UnsafeArrayDataBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 1a5d83bed -> 56741c342 [SPARK-25483][TEST] Refactor UnsafeArrayDataBenchmark to use main method ## What changes were proposed in this pull request? Refactor `UnsafeArrayDataBenchmark` to use main method. Generate benchmark result: ```sh SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.UnsafeArrayDataBenchmark" ``` ## How was this patch tested? manual tests Closes #22491 from wangyum/SPARK-25483. Lead-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56741c34 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56741c34 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56741c34 Branch: refs/heads/master Commit: 56741c342dce87a75b39e52db6de92d7d7bef371 Parents: 1a5d83b Author: Yuming Wang Authored: Wed Oct 3 04:20:02 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 3 04:20:02 2018 -0700 -- .../UnsafeArrayDataBenchmark-results.txt| 33 + .../benchmark/UnsafeArrayDataBenchmark.scala| 73 ++-- 2 files changed, 56 insertions(+), 50 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/56741c34/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt -- diff --git a/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt new file mode 100644 index 000..4ecc1f1 --- /dev/null +++ b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt @@ -0,0 +1,33 @@ + +Benchmark UnsafeArrayData + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Read UnsafeArrayData:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Int233 / 234718.6 1.4 1.0X +Double 244 / 244687.0 1.5 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Write UnsafeArrayData: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Int 32 / 33658.6 1.5 1.0X +Double 73 / 75287.0 3.5 0.4X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Get primitive array from UnsafeArrayData: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Int 70 / 72895.0 1.1 1.0X +Double 141 / 143446.9 2.2 0.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Create UnsafeArrayData from primitive array: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Int 72 / 73874.7 1.1 1.0X +Double 145 / 146433.7 2.3 0.5X + + http://git-wip-us.apache.org/repos/asf/spark/blob/56741c34/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala index 51ab0e1..79eaeab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala @@ -19,20 +19,21 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random -import org.apache.spark.benchmark.Benc
spark git commit: [SPARK-25538][SQL] Zero-out all bytes when writing decimal
Repository: spark Updated Branches: refs/heads/master 56741c342 -> d7ae36a81 [SPARK-25538][SQL] Zero-out all bytes when writing decimal ## What changes were proposed in this pull request? In #20850 when writing non-null decimals, instead of zero-ing all the 16 allocated bytes, we zero-out only the padding bytes. Since we always allocate 16 bytes, if the number of bytes needed for a decimal is lower than 9, then this means that the bytes between 8 and 16 are not zero-ed. I see 2 solutions here: - we can zero-out all the bytes in advance as it was done before #20850 (safer solution IMHO); - we can allocate only the needed bytes (may be a bit more efficient in terms of memory used, but I have not investigated the feasibility of this option). Hence I propose here the first solution in order to fix the correctness issue. We can eventually switch to the second if we think is more efficient later. ## How was this patch tested? Running the test attached in the JIRA + added UT Closes #22602 from mgaido91/SPARK-25582. Authored-by: Marco Gaido Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7ae36a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7ae36a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7ae36a8 Branch: refs/heads/master Commit: d7ae36a810bfcbedfe7360eb2cdbbc3ca970e4d0 Parents: 56741c3 Author: Marco Gaido Authored: Wed Oct 3 07:28:34 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 3 07:28:34 2018 -0700 -- .../expressions/codegen/UnsafeRowWriter.java| 10 ++-- .../codegen/UnsafeRowWriterSuite.scala | 53 2 files changed, 57 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7ae36a8/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java -- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java index 71c49d8..3960d6d 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java @@ -185,13 +185,13 @@ public final class UnsafeRowWriter extends UnsafeWriter { // grow the global buffer before writing data. holder.grow(16); + // always zero-out the 16-byte buffer + Platform.putLong(getBuffer(), cursor(), 0L); + Platform.putLong(getBuffer(), cursor() + 8, 0L); + // Make sure Decimal object has the same scale as DecimalType. // Note that we may pass in null Decimal object to set null for it. if (input == null || !input.changePrecision(precision, scale)) { -// zero-out the bytes -Platform.putLong(getBuffer(), cursor(), 0L); -Platform.putLong(getBuffer(), cursor() + 8, 0L); - BitSetMethods.set(getBuffer(), startingOffset, ordinal); // keep the offset for future update setOffsetAndSize(ordinal, 0); @@ -200,8 +200,6 @@ public final class UnsafeRowWriter extends UnsafeWriter { final int numBytes = bytes.length; assert numBytes <= 16; -zeroOutPaddingBytes(numBytes); - // Write the bytes to the variable length portion. Platform.copyMemory( bytes, Platform.BYTE_ARRAY_OFFSET, getBuffer(), cursor(), numBytes); http://git-wip-us.apache.org/repos/asf/spark/blob/d7ae36a8/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala new file mode 100644 index 000..fb651b7 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in w
spark git commit: [SPARK-25538][SQL] Zero-out all bytes when writing decimal
Repository: spark Updated Branches: refs/heads/branch-2.4 ea4068a0a -> 443d12dbb [SPARK-25538][SQL] Zero-out all bytes when writing decimal ## What changes were proposed in this pull request? In #20850 when writing non-null decimals, instead of zero-ing all the 16 allocated bytes, we zero-out only the padding bytes. Since we always allocate 16 bytes, if the number of bytes needed for a decimal is lower than 9, then this means that the bytes between 8 and 16 are not zero-ed. I see 2 solutions here: - we can zero-out all the bytes in advance as it was done before #20850 (safer solution IMHO); - we can allocate only the needed bytes (may be a bit more efficient in terms of memory used, but I have not investigated the feasibility of this option). Hence I propose here the first solution in order to fix the correctness issue. We can eventually switch to the second if we think is more efficient later. ## How was this patch tested? Running the test attached in the JIRA + added UT Closes #22602 from mgaido91/SPARK-25582. Authored-by: Marco Gaido Signed-off-by: Dongjoon Hyun (cherry picked from commit d7ae36a810bfcbedfe7360eb2cdbbc3ca970e4d0) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/443d12db Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/443d12db Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/443d12db Branch: refs/heads/branch-2.4 Commit: 443d12dbbe40e932978a9a1a811128da8afba89b Parents: ea4068a Author: Marco Gaido Authored: Wed Oct 3 07:28:34 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 3 07:28:48 2018 -0700 -- .../expressions/codegen/UnsafeRowWriter.java| 10 ++-- .../codegen/UnsafeRowWriterSuite.scala | 53 2 files changed, 57 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/443d12db/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java -- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java index 71c49d8..3960d6d 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java @@ -185,13 +185,13 @@ public final class UnsafeRowWriter extends UnsafeWriter { // grow the global buffer before writing data. holder.grow(16); + // always zero-out the 16-byte buffer + Platform.putLong(getBuffer(), cursor(), 0L); + Platform.putLong(getBuffer(), cursor() + 8, 0L); + // Make sure Decimal object has the same scale as DecimalType. // Note that we may pass in null Decimal object to set null for it. if (input == null || !input.changePrecision(precision, scale)) { -// zero-out the bytes -Platform.putLong(getBuffer(), cursor(), 0L); -Platform.putLong(getBuffer(), cursor() + 8, 0L); - BitSetMethods.set(getBuffer(), startingOffset, ordinal); // keep the offset for future update setOffsetAndSize(ordinal, 0); @@ -200,8 +200,6 @@ public final class UnsafeRowWriter extends UnsafeWriter { final int numBytes = bytes.length; assert numBytes <= 16; -zeroOutPaddingBytes(numBytes); - // Write the bytes to the variable length portion. Platform.copyMemory( bytes, Platform.BYTE_ARRAY_OFFSET, getBuffer(), cursor(), numBytes); http://git-wip-us.apache.org/repos/asf/spark/blob/443d12db/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala new file mode 100644 index 000..fb651b7 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at
spark git commit: [SPARK-25479][TEST] Refactor DatasetBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 71c24aad3 -> 95ae20946 [SPARK-25479][TEST] Refactor DatasetBenchmark to use main method ## What changes were proposed in this pull request? Refactor `DatasetBenchmark` to use main method. Generate benchmark result: ```sh SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.DatasetBenchmark" ``` ## How was this patch tested? manual tests Closes #22488 from wangyum/SPARK-25479. Lead-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95ae2094 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95ae2094 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95ae2094 Branch: refs/heads/master Commit: 95ae2094618fbbe07008c190105053dc2b85da1a Parents: 71c24aa Author: Yuming Wang Authored: Thu Oct 4 11:58:16 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 4 11:58:16 2018 -0700 -- .../benchmarks/DatasetBenchmark-results.txt | 46 ++ .../org/apache/spark/sql/DatasetBenchmark.scala | 96 +--- 2 files changed, 71 insertions(+), 71 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/95ae2094/sql/core/benchmarks/DatasetBenchmark-results.txt -- diff --git a/sql/core/benchmarks/DatasetBenchmark-results.txt b/sql/core/benchmarks/DatasetBenchmark-results.txt new file mode 100644 index 000..dcc190e --- /dev/null +++ b/sql/core/benchmarks/DatasetBenchmark-results.txt @@ -0,0 +1,46 @@ + +Dataset Benchmark + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back map long: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +RDD 11800 / 12042 8.5 118.0 1.0X +DataFrame 1927 / 2189 51.9 19.3 6.1X +Dataset 2483 / 2605 40.3 24.8 4.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back map:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +RDD 16286 / 16301 6.1 162.9 1.0X +DataFrame 8101 / 8104 12.3 81.0 2.0X +Dataset 17445 / 17811 5.7 174.4 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back filter Long:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +RDD 2971 / 3184 33.7 29.7 1.0X +DataFrame 1243 / 1296 80.5 12.4 2.4X +Dataset 3062 / 3091 32.7 30.6 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back filter: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +RDD 5253 / 5269 19.0 52.5 1.0X +DataFrame 211 / 234473.4 2.1 24.9X +Dataset 9550 / 9552 10.5 95.5 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +aggregate: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +RDD sum 5086 / 5108 19.7 50.9 1.0X +DataFrame sum 65 / 73 1548.9 0.6 78.8X +Data
spark git commit: [SPARK-25646][K8S] Fix docker-image-tool.sh on dev build.
Repository: spark Updated Branches: refs/heads/master 2c6f4d61b -> 58287a398 [SPARK-25646][K8S] Fix docker-image-tool.sh on dev build. The docker file was referencing a path that only existed in the distribution tarball; it needs to be parameterized so that the right path can be used in a dev build. Tested on local dev build. Closes #22634 from vanzin/SPARK-25646. Authored-by: Marcelo Vanzin Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58287a39 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58287a39 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58287a39 Branch: refs/heads/master Commit: 58287a39864db463eeef17d1152d664be021d9ef Parents: 2c6f4d6 Author: Marcelo Vanzin Authored: Fri Oct 5 21:15:16 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Oct 5 21:15:16 2018 -0700 -- bin/docker-image-tool.sh | 2 ++ .../kubernetes/docker/src/main/dockerfiles/spark/Dockerfile | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/58287a39/bin/docker-image-tool.sh -- diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index d637105..228494d 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -54,6 +54,8 @@ function build { img_path=$IMG_PATH --build-arg spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars + --build-arg + k8s_tests=resource-managers/kubernetes/integration-tests/tests ) else # Not passed as an argument to docker, but used to validate the Spark directory. http://git-wip-us.apache.org/repos/asf/spark/blob/58287a39/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile -- diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile index 7ae57bf..1c4dcd5 100644 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile @@ -19,6 +19,7 @@ FROM openjdk:8-alpine ARG spark_jars=jars ARG img_path=kubernetes/dockerfiles +ARG k8s_tests=kubernetes/tests # Before building the docker image, first build and make a Spark distribution following # the instructions in http://spark.apache.org/docs/latest/building-spark.html. @@ -43,7 +44,7 @@ COPY bin /opt/spark/bin COPY sbin /opt/spark/sbin COPY ${img_path}/spark/entrypoint.sh /opt/ COPY examples /opt/spark/examples -COPY kubernetes/tests /opt/spark/tests +COPY ${k8s_tests} /opt/spark/tests COPY data /opt/spark/data ENV SPARK_HOME /opt/spark - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25646][K8S] Fix docker-image-tool.sh on dev build.
Repository: spark Updated Branches: refs/heads/branch-2.4 0a70afdc0 -> a2991d233 [SPARK-25646][K8S] Fix docker-image-tool.sh on dev build. The docker file was referencing a path that only existed in the distribution tarball; it needs to be parameterized so that the right path can be used in a dev build. Tested on local dev build. Closes #22634 from vanzin/SPARK-25646. Authored-by: Marcelo Vanzin Signed-off-by: Dongjoon Hyun (cherry picked from commit 58287a39864db463eeef17d1152d664be021d9ef) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2991d23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2991d23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2991d23 Branch: refs/heads/branch-2.4 Commit: a2991d23348bd1f4ecc33e5c762ccd12bb65f5cd Parents: 0a70afd Author: Marcelo Vanzin Authored: Fri Oct 5 21:15:16 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Oct 5 21:18:12 2018 -0700 -- bin/docker-image-tool.sh | 2 ++ .../kubernetes/docker/src/main/dockerfiles/spark/Dockerfile | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2991d23/bin/docker-image-tool.sh -- diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index d637105..228494d 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -54,6 +54,8 @@ function build { img_path=$IMG_PATH --build-arg spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars + --build-arg + k8s_tests=resource-managers/kubernetes/integration-tests/tests ) else # Not passed as an argument to docker, but used to validate the Spark directory. http://git-wip-us.apache.org/repos/asf/spark/blob/a2991d23/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile -- diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile index 7ae57bf..1c4dcd5 100644 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile @@ -19,6 +19,7 @@ FROM openjdk:8-alpine ARG spark_jars=jars ARG img_path=kubernetes/dockerfiles +ARG k8s_tests=kubernetes/tests # Before building the docker image, first build and make a Spark distribution following # the instructions in http://spark.apache.org/docs/latest/building-spark.html. @@ -43,7 +44,7 @@ COPY bin /opt/spark/bin COPY sbin /opt/spark/sbin COPY ${img_path}/spark/entrypoint.sh /opt/ COPY examples /opt/spark/examples -COPY kubernetes/tests /opt/spark/tests +COPY ${k8s_tests} /opt/spark/tests COPY data /opt/spark/data ENV SPARK_HOME /opt/spark - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25488][SQL][TEST] Refactor MiscBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 1ee472eec -> edf428661 [SPARK-25488][SQL][TEST] Refactor MiscBenchmark to use main method ## What changes were proposed in this pull request? Refactor `MiscBenchmark ` to use main method. Generate benchmark result: ```sh SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.MiscBenchmark" ``` ## How was this patch tested? manual tests Closes #22500 from wangyum/SPARK-25488. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/edf42866 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/edf42866 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/edf42866 Branch: refs/heads/master Commit: edf42866118c8522dedea3fab848b04a7c50e44c Parents: 1ee472e Author: Yuming Wang Authored: Sat Oct 6 08:47:43 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 6 08:47:43 2018 -0700 -- sql/core/benchmarks/MiscBenchmark-results.txt | 120 +++ .../sql/execution/benchmark/MiscBenchmark.scala | 331 +++ 2 files changed, 232 insertions(+), 219 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/edf42866/sql/core/benchmarks/MiscBenchmark-results.txt -- diff --git a/sql/core/benchmarks/MiscBenchmark-results.txt b/sql/core/benchmarks/MiscBenchmark-results.txt new file mode 100644 index 000..85acd57 --- /dev/null +++ b/sql/core/benchmarks/MiscBenchmark-results.txt @@ -0,0 +1,120 @@ + +filter & aggregate without group + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +range/filter/sum:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +range/filter/sum wholestage off 47752 / 48952 43.9 22.8 1.0X +range/filter/sum wholestage on3123 / 3558671.5 1.5 15.3X + + + +range/limit/sum + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +range/limit/sum: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +range/limit/sum wholestage off 229 / 236 2288.9 0.4 1.0X +range/limit/sum wholestage on 257 / 267 2041.0 0.5 0.9X + + + +sample + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +sample with replacement: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +sample with replacement wholestage off 12908 / 13076 10.2 98.5 1.0X +sample with replacement wholestage on 7334 / 7346 17.9 56.0 1.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +sample without replacement: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +sample without replacement wholestage off 3082 / 3095 42.5 23.5 1.0X +sample without replacement wholestage on 1125 / 1211116.5 8.6 2.7X + + + +collect + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +collect: Best/Avg Time(ms)Rate(M/s) Per R
spark git commit: [SPARK-25644][SS][FOLLOWUP][BUILD] Fix Scala 2.12 build error due to foreachBatch
Repository: spark Updated Branches: refs/heads/branch-2.4 a2991d233 -> 48e2e6fcc [SPARK-25644][SS][FOLLOWUP][BUILD] Fix Scala 2.12 build error due to foreachBatch ## What changes were proposed in this pull request? This PR fixes the Scala-2.12 build error due to ambiguity in `foreachBatch` test cases. - https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-maven-hadoop-2.7-ubuntu-scala-2.12/428/console ```scala [error] /home/jenkins/workspace/spark-master-test-maven-hadoop-2.7-ubuntu-scala-2.12/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala:102: ambiguous reference to overloaded definition, [error] both method foreachBatch in class DataStreamWriter of type (function: org.apache.spark.api.java.function.VoidFunction2[org.apache.spark.sql.Dataset[Int],Long])org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] and method foreachBatch in class DataStreamWriter of type (function: (org.apache.spark.sql.Dataset[Int], Long) => Unit)org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] match argument types ((org.apache.spark.sql.Dataset[Int], Any) => Unit) [error] ds.writeStream.foreachBatch((_, _) => {}).trigger(Trigger.Continuous("1 second")).start() [error] ^ [error] /home/jenkins/workspace/spark-master-test-maven-hadoop-2.7-ubuntu-scala-2.12/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala:106: ambiguous reference to overloaded definition, [error] both method foreachBatch in class DataStreamWriter of type (function: org.apache.spark.api.java.function.VoidFunction2[org.apache.spark.sql.Dataset[Int],Long])org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] and method foreachBatch in class DataStreamWriter of type (function: (org.apache.spark.sql.Dataset[Int], Long) => Unit)org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] match argument types ((org.apache.spark.sql.Dataset[Int], Any) => Unit) [error] ds.writeStream.foreachBatch((_, _) => {}).partitionBy("value").start() [error] ^ ``` ## How was this patch tested? Manual. Since this failure occurs in Scala-2.12 profile and test cases, Jenkins will not test this. We need to build with Scala-2.12 and run the tests. Closes #22649 from dongjoon-hyun/SPARK-SCALA212. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 9cbf105ab1256d65f027115ba5505842ce8fffe3) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/48e2e6fc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/48e2e6fc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/48e2e6fc Branch: refs/heads/branch-2.4 Commit: 48e2e6fcc3617f021b55c2e2be0cda39cad89711 Parents: a2991d2 Author: Dongjoon Hyun Authored: Sat Oct 6 09:40:42 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 6 09:40:54 2018 -0700 -- .../apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala | 4 ++-- .../sql/execution/streaming/sources/ForeachBatchSinkSuite.scala | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/48e2e6fc/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index e0b6d8c..d89e45e 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -33,7 +33,7 @@ import org.apache.kafka.common.TopicPartition import org.scalatest.concurrent.PatienceConfiguration.Timeout import org.scalatest.time.SpanSugar._ -import org.apache.spark.sql.{ForeachWriter, SparkSession} +import org.apache.spark.sql.{Dataset, ForeachWriter, SparkSession} import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution @@ -879,7 +879,7 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { } testUtils.waitUntilOffsetAppears(topicPartition, 5) -val q = ds.writeStream.foreachBatch { (ds, epochId) => +val q = ds.writeStream.foreachBatch { (ds: Dataset[String], epochId: Long) => if (epochId == 0) {
spark git commit: [SPARK-25644][SS][FOLLOWUP][BUILD] Fix Scala 2.12 build error due to foreachBatch
Repository: spark Updated Branches: refs/heads/master 5a617ec4e -> 9cbf105ab [SPARK-25644][SS][FOLLOWUP][BUILD] Fix Scala 2.12 build error due to foreachBatch ## What changes were proposed in this pull request? This PR fixes the Scala-2.12 build error due to ambiguity in `foreachBatch` test cases. - https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-maven-hadoop-2.7-ubuntu-scala-2.12/428/console ```scala [error] /home/jenkins/workspace/spark-master-test-maven-hadoop-2.7-ubuntu-scala-2.12/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala:102: ambiguous reference to overloaded definition, [error] both method foreachBatch in class DataStreamWriter of type (function: org.apache.spark.api.java.function.VoidFunction2[org.apache.spark.sql.Dataset[Int],Long])org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] and method foreachBatch in class DataStreamWriter of type (function: (org.apache.spark.sql.Dataset[Int], Long) => Unit)org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] match argument types ((org.apache.spark.sql.Dataset[Int], Any) => Unit) [error] ds.writeStream.foreachBatch((_, _) => {}).trigger(Trigger.Continuous("1 second")).start() [error] ^ [error] /home/jenkins/workspace/spark-master-test-maven-hadoop-2.7-ubuntu-scala-2.12/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala:106: ambiguous reference to overloaded definition, [error] both method foreachBatch in class DataStreamWriter of type (function: org.apache.spark.api.java.function.VoidFunction2[org.apache.spark.sql.Dataset[Int],Long])org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] and method foreachBatch in class DataStreamWriter of type (function: (org.apache.spark.sql.Dataset[Int], Long) => Unit)org.apache.spark.sql.streaming.DataStreamWriter[Int] [error] match argument types ((org.apache.spark.sql.Dataset[Int], Any) => Unit) [error] ds.writeStream.foreachBatch((_, _) => {}).partitionBy("value").start() [error] ^ ``` ## How was this patch tested? Manual. Since this failure occurs in Scala-2.12 profile and test cases, Jenkins will not test this. We need to build with Scala-2.12 and run the tests. Closes #22649 from dongjoon-hyun/SPARK-SCALA212. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9cbf105a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9cbf105a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9cbf105a Branch: refs/heads/master Commit: 9cbf105ab1256d65f027115ba5505842ce8fffe3 Parents: 5a617ec Author: Dongjoon Hyun Authored: Sat Oct 6 09:40:42 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 6 09:40:42 2018 -0700 -- .../apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala | 4 ++-- .../sql/execution/streaming/sources/ForeachBatchSinkSuite.scala | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9cbf105a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index 39c2cde..5ee7699 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -33,7 +33,7 @@ import org.apache.kafka.common.TopicPartition import org.scalatest.concurrent.PatienceConfiguration.Timeout import org.scalatest.time.SpanSugar._ -import org.apache.spark.sql.{ForeachWriter, SparkSession} +import org.apache.spark.sql.{Dataset, ForeachWriter, SparkSession} import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.streaming._ @@ -900,7 +900,7 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { } testUtils.waitUntilOffsetAppears(topicPartition, 5) -val q = ds.writeStream.foreachBatch { (ds, epochId) => +val q = ds.writeStream.foreachBatch { (ds: Dataset[String], epochId: Long) => if (epochId == 0) { // Send more message before the tasks of the current batch start reading the current batch // data, so t
spark git commit: [SPARK-25062][SQL] Clean up BlockLocations in InMemoryFileIndex
Repository: spark Updated Branches: refs/heads/master 9cbf105ab -> b0cee9605 [SPARK-25062][SQL] Clean up BlockLocations in InMemoryFileIndex ## What changes were proposed in this pull request? `InMemoryFileIndex` contains a cache of `LocatedFileStatus` objects. Each `LocatedFileStatus` object can contain several `BlockLocation`s or some subclass of it. Filling up this cache by listing files happens recursively either on the driver or on the executors, depending on the parallel discovery threshold (`spark.sql.sources.parallelPartitionDiscovery.threshold`). If the listing happens on the executors block location objects are converted to simple `BlockLocation` objects to ensure serialization requirements. If it happens on the driver then there is no conversion and depending on the file system a `BlockLocation` object can be a subclass like `HdfsBlockLocation` and consume more memory. This PR adds the conversion to the latter case and decreases memory consumption. ## How was this patch tested? Added unit test. Closes #22603 from peter-toth/SPARK-25062. Authored-by: Peter Toth Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b0cee960 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b0cee960 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b0cee960 Branch: refs/heads/master Commit: b0cee9605e7c71cfd020aa917319478f9ac61bdb Parents: 9cbf105 Author: Peter Toth Authored: Sat Oct 6 14:50:03 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 6 14:50:03 2018 -0700 -- .../datasources/InMemoryFileIndex.scala | 9 - .../execution/datasources/FileIndexSuite.scala | 39 +++- 2 files changed, 46 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b0cee960/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala index dc5c2ff..fe418e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala @@ -315,7 +315,14 @@ object InMemoryFileIndex extends Logging { // which is very slow on some file system (RawLocalFileSystem, which is launch a // subprocess and parse the stdout). try { - val locations = fs.getFileBlockLocations(f, 0, f.getLen) + val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc => +// Store BlockLocation objects to consume less memory +if (loc.getClass == classOf[BlockLocation]) { + loc +} else { + new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength) +} + } val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize, f.getModificationTime, 0, null, null, null, null, f.getPath, locations) if (f.isSymlink) { http://git-wip-us.apache.org/repos/asf/spark/blob/b0cee960/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala index 18bb4bf..49e7af4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala @@ -23,7 +23,7 @@ import java.net.URI import scala.collection.mutable import scala.language.reflectiveCalls -import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} +import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path, RawLocalFileSystem} import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.catalyst.util._ @@ -248,6 +248,26 @@ class FileIndexSuite extends SharedSQLContext { assert(spark.read.parquet(path.getAbsolutePath).schema.exists(_.name == colToUnescape)) } } + + test("SPARK-25062 - InMemoryFileIndex stores BlockLocation objects no matter what subclass " + +"the FS returns") { +withSQLConf("fs.file.impl" -> classOf[SpecialBlockLocationFileSystem].getName) { + withTempDir { dir => +val file = new File(dir, &qu
spark git commit: [SPARK-25658][SQL][TEST] Refactor HashByteArrayBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 3eb842969 -> b1328cc58 [SPARK-25658][SQL][TEST] Refactor HashByteArrayBenchmark to use main method ## What changes were proposed in this pull request? Refactor `HashByteArrayBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.HashByteArrayBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.HashByteArrayBenchmark" ``` ## How was this patch tested? manual tests Closes #22652 from wangyum/SPARK-25658. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1328cc5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1328cc5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1328cc5 Branch: refs/heads/master Commit: b1328cc58ebb73bc191de5546735cffe0c68255e Parents: 3eb8429 Author: Yuming Wang Authored: Sun Oct 7 09:44:01 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 7 09:44:01 2018 -0700 -- .../HashByteArrayBenchmark-results.txt | 77 .../spark/sql/HashByteArrayBenchmark.scala | 120 --- 2 files changed, 102 insertions(+), 95 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b1328cc5/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt -- diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt new file mode 100644 index 000..a4304ee --- /dev/null +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt @@ -0,0 +1,77 @@ + +Benchmark for MurMurHash 3 and xxHash64 + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 8: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 16 / 16127.7 7.8 1.0X +xxHash 64-bit 23 / 23 90.7 11.0 0.7X +HiveHasher 16 / 16134.8 7.4 1.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 16: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 26 / 26 79.5 12.6 1.0X +xxHash 64-bit 26 / 27 79.3 12.6 1.0X +HiveHasher 30 / 30 70.1 14.3 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 24: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 36 / 36 58.1 17.2 1.0X +xxHash 64-bit 30 / 30 70.2 14.2 1.2X +HiveHasher 45 / 45 46.4 21.5 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 31: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Murmur3_x86_32 50 / 50 41.8 23.9 1.0X +xxHash 64-bit 43 / 43 49.3 20.3 1.2X +HiveHasher 58 / 58 35.9 27.8 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays w
spark git commit: [SPARK-25657][SQL][TEST] Refactor HashBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master b1328cc58 -> 669ade3a8 [SPARK-25657][SQL][TEST] Refactor HashBenchmark to use main method ## What changes were proposed in this pull request? Refactor `HashBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.HashBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.HashBenchmark" ``` ## How was this patch tested? manual tests Closes #22651 from wangyum/SPARK-25657. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/669ade3a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/669ade3a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/669ade3a Branch: refs/heads/master Commit: 669ade3a8eed0016b5ece57d776cea0616417088 Parents: b1328cc Author: Yuming Wang Authored: Sun Oct 7 09:49:37 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 7 09:49:37 2018 -0700 -- .../benchmarks/HashBenchmark-results.txt| 70 + .../org/apache/spark/sql/HashBenchmark.scala| 152 +++ 2 files changed, 129 insertions(+), 93 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/669ade3a/sql/catalyst/benchmarks/HashBenchmark-results.txt -- diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt new file mode 100644 index 000..2459b35 --- /dev/null +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -0,0 +1,70 @@ + +single ints + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single ints:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +interpreted version 5615 / 5616 95.6 10.5 1.0X +codegen version 8400 / 8407 63.9 15.6 0.7X +codegen version 64-bit8139 / 8145 66.0 15.2 0.7X +codegen HiveHash version 7213 / 7348 74.4 13.4 0.8X + + + +single longs + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single longs: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +interpreted version 6053 / 6054 88.7 11.3 1.0X +codegen version 9367 / 9369 57.3 17.4 0.6X +codegen version 64-bit8041 / 8051 66.8 15.0 0.8X +codegen HiveHash version 7546 / 7575 71.1 14.1 0.8X + + + +normal + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For normal: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +interpreted version 3181 / 3182 0.7 1517.0 1.0X +codegen version 2403 / 2403 0.9 1145.7 1.3X +codegen version 64-bit 915 / 916 2.3 436.2 3.5X +codegen HiveHash version 4505 / 4527 0.5 2148.3 0.7X + + +==
spark git commit: [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change
Repository: spark Updated Branches: refs/heads/master 669ade3a8 -> ebd899b8a [SPARK-25321][ML] Revert SPARK-14681 to avoid API breaking change ## What changes were proposed in this pull request? This is the same as #22492 but for master branch. Revert SPARK-14681 to avoid API breaking changes. cc: WeichenXu123 ## How was this patch tested? Existing unit tests. Closes #22618 from mengxr/SPARK-25321.master. Authored-by: WeichenXu Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebd899b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebd899b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebd899b8 Branch: refs/heads/master Commit: ebd899b8a865395e6f1137163cb508086696879b Parents: 669ade3 Author: WeichenXu Authored: Sun Oct 7 10:06:44 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 7 10:06:44 2018 -0700 -- .../classification/DecisionTreeClassifier.scala | 14 +- .../spark/ml/classification/GBTClassifier.scala | 6 +- .../classification/RandomForestClassifier.scala | 6 +- .../ml/regression/DecisionTreeRegressor.scala | 13 +- .../spark/ml/regression/GBTRegressor.scala | 6 +- .../ml/regression/RandomForestRegressor.scala | 6 +- .../scala/org/apache/spark/ml/tree/Node.scala | 247 --- .../spark/ml/tree/impl/RandomForest.scala | 10 +- .../org/apache/spark/ml/tree/treeModels.scala | 36 +-- .../DecisionTreeClassifierSuite.scala | 31 +-- .../ml/classification/GBTClassifierSuite.scala | 4 +- .../RandomForestClassifierSuite.scala | 5 +- .../regression/DecisionTreeRegressorSuite.scala | 14 -- .../spark/ml/tree/impl/RandomForestSuite.scala | 22 +- .../apache/spark/ml/tree/impl/TreeTests.scala | 12 +- project/MimaExcludes.scala | 7 - 16 files changed, 107 insertions(+), 332 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebd899b8/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 8a57bfc..6648e78 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -168,7 +168,7 @@ object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifi @Since("1.4.0") class DecisionTreeClassificationModel private[ml] ( @Since("1.4.0")override val uid: String, -@Since("1.4.0")override val rootNode: ClassificationNode, +@Since("1.4.0")override val rootNode: Node, @Since("1.6.0")override val numFeatures: Int, @Since("1.5.0")override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel] @@ -181,7 +181,7 @@ class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: ClassificationNode, numFeatures: Int, numClasses: Int) = + private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses) override def predict(features: Vector): Double = { @@ -279,9 +279,8 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] - val root = loadTreeNodes(path, metadata, sparkSession, isClassification = true) - val model = new DecisionTreeClassificationModel(metadata.uid, -root.asInstanceOf[ClassificationNode], numFeatures, numClasses) + val root = loadTreeNodes(path, metadata, sparkSession) + val model = new DecisionTreeClassificationModel(metadata.uid, root, numFeatures, numClasses) metadata.getAndSetParams(model) model } @@ -296,10 +295,9 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica require(oldModel.algo == OldAlgo.Classification, s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") -val rootNode = Node.fromOld(o
spark git commit: [SPARK-25700][SQL] Creates ReadSupport in only Append Mode in Data Source V2 write path
Repository: spark Updated Branches: refs/heads/master 80813e198 -> 83e19d5b8 [SPARK-25700][SQL] Creates ReadSupport in only Append Mode in Data Source V2 write path ## What changes were proposed in this pull request? This PR proposes to avoid to make a readsupport and read schema when it writes in other save modes. https://github.com/apache/spark/commit/5fef6e3513d6023a837c427d183006d153c7102b happened to create a readsupport in write path, which ended up with reading schema from readsupport at write path. This breaks `spark.range(1).format("source").write.save("non-existent-path")` case since there's no way to read the schema from "non-existent-path". See also https://github.com/apache/spark/pull/22009#discussion_r223982672 See also https://github.com/apache/spark/pull/22697 See also http://apache-spark-developers-list.1001551.n3.nabble.com/Possible-bug-in-DatasourceV2-td25343.html ## How was this patch tested? Unit test and manual tests. Closes #22688 from HyukjinKwon/append-revert-2. Authored-by: hyukjinkwon Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/83e19d5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/83e19d5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/83e19d5b Branch: refs/heads/master Commit: 83e19d5b80fac6ea4b29d8eb561a5ad06835171b Parents: 80813e1 Author: hyukjinkwon Authored: Thu Oct 11 09:35:49 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 11 09:35:49 2018 -0700 -- .../org/apache/spark/sql/DataFrameWriter.scala | 2 +- .../sql/sources/v2/DataSourceV2Suite.scala | 29 .../sources/v2/SimpleWritableDataSource.scala | 5 ++-- 3 files changed, 32 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/83e19d5b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 188fce7..55e538f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -246,8 +246,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { df.sparkSession.sessionState.conf) val options = sessionOptions ++ extraOptions - val relation = DataSourceV2Relation.create(source, options) if (mode == SaveMode.Append) { +val relation = DataSourceV2Relation.create(source, options) runCommand(df.sparkSession, "save") { AppendData.byName(relation, df.logicalPlan) } http://git-wip-us.apache.org/repos/asf/spark/blob/83e19d5b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala index 7cc8abc..e8f291a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala @@ -351,6 +351,24 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { } } } + + test("SPARK-25700: do not read schema when writing in other modes except append mode") { +withTempPath { file => + val cls = classOf[SimpleWriteOnlyDataSource] + val path = file.getCanonicalPath + val df = spark.range(5).select('id as 'i, -'id as 'j) + try { +df.write.format(cls.getName).option("path", path).mode("error").save() +df.write.format(cls.getName).option("path", path).mode("overwrite").save() +df.write.format(cls.getName).option("path", path).mode("ignore").save() + } catch { +case e: SchemaReadAttemptException => fail("Schema read was attempted.", e) + } + intercept[SchemaReadAttemptException] { +df.write.format(cls.getName).option("path", path).mode("append").save() + } +} + } } @@ -640,3 +658,14 @@ object SpecificReaderFactory extends PartitionReaderFactory { } } } + +class SchemaReadAttemptException(m: String) extends RuntimeException(m) + +class SimpleWriteOnlyDataSource extends SimpleWritableDataSource { + override def fullSchema(): StructType = { +// This is a bit hacky since this source implements
spark git commit: [SPARK-25664][SQL][TEST] Refactor JoinBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 4e141a416 -> e965fb55a [SPARK-25664][SQL][TEST] Refactor JoinBenchmark to use main method ## What changes were proposed in this pull request? Refactor `JoinBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.execution.benchmark.JoinBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.JoinBenchmark" ``` ## How was this patch tested? manual tests Closes #22661 from wangyum/SPARK-25664. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e965fb55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e965fb55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e965fb55 Branch: refs/heads/master Commit: e965fb55acf714bdd639d13d73f75a7d7b43efca Parents: 4e141a4 Author: Yuming Wang Authored: Fri Oct 12 16:08:12 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Oct 12 16:08:12 2018 -0700 -- .../org/apache/spark/benchmark/Benchmark.scala | 3 +- sql/core/benchmarks/JoinBenchmark-results.txt | 75 +++ .../sql/execution/benchmark/JoinBenchmark.scala | 216 +++ 3 files changed, 152 insertions(+), 142 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e965fb55/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala -- diff --git a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala index 7a36b5f..bb389cd 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala @@ -200,11 +200,12 @@ private[spark] object Benchmark { def getProcessorName(): String = { val cpu = if (SystemUtils.IS_OS_MAC_OSX) { Utils.executeAndGetOutput(Seq("/usr/sbin/sysctl", "-n", "machdep.cpu.brand_string")) +.stripLineEnd } else if (SystemUtils.IS_OS_LINUX) { Try { val grepPath = Utils.executeAndGetOutput(Seq("which", "grep")).stripLineEnd Utils.executeAndGetOutput(Seq(grepPath, "-m", "1", "model name", "/proc/cpuinfo")) -.stripLineEnd.replaceFirst("model name[\\s*]:[\\s*]", "") + .stripLineEnd.replaceFirst("model name[\\s*]:[\\s*]", "") }.getOrElse("Unknown processor") } else { System.getenv("PROCESSOR_IDENTIFIER") http://git-wip-us.apache.org/repos/asf/spark/blob/e965fb55/sql/core/benchmarks/JoinBenchmark-results.txt -- diff --git a/sql/core/benchmarks/JoinBenchmark-results.txt b/sql/core/benchmarks/JoinBenchmark-results.txt new file mode 100644 index 000..8ceb5e7 --- /dev/null +++ b/sql/core/benchmarks/JoinBenchmark-results.txt @@ -0,0 +1,75 @@ + +Join Benchmark + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Join w long: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Join w long wholestage off4464 / 4483 4.7 212.9 1.0X +Join w long wholestage on 289 / 339 72.6 13.8 15.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Join w long duplicated: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Join w long duplicated wholestage off 5662 / 5678 3.7 270.0 1.0X +Join w long duplicated wholestage on 332 / 345 63.1 15.8 17.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Join w 2 ints: Best/Avg Time(ms)Rate(M/s) Per Row
spark git commit: [SPARK-25711][CORE] Improve start-history-server.sh: show usage User-Friendly and remove deprecated options
Repository: spark Updated Branches: refs/heads/master 2eaf05878 -> 26c1b959c [SPARK-25711][CORE] Improve start-history-server.sh: show usage User-Friendly and remove deprecated options ## What changes were proposed in this pull request? Currently, if we try run ``` ./start-history-server.sh -h ``` We will get such error ``` java.io.FileNotFoundException: File -h does not exist ``` 1. This is not User-Friendly. For option `-h` or `--help`, it should be parsed correctly and show the usage of the class/script. 2. We can remove deprecated options for setting event log directory through command line options. After fix, we can get following output: ``` Usage: ./sbin/start-history-server.sh [options] Options: --properties-file FILE Path to a custom Spark properties file. Default is conf/spark-defaults.conf. Configuration options can be set by setting the corresponding JVM system property. History Server options are always available; additional options depend on the provider. History Server options: spark.history.ui.port Port where server will listen for connections (default 18080) spark.history.acls.enable Whether to enable view acls for all applications (default false) spark.history.provider Name of history provider class (defaults to file system-based provider) spark.history.retainedApplications Max number of application UIs to keep loaded in memory (default 50) FsHistoryProvider options: spark.history.fs.logDirectory Directory where app logs are stored (default: file:/tmp/spark-events) spark.history.fs.updateIntervalHow often to reload log data from storage (in seconds, default: 10) ``` ## How was this patch tested? Manual test Closes #22699 from gengliangwang/refactorSHSUsage. Authored-by: Gengliang Wang Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26c1b959 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26c1b959 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26c1b959 Branch: refs/heads/master Commit: 26c1b959cf29b8552beb715cc5d39288d5298bdc Parents: 2eaf058 Author: Gengliang Wang Authored: Sat Oct 13 13:34:31 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 13:34:31 2018 -0700 -- .../deploy/history/HistoryServerArguments.scala | 34 ++-- .../history/HistoryServerArgumentsSuite.scala | 12 --- sbin/start-history-server.sh| 17 +- 3 files changed, 25 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26c1b959/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala index 080ba12..49f00cb 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala @@ -34,35 +34,21 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin @tailrec private def parse(args: List[String]): Unit = { -if (args.length == 1) { - setLogDirectory(args.head) -} else { - args match { -case ("--dir" | "-d") :: value :: tail => - setLogDirectory(value) - parse(tail) +args match { + case ("--help" | "-h") :: tail => +printUsageAndExit(0) -case ("--help" | "-h") :: tail => - printUsageAndExit(0) + case ("--properties-file") :: value :: tail => +propertiesFile = value +parse(tail) -case ("--properties-file") :: value :: tail => - propertiesFile = value - parse(tail) + case Nil => -case Nil => - -case _ => - printUsageAndExit(1) - } + case _ => +printUsageAndExit(1) } } - private def setLogDirectory(value: String): Unit = { -logWarning("Setting log directory through the command line is deprecated as of " + - "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") -conf.set("spark.history.fs.logDirectory", value) - } - // This mutates the SparkConf, so all access
spark git commit: [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite
Repository: spark Updated Branches: refs/heads/master b73f76beb -> 6bbceb9fe [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite ## What changes were proposed in this pull request? [SPARK-22479](https://github.com/apache/spark/pull/19708/files#diff-5c22ac5160d3c9d81225c5dd86265d27R31) adds a test case which sometimes fails because the used password string `123` matches `41230802`. This PR aims to fix the flakiness. - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/97343/consoleFull ```scala SaveIntoDataSourceCommandSuite: - simpleString is redacted *** FAILED *** "SaveIntoDataSourceCommand .org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider41230802, Map(password -> *(redacted), url -> *(redacted), driver -> mydriver), ErrorIfExists +- Range (0, 1, step=1, splits=Some(2)) " contained "123" (SaveIntoDataSourceCommandSuite.scala:42) ``` ## How was this patch tested? Pass the Jenkins with the updated test case Closes #22716 from dongjoon-hyun/SPARK-25726. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6bbceb9f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6bbceb9f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6bbceb9f Branch: refs/heads/master Commit: 6bbceb9fefe815d18001c6dd84f9ea2883d17a88 Parents: b73f76b Author: Dongjoon Hyun Authored: Sat Oct 13 18:01:28 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 18:01:28 2018 -0700 -- .../sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6bbceb9f/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala index a1da3ec..8b06b17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala @@ -25,7 +25,7 @@ class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" -val PASS = "123" +val PASS = "mypassword" val DRIVER = "mydriver" val dataSource = DataSource( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite
Repository: spark Updated Branches: refs/heads/branch-2.4 c4efcf1dd -> 883ca3f99 [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite ## What changes were proposed in this pull request? [SPARK-22479](https://github.com/apache/spark/pull/19708/files#diff-5c22ac5160d3c9d81225c5dd86265d27R31) adds a test case which sometimes fails because the used password string `123` matches `41230802`. This PR aims to fix the flakiness. - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/97343/consoleFull ```scala SaveIntoDataSourceCommandSuite: - simpleString is redacted *** FAILED *** "SaveIntoDataSourceCommand .org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider41230802, Map(password -> *(redacted), url -> *(redacted), driver -> mydriver), ErrorIfExists +- Range (0, 1, step=1, splits=Some(2)) " contained "123" (SaveIntoDataSourceCommandSuite.scala:42) ``` ## How was this patch tested? Pass the Jenkins with the updated test case Closes #22716 from dongjoon-hyun/SPARK-25726. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 6bbceb9fefe815d18001c6dd84f9ea2883d17a88) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/883ca3f9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/883ca3f9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/883ca3f9 Branch: refs/heads/branch-2.4 Commit: 883ca3f990e7533938ab5b4d24b1b35f9da0768a Parents: c4efcf1 Author: Dongjoon Hyun Authored: Sat Oct 13 18:01:28 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 18:01:42 2018 -0700 -- .../sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/883ca3f9/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala index a1da3ec..8b06b17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala @@ -25,7 +25,7 @@ class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" -val PASS = "123" +val PASS = "mypassword" val DRIVER = "mydriver" val dataSource = DataSource( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite
Repository: spark Updated Branches: refs/heads/branch-2.3 b3d1b1bcb -> 1e1599834 [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite ## What changes were proposed in this pull request? [SPARK-22479](https://github.com/apache/spark/pull/19708/files#diff-5c22ac5160d3c9d81225c5dd86265d27R31) adds a test case which sometimes fails because the used password string `123` matches `41230802`. This PR aims to fix the flakiness. - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/97343/consoleFull ```scala SaveIntoDataSourceCommandSuite: - simpleString is redacted *** FAILED *** "SaveIntoDataSourceCommand .org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider41230802, Map(password -> *(redacted), url -> *(redacted), driver -> mydriver), ErrorIfExists +- Range (0, 1, step=1, splits=Some(2)) " contained "123" (SaveIntoDataSourceCommandSuite.scala:42) ``` ## How was this patch tested? Pass the Jenkins with the updated test case Closes #22716 from dongjoon-hyun/SPARK-25726. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 6bbceb9fefe815d18001c6dd84f9ea2883d17a88) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1e159983 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1e159983 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1e159983 Branch: refs/heads/branch-2.3 Commit: 1e15998346882bfb7d08fd838bce7a6141510312 Parents: b3d1b1b Author: Dongjoon Hyun Authored: Sat Oct 13 18:01:28 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 18:01:55 2018 -0700 -- .../sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1e159983/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala index a1da3ec..8b06b17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala @@ -25,7 +25,7 @@ class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" -val PASS = "123" +val PASS = "mypassword" val DRIVER = "mydriver" val dataSource = DataSource( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite
Repository: spark Updated Branches: refs/heads/branch-2.2 66c7b4281 -> 9a74cb300 [SPARK-25726][SQL][TEST] Fix flaky test in SaveIntoDataSourceCommandSuite ## What changes were proposed in this pull request? [SPARK-22479](https://github.com/apache/spark/pull/19708/files#diff-5c22ac5160d3c9d81225c5dd86265d27R31) adds a test case which sometimes fails because the used password string `123` matches `41230802`. This PR aims to fix the flakiness. - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/97343/consoleFull ```scala SaveIntoDataSourceCommandSuite: - simpleString is redacted *** FAILED *** "SaveIntoDataSourceCommand .org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider41230802, Map(password -> *(redacted), url -> *(redacted), driver -> mydriver), ErrorIfExists +- Range (0, 1, step=1, splits=Some(2)) " contained "123" (SaveIntoDataSourceCommandSuite.scala:42) ``` ## How was this patch tested? Pass the Jenkins with the updated test case Closes #22716 from dongjoon-hyun/SPARK-25726. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 6bbceb9fefe815d18001c6dd84f9ea2883d17a88) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9a74cb30 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9a74cb30 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9a74cb30 Branch: refs/heads/branch-2.2 Commit: 9a74cb300c2ba533ebec9bf1ebcb31b72591d4bf Parents: 66c7b42 Author: Dongjoon Hyun Authored: Sat Oct 13 18:01:28 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 18:03:11 2018 -0700 -- .../sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9a74cb30/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala index cf340d0..4acdf04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala @@ -25,7 +25,7 @@ class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" -val PASS = "123" +val PASS = "mypassword" val DRIVER = "mydriver" val simpleString = SaveIntoDataSourceCommand( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25727][SQL] Add outputOrdering to otherCopyArgs in InMemoryRelation
Repository: spark Updated Branches: refs/heads/master 6bbceb9fe -> 6c3f2c6a6 [SPARK-25727][SQL] Add outputOrdering to otherCopyArgs in InMemoryRelation ## What changes were proposed in this pull request? Add `outputOrdering ` to `otherCopyArgs` in InMemoryRelation so that this field will be copied when we doing the tree transformation. ``` val data = Seq(100).toDF("count").cache() data.queryExecution.optimizedPlan.toJSON ``` The above code can generate the following error: ``` assertion failed: InMemoryRelation fields: output, cacheBuilder, statsOfPlanToCache, outputOrdering, values: List(count#178), CachedRDDBuilder(true,1,StorageLevel(disk, memory, deserialized, 1 replicas),*(1) Project [value#176 AS count#178] +- LocalTableScan [value#176] ,None), Statistics(sizeInBytes=12.0 B, hints=none) java.lang.AssertionError: assertion failed: InMemoryRelation fields: output, cacheBuilder, statsOfPlanToCache, outputOrdering, values: List(count#178), CachedRDDBuilder(true,1,StorageLevel(disk, memory, deserialized, 1 replicas),*(1) Project [value#176 AS count#178] +- LocalTableScan [value#176] ,None), Statistics(sizeInBytes=12.0 B, hints=none) at scala.Predef$.assert(Predef.scala:170) at org.apache.spark.sql.catalyst.trees.TreeNode.jsonFields(TreeNode.scala:611) at org.apache.spark.sql.catalyst.trees.TreeNode.org$apache$spark$sql$catalyst$trees$TreeNode$$collectJsonValue$1(TreeNode.scala:599) at org.apache.spark.sql.catalyst.trees.TreeNode.jsonValue(TreeNode.scala:604) at org.apache.spark.sql.catalyst.trees.TreeNode.toJSON(TreeNode.scala:590) ``` ## How was this patch tested? Added a test Closes #22715 from gatorsmile/copyArgs1. Authored-by: gatorsmile Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6c3f2c6a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6c3f2c6a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6c3f2c6a Branch: refs/heads/master Commit: 6c3f2c6a6aa69f80de5504961cfd61b9a61ea7ce Parents: 6bbceb9 Author: gatorsmile Authored: Sat Oct 13 22:10:17 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 22:10:17 2018 -0700 -- .../apache/spark/sql/execution/columnar/InMemoryRelation.scala | 2 +- .../sql/execution/columnar/InMemoryColumnarQuerySuite.scala| 6 ++ 2 files changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6c3f2c6a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala index 1a8fbac..b752b77 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala @@ -206,7 +206,7 @@ case class InMemoryRelation( outputOrdering).asInstanceOf[this.type] } - override protected def otherCopyArgs: Seq[AnyRef] = Seq(statsOfPlanToCache) + override protected def otherCopyArgs: Seq[AnyRef] = Seq(statsOfPlanToCache, outputOrdering) override def simpleString: String = s"InMemoryRelation [${Utils.truncatedString(output, ", ")}], ${cacheBuilder.storageLevel}" http://git-wip-us.apache.org/repos/asf/spark/blob/6c3f2c6a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index efc2f20..b1b23e4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -488,6 +488,12 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { } } + test("SPARK-25727 - otherCopyArgs in InMemoryRelation does not include outputOrdering") { +val data = Seq(100).toDF("count").cache() +val json = data.queryExecution.optimizedPlan.toJSON +assert(json.contains("outputOrdering") && json.contains("statsOfPlanToCache")) + } + test("SPARK-22673: InMemoryRelation should utilize existing stats of the plan to be cached") { // This test case depends on the size of parquet in statistics.
spark git commit: [SPARK-25727][SQL] Add outputOrdering to otherCopyArgs in InMemoryRelation
Repository: spark Updated Branches: refs/heads/branch-2.4 883ca3f99 -> 3e776d73b [SPARK-25727][SQL] Add outputOrdering to otherCopyArgs in InMemoryRelation ## What changes were proposed in this pull request? Add `outputOrdering ` to `otherCopyArgs` in InMemoryRelation so that this field will be copied when we doing the tree transformation. ``` val data = Seq(100).toDF("count").cache() data.queryExecution.optimizedPlan.toJSON ``` The above code can generate the following error: ``` assertion failed: InMemoryRelation fields: output, cacheBuilder, statsOfPlanToCache, outputOrdering, values: List(count#178), CachedRDDBuilder(true,1,StorageLevel(disk, memory, deserialized, 1 replicas),*(1) Project [value#176 AS count#178] +- LocalTableScan [value#176] ,None), Statistics(sizeInBytes=12.0 B, hints=none) java.lang.AssertionError: assertion failed: InMemoryRelation fields: output, cacheBuilder, statsOfPlanToCache, outputOrdering, values: List(count#178), CachedRDDBuilder(true,1,StorageLevel(disk, memory, deserialized, 1 replicas),*(1) Project [value#176 AS count#178] +- LocalTableScan [value#176] ,None), Statistics(sizeInBytes=12.0 B, hints=none) at scala.Predef$.assert(Predef.scala:170) at org.apache.spark.sql.catalyst.trees.TreeNode.jsonFields(TreeNode.scala:611) at org.apache.spark.sql.catalyst.trees.TreeNode.org$apache$spark$sql$catalyst$trees$TreeNode$$collectJsonValue$1(TreeNode.scala:599) at org.apache.spark.sql.catalyst.trees.TreeNode.jsonValue(TreeNode.scala:604) at org.apache.spark.sql.catalyst.trees.TreeNode.toJSON(TreeNode.scala:590) ``` ## How was this patch tested? Added a test Closes #22715 from gatorsmile/copyArgs1. Authored-by: gatorsmile Signed-off-by: Dongjoon Hyun (cherry picked from commit 6c3f2c6a6aa69f80de5504961cfd61b9a61ea7ce) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e776d73 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e776d73 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e776d73 Branch: refs/heads/branch-2.4 Commit: 3e776d73b87b8dd7cda603e409dbc4eb258748bb Parents: 883ca3f Author: gatorsmile Authored: Sat Oct 13 22:10:17 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 13 22:10:57 2018 -0700 -- .../apache/spark/sql/execution/columnar/InMemoryRelation.scala | 2 +- .../sql/execution/columnar/InMemoryColumnarQuerySuite.scala| 6 ++ 2 files changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e776d73/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala index 1a8fbac..b752b77 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala @@ -206,7 +206,7 @@ case class InMemoryRelation( outputOrdering).asInstanceOf[this.type] } - override protected def otherCopyArgs: Seq[AnyRef] = Seq(statsOfPlanToCache) + override protected def otherCopyArgs: Seq[AnyRef] = Seq(statsOfPlanToCache, outputOrdering) override def simpleString: String = s"InMemoryRelation [${Utils.truncatedString(output, ", ")}], ${cacheBuilder.storageLevel}" http://git-wip-us.apache.org/repos/asf/spark/blob/3e776d73/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index efc2f20..b1b23e4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -488,6 +488,12 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { } } + test("SPARK-25727 - otherCopyArgs in InMemoryRelation does not include outputOrdering") { +val data = Seq(100).toDF("count").cache() +val json = data.queryExecution.optimizedPlan.toJSON +assert(json.contains("outputOrdering") && json.contains("statsOfPlanToCache")) + } + test("SPARK-22673: InMemoryRelation should utilize existing stats
spark git commit: [SPARK-25760][SQL] Set AddJarCommand return empty
Repository: spark Updated Branches: refs/heads/master 734c6af0d -> 1117fc35f [SPARK-25760][SQL] Set AddJarCommand return empty ## What changes were proposed in this pull request? Only `AddJarCommand` return `0`, the user will be confused about what it means. This PR sets it to empty. ```sql spark-sql> add jar /Users/yumwang/spark/sql/hive/src/test/resources/TestUDTF.jar; ADD JAR /Users/yumwang/spark/sql/hive/src/test/resources/TestUDTF.jar 0 spark-sql> ``` ## How was this patch tested? manual tests ```sql spark-sql> add jar /Users/yumwang/spark/sql/hive/src/test/resources/TestUDTF.jar; ADD JAR /Users/yumwang/spark/sql/hive/src/test/resources/TestUDTF.jar spark-sql> ``` Closes #22747 from wangyum/AddJarCommand. Authored-by: Yuming Wang Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1117fc35 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1117fc35 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1117fc35 Branch: refs/heads/master Commit: 1117fc35ff11ecc2873b4ec095ad243e8dcb5675 Parents: 734c6af Author: Yuming Wang Authored: Thu Oct 18 09:19:42 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 18 09:19:42 2018 -0700 -- .../scala/org/apache/spark/sql/execution/command/resources.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1117fc35/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala index 2e859cf..8fee02a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala @@ -38,7 +38,7 @@ case class AddJarCommand(path: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.sessionState.resourceLoader.addJar(path) -Seq(Row(0)) +Seq.empty[Row] } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25758][ML] Deprecate computeCost on BisectingKMeans
Repository: spark Updated Branches: refs/heads/master 15524c41b -> c2962546d [SPARK-25758][ML] Deprecate computeCost on BisectingKMeans ## What changes were proposed in this pull request? The PR proposes to deprecate the `computeCost` method on `BisectingKMeans` in favor of the adoption of `ClusteringEvaluator` in order to evaluate the clustering. ## How was this patch tested? NA Closes #22756 from mgaido91/SPARK-25758. Authored-by: Marco Gaido Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2962546 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2962546 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2962546 Branch: refs/heads/master Commit: c2962546d9a5900a5628a31b83d2c4b22c3a7936 Parents: 15524c4 Author: Marco Gaido Authored: Thu Oct 18 10:32:25 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 18 10:32:25 2018 -0700 -- .../scala/org/apache/spark/ml/clustering/BisectingKMeans.scala | 5 + python/pyspark/ml/clustering.py| 6 ++ 2 files changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c2962546/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 5cb16cc..2243d99 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -125,8 +125,13 @@ class BisectingKMeansModel private[ml] ( /** * Computes the sum of squared distances between the input points and their corresponding cluster * centers. + * + * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator + * instead. You can also get the cost on the training dataset in the summary. */ @Since("2.0.0") + @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " + +"instead. You can also get the cost on the training dataset in the summary.", "2.4.0") def computeCost(dataset: Dataset[_]): Double = { SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol) val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol) http://git-wip-us.apache.org/repos/asf/spark/blob/c2962546/python/pyspark/ml/clustering.py -- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 5ef4e76..11eb124 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -540,7 +540,13 @@ class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): """ Computes the sum of squared distances between the input points and their corresponding cluster centers. + +..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. + You can also get the cost on the training dataset in the summary. """ +warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator " + "instead. You can also get the cost on the training dataset in the summary.", + DeprecationWarning) return self._call_java("computeCost", dataset) @property - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25758][ML] Deprecate computeCost on BisectingKMeans
Repository: spark Updated Branches: refs/heads/branch-2.4 ac9a6f08a -> 71a6a9ce8 [SPARK-25758][ML] Deprecate computeCost on BisectingKMeans ## What changes were proposed in this pull request? The PR proposes to deprecate the `computeCost` method on `BisectingKMeans` in favor of the adoption of `ClusteringEvaluator` in order to evaluate the clustering. ## How was this patch tested? NA Closes #22756 from mgaido91/SPARK-25758. Authored-by: Marco Gaido Signed-off-by: Dongjoon Hyun (cherry picked from commit c2962546d9a5900a5628a31b83d2c4b22c3a7936) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71a6a9ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71a6a9ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71a6a9ce Branch: refs/heads/branch-2.4 Commit: 71a6a9ce8558913bc410918c14b6799be9baaeb3 Parents: ac9a6f0 Author: Marco Gaido Authored: Thu Oct 18 10:32:25 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 18 10:32:37 2018 -0700 -- .../scala/org/apache/spark/ml/clustering/BisectingKMeans.scala | 5 + python/pyspark/ml/clustering.py| 6 ++ 2 files changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71a6a9ce/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 5cb16cc..2243d99 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -125,8 +125,13 @@ class BisectingKMeansModel private[ml] ( /** * Computes the sum of squared distances between the input points and their corresponding cluster * centers. + * + * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator + * instead. You can also get the cost on the training dataset in the summary. */ @Since("2.0.0") + @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " + +"instead. You can also get the cost on the training dataset in the summary.", "2.4.0") def computeCost(dataset: Dataset[_]): Double = { SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol) val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol) http://git-wip-us.apache.org/repos/asf/spark/blob/71a6a9ce/python/pyspark/ml/clustering.py -- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 5ef4e76..11eb124 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -540,7 +540,13 @@ class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): """ Computes the sum of squared distances between the input points and their corresponding cluster centers. + +..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. + You can also get the cost on the training dataset in the summary. """ +warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator " + "instead. You can also get the cost on the training dataset in the summary.", + DeprecationWarning) return self._call_java("computeCost", dataset) @property - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21402][SQL][BACKPORT-2.2] Fix java array of structs deserialization
Repository: spark Updated Branches: refs/heads/branch-2.2 773c8236c -> 2e3b923e0 [SPARK-21402][SQL][BACKPORT-2.2] Fix java array of structs deserialization This PR is to backport #22708 to branch 2.2. ## What changes were proposed in this pull request? MapObjects expression is used to map array elements to java beans. Struct type of elements is inferred from java bean structure and ends up with mixed up field order. I used UnresolvedMapObjects instead of MapObjects, which allows to provide element type for MapObjects during analysis based on the resolved input data, not on the java bean. ## How was this patch tested? Added a test case. Built complete project on travis. dongjoon-hyun cloud-fan Closes #22768 from vofque/SPARK-21402-2.2. Lead-authored-by: Vladimir Kuriatkov Co-authored-by: Vladimir Kuriatkov Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2e3b923e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2e3b923e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2e3b923e Branch: refs/heads/branch-2.2 Commit: 2e3b923e0095d52607670905fd18c11e231b458f Parents: 773c823 Author: Vladimir Kuriatkov Authored: Thu Oct 18 13:39:50 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 18 13:39:50 2018 -0700 -- .../spark/sql/catalyst/JavaTypeInference.scala | 6 +- .../spark/sql/JavaBeanWithArraySuite.java | 168 +++ .../resources/test-data/with-array-fields.json | 3 + 3 files changed, 174 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2e3b923e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala index 2698fae..afbf9ce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala @@ -267,12 +267,12 @@ object JavaTypeInference { case c if listType.isAssignableFrom(typeToken) => val et = elementType(typeToken) + val array = Invoke( -MapObjects( +UnresolvedMapObjects( p => deserializerFor(et, Some(p)), - getPath, - inferDataType(et)._1), + getPath), "array", ObjectType(classOf[Array[Any]])) http://git-wip-us.apache.org/repos/asf/spark/blob/2e3b923e/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java new file mode 100644 index 000..1cb8507 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.test.TestSparkSession; +import org.apache.spark.sql.types.*; + +public class JavaBeanWithArraySuite { + + private static final List RECORDS = new ArrayList<>(); + + static { +RECORDS.add(new Record(1, Arrays.asList(new Interval(111, 211), new Interval(121, 221; +RECORDS.add(new Record(2, Arrays.asList(new Interval(112, 212), new Interval(122, 222; +RECORDS.add(new Record(3, Arrays.asList(n
spark git commit: [SPARK-24499][DOC][FOLLOW-UP] Split the page of sql-programming-guide.html to multiple separate pages
Repository: spark Updated Branches: refs/heads/branch-2.4 715355164 -> fd5b24726 [SPARK-24499][DOC][FOLLOW-UP] Split the page of sql-programming-guide.html to multiple separate pages ## What changes were proposed in this pull request? Forgot to clean remove the link for `Upgrading From Spark SQL 2.4 to 3.0` when merging to 2.4 ## How was this patch tested? N/A Closes #22769 from gatorsmile/test2.4. Authored-by: gatorsmile Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd5b2472 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd5b2472 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd5b2472 Branch: refs/heads/branch-2.4 Commit: fd5b247262761271ac36d67fe66f7814acc664a9 Parents: 7153551 Author: gatorsmile Authored: Thu Oct 18 13:51:13 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 18 13:51:13 2018 -0700 -- docs/sql-migration-guide.md | 1 - 1 file changed, 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fd5b2472/docs/sql-migration-guide.md -- diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 71d83e8..a3fc52c 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -5,7 +5,6 @@ displayTitle: Migration Guide --- * [Spark SQL Upgrading Guide](sql-migration-guide-upgrade.html) - * [Upgrading From Spark SQL 2.4 to 3.0](sql-migration-guide-upgrade.html#upgrading-from-spark-sql-24-to-30) * [Upgrading From Spark SQL 2.3 to 2.4](sql-migration-guide-upgrade.html#upgrading-from-spark-sql-23-to-24) * [Upgrading From Spark SQL 2.3.0 to 2.3.1 and above](sql-migration-guide-upgrade.html#upgrading-from-spark-sql-230-to-231-and-above) * [Upgrading From Spark SQL 2.2 to 2.3](sql-migration-guide-upgrade.html#upgrading-from-spark-sql-22-to-23) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21402][SQL][BACKPORT-2.3] Fix java array of structs deserialization
Repository: spark Updated Branches: refs/heads/branch-2.3 0726bc56f -> 61b301cc7 [SPARK-21402][SQL][BACKPORT-2.3] Fix java array of structs deserialization This PR is to backport #22708 to branch 2.3. ## What changes were proposed in this pull request? MapObjects expression is used to map array elements to java beans. Struct type of elements is inferred from java bean structure and ends up with mixed up field order. I used UnresolvedMapObjects instead of MapObjects, which allows to provide element type for MapObjects during analysis based on the resolved input data, not on the java bean. ## How was this patch tested? Added a test case. Built complete project on travis. dongjoon-hyun cloud-fan Closes #22767 from vofque/SPARK-21402-2.3. Authored-by: Vladimir Kuriatkov Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/61b301cc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/61b301cc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/61b301cc Branch: refs/heads/branch-2.3 Commit: 61b301cc7bf3fce4c034be3171291d5212c386e1 Parents: 0726bc5 Author: Vladimir Kuriatkov Authored: Thu Oct 18 14:46:03 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 18 14:46:03 2018 -0700 -- .../spark/sql/catalyst/JavaTypeInference.scala | 3 +- .../spark/sql/JavaBeanWithArraySuite.java | 154 +++ .../resources/test-data/with-array-fields.json | 3 + 3 files changed, 158 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/61b301cc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala index 3ecc137..7a226d7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala @@ -271,10 +271,9 @@ object JavaTypeInference { case c if listType.isAssignableFrom(typeToken) => val et = elementType(typeToken) -MapObjects( +UnresolvedMapObjects( p => deserializerFor(et, Some(p)), getPath, - inferDataType(et)._1, customCollectionCls = Some(c)) case _ if mapType.isAssignableFrom(typeToken) => http://git-wip-us.apache.org/repos/asf/spark/blob/61b301cc/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java new file mode 100644 index 000..70dd110 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanWithArraySuite.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.test.TestSparkSession; + +public class JavaBeanWithArraySuite { + + private static final List RECORDS = new ArrayList<>(); + + static { +RECORDS.add(new Record(1, Arrays.asList(new Interval(111, 211), new Interval(121, 221; +RECORDS.add(new Record(2, Arrays.asList(new Interval(112, 212), new Interval(122, 222; +RECORDS.add(new Record(3, Arrays.asList(new Interval(113, 213), new Interval(123, 223; + } + + private TestSparkSession spark; + + @Before + public void setUp() { +spark = ne
spark git commit: [SPARK-25269][SQL] SQL interface support specify StorageLevel when cache table
Repository: spark Updated Branches: refs/heads/master ac586bbb0 -> 9ad0f6ea8 [SPARK-25269][SQL] SQL interface support specify StorageLevel when cache table ## What changes were proposed in this pull request? SQL interface support specify `StorageLevel` when cache table. The semantic is: ```sql CACHE TABLE tableName OPTIONS('storageLevel' 'DISK_ONLY'); ``` All supported `StorageLevel` are: https://github.com/apache/spark/blob/eefdf9f9dd8afde49ad7d4e230e2735eb817ab0a/core/src/main/scala/org/apache/spark/storage/StorageLevel.scala#L172-L183 ## How was this patch tested? unit tests and manual tests. manual tests configuration: ``` --executor-memory 15G --executor-cores 5 --num-executors 50 ``` Data: Input Size / Records: 1037.7 GB / 11732805788 Result: ![image](https://user-images.githubusercontent.com/5399861/47213362-56a1c980-d3cd-11e8-82e7-28d7abc5923e.png) Closes #22263 from wangyum/SPARK-25269. Authored-by: Yuming Wang Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ad0f6ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ad0f6ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ad0f6ea Branch: refs/heads/master Commit: 9ad0f6ea89435391ec16e436bc4c4d5bf6b68493 Parents: ac586bb Author: Yuming Wang Authored: Fri Oct 19 09:15:55 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Oct 19 09:15:55 2018 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 3 +- .../spark/sql/execution/SparkSqlParser.scala| 3 +- .../spark/sql/execution/command/cache.scala | 23 +++- .../org/apache/spark/sql/CachedTableSuite.scala | 60 .../apache/spark/sql/hive/test/TestHive.scala | 2 +- 5 files changed, 86 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9ad0f6ea/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 0569986..e2d34d1 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -162,7 +162,8 @@ statement tableIdentifier partitionSpec? describeColName? #describeTable | REFRESH TABLE tableIdentifier #refreshTable | REFRESH (STRING | .*?) #refreshResource -| CACHE LAZY? TABLE tableIdentifier (AS? query)? #cacheTable +| CACHE LAZY? TABLE tableIdentifier +(OPTIONS options=tablePropertyList)? (AS? query)? #cacheTable | UNCACHE TABLE (IF EXISTS)? tableIdentifier #uncacheTable | CLEAR CACHE #clearCache | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE http://git-wip-us.apache.org/repos/asf/spark/blob/9ad0f6ea/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 4ed14d3..364efea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -282,7 +282,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { throw new ParseException(s"It is not allowed to add database prefix `$database` to " + s"the table name in CACHE TABLE AS SELECT", ctx) } -CacheTableCommand(tableIdent, query, ctx.LAZY != null) +val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty) +CacheTableCommand(tableIdent, query, ctx.LAZY != null, options) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/9ad0f6ea/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala index 6b00426..728604a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala @@ -17,16 +17,21 @@ package org.apache.spark.sql.e
spark git commit: [MINOR][DOC] Update the building doc to use Maven 3.5.4 and Java 8 only
Repository: spark Updated Branches: refs/heads/master ed9d0aac9 -> fc9ba9dcc [MINOR][DOC] Update the building doc to use Maven 3.5.4 and Java 8 only ## What changes were proposed in this pull request? Since we didn't test Java 9 ~ 11 up to now in the community, fix the document to describe Java 8 only. ## How was this patch tested? N/A (This is a document only change.) Closes #22781 from dongjoon-hyun/SPARK-JDK-DOC. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc9ba9dc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc9ba9dc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc9ba9dc Branch: refs/heads/master Commit: fc9ba9dcc6ad47fbd05f093b94e7e1358d5f Parents: ed9d0aa Author: Dongjoon Hyun Authored: Fri Oct 19 23:56:40 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Oct 19 23:56:40 2018 -0700 -- docs/building-spark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc9ba9dc/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 6bcc30d..8af90db 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -12,7 +12,7 @@ redirect_from: "building-with-maven.html" ## Apache Maven The Maven-based build is the build of reference for Apache Spark. -Building Spark using Maven requires Maven 3.3.9 or newer and Java 8+. +Building Spark using Maven requires Maven 3.5.4 and Java 8. Note that support for Java 7 was removed as of Spark 2.2.0. ### Setting up Maven's Memory Usage - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOC] Update the building doc to use Maven 3.5.4 and Java 8 only
Repository: spark Updated Branches: refs/heads/branch-2.4 d6a02c568 -> 869242c6b [MINOR][DOC] Update the building doc to use Maven 3.5.4 and Java 8 only ## What changes were proposed in this pull request? Since we didn't test Java 9 ~ 11 up to now in the community, fix the document to describe Java 8 only. ## How was this patch tested? N/A (This is a document only change.) Closes #22781 from dongjoon-hyun/SPARK-JDK-DOC. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit fc9ba9dcc6ad47fbd05f093b94e7e1358d5f) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/869242c6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/869242c6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/869242c6 Branch: refs/heads/branch-2.4 Commit: 869242c6b8008c30b7e527760df48d7cb8df4593 Parents: d6a02c5 Author: Dongjoon Hyun Authored: Fri Oct 19 23:56:40 2018 -0700 Committer: Dongjoon Hyun Committed: Fri Oct 19 23:56:53 2018 -0700 -- docs/building-spark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/869242c6/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 1501f0b..7b9697c 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -12,7 +12,7 @@ redirect_from: "building-with-maven.html" ## Apache Maven The Maven-based build is the build of reference for Apache Spark. -Building Spark using Maven requires Maven 3.3.9 or newer and Java 8+. +Building Spark using Maven requires Maven 3.5.4 and Java 8. Note that support for Java 7 was removed as of Spark 2.2.0. ### Setting up Maven's Memory Usage - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOC][MINOR] Fix minor error in the code of graphx guide
Repository: spark Updated Branches: refs/heads/branch-2.3 5cef11acc -> 719ff7af6 [DOC][MINOR] Fix minor error in the code of graphx guide ## What changes were proposed in this pull request? Fix minor error in the code "sketch of pregel implementation" of GraphX guide. This fixed error relates to `[SPARK-12995][GraphX] Remove deprecate APIs from Pregel` ## How was this patch tested? N/A Closes #22780 from WeichenXu123/minor_doc_update1. Authored-by: WeichenXu Signed-off-by: Dongjoon Hyun (cherry picked from commit 3b4f35f568eb3844d2a789c8a409bc705477df6b) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/719ff7af Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/719ff7af Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/719ff7af Branch: refs/heads/branch-2.3 Commit: 719ff7af645389e878b2a452f4a4318cc9248de3 Parents: 5cef11a Author: WeichenXu Authored: Sat Oct 20 10:32:09 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 10:32:45 2018 -0700 -- docs/graphx-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/719ff7af/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 5c97a24..ed8f836 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -726,7 +726,7 @@ class GraphOps[VD, ED] { var g = mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg) ).cache() // compute the messages -var messages = g.mapReduceTriplets(sendMsg, mergeMsg) +var messages = GraphXUtils.mapReduceTriplets(g, sendMsg, mergeMsg) var activeMessages = messages.count() // Loop until no messages remain or maxIterations is achieved var i = 0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOC][MINOR] Fix minor error in the code of graphx guide
Repository: spark Updated Branches: refs/heads/master fc9ba9dcc -> 3b4f35f56 [DOC][MINOR] Fix minor error in the code of graphx guide ## What changes were proposed in this pull request? Fix minor error in the code "sketch of pregel implementation" of GraphX guide. This fixed error relates to `[SPARK-12995][GraphX] Remove deprecate APIs from Pregel` ## How was this patch tested? N/A Closes #22780 from WeichenXu123/minor_doc_update1. Authored-by: WeichenXu Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b4f35f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b4f35f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b4f35f5 Branch: refs/heads/master Commit: 3b4f35f568eb3844d2a789c8a409bc705477df6b Parents: fc9ba9d Author: WeichenXu Authored: Sat Oct 20 10:32:09 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 10:32:09 2018 -0700 -- docs/graphx-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b4f35f5/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 3529334..cb96fd7 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -726,7 +726,7 @@ class GraphOps[VD, ED] { var g = mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg) ).cache() // compute the messages -var messages = g.mapReduceTriplets(sendMsg, mergeMsg) +var messages = GraphXUtils.mapReduceTriplets(g, sendMsg, mergeMsg) var activeMessages = messages.count() // Loop until no messages remain or maxIterations is achieved var i = 0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOC][MINOR] Fix minor error in the code of graphx guide
Repository: spark Updated Branches: refs/heads/branch-2.4 869242c6b -> 0239277dd [DOC][MINOR] Fix minor error in the code of graphx guide ## What changes were proposed in this pull request? Fix minor error in the code "sketch of pregel implementation" of GraphX guide. This fixed error relates to `[SPARK-12995][GraphX] Remove deprecate APIs from Pregel` ## How was this patch tested? N/A Closes #22780 from WeichenXu123/minor_doc_update1. Authored-by: WeichenXu Signed-off-by: Dongjoon Hyun (cherry picked from commit 3b4f35f568eb3844d2a789c8a409bc705477df6b) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0239277d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0239277d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0239277d Branch: refs/heads/branch-2.4 Commit: 0239277dd3e56e355101381da80d6538a7297db9 Parents: 869242c Author: WeichenXu Authored: Sat Oct 20 10:32:09 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 10:32:23 2018 -0700 -- docs/graphx-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0239277d/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 3529334..cb96fd7 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -726,7 +726,7 @@ class GraphOps[VD, ED] { var g = mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg) ).cache() // compute the messages -var messages = g.mapReduceTriplets(sendMsg, mergeMsg) +var messages = GraphXUtils.mapReduceTriplets(g, sendMsg, mergeMsg) var activeMessages = messages.count() // Loop until no messages remain or maxIterations is achieved var i = 0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOC][MINOR] Fix minor error in the code of graphx guide
Repository: spark Updated Branches: refs/heads/branch-2.2 d6542fa3f -> ca950c17f [DOC][MINOR] Fix minor error in the code of graphx guide ## What changes were proposed in this pull request? Fix minor error in the code "sketch of pregel implementation" of GraphX guide. This fixed error relates to `[SPARK-12995][GraphX] Remove deprecate APIs from Pregel` ## How was this patch tested? N/A Closes #22780 from WeichenXu123/minor_doc_update1. Authored-by: WeichenXu Signed-off-by: Dongjoon Hyun (cherry picked from commit 3b4f35f568eb3844d2a789c8a409bc705477df6b) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca950c17 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca950c17 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca950c17 Branch: refs/heads/branch-2.2 Commit: ca950c17f19f891d7fd8dd45dcc7af0e8f8fc58b Parents: d6542fa Author: WeichenXu Authored: Sat Oct 20 10:32:09 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 10:33:07 2018 -0700 -- docs/graphx-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ca950c17/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 46225dc..14da2d6 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -726,7 +726,7 @@ class GraphOps[VD, ED] { var g = mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg) ).cache() // compute the messages -var messages = g.mapReduceTriplets(sendMsg, mergeMsg) +var messages = GraphXUtils.mapReduceTriplets(g, sendMsg, mergeMsg) var activeMessages = messages.count() // Loop until no messages remain or maxIterations is achieved var i = 0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25492][TEST] Refactor WideSchemaBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 5330c192b -> 62551ccee [SPARK-25492][TEST] Refactor WideSchemaBenchmark to use main method ## What changes were proposed in this pull request? Refactor `WideSchemaBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.execution.benchmark.WideSchemaBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/core/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.WideSchemaBenchmark" ``` ## How was this patch tested? manual tests Closes #22501 from wangyum/SPARK-25492. Lead-authored-by: Yuming Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62551cce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62551cce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62551cce Branch: refs/heads/master Commit: 62551cceebf6aca8b6bd8164cd2ed85564726f6c Parents: 5330c19 Author: Yuming Wang Authored: Sat Oct 20 17:31:13 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 17:31:13 2018 -0700 -- .../benchmarks/WideSchemaBenchmark-results.txt | 216 +++ .../benchmark/WideSchemaBenchmark.scala | 134 +++- 2 files changed, 197 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/62551cce/sql/core/benchmarks/WideSchemaBenchmark-results.txt -- diff --git a/sql/core/benchmarks/WideSchemaBenchmark-results.txt b/sql/core/benchmarks/WideSchemaBenchmark-results.txt index 0b9f791..6347a6a 100644 --- a/sql/core/benchmarks/WideSchemaBenchmark-results.txt +++ b/sql/core/benchmarks/WideSchemaBenchmark-results.txt @@ -1,117 +1,145 @@ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 -Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz + +parsing large select expressions + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz parsing large select:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -1 select expressions 2 /4 0.0 2050147.0 1.0X -100 select expressions 6 /7 0.0 6123412.0 0.3X -2500 select expressions135 / 141 0.0 134623148.0 0.0X +1 select expressions 6 / 13 0.0 5997373.0 1.0X +100 select expressions 7 / 10 0.0 7204596.0 0.8X +2500 select expressions103 / 107 0.0 102962705.0 0.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 -Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz + +many column field read and write + + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz many column field r/w: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative -1 cols x 10 rows (read in-mem) 16 / 18 6.3 158.6 1.0X -1 cols x 10 rows (exec in-mem) 17 / 19 6.0 166.7 1.0X -1 cols x 10 rows (read parquet) 24 / 26 4.3 235.1 0.7X -1 cols x 10 rows (write parquet)81 / 85 1.2 811.3 0.2X -100 cols x 1000 rows (read in-mem) 17 / 19 6.0 166.2 1.0X -100 cols x 1000 rows (exec in-mem) 25 / 27 4.0 249.2 0.6X -100 cols x 1000 rows (read parquet) 23 / 25 4.4 226.0 0.7X -100 cols x 1000 rows (write parquet)83 / 87 1.2 831.0 0.2X -2500 cols x 40 rows (read in-mem) 132 / 137 0.8 1322.9 0.1X -2500 cols x 40 rows (exec in-mem)
spark git commit: [SPARK-25747][SQL] remove ColumnarBatchScan.needsUnsafeRowConversion
Repository: spark Updated Branches: refs/heads/master 62551ccee -> ab5752cb9 [SPARK-25747][SQL] remove ColumnarBatchScan.needsUnsafeRowConversion ## What changes were proposed in this pull request? `needsUnsafeRowConversion` is used in 2 places: 1. `ColumnarBatchScan.produceRows` 2. `FileSourceScanExec.doExecute` When we hit `ColumnarBatchScan.produceRows`, it means whole stage codegen is on but the vectorized reader is off. The vectorized reader can be off for several reasons: 1. the file format doesn't have a vectorized reader(json, csv, etc.) 2. the vectorized reader config is off 3. the schema is not supported Anyway when the vectorized reader is off, file format reader will always return unsafe rows, and other `ColumnarBatchScan` implementations also always return unsafe rows, so `ColumnarBatchScan.needsUnsafeRowConversion` is not needed. When we hit `FileSourceScanExec.doExecute`, it means whole stage codegen is off. For this case, we need the `needsUnsafeRowConversion` to convert `ColumnarRow` to `UnsafeRow`, if the file format reader returns batch. This PR removes `ColumnarBatchScan.needsUnsafeRowConversion`, and keep this flag only in `FileSourceScanExec` ## How was this patch tested? existing tests Closes #22750 from cloud-fan/minor. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab5752cb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab5752cb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab5752cb Branch: refs/heads/master Commit: ab5752cb952e6536a68a988289e57100fdbba142 Parents: 62551cc Author: Wenchen Fan Authored: Sat Oct 20 17:45:04 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 17:45:04 2018 -0700 -- .../apache/spark/sql/execution/ColumnarBatchScan.scala| 10 +- .../apache/spark/sql/execution/DataSourceScanExec.scala | 7 --- .../sql/execution/columnar/InMemoryTableScanExec.scala| 2 -- .../execution/datasources/v2/DataSourceV2ScanExec.scala | 2 -- 4 files changed, 5 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab5752cb/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala index 9f6b593..7caff69 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala @@ -34,8 +34,6 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport { protected def supportsBatch: Boolean = true - protected def needsUnsafeRowConversion: Boolean = true - override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time")) @@ -159,17 +157,11 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport { ctx.INPUT_ROW = row ctx.currentVars = null -// Always provide `outputVars`, so that the framework can help us build unsafe row if the input -// row is not unsafe row, i.e. `needsUnsafeRowConversion` is true. -val outputVars = output.zipWithIndex.map { case (a, i) => - BoundReference(i, a.dataType, a.nullable).genCode(ctx) -} -val inputRow = if (needsUnsafeRowConversion) null else row s""" |while ($limitNotReachedCond $input.hasNext()) { | InternalRow $row = (InternalRow) $input.next(); | $numOutputRows.add(1); - | ${consume(ctx, outputVars, inputRow).trim} + | ${consume(ctx, null, row).trim} | if (shouldStop()) return; |} """.stripMargin http://git-wip-us.apache.org/repos/asf/spark/blob/ab5752cb/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 738c066..a9b18ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -168,10 +168,11 @@ case class FileSourceScanExec( // Note that some vals referring the file-based relation are lazy intentionally // so that this plan can be canonicali
spark git commit: [SPARK-25779][SQL][TESTS] Remove SQL query tests for function documentation by DESCRIBE FUNCTION at SQLQueryTestSuite
Repository: spark Updated Branches: refs/heads/master ab5752cb9 -> b8c6ba9e6 [SPARK-25779][SQL][TESTS] Remove SQL query tests for function documentation by DESCRIBE FUNCTION at SQLQueryTestSuite Currently, there are some tests testing function descriptions: ```bash $ grep -ir "describe function" sql/core/src/test/resources/sql-tests/inputs sql/core/src/test/resources/sql-tests/inputs/json-functions.sql:describe function to_json; sql/core/src/test/resources/sql-tests/inputs/json-functions.sql:describe function extended to_json; sql/core/src/test/resources/sql-tests/inputs/json-functions.sql:describe function from_json; sql/core/src/test/resources/sql-tests/inputs/json-functions.sql:describe function extended from_json; ``` Looks there are not quite good points about testing them since we're not going to test documentation itself. For `DESCRIBE FCUNTION` functionality itself, they are already being tested here and there. See the test failures in https://github.com/apache/spark/pull/18749 (where I added examples to function descriptions) We better remove those tests so that people don't add such tests in the SQL tests. ## How was this patch tested? Manual. Closes #22776 from HyukjinKwon/SPARK-25779. Authored-by: hyukjinkwon Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8c6ba9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8c6ba9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8c6ba9e Branch: refs/heads/master Commit: b8c6ba9e644786e5da7e009eb8030ac375ccd75f Parents: ab5752c Author: hyukjinkwon Authored: Sat Oct 20 18:02:38 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 18:02:38 2018 -0700 -- .../sql-tests/inputs/json-functions.sql | 4 - .../sql-tests/results/json-functions.sql.out| 296 +++ 2 files changed, 114 insertions(+), 186 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b8c6ba9e/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index bdd1fe4..8bfd7c0 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -1,6 +1,4 @@ -- to_json -describe function to_json; -describe function extended to_json; select to_json(named_struct('a', 1, 'b', 2)); select to_json(named_struct('time', to_timestamp('2015-08-26', '-MM-dd')), map('timestampFormat', 'dd/MM/')); select to_json(array(named_struct('a', 1, 'b', 2))); @@ -15,8 +13,6 @@ select to_json(named_struct('a', 1, 'b', 2), map('mode', 1)); select to_json(); -- from_json -describe function from_json; -describe function extended from_json; select from_json('{"a":1}', 'a INT'); select from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/')); -- Check if errors handled http://git-wip-us.apache.org/repos/asf/spark/blob/b8c6ba9e/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 868eee8..c70a81e 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -1,196 +1,128 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 42 +-- Number of queries: 38 -- !query 0 -describe function to_json --- !query 0 schema -struct --- !query 0 output -Class: org.apache.spark.sql.catalyst.expressions.StructsToJson -Function: to_json -Usage: to_json(expr[, options]) - Returns a JSON string with a given struct value - - --- !query 1 -describe function extended to_json --- !query 1 schema -struct --- !query 1 output -Class: org.apache.spark.sql.catalyst.expressions.StructsToJson -Extended Usage: -Examples: - > SELECT to_json(named_struct('a', 1, 'b', 2)); - {"a":1,"b":2} - > SELECT to_json(named_struct('time', to_timestamp('2015-08-26', '-MM-dd')), map('timestampFormat', 'dd/MM/')); - {"time":"26/08/2015"} - &
spark git commit: [SPARK-25757][BUILD] Upgrade netty-all from 4.1.17.Final to 4.1.30.Final
Repository: spark Updated Branches: refs/heads/master 2fbbcd0d2 -> c77aa42f5 [SPARK-25757][BUILD] Upgrade netty-all from 4.1.17.Final to 4.1.30.Final ## What changes were proposed in this pull request? Upgrade netty dependency from 4.1.17 to 4.1.30. Explanation: Currently when sending a ChunkedByteBuffer with more than 16 chunks over the network will trigger a "merge" of all the blocks into one big transient array that is then sent over the network. This is problematic as the total memory for all chunks can be high (2GB) and this would then trigger an allocation of 2GB to merge everything, which will create OOM errors. And we can avoid this issue by upgrade the netty. https://github.com/netty/netty/pull/8038 ## How was this patch tested? Manual tests in some spark jobs. Closes #22765 from lipzhu/SPARK-25757. Authored-by: Zhu, Lipeng Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c77aa42f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c77aa42f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c77aa42f Branch: refs/heads/master Commit: c77aa42f55fe2c92bb9fcf93c92af0a768edfb68 Parents: 2fbbcd0 Author: Zhu, Lipeng Authored: Sat Oct 20 22:17:37 2018 -0700 Committer: Dongjoon Hyun Committed: Sat Oct 20 22:17:37 2018 -0700 -- dev/deps/spark-deps-hadoop-2.7 | 2 +- dev/deps/spark-deps-hadoop-3.1 | 2 +- pom.xml| 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c77aa42f/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 3b17f88..06173f7 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -149,7 +149,7 @@ metrics-json-3.1.5.jar metrics-jvm-3.1.5.jar minlog-1.3.0.jar netty-3.9.9.Final.jar -netty-all-4.1.17.Final.jar +netty-all-4.1.30.Final.jar objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c77aa42f/dev/deps/spark-deps-hadoop-3.1 -- diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index c818b2c..62fddf0 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@ -165,7 +165,7 @@ metrics-jvm-3.1.5.jar minlog-1.3.0.jar mssql-jdbc-6.2.1.jre7.jar netty-3.9.9.Final.jar -netty-all-4.1.17.Final.jar +netty-all-4.1.30.Final.jar nimbus-jose-jwt-4.41.1.jar objenesis-2.5.1.jar okhttp-2.7.5.jar http://git-wip-us.apache.org/repos/asf/spark/blob/c77aa42f/pom.xml -- diff --git a/pom.xml b/pom.xml index 58a2841..b1f0a53 100644 --- a/pom.xml +++ b/pom.xml @@ -581,7 +581,7 @@ io.netty netty-all -4.1.17.Final +4.1.30.Final io.netty - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25795][R][EXAMPLE] Fix CSV SparkR SQL Example
Repository: spark Updated Branches: refs/heads/master ff9ede092 -> 3b4556745 [SPARK-25795][R][EXAMPLE] Fix CSV SparkR SQL Example ## What changes were proposed in this pull request? This PR aims to fix the following SparkR example in Spark 2.3.0 ~ 2.4.0. ```r > df <- read.df("examples/src/main/resources/people.csv", "csv") > namesAndAges <- select(df, "name", "age") ... Caused by: org.apache.spark.sql.AnalysisException: cannot resolve '`name`' given input columns: [_c0];; 'Project ['name, 'age] +- AnalysisBarrier +- Relation[_c0#97] csv ``` - https://dist.apache.org/repos/dist/dev/spark/v2.4.0-rc3-docs/_site/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.2/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.1/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.0/sql-programming-guide.html#manually-specifying-options ## How was this patch tested? Manual test in SparkR. (Please note that `RSparkSQLExample.R` fails at the last JDBC example) ```r > df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", > inferSchema=T, header=T) > namesAndAges <- select(df, "name", "age") ``` Closes #22791 from dongjoon-hyun/SPARK-25795. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b455674 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b455674 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b455674 Branch: refs/heads/master Commit: 3b4556745e90a13f4ae7ebae4ab682617de25c38 Parents: ff9ede0 Author: Dongjoon Hyun Authored: Mon Oct 22 16:34:33 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Oct 22 16:34:33 2018 -0700 -- examples/src/main/r/RSparkSQLExample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b455674/examples/src/main/r/RSparkSQLExample.R -- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index a5ed723..effba94 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -114,7 +114,7 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet") # $example on:manual_load_options_csv$ -df <- read.df("examples/src/main/resources/people.csv", "csv") +df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T) namesAndAges <- select(df, "name", "age") # $example off:manual_load_options_csv$ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25795][R][EXAMPLE] Fix CSV SparkR SQL Example
Repository: spark Updated Branches: refs/heads/branch-2.4 f33d888a2 -> b9b594ade [SPARK-25795][R][EXAMPLE] Fix CSV SparkR SQL Example ## What changes were proposed in this pull request? This PR aims to fix the following SparkR example in Spark 2.3.0 ~ 2.4.0. ```r > df <- read.df("examples/src/main/resources/people.csv", "csv") > namesAndAges <- select(df, "name", "age") ... Caused by: org.apache.spark.sql.AnalysisException: cannot resolve '`name`' given input columns: [_c0];; 'Project ['name, 'age] +- AnalysisBarrier +- Relation[_c0#97] csv ``` - https://dist.apache.org/repos/dist/dev/spark/v2.4.0-rc3-docs/_site/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.2/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.1/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.0/sql-programming-guide.html#manually-specifying-options ## How was this patch tested? Manual test in SparkR. (Please note that `RSparkSQLExample.R` fails at the last JDBC example) ```r > df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", > inferSchema=T, header=T) > namesAndAges <- select(df, "name", "age") ``` Closes #22791 from dongjoon-hyun/SPARK-25795. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 3b4556745e90a13f4ae7ebae4ab682617de25c38) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b9b594ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b9b594ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b9b594ad Branch: refs/heads/branch-2.4 Commit: b9b594ade9106ad96adb413c7a27ec7b4f8a849a Parents: f33d888 Author: Dongjoon Hyun Authored: Mon Oct 22 16:34:33 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Oct 22 16:34:48 2018 -0700 -- examples/src/main/r/RSparkSQLExample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b9b594ad/examples/src/main/r/RSparkSQLExample.R -- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index a5ed723..effba94 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -114,7 +114,7 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet") # $example on:manual_load_options_csv$ -df <- read.df("examples/src/main/resources/people.csv", "csv") +df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T) namesAndAges <- select(df, "name", "age") # $example off:manual_load_options_csv$ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25795][R][EXAMPLE] Fix CSV SparkR SQL Example
Repository: spark Updated Branches: refs/heads/branch-2.3 d7a35877b -> 8fbf3ee91 [SPARK-25795][R][EXAMPLE] Fix CSV SparkR SQL Example ## What changes were proposed in this pull request? This PR aims to fix the following SparkR example in Spark 2.3.0 ~ 2.4.0. ```r > df <- read.df("examples/src/main/resources/people.csv", "csv") > namesAndAges <- select(df, "name", "age") ... Caused by: org.apache.spark.sql.AnalysisException: cannot resolve '`name`' given input columns: [_c0];; 'Project ['name, 'age] +- AnalysisBarrier +- Relation[_c0#97] csv ``` - https://dist.apache.org/repos/dist/dev/spark/v2.4.0-rc3-docs/_site/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.2/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.1/sql-programming-guide.html#manually-specifying-options - http://spark.apache.org/docs/2.3.0/sql-programming-guide.html#manually-specifying-options ## How was this patch tested? Manual test in SparkR. (Please note that `RSparkSQLExample.R` fails at the last JDBC example) ```r > df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", > inferSchema=T, header=T) > namesAndAges <- select(df, "name", "age") ``` Closes #22791 from dongjoon-hyun/SPARK-25795. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 3b4556745e90a13f4ae7ebae4ab682617de25c38) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8fbf3ee9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8fbf3ee9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8fbf3ee9 Branch: refs/heads/branch-2.3 Commit: 8fbf3ee91703fc714f3f01237485479562915933 Parents: d7a3587 Author: Dongjoon Hyun Authored: Mon Oct 22 16:34:33 2018 -0700 Committer: Dongjoon Hyun Committed: Mon Oct 22 16:35:05 2018 -0700 -- examples/src/main/r/RSparkSQLExample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8fbf3ee9/examples/src/main/r/RSparkSQLExample.R -- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index a5ed723..effba94 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -114,7 +114,7 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet") # $example on:manual_load_options_csv$ -df <- read.df("examples/src/main/resources/people.csv", "csv") +df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T) namesAndAges <- select(df, "name", "age") # $example off:manual_load_options_csv$ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25812][UI][TEST] Fix test failure in PagedTableSuite
Repository: spark Updated Branches: refs/heads/master 736fc0393 -> 65a8d1b87 [SPARK-25812][UI][TEST] Fix test failure in PagedTableSuite ## What changes were proposed in this pull request? In https://github.com/apache/spark/pull/22668, the PR was merged without PR builder test. And there is a test failure: https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-sbt-hadoop-2.7/5070/testReport/org.apache.spark.ui/PagedTableSuite/pageNavigation/ This PR is to fix it. ## How was this patch tested? Update the test case. Closes #22808 from gengliangwang/fixPagedTableSuite. Authored-by: Gengliang Wang Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65a8d1b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65a8d1b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65a8d1b8 Branch: refs/heads/master Commit: 65a8d1b87f98b4160c3e4039c97d6a1a5096aaf2 Parents: 736fc03 Author: Gengliang Wang Authored: Tue Oct 23 12:37:45 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 23 12:37:45 2018 -0700 -- core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65a8d1b8/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala index 74eeca2..cda98ae 100644 --- a/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala @@ -73,7 +73,7 @@ class PagedTableSuite extends SparkFunSuite { override def goButtonFormPath: String = "" } -assert(pagedTable.pageNavigation(1, 10, 1) === Nil) +assert((pagedTable.pageNavigation(1, 10, 1).head \\ "li").map(_.text.trim) === Seq("1")) assert( (pagedTable.pageNavigation(1, 10, 2).head \\ "li").map(_.text.trim) === Seq("1", "2", ">")) assert( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25656][SQL][DOC][EXAMPLE] Add a doc and examples about extra data source options
Repository: spark Updated Branches: refs/heads/master 65a8d1b87 -> 4506dad8a [SPARK-25656][SQL][DOC][EXAMPLE] Add a doc and examples about extra data source options ## What changes were proposed in this pull request? Our current doc does not explain how we are passing the data source specific options to the underlying data source. According to [the review comment](https://github.com/apache/spark/pull/22622#discussion_r222911529), this PR aims to add more detailed information and examples ## How was this patch tested? Manual. Closes #22801 from dongjoon-hyun/SPARK-25656. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4506dad8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4506dad8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4506dad8 Branch: refs/heads/master Commit: 4506dad8a9613d4b6b319c0240119927265a67c1 Parents: 65a8d1b Author: Dongjoon Hyun Authored: Tue Oct 23 12:41:20 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 23 12:41:20 2018 -0700 -- docs/sql-data-sources-load-save-functions.md| 44 +++ .../examples/sql/JavaSQLDataSourceExample.java | 7 +++ examples/src/main/python/sql/datasource.py | 9 examples/src/main/r/RSparkSQLExample.R | 6 ++- examples/src/main/resources/users.orc | Bin 0 -> 547 bytes .../examples/sql/SQLDataSourceExample.scala | 7 +++ 6 files changed, 72 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/docs/sql-data-sources-load-save-functions.md -- diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md index e1dd0a3..e4c7b17 100644 --- a/docs/sql-data-sources-load-save-functions.md +++ b/docs/sql-data-sources-load-save-functions.md @@ -82,6 +82,50 @@ To load a CSV file you can use: +The extra options are also used during write operation. +For example, you can control bloom filters and dictionary encodings for ORC data sources. +The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`. +For Parquet, there exists `parquet.enable.dictionary`, too. +To find more detailed information about the extra ORC/Parquet options, +visit the official Apache ORC/Parquet websites. + + + + +{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} + + + +{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} + + + +{% include_example manual_save_options_orc python/sql/datasource.py %} + + + +{% include_example manual_save_options_orc r/RSparkSQLExample.R %} + + + + +{% highlight sql %} +CREATE TABLE users_with_options ( + name STRING, + favorite_color STRING, + favorite_numbers array +) USING ORC +OPTIONS ( + orc.bloom.filter.columns 'favorite_color', + orc.dictionary.key.threshold '1.0', + orc.column.encoding.direct 'name' +) +{% endhighlight %} + + + + + ### Run SQL on files directly Instead of using read API to load a file into DataFrame and query it, you can also query that http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index ef3c904..cbe9dfd 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -123,6 +123,13 @@ public class JavaSQLDataSourceExample { .option("header", "true") .load("examples/src/main/resources/people.csv"); // $example off:manual_load_options_csv$ +// $example on:manual_save_options_orc$ +usersDF.write().format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc"); +// $example off:manual_save_options_orc$ // $example on:direct_sql$ Dataset sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/python/sql/datasource.py --
spark git commit: [SPARK-25665][SQL][TEST] Refactor ObjectHashAggregateExecBenchmark to…
Repository: spark Updated Branches: refs/heads/master 6540c2f8f -> ccd07b736 [SPARK-25665][SQL][TEST] Refactor ObjectHashAggregateExecBenchmark to⦠## What changes were proposed in this pull request? Refactor ObjectHashAggregateExecBenchmark to use main method ## How was this patch tested? Manually tested: ``` bin/spark-submit --class org.apache.spark.sql.execution.benchmark.ObjectHashAggregateExecBenchmark --jars sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar,core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar,sql/hive/target/spark-hive_2.11-3.0.0-SNAPSHOT.jar --packages org.spark-project.hive:hive-exec:1.2.1.spark2 sql/hive/target/spark-hive_2.11-3.0.0-SNAPSHOT-tests.jar ``` Generated results with: ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "hive/test:runMain org.apache.spark.sql.execution.benchmark.ObjectHashAggregateExecBenchmark" ``` Closes #22804 from peter-toth/SPARK-25665. Lead-authored-by: Peter Toth Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ccd07b73 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ccd07b73 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ccd07b73 Branch: refs/heads/master Commit: ccd07b736640c87ac6980a1c7c2d706ef3bab1bf Parents: 6540c2f Author: Peter Toth Authored: Thu Oct 25 12:42:31 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 25 12:42:31 2018 -0700 -- ...ObjectHashAggregateExecBenchmark-results.txt | 45 .../ObjectHashAggregateExecBenchmark.scala | 218 +-- 2 files changed, 152 insertions(+), 111 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ccd07b73/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt -- diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt new file mode 100644 index 000..f3044da --- /dev/null +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt @@ -0,0 +1,45 @@ + +Hive UDAF vs Spark AF + + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +hive udaf vs spark af: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +hive udaf w/o group by6370 / 6400 0.0 97193.6 1.0X +spark af w/o group by 54 / 63 1.2 820.8 118.4X +hive udaf w/ group by 4492 / 4507 0.0 68539.5 1.4X +spark af w/ group by w/o fallback 58 / 64 1.1 881.7 110.2X +spark af w/ group by w/ fallback 136 / 142 0.5 2075.0 46.8X + + + +ObjectHashAggregateExec vs SortAggregateExec - typed_count + + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +object agg v.s. sort agg:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +sort agg w/ group by41500 / 41630 2.5 395.8 1.0X +object agg w/ group by w/o fallback 10075 / 10122 10.4 96.1 4.1X +object agg w/ group by w/ fallback 28131 / 28205 3.7 268.3 1.5X +sort agg w/o group by 6182 / 6221 17.0 59.0 6.7X +object agg w/o group by w/o fallback 5435 / 5468 19.3 51.8 7.6X + + + +ObjectHashAggregateExec vs SortAggregateExec - percentile_approx + + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +object agg v.s. sort agg:Best/Avg Time(ms)Rate(M/s) Per Row(n
spark git commit: [SPARK-25656][SQL][DOC][EXAMPLE][BRANCH-2.4] Add a doc and examples about extra data source options
Repository: spark Updated Branches: refs/heads/branch-2.4 1b075f26f -> db121a2a1 [SPARK-25656][SQL][DOC][EXAMPLE][BRANCH-2.4] Add a doc and examples about extra data source options ## What changes were proposed in this pull request? Our current doc does not explain how we are passing the data source specific options to the underlying data source. According to [the review comment](https://github.com/apache/spark/pull/22622#discussion_r222911529), this PR aims to add more detailed information and examples. This is a backport of #22801. `orc.column.encoding.direct` is removed since it's not supported in ORC 1.5.2. ## How was this patch tested? Manual. Closes #22839 from dongjoon-hyun/SPARK-25656-2.4. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db121a2a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db121a2a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db121a2a Branch: refs/heads/branch-2.4 Commit: db121a2a1fde96fe77eedff18706df5c8e2e731d Parents: 1b075f2 Author: Dongjoon Hyun Authored: Thu Oct 25 14:15:03 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 25 14:15:03 2018 -0700 -- docs/sql-data-sources-load-save-functions.md| 43 +++ .../examples/sql/JavaSQLDataSourceExample.java | 6 +++ examples/src/main/python/sql/datasource.py | 8 examples/src/main/r/RSparkSQLExample.R | 6 ++- examples/src/main/resources/users.orc | Bin 0 -> 547 bytes .../examples/sql/SQLDataSourceExample.scala | 6 +++ 6 files changed, 68 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db121a2a/docs/sql-data-sources-load-save-functions.md -- diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md index e1dd0a3..a3191b2 100644 --- a/docs/sql-data-sources-load-save-functions.md +++ b/docs/sql-data-sources-load-save-functions.md @@ -82,6 +82,49 @@ To load a CSV file you can use: +The extra options are also used during write operation. +For example, you can control bloom filters and dictionary encodings for ORC data sources. +The following ORC example will create bloom filter on `favorite_color` and use dictionary encoding for `name` and `favorite_color`. +For Parquet, there exists `parquet.enable.dictionary`, too. +To find more detailed information about the extra ORC/Parquet options, +visit the official Apache ORC/Parquet websites. + + + + +{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} + + + +{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} + + + +{% include_example manual_save_options_orc python/sql/datasource.py %} + + + +{% include_example manual_save_options_orc r/RSparkSQLExample.R %} + + + + +{% highlight sql %} +CREATE TABLE users_with_options ( + name STRING, + favorite_color STRING, + favorite_numbers array +) USING ORC +OPTIONS ( + orc.bloom.filter.columns 'favorite_color', + orc.dictionary.key.threshold '1.0' +) +{% endhighlight %} + + + + + ### Run SQL on files directly Instead of using read API to load a file into DataFrame and query it, you can also query that http://git-wip-us.apache.org/repos/asf/spark/blob/db121a2a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index ef3c904..97e9ca3 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -123,6 +123,12 @@ public class JavaSQLDataSourceExample { .option("header", "true") .load("examples/src/main/resources/people.csv"); // $example off:manual_load_options_csv$ +// $example on:manual_save_options_orc$ +usersDF.write().format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .save("users_with_options.orc"); +// $example off:manual_save_options_orc$ // $example on:direct_sql$ Dataset sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); http://git-wip-us.apache.org/repos/asf/spark/blob/db121a2a/exam
spark git commit: [SPARK-25840][BUILD] `make-distribution.sh` should not fail due to missing LICENSE-binary
Repository: spark Updated Branches: refs/heads/branch-2.4 39e108f16 -> b739fb0d7 [SPARK-25840][BUILD] `make-distribution.sh` should not fail due to missing LICENSE-binary ## What changes were proposed in this pull request? We vote for the artifacts. All releases are in the form of the source materials needed to make changes to the software being released. (http://www.apache.org/legal/release-policy.html#artifacts) >From Spark 2.4.0, the source artifact and binary artifact starts to contain >own proper LICENSE files (LICENSE, LICENSE-binary). It's great to have them. >However, unfortunately, `dev/make-distribution.sh` inside source artifacts >start to fail because it expects `LICENSE-binary` and source artifact have >only the LICENSE file. https://dist.apache.org/repos/dist/dev/spark/v2.4.0-rc4-bin/spark-2.4.0.tgz `dev/make-distribution.sh` is used during the voting phase because we are voting on that source artifact instead of GitHub repository. Individual contributors usually don't have the downstream repository and starts to try build the voting source artifacts to help the verification for the source artifact during voting phase. (Personally, I did before.) This PR aims to recover that script to work in any way. This doesn't aim for source artifacts to reproduce the compiled artifacts. ## How was this patch tested? Manual. ``` $ rm LICENSE-binary $ dev/make-distribution.sh ``` Closes #22840 from dongjoon-hyun/SPARK-25840. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun (cherry picked from commit 79f3babcc6e189d7405464b9ac1eb1c017e51f5d) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b739fb0d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b739fb0d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b739fb0d Branch: refs/heads/branch-2.4 Commit: b739fb0d783adad68e7197caaa931a83eb1725bd Parents: 39e108f Author: Dongjoon Hyun Authored: Thu Oct 25 20:26:13 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 25 20:26:26 2018 -0700 -- dev/make-distribution.sh | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b739fb0d/dev/make-distribution.sh -- diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 668682f..84f4ae9 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -212,9 +212,13 @@ mkdir -p "$DISTDIR/examples/src/main" cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/" # Copy license and ASF files -cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE" -cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses" -cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE" +if [ -e "$SPARK_HOME/LICENSE-binary" ]; then + cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE" + cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses" + cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE" +else + echo "Skipping copying LICENSE files" +fi if [ -e "$SPARK_HOME/CHANGES.txt" ]; then cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25840][BUILD] `make-distribution.sh` should not fail due to missing LICENSE-binary
Repository: spark Updated Branches: refs/heads/master dc9b32080 -> 79f3babcc [SPARK-25840][BUILD] `make-distribution.sh` should not fail due to missing LICENSE-binary ## What changes were proposed in this pull request? We vote for the artifacts. All releases are in the form of the source materials needed to make changes to the software being released. (http://www.apache.org/legal/release-policy.html#artifacts) >From Spark 2.4.0, the source artifact and binary artifact starts to contain >own proper LICENSE files (LICENSE, LICENSE-binary). It's great to have them. >However, unfortunately, `dev/make-distribution.sh` inside source artifacts >start to fail because it expects `LICENSE-binary` and source artifact have >only the LICENSE file. https://dist.apache.org/repos/dist/dev/spark/v2.4.0-rc4-bin/spark-2.4.0.tgz `dev/make-distribution.sh` is used during the voting phase because we are voting on that source artifact instead of GitHub repository. Individual contributors usually don't have the downstream repository and starts to try build the voting source artifacts to help the verification for the source artifact during voting phase. (Personally, I did before.) This PR aims to recover that script to work in any way. This doesn't aim for source artifacts to reproduce the compiled artifacts. ## How was this patch tested? Manual. ``` $ rm LICENSE-binary $ dev/make-distribution.sh ``` Closes #22840 from dongjoon-hyun/SPARK-25840. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79f3babc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79f3babc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79f3babc Branch: refs/heads/master Commit: 79f3babcc6e189d7405464b9ac1eb1c017e51f5d Parents: dc9b320 Author: Dongjoon Hyun Authored: Thu Oct 25 20:26:13 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 25 20:26:13 2018 -0700 -- dev/make-distribution.sh | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79f3babc/dev/make-distribution.sh -- diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 668682f..84f4ae9 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -212,9 +212,13 @@ mkdir -p "$DISTDIR/examples/src/main" cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/" # Copy license and ASF files -cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE" -cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses" -cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE" +if [ -e "$SPARK_HOME/LICENSE-binary" ]; then + cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE" + cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses" + cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE" +else + echo "Skipping copying LICENSE files" +fi if [ -e "$SPARK_HOME/CHANGES.txt" ]; then cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][TEST][BRANCH-2.4] Regenerate golden file `datetime.sql.out`
Repository: spark Updated Branches: refs/heads/branch-2.4 b739fb0d7 -> adfd1057d [MINOR][TEST][BRANCH-2.4] Regenerate golden file `datetime.sql.out` ## What changes were proposed in this pull request? `datetime.sql.out` is a generated golden file, but it's a little bit broken during manual [reverting](https://github.com/dongjoon-hyun/spark/commit/5d744499667fcd08825bca0ac6d5d90d6e110ebc#diff-79dd276be45ede6f34e24ad7005b0a7cR87). This doens't cause test failure because the difference is inside `comments` and blank lines. We had better fix this minor issue before RC5. ## How was this patch tested? Pass the Jenkins. Closes #22837 from dongjoon-hyun/fix_datetime_sql_out. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/adfd1057 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/adfd1057 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/adfd1057 Branch: refs/heads/branch-2.4 Commit: adfd1057dae3b48c05d7443e3aee23157965e6d1 Parents: b739fb0 Author: Dongjoon Hyun Authored: Thu Oct 25 20:37:07 2018 -0700 Committer: Dongjoon Hyun Committed: Thu Oct 25 20:37:07 2018 -0700 -- sql/core/src/test/resources/sql-tests/results/datetime.sql.out | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/adfd1057/sql/core/src/test/resources/sql-tests/results/datetime.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 4e1cfa6..63aa004 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -82,9 +82,10 @@ struct 1 2 2 3 + -- !query 9 select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), weekday(null), weekday('1582-10-15 13:10:15') --- !query 3 schema +-- !query 9 schema struct --- !query 3 output +-- !query 9 output 5 3 5 NULL4 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25797][SQL][DOCS][BACKPORT-2.3] Add migration doc for solving issues caused by view canonicalization approach change
Repository: spark Updated Branches: refs/heads/branch-2.2 5b1396596 -> 17d882adf [SPARK-25797][SQL][DOCS][BACKPORT-2.3] Add migration doc for solving issues caused by view canonicalization approach change ## What changes were proposed in this pull request? Since Spark 2.2, view definitions are stored in a different way from prior versions. This may cause Spark unable to read views created by prior versions. See [SPARK-25797](https://issues.apache.org/jira/browse/SPARK-25797) for more details. Basically, we have 2 options. 1) Make Spark 2.2+ able to get older view definitions back. Since the expanded text is buggy and unusable, we have to use original text (this is possible with [SPARK-25459](https://issues.apache.org/jira/browse/SPARK-25459)). However, because older Spark versions don't save the context for the database, we cannot always get correct view definitions without view default database. 2) Recreate the views by `ALTER VIEW AS` or `CREATE OR REPLACE VIEW AS`. This PR aims to add migration doc to help users troubleshoot this issue by above option 2. ## How was this patch tested? N/A. Docs are generated and checked locally ``` cd docs SKIP_API=1 jekyll serve --watch ``` Closes #22851 from seancxmao/SPARK-25797-2.3. Authored-by: seancxmao Signed-off-by: Dongjoon Hyun (cherry picked from commit 3e0160bacfbe4597f15ca410ca832617cdeeddca) Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17d882ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17d882ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17d882ad Branch: refs/heads/branch-2.2 Commit: 17d882adf0b1bbbd4350b6d46756fab0fd602683 Parents: 5b13965 Author: seancxmao Authored: Sun Oct 28 21:27:22 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 28 21:27:42 2018 -0700 -- docs/sql-programming-guide.md | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/17d882ad/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 8cd4d05..758920e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1548,6 +1548,8 @@ options. - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use . Therefore, the initial schema inference occurs only at a table's first access. + - Since Spark 2.2, view definitions are stored in a different way from prior versions. This may cause Spark unable to read views created by prior versions. In such cases, you need to recreate the views using `ALTER VIEW AS` or `CREATE OR REPLACE VIEW AS` with newer Spark versions. + ## Upgrading From Spark SQL 2.0 to 2.1 - Datasource tables now store partition metadata in the Hive metastore. This means that Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25797][SQL][DOCS][BACKPORT-2.3] Add migration doc for solving issues caused by view canonicalization approach change
Repository: spark Updated Branches: refs/heads/branch-2.3 53aeb3d65 -> 3e0160bac [SPARK-25797][SQL][DOCS][BACKPORT-2.3] Add migration doc for solving issues caused by view canonicalization approach change ## What changes were proposed in this pull request? Since Spark 2.2, view definitions are stored in a different way from prior versions. This may cause Spark unable to read views created by prior versions. See [SPARK-25797](https://issues.apache.org/jira/browse/SPARK-25797) for more details. Basically, we have 2 options. 1) Make Spark 2.2+ able to get older view definitions back. Since the expanded text is buggy and unusable, we have to use original text (this is possible with [SPARK-25459](https://issues.apache.org/jira/browse/SPARK-25459)). However, because older Spark versions don't save the context for the database, we cannot always get correct view definitions without view default database. 2) Recreate the views by `ALTER VIEW AS` or `CREATE OR REPLACE VIEW AS`. This PR aims to add migration doc to help users troubleshoot this issue by above option 2. ## How was this patch tested? N/A. Docs are generated and checked locally ``` cd docs SKIP_API=1 jekyll serve --watch ``` Closes #22851 from seancxmao/SPARK-25797-2.3. Authored-by: seancxmao Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e0160ba Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e0160ba Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e0160ba Branch: refs/heads/branch-2.3 Commit: 3e0160bacfbe4597f15ca410ca832617cdeeddca Parents: 53aeb3d Author: seancxmao Authored: Sun Oct 28 21:27:22 2018 -0700 Committer: Dongjoon Hyun Committed: Sun Oct 28 21:27:22 2018 -0700 -- docs/sql-programming-guide.md | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e0160ba/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 461806a..e5fa4c6 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1973,6 +1973,8 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.2.1 and 2.3.0, the schema is always inferred at runtime when the data source tables have the columns that exist in both partition schema and data schema. The inferred schema does not have the partitioned columns. When reading the table, Spark respects the partition values of these overlapping columns instead of the values stored in the data source files. In 2.2.0 and 2.1.x release, the inferred schema is partitioned but the data of the table is invisible to users (i.e., the result set is empty). + - Since Spark 2.2, view definitions are stored in a different way from prior versions. This may cause Spark unable to read views created by prior versions. In such cases, you need to recreate the views using `ALTER VIEW AS` or `CREATE OR REPLACE VIEW AS` with newer Spark versions. + ## Upgrading From Spark SQL 2.0 to 2.1 - Datasource tables now store partition metadata in the Hive metastore. This means that Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25848][SQL][TEST] Refactor CSVBenchmarks to use main method
Repository: spark Updated Branches: refs/heads/master a129f0795 -> 94de5609b [SPARK-25848][SQL][TEST] Refactor CSVBenchmarks to use main method ## What changes were proposed in this pull request? use spark-submit: `bin/spark-submit --class org.apache.spark.sql.execution.datasources.csv.CSVBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/core/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar` Generate benchmark result: `SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.datasources.csv.CSVBenchmark"` ## How was this patch tested? manual tests Closes #22845 from heary-cao/CSVBenchmarks. Authored-by: caoxuewen Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/94de5609 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/94de5609 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/94de5609 Branch: refs/heads/master Commit: 94de5609be27e2618d6d241ec9aa032fbc601b6e Parents: a129f07 Author: caoxuewen Authored: Tue Oct 30 09:18:55 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 30 09:18:55 2018 -0700 -- sql/core/benchmarks/CSVBenchmark-results.txt| 27 .../datasources/csv/CSVBenchmark.scala | 136 .../datasources/csv/CSVBenchmarks.scala | 158 --- 3 files changed, 163 insertions(+), 158 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/94de5609/sql/core/benchmarks/CSVBenchmark-results.txt -- diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt b/sql/core/benchmarks/CSVBenchmark-results.txt new file mode 100644 index 000..865575b --- /dev/null +++ b/sql/core/benchmarks/CSVBenchmark-results.txt @@ -0,0 +1,27 @@ + +Benchmark to measure CSV read/write performance + + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parsing quoted values: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +One quoted string 64733 / 64839 0.0 1294653.1 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Wide rows with 1000 columns: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Select 1000 columns 185609 / 189735 0.0 185608.6 1.0X +Select 100 columns 50195 / 51808 0.0 50194.8 3.7X +Select one column 39266 / 39293 0.0 39265.6 4.7X +count() 10959 / 11000 0.1 10958.5 16.9X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Count a dataset with 10 columns: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Select 10 columns + count() 24637 / 24768 0.4 2463.7 1.0X +Select 1 column + count() 20026 / 20076 0.5 2002.6 1.2X +count() 3754 / 3877 2.7 375.4 6.6X + http://git-wip-us.apache.org/repos/asf/spark/blob/94de5609/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala new file mode 100644 index 000..ce38b08 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License&
spark git commit: [SPARK-25833][SQL][DOCS] Update migration guide for Hive view compatibility
Repository: spark Updated Branches: refs/heads/master 9cf9a83af -> 49bea5a7e [SPARK-25833][SQL][DOCS] Update migration guide for Hive view compatibility ## What changes were proposed in this pull request? Both Spark and Hive support views. However in some cases views created by Hive are not readable by Spark. For example, if column aliases are not specified in view definition queries, both Spark and Hive will generate alias names, but in different ways. In order for Spark to be able to read views created by Hive, users should explicitly specify column aliases in view definition queries. Given that it's not uncommon that Hive and Spark are used together in enterprise data warehouse, this PR aims to explicitly describe this compatibility issue to help users troubleshoot this issue easily. ## How was this patch tested? Docs are manually generated and checked locally. ``` SKIP_API=1 jekyll serve ``` Closes #22868 from seancxmao/SPARK-25833. Authored-by: seancxmao Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/49bea5a7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/49bea5a7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/49bea5a7 Branch: refs/heads/master Commit: 49bea5a7e87ec3ce9cd9466725d81096a54a591b Parents: 9cf9a83 Author: seancxmao Authored: Tue Oct 30 23:05:31 2018 -0700 Committer: Dongjoon Hyun Committed: Tue Oct 30 23:05:31 2018 -0700 -- docs/sql-migration-guide-hive-compatibility.md | 15 +++ 1 file changed, 15 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/49bea5a7/docs/sql-migration-guide-hive-compatibility.md -- diff --git a/docs/sql-migration-guide-hive-compatibility.md b/docs/sql-migration-guide-hive-compatibility.md index 0234ea2..9484941 100644 --- a/docs/sql-migration-guide-hive-compatibility.md +++ b/docs/sql-migration-guide-hive-compatibility.md @@ -51,6 +51,21 @@ Spark SQL supports the vast majority of Hive features, such as: * Explain * Partitioned tables including dynamic partition insertion * View + * If column aliases are not specified in view definition queries, both Spark and Hive will +generate alias names, but in different ways. In order for Spark to be able to read views created +by Hive, users should explicitly specify column aliases in view definition queries. As an +example, Spark cannot read `v1` created as below by Hive. + +``` +CREATE VIEW v1 AS SELECT * FROM (SELECT c + 1 FROM (SELECT 1 c) t1) t2; +``` + +Instead, you should create `v1` as below with column aliases explicitly specified. + +``` +CREATE VIEW v1 AS SELECT * FROM (SELECT c + 1 AS inc_c FROM (SELECT 1 c) t1) t2; +``` + * All Hive DDL Functions, including: * `CREATE TABLE` * `CREATE TABLE AS SELECT` - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25618][SQL][TEST] Reduce time taken to execute KafkaContinuousSourceStressForDontFailOnDataLossSuite
Repository: spark Updated Branches: refs/heads/master 0ad93b093 -> 34c3bc9f1 [SPARK-25618][SQL][TEST] Reduce time taken to execute KafkaContinuousSourceStressForDontFailOnDataLossSuite ## What changes were proposed in this pull request? In this test, i have reduced the test time to 20 secs from 1 minute while reducing the sleep time from 1 sec to 100 milliseconds. With this change, i was able to run the test in 20+ seconds consistently on my laptop. I would like see if it passes in jenkins consistently. ## How was this patch tested? Its a test fix. Closes #22900 from dilipbiswal/SPARK-25618. Authored-by: Dilip Biswal Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34c3bc9f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34c3bc9f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34c3bc9f Branch: refs/heads/master Commit: 34c3bc9f1e2750bbcb91a8706ab78c6a58113350 Parents: 0ad93b0 Author: Dilip Biswal Authored: Wed Oct 31 02:57:39 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 31 02:57:39 2018 -0700 -- .../apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34c3bc9f/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala index 39c4e3f..491a9c6 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala @@ -221,7 +221,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with KafkaM .as[(String, String)] val query = startStream(kafka.map(kv => kv._2.toInt)) -val testTime = 1.minutes +val testTime = 20.seconds val startTime = System.currentTimeMillis() // Track the current existing topics val topics = mutable.ArrayBuffer[String]() @@ -252,7 +252,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with KafkaM testUtils.createTopic(topic, partitions = 1, overwrite = true) logInfo(s"Create topic $topic") case 3 => - Thread.sleep(1000) + Thread.sleep(100) case _ => // Push random messages for (topic <- topics) { val size = Random.nextInt(10) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25663][SPARK-25661][SQL][TEST] Refactor BuiltInDataSourceWriteBenchmark, DataSourceWriteBenchmark and AvroWriteBenchmark to use main method
Repository: spark Updated Branches: refs/heads/master 34c3bc9f1 -> f8484e49e [SPARK-25663][SPARK-25661][SQL][TEST] Refactor BuiltInDataSourceWriteBenchmark, DataSourceWriteBenchmark and AvroWriteBenchmark to use main method ## What changes were proposed in this pull request? Refactor BuiltInDataSourceWriteBenchmark, DataSourceWriteBenchmark and AvroWriteBenchmark to use main method. ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.BuiltInDataSourceWriteBenchmark" SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain org.apache.spark.sql.execution.benchmark.AvroWriteBenchmark" ``` ## How was this patch tested? manual tests Closes #22861 from yucai/BuiltInDataSourceWriteBenchmark. Lead-authored-by: yucai Co-authored-by: Yucai Yu Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f8484e49 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f8484e49 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f8484e49 Branch: refs/heads/master Commit: f8484e49ef83445dd57f8f5ba4b39d2f47bd3c80 Parents: 34c3bc9 Author: yucai Authored: Wed Oct 31 03:03:42 2018 -0700 Committer: Dongjoon Hyun Committed: Wed Oct 31 03:03:42 2018 -0700 -- .../benchmarks/AvroWriteBenchmark-results.txt | 10 +++ .../benchmark/AvroWriteBenchmark.scala | 27 .../BuiltInDataSourceWriteBenchmark-results.txt | 60 + .../BuiltInDataSourceWriteBenchmark.scala | 68 +++- .../benchmark/DataSourceWriteBenchmark.scala| 15 + 5 files changed, 108 insertions(+), 72 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/external/avro/benchmarks/AvroWriteBenchmark-results.txt -- diff --git a/external/avro/benchmarks/AvroWriteBenchmark-results.txt b/external/avro/benchmarks/AvroWriteBenchmark-results.txt new file mode 100644 index 000..fb2a773 --- /dev/null +++ b/external/avro/benchmarks/AvroWriteBenchmark-results.txt @@ -0,0 +1,10 @@ +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Avro writer benchmark: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Output Single Int Column 3213 / 3373 4.9 204.3 1.0X +Output Single Double Column 3313 / 3345 4.7 210.7 1.0X +Output Int and String Column 7303 / 7316 2.2 464.3 0.4X +Output Partitions 5309 / 5691 3.0 337.5 0.6X +Output Buckets7031 / 7557 2.2 447.0 0.5X + http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala -- diff --git a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala index df13b4a..0b11434 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala @@ -19,22 +19,19 @@ package org.apache.spark.sql.execution.benchmark /** * Benchmark to measure Avro data sources write performance. - * Usage: - * 1. with spark-submit: bin/spark-submit --class - * 2. with sbt: build/sbt "avro/test:runMain " + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + *--jars ,, + * , + * + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain " + * Results will be written to "benchmarks/AvroWriteBenchmark-results.txt". + * }}} */ object AvroWriteBenchmark extends DataSourceWriteBenchmark { - def main(args: Array[String]): Unit = { -/* -Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz -Avro writer benchmark: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative - -Output Single Int Column 2481 / 2499 6.3 157.8 1.0X -Output Single Double Column