spark git commit: [SPARK-18566][SQL] remove OverwriteOptions
Repository: spark Updated Branches: refs/heads/master f2ddabfa0 -> 3e307b495 [SPARK-18566][SQL] remove OverwriteOptions ## What changes were proposed in this pull request? `OverwriteOptions` was introduced in https://github.com/apache/spark/pull/15705, to carry the information of static partitions. However, after further refactor, this information becomes duplicated and we can remove `OverwriteOptions`. ## How was this patch tested? N/A Author: Wenchen FanCloses #15995 from cloud-fan/overwrite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e307b49 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e307b49 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e307b49 Branch: refs/heads/master Commit: 3e307b4959ecdab3f9c16484d172403357e7d09b Parents: f2ddabf Author: Wenchen Fan Authored: Wed Dec 14 11:30:34 2016 +0800 Committer: Wenchen Fan Committed: Wed Dec 14 11:30:34 2016 +0800 -- .../apache/spark/sql/catalyst/dsl/package.scala | 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 5 +- .../plans/logical/basicLogicalOperators.scala | 22 +--- .../sql/catalyst/parser/PlanParserSuite.scala | 15 +-- .../org/apache/spark/sql/DataFrameWriter.scala | 4 +- .../sql/execution/datasources/DataSource.scala | 2 +- .../datasources/DataSourceStrategy.scala| 127 ++- .../InsertIntoDataSourceCommand.scala | 6 +- .../InsertIntoHadoopFsRelationCommand.scala | 16 +-- .../apache/spark/sql/hive/HiveStrategies.scala | 2 +- .../CreateHiveTableAsSelectCommand.scala| 5 +- 11 files changed, 91 insertions(+), 115 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e307b49/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index e901683..66e52ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -367,7 +367,7 @@ package object dsl { def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan = InsertIntoTable( analysis.UnresolvedRelation(TableIdentifier(tableName)), - Map.empty, logicalPlan, OverwriteOptions(overwrite), false) + Map.empty, logicalPlan, overwrite, false) def as(alias: String): LogicalPlan = logicalPlan match { case UnresolvedRelation(tbl, _) => UnresolvedRelation(tbl, Option(alias)) http://git-wip-us.apache.org/repos/asf/spark/blob/3e307b49/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 7b8badc..3969fdb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -177,15 +177,12 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " + "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx) } -val overwrite = ctx.OVERWRITE != null -val staticPartitionKeys: Map[String, String] = - partitionKeys.filter(_._2.nonEmpty).map(t => (t._1, t._2.get)) InsertIntoTable( UnresolvedRelation(tableIdent, None), partitionKeys, query, - OverwriteOptions(overwrite, if (overwrite) staticPartitionKeys else Map.empty), + ctx.OVERWRITE != null, ctx.EXISTS != null) } http://git-wip-us.apache.org/repos/asf/spark/blob/3e307b49/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index c210b74..b9bdd53 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
spark git commit: [MINOR][SPARKR] fix kstest example error and add unit test
Repository: spark Updated Branches: refs/heads/master e104e55c1 -> f2ddabfa0 [MINOR][SPARKR] fix kstest example error and add unit test ## What changes were proposed in this pull request? While adding vignettes for kstest, I found some errors in the example: 1. There is a typo of kstest; 2. print.summary.KStest doesn't work with the example; Fix the example errors; Add a new unit test for print.summary.KStest; ## How was this patch tested? Manual test; Add new unit test; Author: wm...@hotmail.comCloses #16259 from wangmiao1981/ks. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2ddabfa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2ddabfa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2ddabfa Branch: refs/heads/master Commit: f2ddabfa09fda26ff0391d026dd67545dab33e01 Parents: e104e55 Author: wm...@hotmail.com Authored: Tue Dec 13 18:52:05 2016 -0800 Committer: Yanbo Liang Committed: Tue Dec 13 18:52:05 2016 -0800 -- R/pkg/R/mllib.R| 4 ++-- R/pkg/inst/tests/testthat/test_mllib.R | 6 ++ 2 files changed, 8 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f2ddabfa/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 5df843c..d736bbb 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -1595,14 +1595,14 @@ setMethod("write.ml", signature(object = "ALSModel", path = "character"), #' \dontrun{ #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25)) #' df <- createDataFrame(data) -#' test <- spark.ktest(df, "test", "norm", c(0, 1)) +#' test <- spark.kstest(df, "test", "norm", c(0, 1)) #' #' # get a summary of the test result #' testSummary <- summary(test) #' testSummary #' #' # print out the summary in an organized way -#' print.summary.KSTest(test) +#' print.summary.KSTest(testSummary) #' } #' @note spark.kstest since 2.1.0 setMethod("spark.kstest", signature(data = "SparkDataFrame"), http://git-wip-us.apache.org/repos/asf/spark/blob/f2ddabfa/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 986af4a..0f0d831 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -986,6 +986,12 @@ test_that("spark.kstest", { expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4) expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4) expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:") + + # Test print.summary.KSTest + printStats <- capture.output(print.summary.KSTest(stats)) + expect_match(printStats[1], "Kolmogorov-Smirnov test summary:") + expect_match(printStats[5], + "Low presumption against null hypothesis: Sample follows theoretical distribution. ") }) test_that("spark.randomForest", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SPARKR] fix kstest example error and add unit test
Repository: spark Updated Branches: refs/heads/branch-2.1 019d1fa3d -> 8ef005931 [MINOR][SPARKR] fix kstest example error and add unit test ## What changes were proposed in this pull request? While adding vignettes for kstest, I found some errors in the example: 1. There is a typo of kstest; 2. print.summary.KStest doesn't work with the example; Fix the example errors; Add a new unit test for print.summary.KStest; ## How was this patch tested? Manual test; Add new unit test; Author: wm...@hotmail.comCloses #16259 from wangmiao1981/ks. (cherry picked from commit f2ddabfa09fda26ff0391d026dd67545dab33e01) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ef00593 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ef00593 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ef00593 Branch: refs/heads/branch-2.1 Commit: 8ef005931a242d087f4879805571be0660aefaf9 Parents: 019d1fa Author: wm...@hotmail.com Authored: Tue Dec 13 18:52:05 2016 -0800 Committer: Yanbo Liang Committed: Tue Dec 13 18:52:22 2016 -0800 -- R/pkg/R/mllib.R| 4 ++-- R/pkg/inst/tests/testthat/test_mllib.R | 6 ++ 2 files changed, 8 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ef00593/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 5df843c..d736bbb 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -1595,14 +1595,14 @@ setMethod("write.ml", signature(object = "ALSModel", path = "character"), #' \dontrun{ #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25)) #' df <- createDataFrame(data) -#' test <- spark.ktest(df, "test", "norm", c(0, 1)) +#' test <- spark.kstest(df, "test", "norm", c(0, 1)) #' #' # get a summary of the test result #' testSummary <- summary(test) #' testSummary #' #' # print out the summary in an organized way -#' print.summary.KSTest(test) +#' print.summary.KSTest(testSummary) #' } #' @note spark.kstest since 2.1.0 setMethod("spark.kstest", signature(data = "SparkDataFrame"), http://git-wip-us.apache.org/repos/asf/spark/blob/8ef00593/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 46dffe3..40c0446 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -986,6 +986,12 @@ test_that("spark.kstest", { expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4) expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4) expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:") + + # Test print.summary.KSTest + printStats <- capture.output(print.summary.KSTest(stats)) + expect_match(printStats[1], "Kolmogorov-Smirnov test summary:") + expect_match(printStats[5], + "Low presumption against null hypothesis: Sample follows theoretical distribution. ") }) test_that("spark.randomForest", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18588][TESTS] Ignore KafkaSourceStressForDontFailOnDataLossSuite
Repository: spark Updated Branches: refs/heads/branch-2.1 5693ac8e5 -> 019d1fa3d [SPARK-18588][TESTS] Ignore KafkaSourceStressForDontFailOnDataLossSuite ## What changes were proposed in this pull request? Disable KafkaSourceStressForDontFailOnDataLossSuite for now. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16275 from zsxwing/ignore-flaky-test. (cherry picked from commit e104e55c16e229e521c517393b8163cbc3bbf85a) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/019d1fa3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/019d1fa3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/019d1fa3 Branch: refs/heads/branch-2.1 Commit: 019d1fa3d421b5750170429fc07b204692b7b58e Parents: 5693ac8 Author: Shixiong Zhu Authored: Tue Dec 13 18:36:36 2016 -0800 Committer: Reynold Xin Committed: Tue Dec 13 18:36:42 2016 -0800 -- .../scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/019d1fa3/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala index 544fbc5..5d2779a 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala @@ -845,7 +845,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared } } - test("stress test for failOnDataLoss=false") { + ignore("stress test for failOnDataLoss=false") { val reader = spark .readStream .format("kafka") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18588][TESTS] Ignore KafkaSourceStressForDontFailOnDataLossSuite
Repository: spark Updated Branches: refs/heads/master 3ae63b808 -> e104e55c1 [SPARK-18588][TESTS] Ignore KafkaSourceStressForDontFailOnDataLossSuite ## What changes were proposed in this pull request? Disable KafkaSourceStressForDontFailOnDataLossSuite for now. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16275 from zsxwing/ignore-flaky-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e104e55c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e104e55c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e104e55c Branch: refs/heads/master Commit: e104e55c16e229e521c517393b8163cbc3bbf85a Parents: 3ae63b8 Author: Shixiong Zhu Authored: Tue Dec 13 18:36:36 2016 -0800 Committer: Reynold Xin Committed: Tue Dec 13 18:36:36 2016 -0800 -- .../scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e104e55c/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala index 544fbc5..5d2779a 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala @@ -845,7 +845,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared } } - test("stress test for failOnDataLoss=false") { + ignore("stress test for failOnDataLoss=false") { val reader = spark .readStream .format("kafka") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18752][SQL] Follow-up: add scaladoc explaining isSrcLocal arg.
Repository: spark Updated Branches: refs/heads/master ae5b2d3e4 -> 3ae63b808 [SPARK-18752][SQL] Follow-up: add scaladoc explaining isSrcLocal arg. Author: Marcelo VanzinCloses #16257 from vanzin/SPARK-18752.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3ae63b80 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3ae63b80 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3ae63b80 Branch: refs/heads/master Commit: 3ae63b808a6f5fafa191c680eb7f73a9543e46ce Parents: ae5b2d3 Author: Marcelo Vanzin Authored: Tue Dec 13 17:55:38 2016 -0800 Committer: Reynold Xin Committed: Tue Dec 13 17:55:38 2016 -0800 -- .../spark/sql/catalyst/catalog/ExternalCatalog.scala| 12 1 file changed, 12 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3ae63b80/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 0c72964..5233699 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -114,6 +114,12 @@ abstract class ExternalCatalog { def listTables(db: String, pattern: String): Seq[String] + /** + * Loads data into a table. + * + * @param isSrcLocal Whether the source data is local, as defined by the "LOAD DATA LOCAL" + * HiveQL command. + */ def loadTable( db: String, table: String, @@ -122,6 +128,12 @@ abstract class ExternalCatalog { holdDDLTime: Boolean, isSrcLocal: Boolean): Unit + /** + * Loads data into a partition. + * + * @param isSrcLocal Whether the source data is local, as defined by the "LOAD DATA LOCAL" + * HiveQL command. + */ def loadPartition( db: String, table: String, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18746][SQL] Add implicit encoder for BigDecimal, timestamp and date
Repository: spark Updated Branches: refs/heads/master 594b14f1e -> ae5b2d3e4 [SPARK-18746][SQL] Add implicit encoder for BigDecimal, timestamp and date ## What changes were proposed in this pull request? Add implicit encoders for BigDecimal, timestamp and date. ## How was this patch tested? Add an unit test. Pass build, unit tests, and some tests below . Before: ``` scala> spark.createDataset(Seq(new java.math.BigDecimal(10))) :24: error: Unable to find encoder for type stored in a Dataset. Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._ Support for serializing other types will be added in future releases. spark.createDataset(Seq(new java.math.BigDecimal(10))) ^ scala> ``` After: ``` scala> spark.createDataset(Seq(new java.math.BigDecimal(10))) res0: org.apache.spark.sql.Dataset[java.math.BigDecimal] = [value: decimal(38,18)] ``` Author: Weiqing YangCloses #16176 from weiqingy/SPARK-18746. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ae5b2d3e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ae5b2d3e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ae5b2d3e Branch: refs/heads/master Commit: ae5b2d3e46cc4c460f539c4db1688309d1cdc66a Parents: 594b14f Author: Weiqing Yang Authored: Wed Dec 14 09:48:38 2016 +0800 Committer: Wenchen Fan Committed: Wed Dec 14 09:48:38 2016 +0800 -- .../scala/org/apache/spark/sql/SQLImplicits.scala | 15 ++- .../scala/org/apache/spark/sql/DatasetSuite.scala | 16 +++- 2 files changed, 29 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ae5b2d3e/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala index 73d16d8..872a78b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala @@ -74,6 +74,19 @@ abstract class SQLImplicits { /** @since 1.6.0 */ implicit def newStringEncoder: Encoder[String] = Encoders.STRING + /** @since 2.2.0 */ + implicit def newJavaDecimalEncoder: Encoder[java.math.BigDecimal] = Encoders.DECIMAL + + /** @since 2.2.0 */ + implicit def newScalaDecimalEncoder: Encoder[scala.math.BigDecimal] = ExpressionEncoder() + + /** @since 2.2.0 */ + implicit def newDateEncoder: Encoder[java.sql.Date] = Encoders.DATE + + /** @since 2.2.0 */ + implicit def newTimeStampEncoder: Encoder[java.sql.Timestamp] = Encoders.TIMESTAMP + + // Boxed primitives /** @since 2.0.0 */ @@ -141,7 +154,7 @@ abstract class SQLImplicits { implicit def newFloatArrayEncoder: Encoder[Array[Float]] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newByteArrayEncoder: Encoder[Array[Byte]] = ExpressionEncoder() + implicit def newByteArrayEncoder: Encoder[Array[Byte]] = Encoders.BINARY /** @since 1.6.1 */ implicit def newShortArrayEncoder: Encoder[Array[Short]] = ExpressionEncoder() http://git-wip-us.apache.org/repos/asf/spark/blob/ae5b2d3e/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 3742115..c27b815 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec, SortExec} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchange} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -1129,6 +1128,21 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val ds2 = Seq(WithMap("hi", Map(42L -> "foo"))).toDS checkDataset(ds2.map(t => t), WithMap("hi", Map(42L -> "foo"))) } + + test("SPARK-18746: add implicit encoder for BigDecimal, date, timestamp") { +// For this implicit encoder, 18 is the default scale +assert(spark.range(1).map { x => new java.math.BigDecimal(1) }.head == + new java.math.BigDecimal(1).setScale(18)) + +assert(spark.range(1).map { x => scala.math.BigDecimal(1, 18) }.head == +
spark git commit: [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes
Repository: spark Updated Branches: refs/heads/branch-2.1 25b97589e -> 5693ac8e5 [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes ## What changes were proposed in this pull request? Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content minimal since users can type `?spark.randomForest` to see the full doc. cc: jkbradley Author: Xiangrui MengCloses #16264 from mengxr/SPARK-18793. (cherry picked from commit 594b14f1ebd0b3db9f630e504be92228f11b4d9f) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5693ac8e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5693ac8e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5693ac8e Branch: refs/heads/branch-2.1 Commit: 5693ac8e5bd5df8aca1b0d6df0be072a45abcfbd Parents: 25b9758 Author: Xiangrui Meng Authored: Tue Dec 13 16:59:09 2016 -0800 Committer: Xiangrui Meng Committed: Tue Dec 13 16:59:15 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 32 +++ 1 file changed, 32 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5693ac8e/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 625b759..334daa5 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -449,6 +449,10 @@ SparkR supports the following machine learning models and algorithms. * Generalized Linear Model (GLM) +* Random Forest + +* Gradient-Boosted Trees (GBT) + * Naive Bayes Model * $k$-means Clustering @@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF) head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) ``` + Random Forest + +`spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +In the following example, we use the `longley` dataset to train a random forest and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +summary(rfModel) +predictions <- predict(rfModel, df) +``` + + Gradient-Boosted Trees + +`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +summary(gbtModel) +predictions <- predict(gbtModel, df) +``` + Naive Bayes Model Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes
Repository: spark Updated Branches: refs/heads/master c68fb426d -> 594b14f1e [SPARK-18793][SPARK-18794][R] add spark.randomForest/spark.gbt to vignettes ## What changes were proposed in this pull request? Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content minimal since users can type `?spark.randomForest` to see the full doc. cc: jkbradley Author: Xiangrui MengCloses #16264 from mengxr/SPARK-18793. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/594b14f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/594b14f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/594b14f1 Branch: refs/heads/master Commit: 594b14f1ebd0b3db9f630e504be92228f11b4d9f Parents: c68fb42 Author: Xiangrui Meng Authored: Tue Dec 13 16:59:09 2016 -0800 Committer: Xiangrui Meng Committed: Tue Dec 13 16:59:09 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 32 +++ 1 file changed, 32 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/594b14f1/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 625b759..334daa5 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -449,6 +449,10 @@ SparkR supports the following machine learning models and algorithms. * Generalized Linear Model (GLM) +* Random Forest + +* Gradient-Boosted Trees (GBT) + * Naive Bayes Model * $k$-means Clustering @@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF) head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) ``` + Random Forest + +`spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +In the following example, we use the `longley` dataset to train a random forest and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +summary(rfModel) +predictions <- predict(rfModel, df) +``` + + Gradient-Boosted Trees + +`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +summary(gbtModel) +predictions <- predict(gbtModel, df) +``` + Naive Bayes Model Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18834][SS] Expose event time stats through StreamingQueryProgress
Repository: spark Updated Branches: refs/heads/branch-2.1 f672bfdf9 -> 25b97589e [SPARK-18834][SS] Expose event time stats through StreamingQueryProgress ## What changes were proposed in this pull request? - Changed `StreamingQueryProgress.watermark` to `StreamingQueryProgress.queryTimestamps` which is a `Map[String, String]` containing the following keys: "eventTime.max", "eventTime.min", "eventTime.avg", "processingTime", "watermark". All of them UTC formatted strings. - Renamed `StreamingQuery.timestamp` to `StreamingQueryProgress.triggerTimestamp` to differentiate from `queryTimestamps`. It has the timestamp of when the trigger was started. ## How was this patch tested? Updated tests Author: Tathagata DasCloses #16258 from tdas/SPARK-18834. (cherry picked from commit c68fb426d4ac05414fb402aa1f30f4c98df103ad) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/25b97589 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/25b97589 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/25b97589 Branch: refs/heads/branch-2.1 Commit: 25b97589e32ddc424df500059cd9962eb1b2fa6b Parents: f672bfd Author: Tathagata Das Authored: Tue Dec 13 14:14:25 2016 -0800 Committer: Tathagata Das Committed: Tue Dec 13 14:15:15 2016 -0800 -- .../streaming/EventTimeWatermarkExec.scala | 55 ++-- .../execution/streaming/ProgressReporter.scala | 38 ++ .../execution/streaming/StreamExecution.scala | 33 ++-- .../apache/spark/sql/streaming/progress.scala | 31 +++ .../streaming/StreamingQueryListenerSuite.scala | 3 ++ .../StreamingQueryStatusAndProgressSuite.scala | 16 -- .../sql/streaming/StreamingQuerySuite.scala | 2 + .../spark/sql/streaming/WatermarkSuite.scala| 49 + 8 files changed, 161 insertions(+), 66 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/25b97589/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala index 4c8cb06..e8570d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.streaming -import scala.math.max - import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} @@ -28,24 +26,48 @@ import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 -/** Tracks the maximum positive long seen. */ -class MaxLong(protected var currentValue: Long = 0) - extends AccumulatorV2[Long, Long] { +/** Class for collecting event time stats with an accumulator */ +case class EventTimeStats(var max: Long, var min: Long, var sum: Long, var count: Long) { + def add(eventTime: Long): Unit = { +this.max = math.max(this.max, eventTime) +this.min = math.min(this.min, eventTime) +this.sum += eventTime +this.count += 1 + } + + def merge(that: EventTimeStats): Unit = { +this.max = math.max(this.max, that.max) +this.min = math.min(this.min, that.min) +this.sum += that.sum +this.count += that.count + } + + def avg: Long = sum / count +} + +object EventTimeStats { + def zero: EventTimeStats = EventTimeStats( +max = Long.MinValue, min = Long.MaxValue, sum = 0L, count = 0L) +} + +/** Accumulator that collects stats on event time in a batch. */ +class EventTimeStatsAccum(protected var currentStats: EventTimeStats = EventTimeStats.zero) + extends AccumulatorV2[Long, EventTimeStats] { - override def isZero: Boolean = value == 0 - override def value: Long = currentValue - override def copy(): AccumulatorV2[Long, Long] = new MaxLong(currentValue) + override def isZero: Boolean = value == EventTimeStats.zero + override def value: EventTimeStats = currentStats + override def copy(): AccumulatorV2[Long, EventTimeStats] = new EventTimeStatsAccum(currentStats) override def reset(): Unit = { -currentValue = 0 +currentStats = EventTimeStats.zero } override def add(v: Long): Unit = { -currentValue = max(v, value) +currentStats.add(v) } - override def merge(other:
spark git commit: [SPARK-18834][SS] Expose event time stats through StreamingQueryProgress
Repository: spark Updated Branches: refs/heads/master aebf44e50 -> c68fb426d [SPARK-18834][SS] Expose event time stats through StreamingQueryProgress ## What changes were proposed in this pull request? - Changed `StreamingQueryProgress.watermark` to `StreamingQueryProgress.queryTimestamps` which is a `Map[String, String]` containing the following keys: "eventTime.max", "eventTime.min", "eventTime.avg", "processingTime", "watermark". All of them UTC formatted strings. - Renamed `StreamingQuery.timestamp` to `StreamingQueryProgress.triggerTimestamp` to differentiate from `queryTimestamps`. It has the timestamp of when the trigger was started. ## How was this patch tested? Updated tests Author: Tathagata DasCloses #16258 from tdas/SPARK-18834. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c68fb426 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c68fb426 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c68fb426 Branch: refs/heads/master Commit: c68fb426d4ac05414fb402aa1f30f4c98df103ad Parents: aebf44e Author: Tathagata Das Authored: Tue Dec 13 14:14:25 2016 -0800 Committer: Tathagata Das Committed: Tue Dec 13 14:14:25 2016 -0800 -- .../streaming/EventTimeWatermarkExec.scala | 55 ++-- .../execution/streaming/ProgressReporter.scala | 38 ++ .../execution/streaming/StreamExecution.scala | 33 ++-- .../apache/spark/sql/streaming/progress.scala | 31 +++ .../streaming/StreamingQueryListenerSuite.scala | 3 ++ .../StreamingQueryStatusAndProgressSuite.scala | 16 -- .../sql/streaming/StreamingQuerySuite.scala | 2 + .../spark/sql/streaming/WatermarkSuite.scala| 49 + 8 files changed, 161 insertions(+), 66 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c68fb426/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala index 4c8cb06..e8570d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.streaming -import scala.math.max - import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} @@ -28,24 +26,48 @@ import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 -/** Tracks the maximum positive long seen. */ -class MaxLong(protected var currentValue: Long = 0) - extends AccumulatorV2[Long, Long] { +/** Class for collecting event time stats with an accumulator */ +case class EventTimeStats(var max: Long, var min: Long, var sum: Long, var count: Long) { + def add(eventTime: Long): Unit = { +this.max = math.max(this.max, eventTime) +this.min = math.min(this.min, eventTime) +this.sum += eventTime +this.count += 1 + } + + def merge(that: EventTimeStats): Unit = { +this.max = math.max(this.max, that.max) +this.min = math.min(this.min, that.min) +this.sum += that.sum +this.count += that.count + } + + def avg: Long = sum / count +} + +object EventTimeStats { + def zero: EventTimeStats = EventTimeStats( +max = Long.MinValue, min = Long.MaxValue, sum = 0L, count = 0L) +} + +/** Accumulator that collects stats on event time in a batch. */ +class EventTimeStatsAccum(protected var currentStats: EventTimeStats = EventTimeStats.zero) + extends AccumulatorV2[Long, EventTimeStats] { - override def isZero: Boolean = value == 0 - override def value: Long = currentValue - override def copy(): AccumulatorV2[Long, Long] = new MaxLong(currentValue) + override def isZero: Boolean = value == EventTimeStats.zero + override def value: EventTimeStats = currentStats + override def copy(): AccumulatorV2[Long, EventTimeStats] = new EventTimeStatsAccum(currentStats) override def reset(): Unit = { -currentValue = 0 +currentStats = EventTimeStats.zero } override def add(v: Long): Unit = { -currentValue = max(v, value) +currentStats.add(v) } - override def merge(other: AccumulatorV2[Long, Long]): Unit = { -currentValue = max(value, other.value) + override def merge(other: AccumulatorV2[Long, EventTimeStats]):
spark git commit: [SPARK-18843][CORE] Fix timeout in awaitResultInForkJoinSafely (branch 2.1, 2.0)
Repository: spark Updated Branches: refs/heads/branch-2.0 06f592c39 -> 1d5c7f452 [SPARK-18843][CORE] Fix timeout in awaitResultInForkJoinSafely (branch 2.1, 2.0) ## What changes were proposed in this pull request? This PR fixes the timeout value in `awaitResultInForkJoinSafely` for 2.1 and 2.0. Master has been fixed by https://github.com/apache/spark/pull/16230. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16268 from zsxwing/SPARK-18843. (cherry picked from commit f672bfdf9689c0ab74226b11785ada50b72cd488) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d5c7f45 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d5c7f45 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d5c7f45 Branch: refs/heads/branch-2.0 Commit: 1d5c7f4528aed3f53334c387ff715a10ca11cd71 Parents: 06f592c Author: Shixiong Zhu Authored: Tue Dec 13 14:09:25 2016 -0800 Committer: Shixiong Zhu Committed: Tue Dec 13 14:09:36 2016 -0800 -- core/src/main/scala/org/apache/spark/util/ThreadUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d5c7f45/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index d093e7b..a15c706 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -209,7 +209,7 @@ private[spark] object ThreadUtils { // `awaitPermission` is not actually used anywhere so it's safe to pass in null here. // See SPARK-13747. val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] - awaitable.result(Duration.Inf)(awaitPermission) + awaitable.result(atMost)(awaitPermission) } catch { case NonFatal(t) => throw new SparkException("Exception thrown in awaitResult: ", t) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18843][CORE] Fix timeout in awaitResultInForkJoinSafely (branch 2.1, 2.0)
Repository: spark Updated Branches: refs/heads/branch-2.1 292a37f24 -> f672bfdf9 [SPARK-18843][CORE] Fix timeout in awaitResultInForkJoinSafely (branch 2.1, 2.0) ## What changes were proposed in this pull request? This PR fixes the timeout value in `awaitResultInForkJoinSafely` for 2.1 and 2.0. Master has been fixed by https://github.com/apache/spark/pull/16230. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16268 from zsxwing/SPARK-18843. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f672bfdf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f672bfdf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f672bfdf Branch: refs/heads/branch-2.1 Commit: f672bfdf9689c0ab74226b11785ada50b72cd488 Parents: 292a37f Author: Shixiong Zhu Authored: Tue Dec 13 14:09:25 2016 -0800 Committer: Shixiong Zhu Committed: Tue Dec 13 14:09:25 2016 -0800 -- core/src/main/scala/org/apache/spark/util/ThreadUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f672bfdf/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 60a6e82..2a21c6a 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -209,7 +209,7 @@ private[spark] object ThreadUtils { // `awaitPermission` is not actually used anywhere so it's safe to pass in null here. // See SPARK-13747. val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] - awaitable.result(Duration.Inf)(awaitPermission) + awaitable.result(atMost)(awaitPermission) } catch { case NonFatal(t) => throw new SparkException("Exception thrown in awaitResult: ", t) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18816][WEB UI] Executors Logs column only ran visibility check on initial table load
Repository: spark Updated Branches: refs/heads/branch-2.1 d5c4a5d06 -> 292a37f24 [SPARK-18816][WEB UI] Executors Logs column only ran visibility check on initial table load ## What changes were proposed in this pull request? When I added a visibility check for the logs column on the executors page in #14382 the method I used only ran the check on the initial DataTable creation and not subsequent page loads. I moved the check out of the table definition and instead it runs on each page load. The jQuery DataTable functionality used is the same. ## How was this patch tested? Tested Manually No visible UI changes to screenshot. Author: Alex BozarthCloses #16256 from ajbozarth/spark18816. (cherry picked from commit aebf44e50b6b04b848829adbbe08b0f74f31eb32) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/292a37f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/292a37f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/292a37f2 Branch: refs/heads/branch-2.1 Commit: 292a37f2455b12ef8dfbdaf5b905a69b8b5e3728 Parents: d5c4a5d Author: Alex Bozarth Authored: Tue Dec 13 21:37:46 2016 + Committer: Sean Owen Committed: Tue Dec 13 21:38:04 2016 + -- .../resources/org/apache/spark/ui/static/executorspage.js | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/292a37f2/core/src/main/resources/org/apache/spark/ui/static/executorspage.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index 1df6733..fe5db6a 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -412,10 +412,6 @@ $(document).ready(function () { ], "columnDefs": [ { -"targets": [ 15 ], -"visible": logsExist(response) -}, -{ "targets": [ 16 ], "visible": getThreadDumpEnabled() } @@ -423,7 +419,8 @@ $(document).ready(function () { "order": [[0, "asc"]] }; -$(selector).DataTable(conf); +var dt = $(selector).DataTable(conf); +dt.column(15).visible(logsExist(response)); $('#active-executors [data-toggle="tooltip"]').tooltip(); var sumSelector = "#summary-execs-table"; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18816][WEB UI] Executors Logs column only ran visibility check on initial table load
Repository: spark Updated Branches: refs/heads/master 9e8a9d7c6 -> aebf44e50 [SPARK-18816][WEB UI] Executors Logs column only ran visibility check on initial table load ## What changes were proposed in this pull request? When I added a visibility check for the logs column on the executors page in #14382 the method I used only ran the check on the initial DataTable creation and not subsequent page loads. I moved the check out of the table definition and instead it runs on each page load. The jQuery DataTable functionality used is the same. ## How was this patch tested? Tested Manually No visible UI changes to screenshot. Author: Alex BozarthCloses #16256 from ajbozarth/spark18816. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aebf44e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aebf44e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aebf44e5 Branch: refs/heads/master Commit: aebf44e50b6b04b848829adbbe08b0f74f31eb32 Parents: 9e8a9d7 Author: Alex Bozarth Authored: Tue Dec 13 21:37:46 2016 + Committer: Sean Owen Committed: Tue Dec 13 21:37:46 2016 + -- .../resources/org/apache/spark/ui/static/executorspage.js | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aebf44e5/core/src/main/resources/org/apache/spark/ui/static/executorspage.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index 1df6733..fe5db6a 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -412,10 +412,6 @@ $(document).ready(function () { ], "columnDefs": [ { -"targets": [ 15 ], -"visible": logsExist(response) -}, -{ "targets": [ 16 ], "visible": getThreadDumpEnabled() } @@ -423,7 +419,8 @@ $(document).ready(function () { "order": [[0, "asc"]] }; -$(selector).DataTable(conf); +var dt = $(selector).DataTable(conf); +dt.column(15).visible(logsExist(response)); $('#active-executors [data-toggle="tooltip"]').tooltip(); var sumSelector = "#summary-execs-table"; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18471][MLLIB] In LBFGS, avoid sending huge vectors of 0
Repository: spark Updated Branches: refs/heads/master e57e3938c -> 9e8a9d7c6 [SPARK-18471][MLLIB] In LBFGS, avoid sending huge vectors of 0 ## What changes were proposed in this pull request? CostFun used to send a dense vector of zeroes as a closure in a treeAggregate call. To avoid that, we replace treeAggregate by mapPartition + treeReduce, creating a zero vector inside the mapPartition block in-place. ## How was this patch tested? Unit test for module mllib run locally for correctness. As for performance we run an heavy optimization on our production data (50 iterations on 128 MB weight vectors) and have seen significant decrease in terms both of runtime and container being killed by lack of off-heap memory. Author: Anthony TruchetAuthor: sethah Author: Anthony Truchet Closes #16037 from AnthonyTruchet/ENG-17719-lbfgs-only. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e8a9d7c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e8a9d7c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e8a9d7c Branch: refs/heads/master Commit: 9e8a9d7c6a847bc5e77f9a1004029ec27616da9d Parents: e57e393 Author: Anthony Truchet Authored: Tue Dec 13 21:30:57 2016 + Committer: Sean Owen Committed: Tue Dec 13 21:30:57 2016 + -- .../apache/spark/mllib/optimization/LBFGS.scala | 28 +--- .../spark/mllib/optimization/LBFGSSuite.scala | 19 + 2 files changed, 37 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e8a9d7c/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index e0e41f7..7a714db 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -241,16 +241,24 @@ object LBFGS extends Logging { val bcW = data.context.broadcast(w) val localGradient = gradient - val (gradientSum, lossSum) = data.treeAggregate((Vectors.zeros(n), 0.0))( - seqOp = (c, v) => (c, v) match { case ((grad, loss), (label, features)) => -val l = localGradient.compute( - features, label, bcW.value, grad) -(grad, loss + l) - }, - combOp = (c1, c2) => (c1, c2) match { case ((grad1, loss1), (grad2, loss2)) => -axpy(1.0, grad2, grad1) -(grad1, loss1 + loss2) - }) + val seqOp = (c: (Vector, Double), v: (Double, Vector)) => +(c, v) match { + case ((grad, loss), (label, features)) => +val denseGrad = grad.toDense +val l = localGradient.compute(features, label, bcW.value, denseGrad) +(denseGrad, loss + l) +} + + val combOp = (c1: (Vector, Double), c2: (Vector, Double)) => +(c1, c2) match { case ((grad1, loss1), (grad2, loss2)) => + val denseGrad1 = grad1.toDense + val denseGrad2 = grad2.toDense + axpy(1.0, denseGrad2, denseGrad1) + (denseGrad1, loss1 + loss2) + } + + val zeroSparseVector = Vectors.sparse(n, Seq()) + val (gradientSum, lossSum) = data.treeAggregate((zeroSparseVector, 0.0))(seqOp, combOp) // broadcasted model is not needed anymore bcW.destroy() http://git-wip-us.apache.org/repos/asf/spark/blob/9e8a9d7c/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala index 75ae0eb..5729592 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala @@ -230,6 +230,25 @@ class LBFGSSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers (weightLBFGS(0) ~= weightGD(0) relTol 0.02) && (weightLBFGS(1) ~= weightGD(1) relTol 0.02), "The weight differences between LBFGS and GD should be within 2%.") } + + test("SPARK-18471: LBFGS aggregator on empty partitions") { +val regParam = 0 + +val initialWeightsWithIntercept = Vectors.dense(0.0) +val convergenceTol = 1e-12 +val numIterations = 1 +val dataWithEmptyPartitions = sc.parallelize(Seq((1.0, Vectors.dense(2.0))), 2) + +LBFGS.runLBFGS( +
spark git commit: [SPARK-18715][ML] Fix AIC calculations in Binomial GLM
Repository: spark Updated Branches: refs/heads/master 43298d157 -> e57e3938c [SPARK-18715][ML] Fix AIC calculations in Binomial GLM The AIC calculation in Binomial GLM seems to be off when the response or weight is non-integer: the result is different from that in R. This issue arises when one models rates, i.e, num of successes normalized over num of trials, and uses num of trials as weights. In this case, the effective likelihood is weight * label ~ binomial(weight, mu), where weight = number of trials, and weight * label = number of successes and mu = is the success rate. srowen sethah yanboliang HyukjinKwon zhengruifeng ## What changes were proposed in this pull request? I suggest changing the current aic calculation for the Binomial family from ``` -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => weight * dist.Binomial(1, mu).logProbabilityOf(math.round(y).toInt) }.sum() ``` to the following which generalizes to the case of real-valued response and weights. ``` -2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) => val wt = math.round(weight).toInt if (wt == 0){ 0.0 } else { dist.Binomial(wt, mu).logProbabilityOf(math.round(y * weight).toInt) } }.sum() ``` ## How was this patch tested? I will write the unit test once the community wants to include the proposed change. For now, the following modifies existing tests in weighted Binomial GLM to illustrate the issue. The second label is changed from 0 to 0.5. ``` val datasetWithWeight = Seq( (1.0, 1.0, 0.0, 5.0), (0.5, 2.0, 1.0, 2.0), (1.0, 3.0, 2.0, 1.0), (0.0, 4.0, 3.0, 3.0) ).toDF("y", "w", "x1", "x2") val formula = (new RFormula() .setFormula("y ~ x1 + x2") .setFeaturesCol("features") .setLabelCol("label")) val output = formula.fit(datasetWithWeight).transform(datasetWithWeight).select("features", "label", "w") val glr = new GeneralizedLinearRegression() .setFamily("binomial") .setWeightCol("w") .setFitIntercept(false) .setRegParam(0) val model = glr.fit(output) model.summary.aic ``` The AIC from Spark is 17.3227, and the AIC from R is 15.66454. Author: actuaryzhangCloses #16149 from actuaryzhang/aic. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e57e3938 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e57e3938 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e57e3938 Branch: refs/heads/master Commit: e57e3938c69fb1d91970341f027f2ab5000d2daa Parents: 43298d1 Author: actuaryzhang Authored: Tue Dec 13 21:27:29 2016 + Committer: Sean Owen Committed: Tue Dec 13 21:27:29 2016 + -- .../GeneralizedLinearRegression.scala | 18 +-- .../GeneralizedLinearRegressionSuite.scala | 57 ++-- 2 files changed, 43 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e57e3938/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index f137c8c..3891ae6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -215,6 +215,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val * Sets the value of param [[weightCol]]. * If this is not set or empty, we treat all instance weights as 1.0. * Default is not set, so all instances have weight one. + * In the Binomial family, weights correspond to number of trials and should be integer. + * Non-integer weights are rounded to integer in AIC calculation. * * @group setParam */ @@ -467,10 +469,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def variance(mu: Double): Double = mu * (1.0 - mu) +private def ylogy(y: Double, mu: Double): Double = { + if (y == 0) 0.0 else y * math.log(y / mu) +} + override def deviance(y: Double, mu: Double, weight: Double): Double = { - val my = 1.0 - y - 2.0 * weight * (y * math.log(math.max(y, 1.0) / mu) + -my * math.log(math.max(my, 1.0) / (1.0 - mu))) + 2.0 * weight * (ylogy(y, mu) + ylogy(1.0 - y, 1.0 - mu)) } override def aic( @@ -479,7 +483,13 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine numInstances:
spark git commit: [SPARK-18840][YARN] Avoid throw exception when getting token renewal interval in non HDFS security environment
Repository: spark Updated Branches: refs/heads/branch-2.1 207107bca -> d5c4a5d06 [SPARK-18840][YARN] Avoid throw exception when getting token renewal interval in non HDFS security environment ## What changes were proposed in this pull request? Fix `java.util.NoSuchElementException` when running Spark in non-hdfs security environment. In the current code, we assume `HDFS_DELEGATION_KIND` token will be found in Credentials. But in some cloud environments, HDFS is not required, so we should avoid this exception. ## How was this patch tested? Manually verified in local environment. Author: jerryshaoCloses #16265 from jerryshao/SPARK-18840. (cherry picked from commit 43298d157d58d5d03ffab818f8cdfc6eac783c55) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d5c4a5d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d5c4a5d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d5c4a5d0 Branch: refs/heads/branch-2.1 Commit: d5c4a5d06b3282aec8300d27510393161773061b Parents: 207107b Author: jerryshao Authored: Tue Dec 13 10:37:45 2016 -0800 Committer: Marcelo Vanzin Committed: Tue Dec 13 10:37:56 2016 -0800 -- .../yarn/security/HDFSCredentialProvider.scala | 21 ++-- 1 file changed, 11 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d5c4a5d0/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala index 8d06d73..ebb176b 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala @@ -72,21 +72,22 @@ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. -sparkConf.get(PRINCIPAL).map { renewer => +sparkConf.get(PRINCIPAL).flatMap { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } - val t = creds.getAllTokens.asScala -.filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) -.head - val newExpiration = t.renew(hadoopConf) - val identifier = new DelegationTokenIdentifier() - identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) - val interval = newExpiration - identifier.getIssueDate - logInfo(s"Renewal Interval is $interval") - interval + val hdfsToken = creds.getAllTokens.asScala +.find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) + hdfsToken.map { t => +val newExpiration = t.renew(hadoopConf) +val identifier = new DelegationTokenIdentifier() +identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) +val interval = newExpiration - identifier.getIssueDate +logInfo(s"Renewal Interval is $interval") +interval + } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18840][YARN] Avoid throw exception when getting token renewal interval in non HDFS security environment
Repository: spark Updated Branches: refs/heads/master 5572ccf86 -> 43298d157 [SPARK-18840][YARN] Avoid throw exception when getting token renewal interval in non HDFS security environment ## What changes were proposed in this pull request? Fix `java.util.NoSuchElementException` when running Spark in non-hdfs security environment. In the current code, we assume `HDFS_DELEGATION_KIND` token will be found in Credentials. But in some cloud environments, HDFS is not required, so we should avoid this exception. ## How was this patch tested? Manually verified in local environment. Author: jerryshaoCloses #16265 from jerryshao/SPARK-18840. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43298d15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43298d15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43298d15 Branch: refs/heads/master Commit: 43298d157d58d5d03ffab818f8cdfc6eac783c55 Parents: 5572ccf Author: jerryshao Authored: Tue Dec 13 10:37:45 2016 -0800 Committer: Marcelo Vanzin Committed: Tue Dec 13 10:37:45 2016 -0800 -- .../yarn/security/HDFSCredentialProvider.scala | 21 ++-- 1 file changed, 11 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/43298d15/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala -- diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala index 8d06d73..ebb176b 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala @@ -72,21 +72,22 @@ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. -sparkConf.get(PRINCIPAL).map { renewer => +sparkConf.get(PRINCIPAL).flatMap { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } - val t = creds.getAllTokens.asScala -.filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) -.head - val newExpiration = t.renew(hadoopConf) - val identifier = new DelegationTokenIdentifier() - identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) - val interval = newExpiration - identifier.getIssueDate - logInfo(s"Renewal Interval is $interval") - interval + val hdfsToken = creds.getAllTokens.asScala +.find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) + hdfsToken.map { t => +val newExpiration = t.renew(hadoopConf) +val identifier = new DelegationTokenIdentifier() +identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) +val interval = newExpiration - identifier.getIssueDate +logInfo(s"Renewal Interval is $interval") +interval + } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17932][SQL][FOLLOWUP] Change statement `SHOW TABLES EXTENDED` to `SHOW TABLE EXTENDED`
Repository: spark Updated Branches: refs/heads/master f280ccf44 -> 5572ccf86 [SPARK-17932][SQL][FOLLOWUP] Change statement `SHOW TABLES EXTENDED` to `SHOW TABLE EXTENDED` ## What changes were proposed in this pull request? Change the statement `SHOW TABLES [EXTENDED] [(IN|FROM) database_name] [[LIKE] 'identifier_with_wildcards'] [PARTITION(partition_spec)]` to the following statements: - SHOW TABLES [(IN|FROM) database_name] [[LIKE] 'identifier_with_wildcards'] - SHOW TABLE EXTENDED [(IN|FROM) database_name] LIKE 'identifier_with_wildcards' [PARTITION(partition_spec)] After this change, the statements `SHOW TABLE/SHOW TABLES` have the same syntax with that HIVE has. ## How was this patch tested? Modified the test sql file `show-tables.sql`; Modified the test suite `DDLSuite`. Author: jiangxingboCloses #16262 from jiangxb1987/show-table-extended. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5572ccf8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5572ccf8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5572ccf8 Branch: refs/heads/master Commit: 5572ccf86b084eb5938fe62fd5d9973ec14d555d Parents: f280ccf Author: jiangxingbo Authored: Tue Dec 13 19:04:34 2016 +0100 Committer: Herman van Hovell Committed: Tue Dec 13 19:04:34 2016 +0100 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 6 +++-- .../spark/sql/execution/SparkSqlParser.scala| 26 ++-- .../spark/sql/execution/command/tables.scala| 7 +++--- .../resources/sql-tests/inputs/show-tables.sql | 8 +++--- .../sql-tests/results/show-tables.sql.out | 14 ++- .../spark/sql/execution/command/DDLSuite.scala | 6 ++--- 6 files changed, 41 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5572ccf8/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 075c73d..63055b6 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -120,8 +120,10 @@ statement (USING resource (',' resource)*)? #createFunction | DROP TEMPORARY? FUNCTION (IF EXISTS)? qualifiedName #dropFunction | EXPLAIN (LOGICAL | FORMATTED | EXTENDED | CODEGEN)? statement#explain -| SHOW TABLES EXTENDED? ((FROM | IN) db=identifier)? -(LIKE? pattern=STRING)? partitionSpec? #showTables +| SHOW TABLES ((FROM | IN) db=identifier)? +(LIKE? pattern=STRING)? #showTables +| SHOW TABLE EXTENDED ((FROM | IN) db=identifier)? +LIKE pattern=STRING partitionSpec? #showTable | SHOW DATABASES (LIKE pattern=STRING)? #showDatabases | SHOW TBLPROPERTIES table=tableIdentifier ('(' key=tablePropertyKey ')')? #showTblProperties http://git-wip-us.apache.org/repos/asf/spark/blob/5572ccf8/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 4400174..cab1b22 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -126,23 +126,33 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { * Create a [[ShowTablesCommand]] logical plan. * Example SQL : * {{{ - * SHOW TABLES [EXTENDED] [(IN|FROM) database_name] [[LIKE] 'identifier_with_wildcards'] - * [PARTITION(partition_spec)]; + * SHOW TABLES [(IN|FROM) database_name] [[LIKE] 'identifier_with_wildcards']; * }}} */ override def visitShowTables(ctx: ShowTablesContext): LogicalPlan = withOrigin(ctx) { +ShowTablesCommand( + Option(ctx.db).map(_.getText), + Option(ctx.pattern).map(string), + isExtended = false) + } + + /** + * Create a [[ShowTablesCommand]] logical plan. + * Example SQL : + * {{{ + * SHOW TABLE EXTENDED [(IN|FROM) database_name] LIKE 'identifier_with_wildcards' + * [PARTITION(partition_spec)]; + * }}} + */ + override def visitShowTable(ctx:
spark git commit: [SPARK-18835][SQL] Don't expose Guava types in the JavaTypeInference API.
Repository: spark Updated Branches: refs/heads/master fb3081d3b -> f280ccf44 [SPARK-18835][SQL] Don't expose Guava types in the JavaTypeInference API. This avoids issues during maven tests because of shading. Author: Marcelo VanzinCloses #16260 from vanzin/SPARK-18835. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f280ccf4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f280ccf4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f280ccf4 Branch: refs/heads/master Commit: f280ccf449f62a00eb4042dfbcf7a0715850fd4c Parents: fb3081d Author: Marcelo Vanzin Authored: Tue Dec 13 10:02:19 2016 -0800 Committer: Marcelo Vanzin Committed: Tue Dec 13 10:02:19 2016 -0800 -- .../apache/spark/sql/catalyst/JavaTypeInference.scala | 12 +++- .../scala/org/apache/spark/sql/UDFRegistration.scala| 4 +--- 2 files changed, 12 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f280ccf4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala index 7e8e4da..8b53d98 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst import java.beans.{Introspector, PropertyDescriptor} import java.lang.{Iterable => JIterable} +import java.lang.reflect.Type import java.util.{Iterator => JIterator, List => JList, Map => JMap} import scala.language.existentials @@ -56,10 +57,19 @@ object JavaTypeInference { /** * Infers the corresponding SQL data type of a Java type. + * @param beanType Java type + * @return (SQL data type, nullable) + */ + private[sql] def inferDataType(beanType: Type): (DataType, Boolean) = { +inferDataType(TypeToken.of(beanType)) + } + + /** + * Infers the corresponding SQL data type of a Java type. * @param typeToken Java type * @return (SQL data type, nullable) */ - private[sql] def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = { + private def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = { typeToken.getRawType match { case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) => (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true) http://git-wip-us.apache.org/repos/asf/spark/blob/f280ccf4/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index c8be89c..d94185b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -23,8 +23,6 @@ import java.lang.reflect.{ParameterizedType, Type} import scala.reflect.runtime.universe.TypeTag import scala.util.Try -import com.google.common.reflect.TypeToken - import org.apache.spark.annotation.InterfaceStability import org.apache.spark.internal.Logging import org.apache.spark.sql.api.java._ @@ -446,7 +444,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends val udfReturnType = udfInterfaces(0).getActualTypeArguments.last var returnType = returnDataType if (returnType == null) { -returnType = JavaTypeInference.inferDataType(TypeToken.of(udfReturnType))._1 +returnType = JavaTypeInference.inferDataType(udfReturnType)._1 } udfInterfaces(0).getActualTypeArguments.length match { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18835][SQL] Don't expose Guava types in the JavaTypeInference API.
Repository: spark Updated Branches: refs/heads/branch-2.1 9f0e3be62 -> 207107bca [SPARK-18835][SQL] Don't expose Guava types in the JavaTypeInference API. This avoids issues during maven tests because of shading. Author: Marcelo VanzinCloses #16260 from vanzin/SPARK-18835. (cherry picked from commit f280ccf449f62a00eb4042dfbcf7a0715850fd4c) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/207107bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/207107bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/207107bc Branch: refs/heads/branch-2.1 Commit: 207107bca5e550657b02892eef74230787972d10 Parents: 9f0e3be Author: Marcelo Vanzin Authored: Tue Dec 13 10:02:19 2016 -0800 Committer: Marcelo Vanzin Committed: Tue Dec 13 10:02:29 2016 -0800 -- .../apache/spark/sql/catalyst/JavaTypeInference.scala | 12 +++- .../scala/org/apache/spark/sql/UDFRegistration.scala| 4 +--- 2 files changed, 12 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/207107bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala index 04f0cfc..61c153c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst import java.beans.{Introspector, PropertyDescriptor} import java.lang.{Iterable => JIterable} +import java.lang.reflect.Type import java.util.{Iterator => JIterator, List => JList, Map => JMap} import scala.language.existentials @@ -56,10 +57,19 @@ object JavaTypeInference { /** * Infers the corresponding SQL data type of a Java type. + * @param beanType Java type + * @return (SQL data type, nullable) + */ + private[sql] def inferDataType(beanType: Type): (DataType, Boolean) = { +inferDataType(TypeToken.of(beanType)) + } + + /** + * Infers the corresponding SQL data type of a Java type. * @param typeToken Java type * @return (SQL data type, nullable) */ - private[sql] def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = { + private def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = { typeToken.getRawType match { case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) => (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true) http://git-wip-us.apache.org/repos/asf/spark/blob/207107bc/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index c8be89c..d94185b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -23,8 +23,6 @@ import java.lang.reflect.{ParameterizedType, Type} import scala.reflect.runtime.universe.TypeTag import scala.util.Try -import com.google.common.reflect.TypeToken - import org.apache.spark.annotation.InterfaceStability import org.apache.spark.internal.Logging import org.apache.spark.sql.api.java._ @@ -446,7 +444,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends val udfReturnType = udfInterfaces(0).getActualTypeArguments.last var returnType = returnDataType if (returnType == null) { -returnType = JavaTypeInference.inferDataType(TypeToken.of(udfReturnType))._1 +returnType = JavaTypeInference.inferDataType(udfReturnType)._1 } udfInterfaces(0).getActualTypeArguments.length match { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-13747][CORE] Fix potential ThreadLocal leaks in RPC when using ForkJoinPool
Repository: spark Updated Branches: refs/heads/master d53f18cae -> fb3081d3b [SPARK-13747][CORE] Fix potential ThreadLocal leaks in RPC when using ForkJoinPool ## What changes were proposed in this pull request? Some places in SQL may call `RpcEndpointRef.askWithRetry` (e.g., ParquetFileFormat.buildReader -> SparkContext.broadcast -> ... -> BlockManagerMaster.updateBlockInfo -> RpcEndpointRef.askWithRetry), which will finally call `Await.result`. It may cause `java.lang.IllegalArgumentException: spark.sql.execution.id is already set` when running in Scala ForkJoinPool. This PR includes the following changes to fix this issue: - Remove `ThreadUtils.awaitResult` - Rename `ThreadUtils. awaitResultInForkJoinSafely` to `ThreadUtils.awaitResult` - Replace `Await.result` in RpcTimeout with `ThreadUtils.awaitResult`. ## How was this patch tested? Jenkins Author: Shixiong ZhuCloses #16230 from zsxwing/fix-SPARK-13747. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb3081d3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb3081d3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb3081d3 Branch: refs/heads/master Commit: fb3081d3b38a50aa5e023c603e1b191e57f7c876 Parents: d53f18c Author: Shixiong Zhu Authored: Tue Dec 13 09:53:22 2016 -0800 Committer: Yin Huai Committed: Tue Dec 13 09:53:22 2016 -0800 -- .../scala/org/apache/spark/rpc/RpcTimeout.scala | 12 ++ .../org/apache/spark/util/ThreadUtils.scala | 41 .../apache/spark/rdd/AsyncRDDActionsSuite.scala | 3 +- .../OutputCommitCoordinatorSuite.scala | 3 +- scalastyle-config.xml | 1 - .../sql/execution/basicPhysicalOperators.scala | 2 +- .../exchange/BroadcastExchangeExec.scala| 3 +- 7 files changed, 23 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb3081d3/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala -- diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala index 2761d39..efd2648 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala @@ -24,7 +24,7 @@ import scala.concurrent.duration._ import scala.util.control.NonFatal import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.util.Utils +import org.apache.spark.util.{ThreadUtils, Utils} /** * An exception thrown if RpcTimeout modifies a [[TimeoutException]]. @@ -72,15 +72,9 @@ private[spark] class RpcTimeout(val duration: FiniteDuration, val timeoutProp: S * is still not ready */ def awaitResult[T](future: Future[T]): T = { -val wrapAndRethrow: PartialFunction[Throwable, T] = { - case NonFatal(t) => -throw new SparkException("Exception thrown in awaitResult", t) -} try { - // scalastyle:off awaitresult - Await.result(future, duration) - // scalastyle:on awaitresult -} catch addMessageIfTimeout.orElse(wrapAndRethrow) + ThreadUtils.awaitResult(future, duration) +} catch addMessageIfTimeout } } http://git-wip-us.apache.org/repos/asf/spark/blob/fb3081d3/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 60a6e82..1aa4456 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -19,7 +19,7 @@ package org.apache.spark.util import java.util.concurrent._ -import scala.concurrent.{Await, Awaitable, ExecutionContext, ExecutionContextExecutor} +import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor} import scala.concurrent.duration.Duration import scala.concurrent.forkjoin.{ForkJoinPool => SForkJoinPool, ForkJoinWorkerThread => SForkJoinWorkerThread} import scala.util.control.NonFatal @@ -180,39 +180,30 @@ private[spark] object ThreadUtils { // scalastyle:off awaitresult /** - * Preferred alternative to `Await.result()`. This method wraps and re-throws any exceptions - * thrown by the underlying `Await` call, ensuring that this thread's stack trace appears in - * logs. - */ - @throws(classOf[SparkException]) - def awaitResult[T](awaitable: Awaitable[T], atMost: Duration): T = { -try { - Await.result(awaitable, atMost) - // scalastyle:on awaitresult -} catch { - case
spark git commit: [SPARK-18675][SQL] CTAS for hive serde table should work for all hive versions
Repository: spark Updated Branches: refs/heads/master 096f868b7 -> d53f18cae [SPARK-18675][SQL] CTAS for hive serde table should work for all hive versions ## What changes were proposed in this pull request? Before hive 1.1, when inserting into a table, hive will create the staging directory under a common scratch directory. After the writing is finished, hive will simply empty the table directory and move the staging directory to it. After hive 1.1, hive will create the staging directory under the table directory, and when moving staging directory to table directory, hive will still empty the table directory, but will exclude the staging directory there. In `InsertIntoHiveTable`, we simply copy the code from hive 1.2, which means we will always create the staging directory under the table directory, no matter what the hive version is. This causes problems if the hive version is prior to 1.1, because the staging directory will be removed by hive when hive is trying to empty the table directory. This PR copies the code from hive 0.13, so that we have 2 branches to create staging directory. If hive version is prior to 1.1, we'll go to the old style branch(i.e. create the staging directory under a common scratch directory), else, go to the new style branch(i.e. create the staging directory under the table directory) ## How was this patch tested? new test Author: Wenchen FanCloses #16104 from cloud-fan/hive-0.13. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d53f18ca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d53f18ca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d53f18ca Branch: refs/heads/master Commit: d53f18cae41c6c77a0cff3f1fd266e4c1b9ea79a Parents: 096f868 Author: Wenchen Fan Authored: Tue Dec 13 09:46:58 2016 -0800 Committer: Yin Huai Committed: Tue Dec 13 09:46:58 2016 -0800 -- .../hive/execution/InsertIntoHiveTable.scala| 68 +--- .../spark/sql/hive/client/VersionsSuite.scala | 19 +- 2 files changed, 75 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d53f18ca/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index db2239d..82c7b1a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -22,7 +22,6 @@ import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.exec.TaskRunner @@ -86,6 +85,7 @@ case class InsertIntoHiveTable( val hadoopConf = sessionState.newHadoopConf() val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") + val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive") private def executionId: String = { val rand: Random = new Random @@ -93,7 +93,7 @@ case class InsertIntoHiveTable( "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) } - private def getStagingDir(inputPath: Path, hadoopConf: Configuration): Path = { + private def getStagingDir(inputPath: Path): Path = { val inputPathUri: URI = inputPath.toUri val inputPathName: String = inputPathUri.getPath val fs: FileSystem = inputPath.getFileSystem(hadoopConf) @@ -121,21 +121,69 @@ case class InsertIntoHiveTable( return dir } - private def getExternalScratchDir(extURI: URI, hadoopConf: Configuration): Path = { -getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), hadoopConf) + private def getExternalScratchDir(extURI: URI): Path = { +getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath)) } - def getExternalTmpPath(path: Path, hadoopConf: Configuration): Path = { + def getExternalTmpPath(path: Path): Path = { +import org.apache.spark.sql.hive.client.hive._ + +val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version +// Before Hive 1.1, when inserting into a table, Hive will create the staging directory under +// a common scratch directory. After the writing is finished, Hive will simply empty the table +// directory and move the staging directory to it. +// After Hive 1.1, Hive will
spark git commit: [MINOR][CORE][SQL] Remove explicit RDD and Partition overrides
Repository: spark Updated Branches: refs/heads/master 46d30ac48 -> 096f868b7 [MINOR][CORE][SQL] Remove explicit RDD and Partition overrides ## What changes were proposed in this pull request? I **believe** that I _only_ removed duplicated code (that adds nothing but noise). I'm gonna remove the comment after Jenkins has built the changes with no issues and Spark devs has agreed to include the changes. Remove explicit `RDD` and `Partition` overrides (that turn out code duplication) ## How was this patch tested? Local build. Awaiting Jenkins. â¦cation) Author: Jacek LaskowskiCloses #16145 from jaceklaskowski/rdd-overrides-removed. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/096f868b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/096f868b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/096f868b Branch: refs/heads/master Commit: 096f868b74d01c3dfc8f09e1e7dfc0ebab65226f Parents: 46d30ac Author: Jacek Laskowski Authored: Tue Dec 13 09:40:16 2016 + Committer: Sean Owen Committed: Tue Dec 13 09:40:16 2016 + -- core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala | 4 .../scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala| 4 2 files changed, 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/096f868b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala index 29d5d74..26eaa9a 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala @@ -25,10 +25,6 @@ import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx - - override def hashCode(): Int = index - - override def equals(other: Any): Boolean = super.equals(other) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/096f868b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala index 5f0c264..862ee05 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala @@ -33,10 +33,6 @@ private final class ShuffledRowRDDPartition( val startPreShufflePartitionIndex: Int, val endPreShufflePartitionIndex: Int) extends Partition { override val index: Int = postShufflePartitionIndex - - override def hashCode(): Int = postShufflePartitionIndex - - override def equals(other: Any): Boolean = super.equals(other) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org