spark git commit: [SPARK-17183][SPARK-17983][SPARK-18101][SQL] put hive serde table schema to table properties like data source table
Repository: spark Updated Branches: refs/heads/master 6e2701815 -> 95ec4e25b [SPARK-17183][SPARK-17983][SPARK-18101][SQL] put hive serde table schema to table properties like data source table ## What changes were proposed in this pull request? For data source tables, we will put its table schema, partition columns, etc. to table properties, to work around some hive metastore issues, e.g. not case-preserving, bad decimal type support, etc. We should also do this for hive serde tables, to reduce the difference between hive serde tables and data source tables, e.g. column names should be case preserving. ## How was this patch tested? existing tests, and a new test in `HiveExternalCatalog` Author: Wenchen Fan Closes #14750 from cloud-fan/minor1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95ec4e25 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95ec4e25 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95ec4e25 Branch: refs/heads/master Commit: 95ec4e25bb65f37f80222ffe70a95993a9149f80 Parents: 6e27018 Author: Wenchen Fan Authored: Sat Nov 5 00:58:50 2016 -0700 Committer: Reynold Xin Committed: Sat Nov 5 00:58:50 2016 -0700 -- .../sql/catalyst/catalog/ExternalCatalog.scala | 8 +- .../sql/catalyst/catalog/InMemoryCatalog.scala | 6 - .../org/apache/spark/sql/types/DataType.scala | 24 ++ .../catalyst/catalog/ExternalCatalogSuite.scala | 20 ++ .../org/apache/spark/sql/DataFrameWriter.scala | 10 +- .../spark/sql/execution/SparkSqlParser.scala| 4 +- .../spark/sql/execution/SparkStrategies.scala | 6 +- .../spark/sql/execution/command/ddl.scala | 4 +- .../spark/sql/execution/datasources/rules.scala | 5 +- .../spark/sql/hive/HiveExternalCatalog.scala| 218 ++- .../input1-2-d3aa54d5436b7b59ff5c7091b7ca6145 | 4 +- .../input2-1-e0efeda558cd0194f4764a5735147b16 | 4 +- .../input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd | 4 +- .../input2-4-235f92683416fab031e6e7490487b15b | 6 +- ...w_columns-2-b74990316ec4245fd8a7011e684b39da | 6 +- .../hive/PartitionedTablePerfStatsSuite.scala | 9 +- .../sql/hive/execution/SQLQuerySuite.scala | 4 +- 17 files changed, 245 insertions(+), 97 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/95ec4e25/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index a5e0252..14dd707 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.catalog -import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException} +import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException, NoSuchTableException} import org.apache.spark.sql.catalyst.expressions.Expression @@ -39,6 +39,12 @@ abstract class ExternalCatalog { } } + protected def requireTableExists(db: String, table: String): Unit = { +if (!tableExists(db, table)) { + throw new NoSuchTableException(db = db, table = table) +} + } + protected def requireFunctionExists(db: String, funcName: String): Unit = { if (!functionExists(db, funcName)) { throw new NoSuchFunctionException(db = db, func = funcName) http://git-wip-us.apache.org/repos/asf/spark/blob/95ec4e25/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index ea675b7..bc39688 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -64,12 +64,6 @@ class InMemoryCatalog( catalog(db).tables(table).partitions.contains(spec) } - private def requireTableExists(db: String, table: String): Unit = { -if (!tableExists(db, table)) { - throw new NoSuchTableException(db = db, table = table) -} - } - private def requireTableNotExists(db: String, table: String): Unit = { if (tableExists(db, table)) { throw new TableAlreadyE
spark git commit: [SPARK-18260] Make from_json null safe
Repository: spark Updated Branches: refs/heads/master 8a9ca1924 -> 6e2701815 [SPARK-18260] Make from_json null safe ## What changes were proposed in this pull request? `from_json` is currently not safe against `null` rows. This PR adds a fix and a regression test for it. ## How was this patch tested? Regression test Author: Burak Yavuz Closes #15771 from brkyvz/json_fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e270181 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e270181 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e270181 Branch: refs/heads/master Commit: 6e2701815761d5870111cb56300e30d3059b39ed Parents: 8a9ca19 Author: Burak Yavuz Authored: Sat Nov 5 00:07:51 2016 -0700 Committer: Reynold Xin Committed: Sat Nov 5 00:07:51 2016 -0700 -- .../spark/sql/catalyst/expressions/jsonExpressions.scala | 4 +++- .../sql/catalyst/expressions/JsonExpressionsSuite.scala | 8 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e270181/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index e034735..89fe7c4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -498,7 +498,9 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child: override def children: Seq[Expression] = child :: Nil override def eval(input: InternalRow): Any = { -try parser.parse(child.eval(input).toString).head catch { +val json = child.eval(input) +if (json == null) return null +try parser.parse(json.toString).head catch { case _: SparkSQLJsonProcessingException => null } } http://git-wip-us.apache.org/repos/asf/spark/blob/6e270181/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index f9db649..3bfa0bf 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -344,6 +344,14 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { ) } + test("from_json null input column") { +val schema = StructType(StructField("a", IntegerType) :: Nil) +checkEvaluation( + JsonToStruct(schema, Map.empty, Literal(null)), + null +) + } + test("to_json") { val schema = StructType(StructField("a", IntegerType) :: Nil) val struct = Literal.create(create_row(1), schema) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18260] Make from_json null safe
Repository: spark Updated Branches: refs/heads/branch-2.1 707630147 -> 42386e796 [SPARK-18260] Make from_json null safe ## What changes were proposed in this pull request? `from_json` is currently not safe against `null` rows. This PR adds a fix and a regression test for it. ## How was this patch tested? Regression test Author: Burak Yavuz Closes #15771 from brkyvz/json_fix. (cherry picked from commit 6e2701815761d5870111cb56300e30d3059b39ed) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42386e79 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42386e79 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42386e79 Branch: refs/heads/branch-2.1 Commit: 42386e796f6519d22092fba88a8c42cba6511d7c Parents: 7076301 Author: Burak Yavuz Authored: Sat Nov 5 00:07:51 2016 -0700 Committer: Reynold Xin Committed: Sat Nov 5 00:08:00 2016 -0700 -- .../spark/sql/catalyst/expressions/jsonExpressions.scala | 4 +++- .../sql/catalyst/expressions/JsonExpressionsSuite.scala | 8 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/42386e79/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index e034735..89fe7c4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -498,7 +498,9 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child: override def children: Seq[Expression] = child :: Nil override def eval(input: InternalRow): Any = { -try parser.parse(child.eval(input).toString).head catch { +val json = child.eval(input) +if (json == null) return null +try parser.parse(json.toString).head catch { case _: SparkSQLJsonProcessingException => null } } http://git-wip-us.apache.org/repos/asf/spark/blob/42386e79/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index f9db649..3bfa0bf 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -344,6 +344,14 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { ) } + test("from_json null input column") { +val schema = StructType(StructField("a", IntegerType) :: Nil) +checkEvaluation( + JsonToStruct(schema, Map.empty, Literal(null)), + null +) + } + test("to_json") { val schema = StructType(StructField("a", IntegerType) :: Nil) val struct = Literal.create(create_row(1), schema) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not used
Repository: spark Updated Branches: refs/heads/branch-2.1 491db67a5 -> 707630147 [SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not used ## What changes were proposed in this pull request? Add comments. ## How was this patch tested? Build passed. Author: Weiqing Yang Closes #15776 from weiqingy/SPARK-17710. (cherry picked from commit 8a9ca1924792d1a7c733bdfd757996b3ade0d63d) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70763014 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70763014 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70763014 Branch: refs/heads/branch-2.1 Commit: 707630147e51114aa90f58f375df43bb2b5f7fb4 Parents: 491db67 Author: Weiqing Yang Authored: Fri Nov 4 23:44:46 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 23:44:53 2016 -0700 -- core/src/main/scala/org/apache/spark/util/Utils.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/70763014/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 22c28fb..1de66af 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2539,6 +2539,8 @@ private[util] object CallerContext extends Logging { val callerContextSupported: Boolean = { SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", false) && { try { +// `Utils.classForName` will make `ReplSuite` fail with `ClassCircularityError` in +// master Maven build, so do not use it before resolving SPARK-17714. // scalastyle:off classforname Class.forName("org.apache.hadoop.ipc.CallerContext") Class.forName("org.apache.hadoop.ipc.CallerContext$Builder") @@ -2604,6 +2606,8 @@ private[spark] class CallerContext( def setCurrentContext(): Unit = { if (CallerContext.callerContextSupported) { try { +// `Utils.classForName` will make `ReplSuite` fail with `ClassCircularityError` in +// master Maven build, so do not use it before resolving SPARK-17714. // scalastyle:off classforname val callerContext = Class.forName("org.apache.hadoop.ipc.CallerContext") val builder = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not used
Repository: spark Updated Branches: refs/heads/master 0f7c9e84e -> 8a9ca1924 [SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not used ## What changes were proposed in this pull request? Add comments. ## How was this patch tested? Build passed. Author: Weiqing Yang Closes #15776 from weiqingy/SPARK-17710. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a9ca192 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a9ca192 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a9ca192 Branch: refs/heads/master Commit: 8a9ca1924792d1a7c733bdfd757996b3ade0d63d Parents: 0f7c9e8 Author: Weiqing Yang Authored: Fri Nov 4 23:44:46 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 23:44:46 2016 -0700 -- core/src/main/scala/org/apache/spark/util/Utils.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a9ca192/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 22c28fb..1de66af 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2539,6 +2539,8 @@ private[util] object CallerContext extends Logging { val callerContextSupported: Boolean = { SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", false) && { try { +// `Utils.classForName` will make `ReplSuite` fail with `ClassCircularityError` in +// master Maven build, so do not use it before resolving SPARK-17714. // scalastyle:off classforname Class.forName("org.apache.hadoop.ipc.CallerContext") Class.forName("org.apache.hadoop.ipc.CallerContext$Builder") @@ -2604,6 +2606,8 @@ private[spark] class CallerContext( def setCurrentContext(): Unit = { if (CallerContext.callerContextSupported) { try { +// `Utils.classForName` will make `ReplSuite` fail with `ClassCircularityError` in +// master Maven build, so do not use it before resolving SPARK-17714. // scalastyle:off classforname val callerContext = Class.forName("org.apache.hadoop.ipc.CallerContext") val builder = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent java.lang.ClassCircularityError
Repository: spark Updated Branches: refs/heads/branch-2.1 0a303a694 -> 491db67a5 [SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent java.lang.ClassCircularityError closes #15774 (cherry picked from commit 0f7c9e84e0d00813bf56712097677add5657f19f) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/491db67a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/491db67a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/491db67a Branch: refs/heads/branch-2.1 Commit: 491db67a5fd067ef5e767ac4a07144722302d95a Parents: 0a303a6 Author: Reynold Xin Authored: Fri Nov 4 23:34:29 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 23:35:04 2016 -0700 -- .../scala/org/apache/spark/repl/ReplSuite.scala| 17 - .../scala/org/apache/spark/sql/DatasetSuite.scala | 12 2 files changed, 12 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/491db67a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 96d2dfc..9262e93 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite { assertDoesNotContain("AssertionError", output) assertDoesNotContain("Exception", output) } - - test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { -val resultValue = 12345 -val output = runInterpreter("local", - s""" - |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) - |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) - |val broadcasted = sc.broadcast($resultValue) - | - |// Using broadcast triggers serialization issue in KeyValueGroupedDataset - |val dataset = mapGroups.map(_ => broadcasted.value) - |dataset.collect() - """.stripMargin) -assertDoesNotContain("error:", output) -assertDoesNotContain("Exception", output) -assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output) - } } http://git-wip-us.apache.org/repos/asf/spark/blob/491db67a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 55f0487..6fa7b04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -923,6 +923,18 @@ class DatasetSuite extends QueryTest with SharedSQLContext { .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() }) } + test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { +val resultValue = 12345 +val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) +val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) +val broadcasted = spark.sparkContext.broadcast(resultValue) + +// Using broadcast triggers serialization issue in KeyValueGroupedDataset +val dataset = mapGroups.map(_ => broadcasted.value) + +assert(dataset.collect() sameElements Array(resultValue, resultValue)) + } + Seq(true, false).foreach { eager => def testCheckpointing(testName: String)(f: => Unit): Unit = { test(s"Dataset.checkpoint() - $testName (eager = $eager)") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent java.lang.ClassCircularityError
Repository: spark Updated Branches: refs/heads/master 0e3312ee7 -> 0f7c9e84e [SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent java.lang.ClassCircularityError closes #15774 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f7c9e84 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f7c9e84 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f7c9e84 Branch: refs/heads/master Commit: 0f7c9e84e0d00813bf56712097677add5657f19f Parents: 0e3312e Author: Reynold Xin Authored: Fri Nov 4 23:34:29 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 23:34:29 2016 -0700 -- .../scala/org/apache/spark/repl/ReplSuite.scala| 17 - .../scala/org/apache/spark/sql/DatasetSuite.scala | 12 2 files changed, 12 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f7c9e84/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 96d2dfc..9262e93 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite { assertDoesNotContain("AssertionError", output) assertDoesNotContain("Exception", output) } - - test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { -val resultValue = 12345 -val output = runInterpreter("local", - s""" - |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) - |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) - |val broadcasted = sc.broadcast($resultValue) - | - |// Using broadcast triggers serialization issue in KeyValueGroupedDataset - |val dataset = mapGroups.map(_ => broadcasted.value) - |dataset.collect() - """.stripMargin) -assertDoesNotContain("error:", output) -assertDoesNotContain("Exception", output) -assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output) - } } http://git-wip-us.apache.org/repos/asf/spark/blob/0f7c9e84/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 55f0487..6fa7b04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -923,6 +923,18 @@ class DatasetSuite extends QueryTest with SharedSQLContext { .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() }) } + test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { +val resultValue = 12345 +val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) +val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) +val broadcasted = spark.sparkContext.broadcast(resultValue) + +// Using broadcast triggers serialization issue in KeyValueGroupedDataset +val dataset = mapGroups.map(_ => broadcasted.value) + +assert(dataset.collect() sameElements Array(resultValue, resultValue)) + } + Seq(true, false).foreach { eager => def testCheckpointing(testName: String)(f: => Unit): Unit = { test(s"Dataset.checkpoint() - $testName (eager = $eager)") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18189][SQL][FOLLOWUP] Move test from ReplSuite to prevent java.lang.ClassCircularityError
Repository: spark Updated Branches: refs/heads/branch-2.0 399597b04 -> 8b99e204a [SPARK-18189][SQL][FOLLOWUP] Move test from ReplSuite to prevent java.lang.ClassCircularityError ## What changes were proposed in this pull request? Move the test which is causing java.lang.ClassCircularityError from ReplSuite to DatasetSuite. ## How was this patch tested? > build/mvn -DskipTests -Phadoop-2.3 -Pyarn -Phive -Phive-thriftserver > -Pkinesis-asl -Pmesos clean package > build/mvn -Dtest=none -DwildcardSuites=org.apache.spark.repl.ReplSuite test Author: Ergin Seyfe Closes #15774 from seyfe/fix_replsuite_test_error_branch2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b99e204 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b99e204 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b99e204 Branch: refs/heads/branch-2.0 Commit: 8b99e204a9a056fd071f9bd75f3e0a29f90bccc0 Parents: 399597b Author: Ergin Seyfe Authored: Fri Nov 4 23:29:20 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 23:29:20 2016 -0700 -- .../scala/org/apache/spark/repl/ReplSuite.scala| 17 - .../scala/org/apache/spark/sql/DatasetSuite.scala | 13 + 2 files changed, 13 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8b99e204/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 8deafe3..f7d7a4f 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite { assertDoesNotContain("AssertionError", output) assertDoesNotContain("Exception", output) } - - test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { -val resultValue = 12345 -val output = runInterpreter("local", - s""" - |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) - |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) - |val broadcasted = sc.broadcast($resultValue) - | - |// Using broadcast triggers serialization issue in KeyValueGroupedDataset - |val dataset = mapGroups.map(_ => broadcasted.value) - |dataset.collect() - """.stripMargin) -assertDoesNotContain("error:", output) -assertDoesNotContain("Exception", output) -assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output) - } } http://git-wip-us.apache.org/repos/asf/spark/blob/8b99e204/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index f897cfb..6113e5d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -882,6 +882,19 @@ class DatasetSuite extends QueryTest with SharedSQLContext { df.withColumn("b", expr("0")).as[ClassData] .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() }) } + + // This is moved from ReplSuite to prevent java.lang.ClassCircularityError. + test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { +val resultValue = 12345 +val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) +val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) +val broadcasted = spark.sparkContext.broadcast(resultValue) + +// Using broadcast triggers serialization issue in KeyValueGroupedDataset +val dataset = mapGroups.map(_ => broadcasted.value) + +assert(dataset.collect() sameElements Array(resultValue, resultValue)) + } } case class Generic[T](id: T, value: Double) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17337][SPARK-16804][SQL][BRANCH-2.0] Backport subquery related PRs
Repository: spark Updated Branches: refs/heads/branch-2.0 c864e8a80 -> 399597b04 [SPARK-17337][SPARK-16804][SQL][BRANCH-2.0] Backport subquery related PRs ## What changes were proposed in this pull request? This PR backports two subquery related PRs to branch-2.0: - https://github.com/apache/spark/pull/14411 - https://github.com/apache/spark/pull/15761 ## How was this patch tested? Added a tests to `SubquerySuite`. Author: Nattavut Sutyanyong Author: Herman van Hovell Closes #15772 from hvanhovell/SPARK-17337-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/399597b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/399597b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/399597b0 Branch: refs/heads/branch-2.0 Commit: 399597b04a83bbe3cc748c21446de0d808d08155 Parents: c864e8a Author: Herman van Hovell Authored: Fri Nov 4 15:54:58 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 15:54:58 2016 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 13 ++ .../sql/catalyst/optimizer/Optimizer.scala | 16 ++- .../catalyst/analysis/AnalysisErrorSuite.scala | 17 .../org/apache/spark/sql/SubquerySuite.scala| 44 4 files changed, 89 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/399597b0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 617f3e0..6332f92 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1025,6 +1025,19 @@ class Analyzer( case e: Expand => failOnOuterReferenceInSubTree(e, "an EXPAND") e +case l : LocalLimit => + failOnOuterReferenceInSubTree(l, "a LIMIT") + l +// Since LIMIT is represented as GlobalLimit(, (LocalLimit (, child)) +// and we are walking bottom up, we will fail on LocalLimit before +// reaching GlobalLimit. +// The code below is just a safety net. +case g : GlobalLimit => + failOnOuterReferenceInSubTree(g, "a LIMIT") + g +case s : Sample => + failOnOuterReferenceInSubTree(s, "a TABLESAMPLE") + s case p => failOnOuterReference(p) p http://git-wip-us.apache.org/repos/asf/spark/blob/399597b0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 4c06038..f0992b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1020,7 +1020,7 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { // state and all the input rows processed before. In another word, the order of input rows // matters for non-deterministic expressions, while pushing down predicates changes the order. case filter @ Filter(condition, project @ Project(fields, grandChild)) - if fields.forall(_.deterministic) => + if fields.forall(_.deterministic) && canPushThroughCondition(grandChild, condition) => // Create a map of Aliases to their values from the child projection. // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b). @@ -1161,6 +1161,20 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper { filter } } + + /** + * Check if we can safely push a filter through a projection, by making sure that predicate + * subqueries in the condition do not contain the same attributes as the plan they are moved + * into. This can happen when the plan and predicate subquery have the same source. + */ + private def canPushThroughCondition(plan: LogicalPlan, condition: Expression): Boolean = { +val attributes = plan.outputSet +val matched = condition.find { + case PredicateSubquery(p, _, _, _) => p.outputSet.intersect(attributes).nonEmpty + case _ => false +} +matched.isEmpty + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/399597b0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala ---
spark git commit: [SPARK-18197][CORE] Optimise AppendOnlyMap implementation
Repository: spark Updated Branches: refs/heads/branch-2.1 cfe76028b -> a2d7e25e7 [SPARK-18197][CORE] Optimise AppendOnlyMap implementation ## What changes were proposed in this pull request? This improvement works by using the fastest comparison test first and we observed a 1% throughput performance improvement on PageRank (HiBench large profile) with this change. We used tprof and before the change in AppendOnlyMap.changeValue (where the optimisation occurs) this method was being used for 8053 profiling ticks representing 0.72% of the overall application time. After this change we observed this method only occurring for 2786 ticks and for 0.25% of the overall time. ## How was this patch tested? Existing unit tests and for performance we used HiBench large, profiling with tprof and IBM Healthcenter. Author: Adam Roberts Closes #15714 from a-roberts/patch-9. (cherry picked from commit a42d738c5de08bd395a7c220c487146173c6c163) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2d7e25e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2d7e25e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2d7e25e Branch: refs/heads/branch-2.1 Commit: a2d7e25e7c85ce17c8ceac5e1806afe96d3acc14 Parents: cfe7602 Author: Adam Roberts Authored: Fri Nov 4 12:06:06 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 12:06:12 2016 -0700 -- .../org/apache/spark/util/collection/AppendOnlyMap.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2d7e25e/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala index 6b74a29..bcb95b4 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala @@ -140,16 +140,16 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64) var i = 1 while (true) { val curKey = data(2 * pos) - if (k.eq(curKey) || k.equals(curKey)) { -val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V]) -data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] -return newValue - } else if (curKey.eq(null)) { + if (curKey.eq(null)) { val newValue = updateFunc(false, null.asInstanceOf[V]) data(2 * pos) = k data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] incrementSize() return newValue + } else if (k.eq(curKey) || k.equals(curKey)) { +val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V]) +data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] +return newValue } else { val delta = i pos = (pos + delta) & mask - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18197][CORE] Optimise AppendOnlyMap implementation
Repository: spark Updated Branches: refs/heads/master 14f235d56 -> a42d738c5 [SPARK-18197][CORE] Optimise AppendOnlyMap implementation ## What changes were proposed in this pull request? This improvement works by using the fastest comparison test first and we observed a 1% throughput performance improvement on PageRank (HiBench large profile) with this change. We used tprof and before the change in AppendOnlyMap.changeValue (where the optimisation occurs) this method was being used for 8053 profiling ticks representing 0.72% of the overall application time. After this change we observed this method only occurring for 2786 ticks and for 0.25% of the overall time. ## How was this patch tested? Existing unit tests and for performance we used HiBench large, profiling with tprof and IBM Healthcenter. Author: Adam Roberts Closes #15714 from a-roberts/patch-9. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a42d738c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a42d738c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a42d738c Branch: refs/heads/master Commit: a42d738c5de08bd395a7c220c487146173c6c163 Parents: 14f235d Author: Adam Roberts Authored: Fri Nov 4 12:06:06 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 12:06:06 2016 -0700 -- .../org/apache/spark/util/collection/AppendOnlyMap.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a42d738c/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala index 6b74a29..bcb95b4 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala @@ -140,16 +140,16 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64) var i = 1 while (true) { val curKey = data(2 * pos) - if (k.eq(curKey) || k.equals(curKey)) { -val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V]) -data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] -return newValue - } else if (curKey.eq(null)) { + if (curKey.eq(null)) { val newValue = updateFunc(false, null.asInstanceOf[V]) data(2 * pos) = k data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] incrementSize() return newValue + } else if (k.eq(curKey) || k.equals(curKey)) { +val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V]) +data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] +return newValue } else { val delta = i pos = (pos + delta) & mask - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Closing some stale/invalid pull requests
Repository: spark Updated Branches: refs/heads/master 27602c337 -> 14f235d56 Closing some stale/invalid pull requests Closes #15758 Closes #15753 Closes #12708 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14f235d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14f235d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14f235d5 Branch: refs/heads/master Commit: 14f235d5643bca75e270652c15154d86e57a7a70 Parents: 27602c3 Author: Reynold Xin Authored: Fri Nov 4 01:27:06 2016 -0700 Committer: Reynold Xin Committed: Fri Nov 4 01:27:06 2016 -0700 -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet
Repository: spark Updated Branches: refs/heads/branch-2.0 dae1581d9 -> c864e8a80 [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet ## What changes were proposed in this pull request? This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent. **Before** ``` nextPowerOf2(0) => 2 nextPowerOf2(1) => 1 nextPowerOf2(2) => 2 nextPowerOf2(3) => 4 nextPowerOf2(4) => 4 nextPowerOf2(5) => 8 ``` **After** ``` nextPowerOf2(0) => 1 nextPowerOf2(1) => 1 nextPowerOf2(2) => 2 nextPowerOf2(3) => 4 nextPowerOf2(4) => 4 nextPowerOf2(5) => 8 ``` ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #15754 from dongjoon-hyun/SPARK-18200-2. (cherry picked from commit 27602c33751cebf6cd173c0de103454608cf6625) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c864e8a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c864e8a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c864e8a8 Branch: refs/heads/branch-2.0 Commit: c864e8a8020f4890f1839766851e7f4917da5c70 Parents: dae1581 Author: Dongjoon Hyun Authored: Thu Nov 3 23:15:33 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 23:17:15 2016 -0700 -- .../main/scala/org/apache/spark/util/collection/OpenHashSet.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c864e8a8/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 7a1be85..60f6f53 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( private def nextPowerOf2(n: Int): Int = { if (n == 0) { - 2 + 1 } else { val highBit = Integer.highestOneBit(n) if (highBit == n) n else highBit << 1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet
Repository: spark Updated Branches: refs/heads/branch-2.1 8e145a94b -> cfe76028b [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet ## What changes were proposed in this pull request? This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent. **Before** ``` nextPowerOf2(0) => 2 nextPowerOf2(1) => 1 nextPowerOf2(2) => 2 nextPowerOf2(3) => 4 nextPowerOf2(4) => 4 nextPowerOf2(5) => 8 ``` **After** ``` nextPowerOf2(0) => 1 nextPowerOf2(1) => 1 nextPowerOf2(2) => 2 nextPowerOf2(3) => 4 nextPowerOf2(4) => 4 nextPowerOf2(5) => 8 ``` ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #15754 from dongjoon-hyun/SPARK-18200-2. (cherry picked from commit 27602c33751cebf6cd173c0de103454608cf6625) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cfe76028 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cfe76028 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cfe76028 Branch: refs/heads/branch-2.1 Commit: cfe76028bb116d72eab6601bff3b2a1856597370 Parents: 8e145a9 Author: Dongjoon Hyun Authored: Thu Nov 3 23:15:33 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 23:17:07 2016 -0700 -- .../main/scala/org/apache/spark/util/collection/OpenHashSet.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cfe76028/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 7a1be85..60f6f53 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( private def nextPowerOf2(n: Int): Int = { if (n == 0) { - 2 + 1 } else { val highBit = Integer.highestOneBit(n) if (highBit == n) n else highBit << 1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet
Repository: spark Updated Branches: refs/heads/master a08463b1d -> 27602c337 [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet ## What changes were proposed in this pull request? This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent. **Before** ``` nextPowerOf2(0) => 2 nextPowerOf2(1) => 1 nextPowerOf2(2) => 2 nextPowerOf2(3) => 4 nextPowerOf2(4) => 4 nextPowerOf2(5) => 8 ``` **After** ``` nextPowerOf2(0) => 1 nextPowerOf2(1) => 1 nextPowerOf2(2) => 2 nextPowerOf2(3) => 4 nextPowerOf2(4) => 4 nextPowerOf2(5) => 8 ``` ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #15754 from dongjoon-hyun/SPARK-18200-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27602c33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27602c33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27602c33 Branch: refs/heads/master Commit: 27602c33751cebf6cd173c0de103454608cf6625 Parents: a08463b Author: Dongjoon Hyun Authored: Thu Nov 3 23:15:33 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 23:15:33 2016 -0700 -- .../main/scala/org/apache/spark/util/collection/OpenHashSet.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27602c33/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 7a1be85..60f6f53 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( private def nextPowerOf2(n: Int): Int = { if (n == 0) { - 2 + 1 } else { val highBit = Integer.highestOneBit(n) if (highBit == n) n else highBit << 1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18259][SQL] Do not capture Throwable in QueryExecution
Repository: spark Updated Branches: refs/heads/branch-2.1 37550c492 -> 91d567150 [SPARK-18259][SQL] Do not capture Throwable in QueryExecution ## What changes were proposed in this pull request? `QueryExecution.toString` currently captures `java.lang.Throwable`s; this is far from a best practice and can lead to confusing situation or invalid application states. This PR fixes this by only capturing `AnalysisException`s. ## How was this patch tested? Added a `QueryExecutionSuite`. Author: Herman van Hovell Closes #15760 from hvanhovell/SPARK-18259. (cherry picked from commit aa412c55e31e61419d3de57ef4b13e50f9b38af0) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91d56715 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91d56715 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91d56715 Branch: refs/heads/branch-2.1 Commit: 91d567150b305d05acb8543da5cbf21df244352d Parents: 37550c4 Author: Herman van Hovell Authored: Thu Nov 3 21:59:59 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 22:00:23 2016 -0700 -- .../spark/sql/execution/QueryExecution.scala| 2 +- .../sql/execution/QueryExecutionSuite.scala | 50 2 files changed, 51 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/91d56715/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index cb45a6d..b3ef29f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -104,7 +104,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) { ReuseSubquery(sparkSession.sessionState.conf)) protected def stringOrError[A](f: => A): String = -try f.toString catch { case e: Throwable => e.toString } +try f.toString catch { case e: AnalysisException => e.toString } /** http://git-wip-us.apache.org/repos/asf/spark/blob/91d56715/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala new file mode 100644 index 000..8bceab3 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} +import org.apache.spark.sql.test.SharedSQLContext + +class QueryExecutionSuite extends SharedSQLContext { + test("toString() exception/error handling") { +val badRule = new SparkStrategy { + var mode: String = "" + override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match { +case "exception" => throw new AnalysisException(mode) +case "error" => throw new Error(mode) +case _ => Nil + } +} +spark.experimental.extraStrategies = badRule :: Nil + +def qe: QueryExecution = new QueryExecution(spark, OneRowRelation) + +// Nothing! +badRule.mode = "" +assert(qe.toString.contains("OneRowRelation")) + +// Throw an AnalysisException - this should be captured. +badRule.mode = "exception" +assert(qe.toString.contains("org.apache.spark.sql.AnalysisException")) + +// Throw an Error - this should not be captured. +badRule.mode = "error" +val error = intercept[Error](qe.toString) +assert(error.getMessage.contains("error")) + } +} --
spark git commit: [SPARK-18259][SQL] Do not capture Throwable in QueryExecution
Repository: spark Updated Branches: refs/heads/master dc4c60098 -> aa412c55e [SPARK-18259][SQL] Do not capture Throwable in QueryExecution ## What changes were proposed in this pull request? `QueryExecution.toString` currently captures `java.lang.Throwable`s; this is far from a best practice and can lead to confusing situation or invalid application states. This PR fixes this by only capturing `AnalysisException`s. ## How was this patch tested? Added a `QueryExecutionSuite`. Author: Herman van Hovell Closes #15760 from hvanhovell/SPARK-18259. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aa412c55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aa412c55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aa412c55 Branch: refs/heads/master Commit: aa412c55e31e61419d3de57ef4b13e50f9b38af0 Parents: dc4c600 Author: Herman van Hovell Authored: Thu Nov 3 21:59:59 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 21:59:59 2016 -0700 -- .../spark/sql/execution/QueryExecution.scala| 2 +- .../sql/execution/QueryExecutionSuite.scala | 50 2 files changed, 51 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aa412c55/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index cb45a6d..b3ef29f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -104,7 +104,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) { ReuseSubquery(sparkSession.sessionState.conf)) protected def stringOrError[A](f: => A): String = -try f.toString catch { case e: Throwable => e.toString } +try f.toString catch { case e: AnalysisException => e.toString } /** http://git-wip-us.apache.org/repos/asf/spark/blob/aa412c55/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala new file mode 100644 index 000..8bceab3 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} +import org.apache.spark.sql.test.SharedSQLContext + +class QueryExecutionSuite extends SharedSQLContext { + test("toString() exception/error handling") { +val badRule = new SparkStrategy { + var mode: String = "" + override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match { +case "exception" => throw new AnalysisException(mode) +case "error" => throw new Error(mode) +case _ => Nil + } +} +spark.experimental.extraStrategies = badRule :: Nil + +def qe: QueryExecution = new QueryExecution(spark, OneRowRelation) + +// Nothing! +badRule.mode = "" +assert(qe.toString.contains("OneRowRelation")) + +// Throw an AnalysisException - this should be captured. +badRule.mode = "exception" +assert(qe.toString.contains("org.apache.spark.sql.AnalysisException")) + +// Throw an Error - this should not be captured. +badRule.mode = "error" +val error = intercept[Error](qe.toString) +assert(error.getMessage.contains("error")) + } +} - To unsubscribe, e-mail: commits-unsubscr..
spark git commit: [SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0
Repository: spark Updated Branches: refs/heads/branch-2.1 af60b1ebb -> 37550c492 [SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0 ## What changes were proposed in this pull request? Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0. This does not actually implement any of the change in SPARK-18138, just peppers the documentation with notices about it. ## How was this patch tested? Doc build Author: Sean Owen Closes #15733 from srowen/SPARK-18138. (cherry picked from commit dc4c60098641cf64007e2f0e36378f000ad5f6b1) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37550c49 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37550c49 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37550c49 Branch: refs/heads/branch-2.1 Commit: 37550c49218e1890f8adc10c9549a23dc072e21f Parents: af60b1e Author: Sean Owen Authored: Thu Nov 3 17:27:23 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 17:27:44 2016 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala | 12 docs/building-spark.md | 6 ++ docs/index.md | 4 docs/programming-guide.md | 4 python/pyspark/context.py | 4 5 files changed, 30 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/37550c49/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 63478c8..9f0f607 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging { // log out Spark Version in Spark driver log logInfo(s"Running Spark version $SPARK_VERSION") + warnDeprecatedVersions() + /* - * | Private variables. These variables keep the internal state of the context, and are| | not accessible by the outside world. They're mutable since we want to initialize all | @@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging { value } + private def warnDeprecatedVersions(): Unit = { +val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3) +if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) { + logWarning("Support for Java 7 is deprecated as of Spark 2.0.0") +} +if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) { + logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0") +} + } + /** Control our logLevel. This overrides any user-defined log settings. * @param logLevel The desired log level as a string. * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN http://git-wip-us.apache.org/repos/asf/spark/blob/37550c49/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index ebe46a4..2b404bd 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -13,6 +13,7 @@ redirect_from: "building-with-maven.html" The Maven-based build is the build of reference for Apache Spark. Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+. +Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be removed in Spark 2.2.0. ### Setting up Maven's Memory Usage @@ -79,6 +80,9 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro +Note that support for versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0 and may be +removed in Spark 2.2.0. + You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later. @@ -129,6 +133,8 @@ To produce a Spark package compiled with Scala 2.10, use the `-Dscala-2.10` prop ./dev/change-scala-version.sh 2.10 ./build/mvn -Pyarn -Phadoop-2.4 -Dscala-2.10 -DskipTests clean package + +Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.2.0. ## Building submodules individually http://git-wip-us.apache.org/repos/asf/spark/blob/37550c49/docs/index.md -- diff --git a/docs/index.md b/docs/index.md index
spark git commit: [SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0
Repository: spark Updated Branches: refs/heads/master f22954ad4 -> dc4c60098 [SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0 ## What changes were proposed in this pull request? Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0. This does not actually implement any of the change in SPARK-18138, just peppers the documentation with notices about it. ## How was this patch tested? Doc build Author: Sean Owen Closes #15733 from srowen/SPARK-18138. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc4c6009 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc4c6009 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc4c6009 Branch: refs/heads/master Commit: dc4c60098641cf64007e2f0e36378f000ad5f6b1 Parents: f22954a Author: Sean Owen Authored: Thu Nov 3 17:27:23 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 17:27:23 2016 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala | 12 docs/building-spark.md | 6 ++ docs/index.md | 4 docs/programming-guide.md | 4 python/pyspark/context.py | 4 5 files changed, 30 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c6009/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 63478c8..9f0f607 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging { // log out Spark Version in Spark driver log logInfo(s"Running Spark version $SPARK_VERSION") + warnDeprecatedVersions() + /* - * | Private variables. These variables keep the internal state of the context, and are| | not accessible by the outside world. They're mutable since we want to initialize all | @@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging { value } + private def warnDeprecatedVersions(): Unit = { +val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3) +if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) { + logWarning("Support for Java 7 is deprecated as of Spark 2.0.0") +} +if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) { + logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0") +} + } + /** Control our logLevel. This overrides any user-defined log settings. * @param logLevel The desired log level as a string. * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c6009/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index ebe46a4..2b404bd 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -13,6 +13,7 @@ redirect_from: "building-with-maven.html" The Maven-based build is the build of reference for Apache Spark. Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+. +Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be removed in Spark 2.2.0. ### Setting up Maven's Memory Usage @@ -79,6 +80,9 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro +Note that support for versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0 and may be +removed in Spark 2.2.0. + You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later. @@ -129,6 +133,8 @@ To produce a Spark package compiled with Scala 2.10, use the `-Dscala-2.10` prop ./dev/change-scala-version.sh 2.10 ./build/mvn -Pyarn -Phadoop-2.4 -Dscala-2.10 -DskipTests clean package + +Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.2.0. ## Building submodules individually http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c6009/docs/index.md -- diff --git a/docs/index.md b/docs/index.md index a7a92f6..fe51439 100644 --- a/docs/index.md +++ b/docs/index.md @@ -28,6 +28,10 @@ Spark runs on Java 7+,
spark git commit: [SPARK-18257][SS] Improve error reporting for FileStressSuite
Repository: spark Updated Branches: refs/heads/branch-2.1 2daca62cd -> af60b1ebb [SPARK-18257][SS] Improve error reporting for FileStressSuite ## What changes were proposed in this pull request? This patch improves error reporting for FileStressSuite, when there is an error in Spark itself (not user code). This works by simply tightening the exception verification, and gets rid of the unnecessary thread for starting the stream. Also renamed the class FileStreamStressSuite to make it more obvious it is a streaming suite. ## How was this patch tested? This is a test only change and I manually verified error reporting by injecting some bug in the addBatch code for FileStreamSink. Author: Reynold Xin Closes #15757 from rxin/SPARK-18257. (cherry picked from commit f22954ad49bf5a32c7b6d8487cd38ffe0da904ca) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af60b1eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af60b1eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af60b1eb Branch: refs/heads/branch-2.1 Commit: af60b1ebbf5cb91dc724aad9d3d7476ce9085ac9 Parents: 2daca62 Author: Reynold Xin Authored: Thu Nov 3 15:30:45 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 15:30:55 2016 -0700 -- .../sql/streaming/FileStreamStressSuite.scala | 156 +++ .../spark/sql/streaming/FileStressSuite.scala | 153 -- 2 files changed, 156 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af60b1eb/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala new file mode 100644 index 000..28412ea --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import java.io.File +import java.util.UUID + +import scala.util.Random +import scala.util.control.NonFatal + +import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.util.Utils + +/** + * A stress test for streaming queries that read and write files. This test consists of + * two threads: + * - one that writes out `numRecords` distinct integers to files of random sizes (the total + *number of records is fixed but each files size / creation time is random). + * - another that continually restarts a buggy streaming query (i.e. fails with 5% probability on + *any partition). + * + * At the end, the resulting files are loaded and the answer is checked. + */ +class FileStreamStressSuite extends StreamTest { + import testImplicits._ + + // Error message thrown in the streaming job for testing recovery. + private val injectedErrorMsg = "test suite injected failure!" + + testQuietly("fault tolerance stress test - unpartitioned output") { +stressTest(partitionWrites = false) + } + + testQuietly("fault tolerance stress test - partitioned output") { +stressTest(partitionWrites = true) + } + + def stressTest(partitionWrites: Boolean): Unit = { +val numRecords = 1 +val inputDir = Utils.createTempDir(namePrefix = "stream.input").getCanonicalPath +val stagingDir = Utils.createTempDir(namePrefix = "stream.staging").getCanonicalPath +val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath +val checkpoint = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath + +@volatile +var continue = true +@volatile +var stream: StreamingQuery = null + +val writer = new Thread("stream writer") { + override def run(): Unit = { +var
spark git commit: [SPARK-18257][SS] Improve error reporting for FileStressSuite
Repository: spark Updated Branches: refs/heads/master e89202523 -> f22954ad4 [SPARK-18257][SS] Improve error reporting for FileStressSuite ## What changes were proposed in this pull request? This patch improves error reporting for FileStressSuite, when there is an error in Spark itself (not user code). This works by simply tightening the exception verification, and gets rid of the unnecessary thread for starting the stream. Also renamed the class FileStreamStressSuite to make it more obvious it is a streaming suite. ## How was this patch tested? This is a test only change and I manually verified error reporting by injecting some bug in the addBatch code for FileStreamSink. Author: Reynold Xin Closes #15757 from rxin/SPARK-18257. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f22954ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f22954ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f22954ad Branch: refs/heads/master Commit: f22954ad49bf5a32c7b6d8487cd38ffe0da904ca Parents: e892025 Author: Reynold Xin Authored: Thu Nov 3 15:30:45 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 15:30:45 2016 -0700 -- .../sql/streaming/FileStreamStressSuite.scala | 156 +++ .../spark/sql/streaming/FileStressSuite.scala | 153 -- 2 files changed, 156 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f22954ad/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala new file mode 100644 index 000..28412ea --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import java.io.File +import java.util.UUID + +import scala.util.Random +import scala.util.control.NonFatal + +import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.util.Utils + +/** + * A stress test for streaming queries that read and write files. This test consists of + * two threads: + * - one that writes out `numRecords` distinct integers to files of random sizes (the total + *number of records is fixed but each files size / creation time is random). + * - another that continually restarts a buggy streaming query (i.e. fails with 5% probability on + *any partition). + * + * At the end, the resulting files are loaded and the answer is checked. + */ +class FileStreamStressSuite extends StreamTest { + import testImplicits._ + + // Error message thrown in the streaming job for testing recovery. + private val injectedErrorMsg = "test suite injected failure!" + + testQuietly("fault tolerance stress test - unpartitioned output") { +stressTest(partitionWrites = false) + } + + testQuietly("fault tolerance stress test - partitioned output") { +stressTest(partitionWrites = true) + } + + def stressTest(partitionWrites: Boolean): Unit = { +val numRecords = 1 +val inputDir = Utils.createTempDir(namePrefix = "stream.input").getCanonicalPath +val stagingDir = Utils.createTempDir(namePrefix = "stream.staging").getCanonicalPath +val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath +val checkpoint = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath + +@volatile +var continue = true +@volatile +var stream: StreamingQuery = null + +val writer = new Thread("stream writer") { + override def run(): Unit = { +var i = numRecords +while (i > 0) { + val count = Random.nextInt(100) + var j = 0 +
spark git commit: [SPARK-18237][HIVE] hive.exec.stagingdir have no effect
Repository: spark Updated Branches: refs/heads/master b17057c0a -> 16293311c [SPARK-18237][HIVE] hive.exec.stagingdir have no effect hive.exec.stagingdir have no effect in spark2.0.1ï¼ Hive confs in hive-site.xml will be loaded in `hadoopConf`, so we should use `hadoopConf` in `InsertIntoHiveTable` instead of `SessionState.conf` Author: ç¦æ Closes #15744 from ClassNotFoundExp/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16293311 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16293311 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16293311 Branch: refs/heads/master Commit: 16293311cdb25a62733a9aae4355659b971a3ce1 Parents: b17057c Author: ç¦æ Authored: Thu Nov 3 12:02:01 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 12:02:01 2016 -0700 -- .../apache/spark/sql/hive/execution/InsertIntoHiveTable.scala| 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/16293311/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 15be12c..e333fc7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -76,7 +76,8 @@ case class InsertIntoHiveTable( def output: Seq[Attribute] = Seq.empty - val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging") + val hadoopConf = sessionState.newHadoopConf() + val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") private def executionId: String = { val rand: Random = new Random @@ -163,7 +164,6 @@ case class InsertIntoHiveTable( // instances within the closure, since Serializer is not serializable while TableDesc is. val tableDesc = table.tableDesc val tableLocation = table.hiveQlTable.getDataLocation -val hadoopConf = sessionState.newHadoopConf() val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf) val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false) val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18237][HIVE] hive.exec.stagingdir have no effect
Repository: spark Updated Branches: refs/heads/branch-2.1 4f91630c8 -> 3e139e239 [SPARK-18237][HIVE] hive.exec.stagingdir have no effect hive.exec.stagingdir have no effect in spark2.0.1ï¼ Hive confs in hive-site.xml will be loaded in `hadoopConf`, so we should use `hadoopConf` in `InsertIntoHiveTable` instead of `SessionState.conf` Author: ç¦æ Closes #15744 from ClassNotFoundExp/master. (cherry picked from commit 16293311cdb25a62733a9aae4355659b971a3ce1) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e139e23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e139e23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e139e23 Branch: refs/heads/branch-2.1 Commit: 3e139e2390085cfb42f7136f150b0fa08c14eb61 Parents: 4f91630 Author: ç¦æ Authored: Thu Nov 3 12:02:01 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 12:02:08 2016 -0700 -- .../apache/spark/sql/hive/execution/InsertIntoHiveTable.scala| 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e139e23/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 15be12c..e333fc7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -76,7 +76,8 @@ case class InsertIntoHiveTable( def output: Seq[Attribute] = Seq.empty - val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging") + val hadoopConf = sessionState.newHadoopConf() + val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") private def executionId: String = { val rand: Random = new Random @@ -163,7 +164,6 @@ case class InsertIntoHiveTable( // instances within the closure, since Serializer is not serializable while TableDesc is. val tableDesc = table.tableDesc val tableLocation = table.hiveQlTable.getDataLocation -val hadoopConf = sessionState.newHadoopConf() val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf) val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false) val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog
Repository: spark Updated Branches: refs/heads/branch-2.1 c2876bfbf -> 4f91630c8 [SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog ## What changes were proposed in this pull request? This patch renames partitionProviderIsHive to tracksPartitionsInCatalog, as the old name was too Hive specific. ## How was this patch tested? Should be covered by existing tests. Author: Reynold Xin Closes #15750 from rxin/SPARK-18244. (cherry picked from commit b17057c0a69b9c56e503483d97f5dc209eef0884) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f91630c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f91630c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f91630c Branch: refs/heads/branch-2.1 Commit: 4f91630c8100ee3a6fd168bc4247ca6fadd0a736 Parents: c2876bf Author: Reynold Xin Authored: Thu Nov 3 11:48:05 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 11:48:17 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 9 + .../sql/catalyst/trees/TreeNodeSuite.scala | 2 +- .../command/createDataSourceTables.scala| 2 +- .../spark/sql/execution/command/ddl.scala | 4 ++-- .../spark/sql/execution/command/tables.scala| 2 +- .../sql/execution/datasources/DataSource.scala | 2 +- .../datasources/DataSourceStrategy.scala| 7 --- .../InsertIntoHadoopFsRelationCommand.scala | 6 +- .../spark/sql/execution/command/DDLSuite.scala | 2 +- .../spark/sql/hive/HiveExternalCatalog.scala| 21 10 files changed, 30 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f91630c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 7c3bec8..34748a0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -138,8 +138,9 @@ case class BucketSpec( * Can be None if this table is a View, should be "hive" for hive serde tables. * @param unsupportedFeatures is a list of string descriptions of features that are used by the *underlying table but not supported by Spark SQL yet. - * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive - *metastore. + * @param tracksPartitionsInCatalog whether this table's partition metadata is stored in the + * catalog. If false, it is inferred automatically based on file + * structure. */ case class CatalogTable( identifier: TableIdentifier, @@ -158,7 +159,7 @@ case class CatalogTable( viewText: Option[String] = None, comment: Option[String] = None, unsupportedFeatures: Seq[String] = Seq.empty, -partitionProviderIsHive: Boolean = false) { +tracksPartitionsInCatalog: Boolean = false) { /** schema of this table's partition columns */ def partitionSchema: StructType = StructType(schema.filter { @@ -217,7 +218,7 @@ case class CatalogTable( if (properties.nonEmpty) s"Properties: $tableProperties" else "", if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "", s"$storage", -if (partitionProviderIsHive) "Partition Provider: Hive" else "") +if (tracksPartitionsInCatalog) "Partition Provider: Catalog" else "") output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")") } http://git-wip-us.apache.org/repos/asf/spark/blob/4f91630c/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala index 3eff12f..af1eaa1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala @@ -489,7 +489,7 @@ class TreeNodeSuite extends SparkFunSuite { "owner" -> "", "createTime" -> 0, "lastAccessTime" -&
spark git commit: [SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog
Repository: spark Updated Branches: refs/heads/master 27daf6bcd -> b17057c0a [SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog ## What changes were proposed in this pull request? This patch renames partitionProviderIsHive to tracksPartitionsInCatalog, as the old name was too Hive specific. ## How was this patch tested? Should be covered by existing tests. Author: Reynold Xin Closes #15750 from rxin/SPARK-18244. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b17057c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b17057c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b17057c0 Branch: refs/heads/master Commit: b17057c0a69b9c56e503483d97f5dc209eef0884 Parents: 27daf6b Author: Reynold Xin Authored: Thu Nov 3 11:48:05 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 11:48:05 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 9 + .../sql/catalyst/trees/TreeNodeSuite.scala | 2 +- .../command/createDataSourceTables.scala| 2 +- .../spark/sql/execution/command/ddl.scala | 4 ++-- .../spark/sql/execution/command/tables.scala| 2 +- .../sql/execution/datasources/DataSource.scala | 2 +- .../datasources/DataSourceStrategy.scala| 7 --- .../InsertIntoHadoopFsRelationCommand.scala | 6 +- .../spark/sql/execution/command/DDLSuite.scala | 2 +- .../spark/sql/hive/HiveExternalCatalog.scala| 21 10 files changed, 30 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b17057c0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 7c3bec8..34748a0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -138,8 +138,9 @@ case class BucketSpec( * Can be None if this table is a View, should be "hive" for hive serde tables. * @param unsupportedFeatures is a list of string descriptions of features that are used by the *underlying table but not supported by Spark SQL yet. - * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive - *metastore. + * @param tracksPartitionsInCatalog whether this table's partition metadata is stored in the + * catalog. If false, it is inferred automatically based on file + * structure. */ case class CatalogTable( identifier: TableIdentifier, @@ -158,7 +159,7 @@ case class CatalogTable( viewText: Option[String] = None, comment: Option[String] = None, unsupportedFeatures: Seq[String] = Seq.empty, -partitionProviderIsHive: Boolean = false) { +tracksPartitionsInCatalog: Boolean = false) { /** schema of this table's partition columns */ def partitionSchema: StructType = StructType(schema.filter { @@ -217,7 +218,7 @@ case class CatalogTable( if (properties.nonEmpty) s"Properties: $tableProperties" else "", if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "", s"$storage", -if (partitionProviderIsHive) "Partition Provider: Hive" else "") +if (tracksPartitionsInCatalog) "Partition Provider: Catalog" else "") output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")") } http://git-wip-us.apache.org/repos/asf/spark/blob/b17057c0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala index 3eff12f..af1eaa1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala @@ -489,7 +489,7 @@ class TreeNodeSuite extends SparkFunSuite { "owner" -> "", "createTime" -> 0, "lastAccessTime" -> -1, -"partitionProviderIsHive" -> false, +"tracksPartitionsInCata
spark git commit: [SQL] minor - internal doc improvement for InsertIntoTable.
Repository: spark Updated Branches: refs/heads/branch-2.1 bc7f05f5f -> 71104c9c9 [SQL] minor - internal doc improvement for InsertIntoTable. ## What changes were proposed in this pull request? I was reading this part of the code and was really confused by the "partition" parameter. This patch adds some documentation for it to reduce confusion in the future. I also looked around other logical plans but most of them are either already documented, or pretty self-evident to people that know Spark SQL. ## How was this patch tested? N/A - doc change only. Author: Reynold Xin Closes #15749 from rxin/doc-improvement. (cherry picked from commit 0ea5d5b24c1f7b29efeac0e72d271aba279523f7) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71104c9c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71104c9c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71104c9c Branch: refs/heads/branch-2.1 Commit: 71104c9c97a648c94e6619279ad49752c01c89c3 Parents: bc7f05f Author: Reynold Xin Authored: Thu Nov 3 02:45:54 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 02:46:01 2016 -0700 -- .../plans/logical/basicLogicalOperators.scala | 16 ++ .../hive/execution/InsertIntoHiveTable.scala| 31 2 files changed, 42 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71104c9c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 7a15c22..65ceab2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -360,6 +360,22 @@ case class OverwriteOptions( } } +/** + * Insert some data into a table. + * + * @param table the logical plan representing the table. In the future this should be a + * [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we converge Hive tables + * and data source tables. + * @param partition a map from the partition key to the partition value (optional). If the partition + * value is optional, dynamic partition insert will be performed. + * As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have + * Map('a' -> Some('1'), 'b' -> Some('2')), + * and `INSERT INTO tbl PARTITION (a=1, b) AS ...` + * would have Map('a' -> Some('1'), 'b' -> None). + * @param child the logical plan representing data to write to. + * @param overwrite overwrite existing table or partitions. + * @param ifNotExists If true, only write if the table or partition does not exist. + */ case class InsertIntoTable( table: LogicalPlan, partition: Map[String, Option[String]], http://git-wip-us.apache.org/repos/asf/spark/blob/71104c9c/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 05164d7..15be12c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -35,13 +35,35 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand} import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.SparkException import org.apache.spark.util.SerializableJobConf +/** + * Command for writing data out to a Hive table. + * + * This class is mostly a mess, for legacy reasons (since it evolved in organic ways and had to + * follow Hive's internal implementations closely, which itself was a mess too). Please don't + * blame Reynold for this! He was just moving code arou
spark git commit: [SQL] minor - internal doc improvement for InsertIntoTable.
Repository: spark Updated Branches: refs/heads/master 937af592e -> 0ea5d5b24 [SQL] minor - internal doc improvement for InsertIntoTable. ## What changes were proposed in this pull request? I was reading this part of the code and was really confused by the "partition" parameter. This patch adds some documentation for it to reduce confusion in the future. I also looked around other logical plans but most of them are either already documented, or pretty self-evident to people that know Spark SQL. ## How was this patch tested? N/A - doc change only. Author: Reynold Xin Closes #15749 from rxin/doc-improvement. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0ea5d5b2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0ea5d5b2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0ea5d5b2 Branch: refs/heads/master Commit: 0ea5d5b24c1f7b29efeac0e72d271aba279523f7 Parents: 937af59 Author: Reynold Xin Authored: Thu Nov 3 02:45:54 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 02:45:54 2016 -0700 -- .../plans/logical/basicLogicalOperators.scala | 16 ++ .../hive/execution/InsertIntoHiveTable.scala| 31 2 files changed, 42 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0ea5d5b2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 7a15c22..65ceab2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -360,6 +360,22 @@ case class OverwriteOptions( } } +/** + * Insert some data into a table. + * + * @param table the logical plan representing the table. In the future this should be a + * [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we converge Hive tables + * and data source tables. + * @param partition a map from the partition key to the partition value (optional). If the partition + * value is optional, dynamic partition insert will be performed. + * As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have + * Map('a' -> Some('1'), 'b' -> Some('2')), + * and `INSERT INTO tbl PARTITION (a=1, b) AS ...` + * would have Map('a' -> Some('1'), 'b' -> None). + * @param child the logical plan representing data to write to. + * @param overwrite overwrite existing table or partitions. + * @param ifNotExists If true, only write if the table or partition does not exist. + */ case class InsertIntoTable( table: LogicalPlan, partition: Map[String, Option[String]], http://git-wip-us.apache.org/repos/asf/spark/blob/0ea5d5b2/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 05164d7..15be12c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -35,13 +35,35 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand} import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.SparkException import org.apache.spark.util.SerializableJobConf +/** + * Command for writing data out to a Hive table. + * + * This class is mostly a mess, for legacy reasons (since it evolved in organic ways and had to + * follow Hive's internal implementations closely, which itself was a mess too). Please don't + * blame Reynold for this! He was just moving code around! + * + * In the future we should converge the write path for Hive with the normal data source write path,
spark git commit: [SPARK-18219] Move commit protocol API (internal) from sql/core to core module
Repository: spark Updated Branches: refs/heads/branch-2.1 c4c5328f2 -> bc7f05f5f [SPARK-18219] Move commit protocol API (internal) from sql/core to core module ## What changes were proposed in this pull request? This patch moves the new commit protocol API from sql/core to core module, so we can use it in the future in the RDD API. As part of this patch, I also moved the speficiation of the random uuid for the write path out of the commit protocol, and instead pass in a job id. ## How was this patch tested? N/A Author: Reynold Xin Closes #15731 from rxin/SPARK-18219. (cherry picked from commit 937af592e65f4dd878aafcabf8fe2cfe7fa3d9b3) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc7f05f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc7f05f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc7f05f5 Branch: refs/heads/branch-2.1 Commit: bc7f05f5f03653c623190b8178bcbe981a41c2f3 Parents: c4c5328 Author: Reynold Xin Authored: Thu Nov 3 02:42:48 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 02:43:03 2016 -0700 -- .../spark/internal/io/FileCommitProtocol.scala | 126 + .../io/HadoopMapReduceCommitProtocol.scala | 111 .../datasources/FileCommitProtocol.scala| 257 --- .../datasources/FileFormatWriter.scala | 3 +- .../InsertIntoHadoopFsRelationCommand.scala | 6 +- .../SQLHadoopMapReduceCommitProtocol.scala | 72 ++ .../execution/streaming/FileStreamSink.scala| 9 +- .../streaming/ManifestFileCommitProtocol.scala | 6 +- .../org/apache/spark/sql/internal/SQLConf.scala | 4 +- 9 files changed, 327 insertions(+), 267 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bc7f05f5/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala new file mode 100644 index 000..fb80205 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.io + +import org.apache.hadoop.mapreduce._ + +import org.apache.spark.util.Utils + + +/** + * An interface to define how a single Spark job commits its outputs. Two notes: + * + * 1. Implementations must be serializable, as the committer instance instantiated on the driver + *will be used for tasks on executors. + * 2. Implementations should have a constructor with either 2 or 3 arguments: + *(jobId: String, path: String) or (jobId: String, path: String, isAppend: Boolean). + * 3. A committer should not be reused across multiple Spark jobs. + * + * The proper call sequence is: + * + * 1. Driver calls setupJob. + * 2. As part of each task's execution, executor calls setupTask and then commitTask + *(or abortTask if task failed). + * 3. When all necessary tasks completed successfully, the driver calls commitJob. If the job + *failed to execute (e.g. too many failed tasks), the job should call abortJob. + */ +abstract class FileCommitProtocol { + import FileCommitProtocol._ + + /** + * Setups up a job. Must be called on the driver before any other methods can be invoked. + */ + def setupJob(jobContext: JobContext): Unit + + /** + * Commits a job after the writes succeed. Must be called on the driver. + */ + def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit + + /** + * Aborts a job after the writes fail. Must be called on the driver. + * + * Calling this function is a best-effort attempt, because it is possible that the driver + * just crashes (or killed) before it can call abort. + */ + def abortJob(jobContext: JobContext): Unit + + /** + * Sets up a task within
spark git commit: [SPARK-18219] Move commit protocol API (internal) from sql/core to core module
Repository: spark Updated Branches: refs/heads/master 96cc1b567 -> 937af592e [SPARK-18219] Move commit protocol API (internal) from sql/core to core module ## What changes were proposed in this pull request? This patch moves the new commit protocol API from sql/core to core module, so we can use it in the future in the RDD API. As part of this patch, I also moved the speficiation of the random uuid for the write path out of the commit protocol, and instead pass in a job id. ## How was this patch tested? N/A Author: Reynold Xin Closes #15731 from rxin/SPARK-18219. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/937af592 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/937af592 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/937af592 Branch: refs/heads/master Commit: 937af592e65f4dd878aafcabf8fe2cfe7fa3d9b3 Parents: 96cc1b56 Author: Reynold Xin Authored: Thu Nov 3 02:42:48 2016 -0700 Committer: Reynold Xin Committed: Thu Nov 3 02:42:48 2016 -0700 -- .../spark/internal/io/FileCommitProtocol.scala | 126 + .../io/HadoopMapReduceCommitProtocol.scala | 111 .../datasources/FileCommitProtocol.scala| 257 --- .../datasources/FileFormatWriter.scala | 3 +- .../InsertIntoHadoopFsRelationCommand.scala | 6 +- .../SQLHadoopMapReduceCommitProtocol.scala | 72 ++ .../execution/streaming/FileStreamSink.scala| 9 +- .../streaming/ManifestFileCommitProtocol.scala | 6 +- .../org/apache/spark/sql/internal/SQLConf.scala | 4 +- 9 files changed, 327 insertions(+), 267 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/937af592/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala new file mode 100644 index 000..fb80205 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.io + +import org.apache.hadoop.mapreduce._ + +import org.apache.spark.util.Utils + + +/** + * An interface to define how a single Spark job commits its outputs. Two notes: + * + * 1. Implementations must be serializable, as the committer instance instantiated on the driver + *will be used for tasks on executors. + * 2. Implementations should have a constructor with either 2 or 3 arguments: + *(jobId: String, path: String) or (jobId: String, path: String, isAppend: Boolean). + * 3. A committer should not be reused across multiple Spark jobs. + * + * The proper call sequence is: + * + * 1. Driver calls setupJob. + * 2. As part of each task's execution, executor calls setupTask and then commitTask + *(or abortTask if task failed). + * 3. When all necessary tasks completed successfully, the driver calls commitJob. If the job + *failed to execute (e.g. too many failed tasks), the job should call abortJob. + */ +abstract class FileCommitProtocol { + import FileCommitProtocol._ + + /** + * Setups up a job. Must be called on the driver before any other methods can be invoked. + */ + def setupJob(jobContext: JobContext): Unit + + /** + * Commits a job after the writes succeed. Must be called on the driver. + */ + def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit + + /** + * Aborts a job after the writes fail. Must be called on the driver. + * + * Calling this function is a best-effort attempt, because it is possible that the driver + * just crashes (or killed) before it can call abort. + */ + def abortJob(jobContext: JobContext): Unit + + /** + * Sets up a task within a job. + * Must be called before any other task related methods can be invoked. + */
spark git commit: [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet
Repository: spark Updated Branches: refs/heads/branch-2.1 2cf39d638 -> 965c964c2 [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet ## What changes were proposed in this pull request? [SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: Invalid initial capacity` while running `triangleCount`. The root cause is that `VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial size. This PR loosens the restriction to allow zero. ## How was this patch tested? Pass the Jenkins test with a new test case in `OpenHashSetSuite`. Author: Dongjoon Hyun Closes #15741 from dongjoon-hyun/SPARK-18200. (cherry picked from commit d24e736471f34ef8f2c12766393379c4213fe96e) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/965c964c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/965c964c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/965c964c Branch: refs/heads/branch-2.1 Commit: 965c964c2657aaf575f0e00ce6b74a8f05172c06 Parents: 2cf39d6 Author: Dongjoon Hyun Authored: Wed Nov 2 23:50:50 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 23:51:16 2016 -0700 -- .../org/apache/spark/util/collection/OpenHashSet.scala| 10 +++--- .../apache/spark/util/collection/OpenHashMapSuite.scala | 3 --- .../apache/spark/util/collection/OpenHashSetSuite.scala | 5 + .../util/collection/PrimitiveKeyOpenHashMapSuite.scala| 3 --- 4 files changed, 12 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/965c964c/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 0f6a425..7a1be85 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( require(initialCapacity <= OpenHashSet.MAX_CAPACITY, s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements") - require(initialCapacity >= 1, "Invalid initial capacity") + require(initialCapacity >= 0, "Invalid initial capacity") require(loadFactor < 1.0, "Load factor must be less than 1.0") require(loadFactor > 0.0, "Load factor must be greater than 0.0") @@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt() private def nextPowerOf2(n: Int): Int = { -val highBit = Integer.highestOneBit(n) -if (highBit == n) n else highBit << 1 +if (n == 0) { + 2 +} else { + val highBit = Integer.highestOneBit(n) + if (highBit == n) n else highBit << 1 +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/965c964c/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 3066e99..335ecb9 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { intercept[IllegalArgumentException] { new OpenHashMap[String, Int](-1) } -intercept[IllegalArgumentException] { - new OpenHashMap[String, String](0) -} } test("primitive value") { http://git-wip-us.apache.org/repos/asf/spark/blob/965c964c/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index 2607a54..210bc5c 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { assert(set.size === 1000) assert(set.capacity > 1000) } + + test("SPARK-18200 Support zero as an initial set size") { +val set = new OpenHashSet[Long](0) +assert(set.size === 0) + } } http:/
spark git commit: [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet
Repository: spark Updated Branches: refs/heads/branch-2.0 3253ae7f7 -> dae1581d9 [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet ## What changes were proposed in this pull request? [SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: Invalid initial capacity` while running `triangleCount`. The root cause is that `VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial size. This PR loosens the restriction to allow zero. ## How was this patch tested? Pass the Jenkins test with a new test case in `OpenHashSetSuite`. Author: Dongjoon Hyun Closes #15741 from dongjoon-hyun/SPARK-18200. (cherry picked from commit d24e736471f34ef8f2c12766393379c4213fe96e) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dae1581d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dae1581d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dae1581d Branch: refs/heads/branch-2.0 Commit: dae1581d9461346511098dc83938939a0f930048 Parents: 3253ae7 Author: Dongjoon Hyun Authored: Wed Nov 2 23:50:50 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 23:51:26 2016 -0700 -- .../org/apache/spark/util/collection/OpenHashSet.scala| 10 +++--- .../apache/spark/util/collection/OpenHashMapSuite.scala | 3 --- .../apache/spark/util/collection/OpenHashSetSuite.scala | 5 + .../util/collection/PrimitiveKeyOpenHashMapSuite.scala| 3 --- 4 files changed, 12 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dae1581d/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 0f6a425..7a1be85 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( require(initialCapacity <= OpenHashSet.MAX_CAPACITY, s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements") - require(initialCapacity >= 1, "Invalid initial capacity") + require(initialCapacity >= 0, "Invalid initial capacity") require(loadFactor < 1.0, "Load factor must be less than 1.0") require(loadFactor > 0.0, "Load factor must be greater than 0.0") @@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt() private def nextPowerOf2(n: Int): Int = { -val highBit = Integer.highestOneBit(n) -if (highBit == n) n else highBit << 1 +if (n == 0) { + 2 +} else { + val highBit = Integer.highestOneBit(n) + if (highBit == n) n else highBit << 1 +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/dae1581d/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 3066e99..335ecb9 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { intercept[IllegalArgumentException] { new OpenHashMap[String, Int](-1) } -intercept[IllegalArgumentException] { - new OpenHashMap[String, String](0) -} } test("primitive value") { http://git-wip-us.apache.org/repos/asf/spark/blob/dae1581d/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index 2607a54..210bc5c 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { assert(set.size === 1000) assert(set.capacity > 1000) } + + test("SPARK-18200 Support zero as an initial set size") { +val set = new OpenHashSet[Long](0) +assert(set.size === 0) + } } http:/
spark git commit: [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet
Repository: spark Updated Branches: refs/heads/master 9ddec8636 -> d24e73647 [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet ## What changes were proposed in this pull request? [SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: Invalid initial capacity` while running `triangleCount`. The root cause is that `VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial size. This PR loosens the restriction to allow zero. ## How was this patch tested? Pass the Jenkins test with a new test case in `OpenHashSetSuite`. Author: Dongjoon Hyun Closes #15741 from dongjoon-hyun/SPARK-18200. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d24e7364 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d24e7364 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d24e7364 Branch: refs/heads/master Commit: d24e736471f34ef8f2c12766393379c4213fe96e Parents: 9ddec86 Author: Dongjoon Hyun Authored: Wed Nov 2 23:50:50 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 23:50:50 2016 -0700 -- .../org/apache/spark/util/collection/OpenHashSet.scala| 10 +++--- .../apache/spark/util/collection/OpenHashMapSuite.scala | 3 --- .../apache/spark/util/collection/OpenHashSetSuite.scala | 5 + .../util/collection/PrimitiveKeyOpenHashMapSuite.scala| 3 --- 4 files changed, 12 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 0f6a425..7a1be85 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( require(initialCapacity <= OpenHashSet.MAX_CAPACITY, s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements") - require(initialCapacity >= 1, "Invalid initial capacity") + require(initialCapacity >= 0, "Invalid initial capacity") require(loadFactor < 1.0, "Load factor must be less than 1.0") require(loadFactor > 0.0, "Load factor must be greater than 0.0") @@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag]( private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt() private def nextPowerOf2(n: Int): Int = { -val highBit = Integer.highestOneBit(n) -if (highBit == n) n else highBit << 1 +if (n == 0) { + 2 +} else { + val highBit = Integer.highestOneBit(n) + if (highBit == n) n else highBit << 1 +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 3066e99..335ecb9 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { intercept[IllegalArgumentException] { new OpenHashMap[String, Int](-1) } -intercept[IllegalArgumentException] { - new OpenHashMap[String, String](0) -} } test("primitive value") { http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index 2607a54..210bc5c 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { assert(set.size === 1000) assert(set.capacity > 1000) } + + test("SPARK-18200 Support zero as an initial set size") { +val set = new OpenHashSet[Long](0) +assert(set.size === 0) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/test/scala/org/apache/spark/util/collection/
spark git commit: [SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion
Repository: spark Updated Branches: refs/heads/branch-2.1 1eef8e5cd -> 2aff2ea81 [SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion ## What changes were proposed in this pull request? RuntimeReplaceable is used to create aliases for expressions, but the way it deals with type coercion is pretty weird (each expression is responsible for how to handle type coercion, which does not obey the normal implicit type cast rules). This patch simplifies its handling by allowing the analyzer to traverse into the actual expression of a RuntimeReplaceable. ## How was this patch tested? - Correctness should be guaranteed by existing unit tests already - Removed SQLCompatibilityFunctionSuite and moved it sql-compatibility-functions.sql - Added a new test case in sql-compatibility-functions.sql for verifying explain behavior. Author: Reynold Xin Closes #15723 from rxin/SPARK-18214. (cherry picked from commit fd90541c35af2bccf0155467bec8cea7c8865046) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2aff2ea8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2aff2ea8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2aff2ea8 Branch: refs/heads/branch-2.1 Commit: 2aff2ea81d260a47e7762b2990ed62a91e5d0198 Parents: 1eef8e5 Author: Reynold Xin Authored: Wed Nov 2 15:53:02 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 15:53:09 2016 -0700 -- .../sql/catalyst/analysis/TypeCoercion.scala| 2 - .../sql/catalyst/expressions/Expression.scala | 30 ++--- .../expressions/datetimeExpressions.scala | 2 - .../catalyst/expressions/nullExpressions.scala | 75 --- .../sql/catalyst/optimizer/finishAnalysis.scala | 2 +- .../expressions/NullFunctionsSuite.scala| 19 ++- .../inputs/sql-compatibility-functions.sql | 25 .../resources/sql-tests/results/array.sql.out | 5 +- .../results/sql-compatibility-functions.sql.out | 124 +++ .../sql/SQLCompatibilityFunctionSuite.scala | 98 --- .../apache/spark/sql/SQLQueryTestSuite.scala| 4 +- 11 files changed, 204 insertions(+), 182 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2aff2ea8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 01b04c0..6662a9e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -528,8 +528,6 @@ object TypeCoercion { NaNvl(l, Cast(r, DoubleType)) case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType => NaNvl(Cast(l, DoubleType), r) - - case e: RuntimeReplaceable => e.replaceForTypeCoercion() } } http://git-wip-us.apache.org/repos/asf/spark/blob/2aff2ea8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 726a231..221f830 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -186,7 +186,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase - protected def flatArguments = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = productIterator.flatMap { case t: Traversable[_] => t case single => single :: Nil } @@ -229,26 +229,16 @@ trait Unevaluable extends Expression { * An expression that gets replaced at runtime (currently by the optimizer) into a different * expression for evaluation. This is mainly used to provide compatibility with other databases. * For example, we use this to support "nvl" by replacing it with "coalesce". + * + * A RuntimeReplaceable should have the original parameters along with a "child" expression in the + * case class constructor, and define a normal constructor that accepts only the original + * parameters. For an example, see [[Nvl]]. To make sure the explain plan and expression SQL + * works correctly, the implementation should also override flatArguments
spark git commit: [SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion
Repository: spark Updated Branches: refs/heads/master 37d95227a -> fd90541c3 [SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion ## What changes were proposed in this pull request? RuntimeReplaceable is used to create aliases for expressions, but the way it deals with type coercion is pretty weird (each expression is responsible for how to handle type coercion, which does not obey the normal implicit type cast rules). This patch simplifies its handling by allowing the analyzer to traverse into the actual expression of a RuntimeReplaceable. ## How was this patch tested? - Correctness should be guaranteed by existing unit tests already - Removed SQLCompatibilityFunctionSuite and moved it sql-compatibility-functions.sql - Added a new test case in sql-compatibility-functions.sql for verifying explain behavior. Author: Reynold Xin Closes #15723 from rxin/SPARK-18214. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd90541c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd90541c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd90541c Branch: refs/heads/master Commit: fd90541c35af2bccf0155467bec8cea7c8865046 Parents: 37d9522 Author: Reynold Xin Authored: Wed Nov 2 15:53:02 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 15:53:02 2016 -0700 -- .../sql/catalyst/analysis/TypeCoercion.scala| 2 - .../sql/catalyst/expressions/Expression.scala | 30 ++--- .../expressions/datetimeExpressions.scala | 2 - .../catalyst/expressions/nullExpressions.scala | 75 --- .../sql/catalyst/optimizer/finishAnalysis.scala | 2 +- .../expressions/NullFunctionsSuite.scala| 19 ++- .../inputs/sql-compatibility-functions.sql | 25 .../resources/sql-tests/results/array.sql.out | 5 +- .../results/sql-compatibility-functions.sql.out | 124 +++ .../sql/SQLCompatibilityFunctionSuite.scala | 98 --- .../apache/spark/sql/SQLQueryTestSuite.scala| 4 +- 11 files changed, 204 insertions(+), 182 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fd90541c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 01b04c0..6662a9e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -528,8 +528,6 @@ object TypeCoercion { NaNvl(l, Cast(r, DoubleType)) case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType => NaNvl(Cast(l, DoubleType), r) - - case e: RuntimeReplaceable => e.replaceForTypeCoercion() } } http://git-wip-us.apache.org/repos/asf/spark/blob/fd90541c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 726a231..221f830 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -186,7 +186,7 @@ abstract class Expression extends TreeNode[Expression] { */ def prettyName: String = nodeName.toLowerCase - protected def flatArguments = productIterator.flatMap { + protected def flatArguments: Iterator[Any] = productIterator.flatMap { case t: Traversable[_] => t case single => single :: Nil } @@ -229,26 +229,16 @@ trait Unevaluable extends Expression { * An expression that gets replaced at runtime (currently by the optimizer) into a different * expression for evaluation. This is mainly used to provide compatibility with other databases. * For example, we use this to support "nvl" by replacing it with "coalesce". + * + * A RuntimeReplaceable should have the original parameters along with a "child" expression in the + * case class constructor, and define a normal constructor that accepts only the original + * parameters. For an example, see [[Nvl]]. To make sure the explain plan and expression SQL + * works correctly, the implementation should also override flatArguments method and sql method. */ -trait RuntimeReplaceable extends Unevaluable { - /** - * Method for concr
spark git commit: [SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test against staging artifacts
Repository: spark Updated Branches: refs/heads/branch-2.1 bd3ea6595 -> 1eef8e5cd [SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test against staging artifacts ## What changes were proposed in this pull request? Adds a `snapshots-and-staging profile` so that RCs of projects like Hadoop and HBase can be used in developer-only build and test runs. There's a comment above the profile telling people not to use this in production. There's no attempt to do the same for SBT, as Ivy is different. ## How was this patch tested? Tested by building against the Hadoop 2.7.3 RC 1 JARs without the profile (and without any local copy of the 2.7.3 artifacts), the build failed ``` mvn install -DskipTests -Pyarn,hadoop-2.7,hive -Dhadoop.version=2.7.3 ... [INFO] [INFO] Building Spark Project Launcher 2.1.0-SNAPSHOT [INFO] Downloading: https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.pom [WARNING] The POM for org.apache.hadoop:hadoop-client:jar:2.7.3 is missing, no dependency information available Downloading: https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.jar [INFO] [INFO] Reactor Summary: [INFO] [INFO] Spark Project Parent POM ... SUCCESS [ 4.482 s] [INFO] Spark Project Tags . SUCCESS [ 17.402 s] [INFO] Spark Project Sketch ... SUCCESS [ 11.252 s] [INFO] Spark Project Networking ... SUCCESS [ 13.458 s] [INFO] Spark Project Shuffle Streaming Service SUCCESS [ 9.043 s] [INFO] Spark Project Unsafe ... SUCCESS [ 16.027 s] [INFO] Spark Project Launcher . FAILURE [ 1.653 s] [INFO] Spark Project Core . SKIPPED ... ``` With the profile, the build completed ``` mvn install -DskipTests -Pyarn,hadoop-2.7,hive,snapshots-and-staging -Dhadoop.version=2.7.3 ``` Author: Steve Loughran Closes #14646 from steveloughran/stevel/SPARK-17058-support-asf-snapshots. (cherry picked from commit 37d95227a21de602b939dae84943ba007f434513) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1eef8e5c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1eef8e5c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1eef8e5c Branch: refs/heads/branch-2.1 Commit: 1eef8e5cd09dfb8b77044ef9864321618e8ea8c8 Parents: bd3ea65 Author: Steve Loughran Authored: Wed Nov 2 11:52:29 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:52:38 2016 -0700 -- pom.xml | 48 1 file changed, 48 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1eef8e5c/pom.xml -- diff --git a/pom.xml b/pom.xml index aaf7cfa..04d2eaa 100644 --- a/pom.xml +++ b/pom.xml @@ -2694,6 +2694,54 @@ + + snapshots-and-staging + + + https://repository.apache.org/content/groups/staging/ + https://repository.apache.org/content/repositories/snapshots/ + + + + + ASF Staging + ${asf.staging} + + + ASF Snapshots + ${asf.snapshots} + +true + + +false + + + + + + + ASF Staging + ${asf.staging} + + + ASF Snapshots + ${asf.snapshots} + +true + + +false + + + + + +
spark git commit: [SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test against staging artifacts
Repository: spark Updated Branches: refs/heads/master 3c24299b7 -> 37d95227a [SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test against staging artifacts ## What changes were proposed in this pull request? Adds a `snapshots-and-staging profile` so that RCs of projects like Hadoop and HBase can be used in developer-only build and test runs. There's a comment above the profile telling people not to use this in production. There's no attempt to do the same for SBT, as Ivy is different. ## How was this patch tested? Tested by building against the Hadoop 2.7.3 RC 1 JARs without the profile (and without any local copy of the 2.7.3 artifacts), the build failed ``` mvn install -DskipTests -Pyarn,hadoop-2.7,hive -Dhadoop.version=2.7.3 ... [INFO] [INFO] Building Spark Project Launcher 2.1.0-SNAPSHOT [INFO] Downloading: https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.pom [WARNING] The POM for org.apache.hadoop:hadoop-client:jar:2.7.3 is missing, no dependency information available Downloading: https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.jar [INFO] [INFO] Reactor Summary: [INFO] [INFO] Spark Project Parent POM ... SUCCESS [ 4.482 s] [INFO] Spark Project Tags . SUCCESS [ 17.402 s] [INFO] Spark Project Sketch ... SUCCESS [ 11.252 s] [INFO] Spark Project Networking ... SUCCESS [ 13.458 s] [INFO] Spark Project Shuffle Streaming Service SUCCESS [ 9.043 s] [INFO] Spark Project Unsafe ... SUCCESS [ 16.027 s] [INFO] Spark Project Launcher . FAILURE [ 1.653 s] [INFO] Spark Project Core . SKIPPED ... ``` With the profile, the build completed ``` mvn install -DskipTests -Pyarn,hadoop-2.7,hive,snapshots-and-staging -Dhadoop.version=2.7.3 ``` Author: Steve Loughran Closes #14646 from steveloughran/stevel/SPARK-17058-support-asf-snapshots. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37d95227 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37d95227 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37d95227 Branch: refs/heads/master Commit: 37d95227a21de602b939dae84943ba007f434513 Parents: 3c24299 Author: Steve Loughran Authored: Wed Nov 2 11:52:29 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:52:29 2016 -0700 -- pom.xml | 48 1 file changed, 48 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/37d95227/pom.xml -- diff --git a/pom.xml b/pom.xml index aaf7cfa..04d2eaa 100644 --- a/pom.xml +++ b/pom.xml @@ -2694,6 +2694,54 @@ + + snapshots-and-staging + + + https://repository.apache.org/content/groups/staging/ + https://repository.apache.org/content/repositories/snapshots/ + + + + + ASF Staging + ${asf.staging} + + + ASF Snapshots + ${asf.snapshots} + +true + + +false + + + + + + + ASF Staging + ${asf.staging} + + + ASF Snapshots + ${asf.snapshots} + +true + + +false + + + + + +
spark git commit: [SPARK-18111][SQL] Wrong approximate quantile answer when multiple records have the minimum value(for branch 2.0)
Repository: spark Updated Branches: refs/heads/branch-2.0 1696bcfad -> 3253ae7f7 [SPARK-18111][SQL] Wrong approximate quantile answer when multiple records have the minimum value(for branch 2.0) ## What changes were proposed in this pull request? When multiple records have the minimum value, the answer of `StatFunctions.multipleApproxQuantiles` is wrong. ## How was this patch tested? add a test case Author: wangzhenhua Closes #15732 from wzhfy/percentile2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3253ae7f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3253ae7f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3253ae7f Branch: refs/heads/branch-2.0 Commit: 3253ae7f722a996cf0af21608e1a27d5d2a12004 Parents: 1696bcf Author: wangzhenhua Authored: Wed Nov 2 11:49:30 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:49:30 2016 -0700 -- .../spark/sql/execution/stat/StatFunctions.scala | 4 +++- .../org/apache/spark/sql/DataFrameStatSuite.scala | 13 + 2 files changed, 16 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index 7e2ebe8..acc42a0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -337,7 +337,9 @@ object StatFunctions extends Logging { res.prepend(head) // If necessary, add the minimum element: val currHead = currentSamples.head - if (currHead.value < head.value) { + // don't add the minimum element if `currentSamples` has only one element (both `currHead` and + // `head` point to the same element) + if (currHead.value <= head.value && currentSamples.length > 1) { res.prepend(currentSamples.head) } res.toArray http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 73026c7..571e2ad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -152,6 +152,19 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { } } + test("approximate quantile, multiple records with the minimum value in a partition") { +val data = Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5) +val df = spark.sparkContext.makeRDD(data, 4).toDF("col") +val epsilons = List(0.1, 0.05, 0.001) +val quantile = 0.5 +val expected = 1 +for (epsilon <- epsilons) { + val Array(answer) = df.stat.approxQuantile("col", Array(quantile), epsilon) + val error = 2 * data.length * epsilon + assert(math.abs(answer - expected) < error) +} + } + test("crosstab") { val rng = new Random() val data = Seq.tabulate(25)(i => (rng.nextInt(5), rng.nextInt(10))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17895] Improve doc for rangeBetween and rowsBetween
Repository: spark Updated Branches: refs/heads/master 4af0ce2d9 -> 742e0fea5 [SPARK-17895] Improve doc for rangeBetween and rowsBetween ## What changes were proposed in this pull request? Copied description for row and range based frame boundary from https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala#L56 Added examples to show different behavior of rangeBetween and rowsBetween when involving duplicate values. Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: buzhihuojie Closes #15727 from david-weiluo-ren/improveDocForRangeAndRowsBetween. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/742e0fea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/742e0fea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/742e0fea Branch: refs/heads/master Commit: 742e0fea5391857964e90d396641ecf95cac4248 Parents: 4af0ce2 Author: buzhihuojie Authored: Wed Nov 2 11:36:20 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:36:20 2016 -0700 -- .../apache/spark/sql/expressions/Window.scala | 55 .../spark/sql/expressions/WindowSpec.scala | 55 2 files changed, 110 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/742e0fea/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala index 0b26d86..327bc37 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala @@ -121,6 +121,32 @@ object Window { * and [[Window.currentRow]] to specify special boundary values, rather than using integral * values directly. * + * A row based boundary is based on the position of the row within the partition. + * An offset indicates the number of rows above or below the current row, the frame for the + * current row starts or ends. For instance, given a row based sliding frame with a lower bound + * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from + * index 4 to index 6. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * df.withColumn("sum", + * sum('id) over Window.partitionBy('category).orderBy('id).rowsBetween(0,1)) + * .show() + * + * +---++---+ + * | id|category|sum| + * +---++---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 2| + * | 1| a| 3| + * | 2| a| 2| + * +---++---+ + * }}} + * * @param start boundary start, inclusive. The frame is unbounded if this is * the minimum long value ([[Window.unboundedPreceding]]). * @param end boundary end, inclusive. The frame is unbounded if this is the @@ -144,6 +170,35 @@ object Window { * and [[Window.currentRow]] to specify special boundary values, rather than using integral * values directly. * + * A range based boundary is based on the actual value of the ORDER BY + * expression(s). An offset is used to alter the value of the ORDER BY expression, for + * instance if the current order by expression has a value of 10 and the lower bound offset + * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a + * number of constraints on the ORDER BY expressions: there can be only one expression and this + * expression must have a numerical data type. An exception can be made when the offset is 0, + * because no value modification is needed, in this case multiple and non-numeric ORDER BY + * expression are allowed. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * df.withColumn("sum", + * sum('id) over Window.partitionBy('category).orderBy('id).rangeBetween(0,1)) + * .show() + * + * +---++---+ + * | id|category|sum| + * +---++---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 4| + * | 1| a| 4| + * | 2| a| 2| + * +---++---+ + * }}} + * * @param start boundary start, inclusive. The frame is unboun
spark git commit: [SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union
Repository: spark Updated Branches: refs/heads/branch-2.1 a885d5bbc -> 0093257ea [SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union ## What changes were proposed in this pull request? When a user appended a column using a "nondeterministic" function to a DataFrame, e.g., `rand`, `randn`, and `monotonically_increasing_id`, the expected semantic is the following: - The value in each row should remain unchanged, as if we materialize the column immediately, regardless of later DataFrame operations. However, since we use `TaskContext.getPartitionId` to get the partition index from the current thread, the values from nondeterministic columns might change if we call `union` or `coalesce` after. `TaskContext.getPartitionId` returns the partition index of the current Spark task, which might not be the corresponding partition index of the DataFrame where we defined the column. See the unit tests below or JIRA for examples. This PR uses the partition index from `RDD.mapPartitionWithIndex` instead of `TaskContext` and fixes the partition initialization logic in whole-stage codegen, normal codegen, and codegen fallback. `initializeStatesForPartition(partitionIndex: Int)` was added to `Projection`, `Nondeterministic`, and `Predicate` (codegen) and initialized right after object creation in `mapPartitionWithIndex`. `newPredicate` now returns a `Predicate` instance rather than a function for proper initialization. ## How was this patch tested? Unit tests. (Actually I'm not very confident that this PR fixed all issues without introducing new ones ...) cc: rxin davies Author: Xiangrui Meng Closes #15567 from mengxr/SPARK-14393. (cherry picked from commit 02f203107b8eda1f1576e36c4f12b0e3bc5e910e) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0093257e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0093257e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0093257e Branch: refs/heads/branch-2.1 Commit: 0093257ea94d3a197ca061b54c04685d7c1f616a Parents: a885d5b Author: Xiangrui Meng Authored: Wed Nov 2 11:41:49 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:42:01 2016 -0700 -- .../main/scala/org/apache/spark/rdd/RDD.scala | 16 +- .../sql/catalyst/expressions/Expression.scala | 19 +-- .../catalyst/expressions/InputFileName.scala| 2 +- .../expressions/MonotonicallyIncreasingID.scala | 11 +++-- .../sql/catalyst/expressions/Projection.scala | 22 ++--- .../catalyst/expressions/SparkPartitionID.scala | 13 +++-- .../expressions/codegen/CodeGenerator.scala | 14 ++ .../expressions/codegen/CodegenFallback.scala | 18 +-- .../codegen/GenerateMutableProjection.scala | 4 ++ .../expressions/codegen/GeneratePredicate.scala | 18 +-- .../codegen/GenerateSafeProjection.scala| 4 ++ .../codegen/GenerateUnsafeProjection.scala | 4 ++ .../sql/catalyst/expressions/package.scala | 10 +++- .../sql/catalyst/expressions/predicates.scala | 4 -- .../expressions/randomExpressions.scala | 14 +++--- .../sql/catalyst/optimizer/Optimizer.scala | 1 + .../expressions/ExpressionEvalHelper.scala | 5 +- .../codegen/CodegenExpressionCachingSuite.scala | 13 +++-- .../sql/execution/DataSourceScanExec.scala | 6 ++- .../spark/sql/execution/ExistingRDD.scala | 3 +- .../spark/sql/execution/GenerateExec.scala | 3 +- .../apache/spark/sql/execution/SparkPlan.scala | 4 +- .../sql/execution/WholeStageCodegenExec.scala | 8 ++- .../sql/execution/basicPhysicalOperators.scala | 8 +-- .../columnar/InMemoryTableScanExec.scala| 5 +- .../joins/BroadcastNestedLoopJoinExec.scala | 7 +-- .../execution/joins/CartesianProductExec.scala | 8 +-- .../spark/sql/execution/joins/HashJoin.scala| 2 +- .../sql/execution/joins/SortMergeJoinExec.scala | 2 +- .../apache/spark/sql/execution/objects.scala| 6 ++- .../spark/sql/DataFrameFunctionsSuite.scala | 52 .../sql/hive/execution/HiveTableScanExec.scala | 3 +- 32 files changed, 231 insertions(+), 78 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0093257e/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index db535de..e018af3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -788,14 +788,26 @@ abstract class RDD[T: ClassTag]( } /** - * [performance] Spark's internal mapPartitions method w
spark git commit: [SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union
Repository: spark Updated Branches: refs/heads/master 742e0fea5 -> 02f203107 [SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union ## What changes were proposed in this pull request? When a user appended a column using a "nondeterministic" function to a DataFrame, e.g., `rand`, `randn`, and `monotonically_increasing_id`, the expected semantic is the following: - The value in each row should remain unchanged, as if we materialize the column immediately, regardless of later DataFrame operations. However, since we use `TaskContext.getPartitionId` to get the partition index from the current thread, the values from nondeterministic columns might change if we call `union` or `coalesce` after. `TaskContext.getPartitionId` returns the partition index of the current Spark task, which might not be the corresponding partition index of the DataFrame where we defined the column. See the unit tests below or JIRA for examples. This PR uses the partition index from `RDD.mapPartitionWithIndex` instead of `TaskContext` and fixes the partition initialization logic in whole-stage codegen, normal codegen, and codegen fallback. `initializeStatesForPartition(partitionIndex: Int)` was added to `Projection`, `Nondeterministic`, and `Predicate` (codegen) and initialized right after object creation in `mapPartitionWithIndex`. `newPredicate` now returns a `Predicate` instance rather than a function for proper initialization. ## How was this patch tested? Unit tests. (Actually I'm not very confident that this PR fixed all issues without introducing new ones ...) cc: rxin davies Author: Xiangrui Meng Closes #15567 from mengxr/SPARK-14393. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02f20310 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02f20310 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02f20310 Branch: refs/heads/master Commit: 02f203107b8eda1f1576e36c4f12b0e3bc5e910e Parents: 742e0fe Author: Xiangrui Meng Authored: Wed Nov 2 11:41:49 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:41:49 2016 -0700 -- .../main/scala/org/apache/spark/rdd/RDD.scala | 16 +- .../sql/catalyst/expressions/Expression.scala | 19 +-- .../catalyst/expressions/InputFileName.scala| 2 +- .../expressions/MonotonicallyIncreasingID.scala | 11 +++-- .../sql/catalyst/expressions/Projection.scala | 22 ++--- .../catalyst/expressions/SparkPartitionID.scala | 13 +++-- .../expressions/codegen/CodeGenerator.scala | 14 ++ .../expressions/codegen/CodegenFallback.scala | 18 +-- .../codegen/GenerateMutableProjection.scala | 4 ++ .../expressions/codegen/GeneratePredicate.scala | 18 +-- .../codegen/GenerateSafeProjection.scala| 4 ++ .../codegen/GenerateUnsafeProjection.scala | 4 ++ .../sql/catalyst/expressions/package.scala | 10 +++- .../sql/catalyst/expressions/predicates.scala | 4 -- .../expressions/randomExpressions.scala | 14 +++--- .../sql/catalyst/optimizer/Optimizer.scala | 1 + .../expressions/ExpressionEvalHelper.scala | 5 +- .../codegen/CodegenExpressionCachingSuite.scala | 13 +++-- .../sql/execution/DataSourceScanExec.scala | 6 ++- .../spark/sql/execution/ExistingRDD.scala | 3 +- .../spark/sql/execution/GenerateExec.scala | 3 +- .../apache/spark/sql/execution/SparkPlan.scala | 4 +- .../sql/execution/WholeStageCodegenExec.scala | 8 ++- .../sql/execution/basicPhysicalOperators.scala | 8 +-- .../columnar/InMemoryTableScanExec.scala| 5 +- .../joins/BroadcastNestedLoopJoinExec.scala | 7 +-- .../execution/joins/CartesianProductExec.scala | 8 +-- .../spark/sql/execution/joins/HashJoin.scala| 2 +- .../sql/execution/joins/SortMergeJoinExec.scala | 2 +- .../apache/spark/sql/execution/objects.scala| 6 ++- .../spark/sql/DataFrameFunctionsSuite.scala | 52 .../sql/hive/execution/HiveTableScanExec.scala | 3 +- 32 files changed, 231 insertions(+), 78 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/02f20310/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index db535de..e018af3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -788,14 +788,26 @@ abstract class RDD[T: ClassTag]( } /** - * [performance] Spark's internal mapPartitions method which skips closure cleaning. It is a - * performance API to be used carefully only if we are sur
spark git commit: [SPARK-17895] Improve doc for rangeBetween and rowsBetween
Repository: spark Updated Branches: refs/heads/branch-2.1 9be069125 -> a885d5bbc [SPARK-17895] Improve doc for rangeBetween and rowsBetween ## What changes were proposed in this pull request? Copied description for row and range based frame boundary from https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala#L56 Added examples to show different behavior of rangeBetween and rowsBetween when involving duplicate values. Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: buzhihuojie Closes #15727 from david-weiluo-ren/improveDocForRangeAndRowsBetween. (cherry picked from commit 742e0fea5391857964e90d396641ecf95cac4248) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a885d5bb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a885d5bb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a885d5bb Branch: refs/heads/branch-2.1 Commit: a885d5bbce9dba66b394850b3aac51ae97cb18dd Parents: 9be0691 Author: buzhihuojie Authored: Wed Nov 2 11:36:20 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:36:26 2016 -0700 -- .../apache/spark/sql/expressions/Window.scala | 55 .../spark/sql/expressions/WindowSpec.scala | 55 2 files changed, 110 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a885d5bb/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala index 0b26d86..327bc37 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala @@ -121,6 +121,32 @@ object Window { * and [[Window.currentRow]] to specify special boundary values, rather than using integral * values directly. * + * A row based boundary is based on the position of the row within the partition. + * An offset indicates the number of rows above or below the current row, the frame for the + * current row starts or ends. For instance, given a row based sliding frame with a lower bound + * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from + * index 4 to index 6. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * df.withColumn("sum", + * sum('id) over Window.partitionBy('category).orderBy('id).rowsBetween(0,1)) + * .show() + * + * +---++---+ + * | id|category|sum| + * +---++---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 2| + * | 1| a| 3| + * | 2| a| 2| + * +---++---+ + * }}} + * * @param start boundary start, inclusive. The frame is unbounded if this is * the minimum long value ([[Window.unboundedPreceding]]). * @param end boundary end, inclusive. The frame is unbounded if this is the @@ -144,6 +170,35 @@ object Window { * and [[Window.currentRow]] to specify special boundary values, rather than using integral * values directly. * + * A range based boundary is based on the actual value of the ORDER BY + * expression(s). An offset is used to alter the value of the ORDER BY expression, for + * instance if the current order by expression has a value of 10 and the lower bound offset + * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a + * number of constraints on the ORDER BY expressions: there can be only one expression and this + * expression must have a numerical data type. An exception can be made when the offset is 0, + * because no value modification is needed, in this case multiple and non-numeric ORDER BY + * expression are allowed. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * df.withColumn("sum", + * sum('id) over Window.partitionBy('category).orderBy('id).rangeBetween(0,1)) + * .show() + * + * +---++---+ + * | id|category|sum| + * +---++---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 4| + * | 1| a| 4| + * | 2| a| 2| +
spark git commit: [SPARK-17683][SQL] Support ArrayType in Literal.apply
Repository: spark Updated Branches: refs/heads/branch-2.1 41491e540 -> 9be069125 [SPARK-17683][SQL] Support ArrayType in Literal.apply ## What changes were proposed in this pull request? This pr is to add pattern-matching entries for array data in `Literal.apply`. ## How was this patch tested? Added tests in `LiteralExpressionSuite`. Author: Takeshi YAMAMURO Closes #15257 from maropu/SPARK-17683. (cherry picked from commit 4af0ce2d96de3397c9bc05684cad290a52486577) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9be06912 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9be06912 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9be06912 Branch: refs/heads/branch-2.1 Commit: 9be069125f7e94df9d862f307b87965baf9416e3 Parents: 41491e5 Author: Takeshi YAMAMURO Authored: Wed Nov 2 11:29:26 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:29:39 2016 -0700 -- .../sql/catalyst/expressions/literals.scala | 57 +++- .../expressions/LiteralExpressionSuite.scala| 27 +- 2 files changed, 82 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9be06912/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index a597a17..1985e68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -17,14 +17,25 @@ package org.apache.spark.sql.catalyst.expressions +import java.lang.{Boolean => JavaBoolean} +import java.lang.{Byte => JavaByte} +import java.lang.{Double => JavaDouble} +import java.lang.{Float => JavaFloat} +import java.lang.{Integer => JavaInteger} +import java.lang.{Long => JavaLong} +import java.lang.{Short => JavaShort} +import java.math.{BigDecimal => JavaBigDecimal} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.util import java.util.Objects import javax.xml.bind.DatatypeConverter +import scala.math.{BigDecimal, BigInt} + import org.json4s.JsonAST._ +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -46,12 +57,17 @@ object Literal { case s: String => Literal(UTF8String.fromString(s), StringType) case b: Boolean => Literal(b, BooleanType) case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) -case d: java.math.BigDecimal => +case d: JavaBigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] => Literal(a, BinaryType) +case a: Array[_] => + val elementType = componentTypeToDataType(a.getClass.getComponentType()) + val dataType = ArrayType(elementType) + val convert = CatalystTypeConverters.createToCatalystConverter(dataType) + Literal(convert(a), dataType) case i: CalendarInterval => Literal(i, CalendarIntervalType) case null => Literal(null, NullType) case v: Literal => v @@ -60,6 +76,45 @@ object Literal { } /** + * Returns the Spark SQL DataType for a given class object. Since this type needs to be resolved + * in runtime, we use match-case idioms for class objects here. However, there are similar + * functions in other files (e.g., HiveInspectors), so these functions need to merged into one. + */ + private[this] def componentTypeToDataType(clz: Class[_]): DataType = clz match { +// primitive types +case JavaShort.TYPE => ShortType +case JavaInteger.TYPE => IntegerType +case JavaLong.TYPE => LongType +case JavaDouble.TYPE => DoubleType +case JavaByte.TYPE => ByteType +case JavaFloat.TYPE => FloatType +case JavaBoolean.TYPE => BooleanType + +// java classes +case _ if clz == classOf[Date] => DateType +case _ if clz == classOf[Timestamp] => TimestampType +case _ if clz == classOf[JavaBigDecimal] => DecimalType.SYSTEM_DEFAULT +case _ if clz == classOf[Array[Byte]] => BinaryType +case _ if clz == classOf[JavaShort] => ShortType +case _ if
spark git commit: [SPARK-17683][SQL] Support ArrayType in Literal.apply
Repository: spark Updated Branches: refs/heads/master f151bd1af -> 4af0ce2d9 [SPARK-17683][SQL] Support ArrayType in Literal.apply ## What changes were proposed in this pull request? This pr is to add pattern-matching entries for array data in `Literal.apply`. ## How was this patch tested? Added tests in `LiteralExpressionSuite`. Author: Takeshi YAMAMURO Closes #15257 from maropu/SPARK-17683. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4af0ce2d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4af0ce2d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4af0ce2d Branch: refs/heads/master Commit: 4af0ce2d96de3397c9bc05684cad290a52486577 Parents: f151bd1 Author: Takeshi YAMAMURO Authored: Wed Nov 2 11:29:26 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 11:29:26 2016 -0700 -- .../sql/catalyst/expressions/literals.scala | 57 +++- .../expressions/LiteralExpressionSuite.scala| 27 +- 2 files changed, 82 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4af0ce2d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index a597a17..1985e68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -17,14 +17,25 @@ package org.apache.spark.sql.catalyst.expressions +import java.lang.{Boolean => JavaBoolean} +import java.lang.{Byte => JavaByte} +import java.lang.{Double => JavaDouble} +import java.lang.{Float => JavaFloat} +import java.lang.{Integer => JavaInteger} +import java.lang.{Long => JavaLong} +import java.lang.{Short => JavaShort} +import java.math.{BigDecimal => JavaBigDecimal} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.util import java.util.Objects import javax.xml.bind.DatatypeConverter +import scala.math.{BigDecimal, BigInt} + import org.json4s.JsonAST._ +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -46,12 +57,17 @@ object Literal { case s: String => Literal(UTF8String.fromString(s), StringType) case b: Boolean => Literal(b, BooleanType) case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) -case d: java.math.BigDecimal => +case d: JavaBigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] => Literal(a, BinaryType) +case a: Array[_] => + val elementType = componentTypeToDataType(a.getClass.getComponentType()) + val dataType = ArrayType(elementType) + val convert = CatalystTypeConverters.createToCatalystConverter(dataType) + Literal(convert(a), dataType) case i: CalendarInterval => Literal(i, CalendarIntervalType) case null => Literal(null, NullType) case v: Literal => v @@ -60,6 +76,45 @@ object Literal { } /** + * Returns the Spark SQL DataType for a given class object. Since this type needs to be resolved + * in runtime, we use match-case idioms for class objects here. However, there are similar + * functions in other files (e.g., HiveInspectors), so these functions need to merged into one. + */ + private[this] def componentTypeToDataType(clz: Class[_]): DataType = clz match { +// primitive types +case JavaShort.TYPE => ShortType +case JavaInteger.TYPE => IntegerType +case JavaLong.TYPE => LongType +case JavaDouble.TYPE => DoubleType +case JavaByte.TYPE => ByteType +case JavaFloat.TYPE => FloatType +case JavaBoolean.TYPE => BooleanType + +// java classes +case _ if clz == classOf[Date] => DateType +case _ if clz == classOf[Timestamp] => TimestampType +case _ if clz == classOf[JavaBigDecimal] => DecimalType.SYSTEM_DEFAULT +case _ if clz == classOf[Array[Byte]] => BinaryType +case _ if clz == classOf[JavaShort] => ShortType +case _ if clz == classOf[JavaInteger] => IntegerType +case _ if clz == classOf[JavaLong] => LongType +case
spark git commit: [SPARK-17532] Add lock debugging info to thread dumps.
Repository: spark Updated Branches: refs/heads/master 85c5424d4 -> 2dc048081 [SPARK-17532] Add lock debugging info to thread dumps. ## What changes were proposed in this pull request? This adds information to the web UI thread dump page about the JVM locks held by threads and the locks that threads are blocked waiting to acquire. This should help find cases where lock contention is causing Spark applications to run slowly. ## How was this patch tested? Tested by applying this patch and viewing the change in the web UI. ![thread-lock-info](https://cloud.githubusercontent.com/assets/87915/18493057/6e5da870-79c3-11e6-8c20-f54c18a37544.png) Additions: - A "Thread Locking" column with the locks held by the thread or that are blocking the thread - Links from the a blocked thread to the thread holding the lock - Stack frames show where threads are inside `synchronized` blocks, "holding Monitor(...)" Author: Ryan Blue Closes #15088 from rdblue/SPARK-17532-add-thread-lock-info. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2dc04808 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2dc04808 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2dc04808 Branch: refs/heads/master Commit: 2dc048081668665f85623839d5f663b402e42555 Parents: 85c5424 Author: Ryan Blue Authored: Wed Nov 2 00:08:30 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 00:08:30 2016 -0700 -- .../org/apache/spark/ui/static/table.js | 3 +- .../spark/ui/exec/ExecutorThreadDumpPage.scala | 12 +++ .../apache/spark/util/ThreadStackTrace.scala| 6 +++- .../scala/org/apache/spark/util/Utils.scala | 34 +--- 4 files changed, 49 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2dc04808/core/src/main/resources/org/apache/spark/ui/static/table.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js b/core/src/main/resources/org/apache/spark/ui/static/table.js index 14b06bf..0315ebf 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/table.js +++ b/core/src/main/resources/org/apache/spark/ui/static/table.js @@ -36,7 +36,7 @@ function toggleThreadStackTrace(threadId, forceAdd) { if (stackTrace.length == 0) { var stackTraceText = $('#' + threadId + "_td_stacktrace").html() var threadCell = $("#thread_" + threadId + "_tr") -threadCell.after("" + +threadCell.after("" + stackTraceText + "") } else { if (!forceAdd) { @@ -73,6 +73,7 @@ function onMouseOverAndOut(threadId) { $("#" + threadId + "_td_id").toggleClass("threaddump-td-mouseover"); $("#" + threadId + "_td_name").toggleClass("threaddump-td-mouseover"); $("#" + threadId + "_td_state").toggleClass("threaddump-td-mouseover"); +$("#" + threadId + "_td_locking").toggleClass("threaddump-td-mouseover"); } function onSearchStringChange() { http://git-wip-us.apache.org/repos/asf/spark/blob/2dc04808/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala index a0ef80d..c6a0744 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala @@ -48,6 +48,16 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage } }.map { thread => val threadId = thread.threadId +val blockedBy = thread.blockedByThreadId match { + case Some(blockedByThreadId) => + + Blocked by + Thread {thread.blockedByThreadId} {thread.blockedByLock} + + case None => Text("") +} +val heldLocks = thread.holdingLocks.mkString(", ") + {threadId} {thread.threadName} {thread.threadState} + {blockedBy}{heldLocks} {thread.stackTrace} } @@ -86,6 +97,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage Thread ID Thread Name Thread State + Thread Locks {dumpRows} http://git-wip-us.apache.org/repos/asf/spark/blob/2dc04808/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala b/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.s
spark git commit: [SPARK-17532] Add lock debugging info to thread dumps.
Repository: spark Updated Branches: refs/heads/branch-2.1 4c4bf87ac -> 3b624bedf [SPARK-17532] Add lock debugging info to thread dumps. ## What changes were proposed in this pull request? This adds information to the web UI thread dump page about the JVM locks held by threads and the locks that threads are blocked waiting to acquire. This should help find cases where lock contention is causing Spark applications to run slowly. ## How was this patch tested? Tested by applying this patch and viewing the change in the web UI. ![thread-lock-info](https://cloud.githubusercontent.com/assets/87915/18493057/6e5da870-79c3-11e6-8c20-f54c18a37544.png) Additions: - A "Thread Locking" column with the locks held by the thread or that are blocking the thread - Links from the a blocked thread to the thread holding the lock - Stack frames show where threads are inside `synchronized` blocks, "holding Monitor(...)" Author: Ryan Blue Closes #15088 from rdblue/SPARK-17532-add-thread-lock-info. (cherry picked from commit 2dc048081668665f85623839d5f663b402e42555) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b624bed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b624bed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b624bed Branch: refs/heads/branch-2.1 Commit: 3b624bedf0f0ecd5dcfcc262a3ca8b4e33662533 Parents: 4c4bf87 Author: Ryan Blue Authored: Wed Nov 2 00:08:30 2016 -0700 Committer: Reynold Xin Committed: Wed Nov 2 00:08:37 2016 -0700 -- .../org/apache/spark/ui/static/table.js | 3 +- .../spark/ui/exec/ExecutorThreadDumpPage.scala | 12 +++ .../apache/spark/util/ThreadStackTrace.scala| 6 +++- .../scala/org/apache/spark/util/Utils.scala | 34 +--- 4 files changed, 49 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b624bed/core/src/main/resources/org/apache/spark/ui/static/table.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js b/core/src/main/resources/org/apache/spark/ui/static/table.js index 14b06bf..0315ebf 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/table.js +++ b/core/src/main/resources/org/apache/spark/ui/static/table.js @@ -36,7 +36,7 @@ function toggleThreadStackTrace(threadId, forceAdd) { if (stackTrace.length == 0) { var stackTraceText = $('#' + threadId + "_td_stacktrace").html() var threadCell = $("#thread_" + threadId + "_tr") -threadCell.after("" + +threadCell.after("" + stackTraceText + "") } else { if (!forceAdd) { @@ -73,6 +73,7 @@ function onMouseOverAndOut(threadId) { $("#" + threadId + "_td_id").toggleClass("threaddump-td-mouseover"); $("#" + threadId + "_td_name").toggleClass("threaddump-td-mouseover"); $("#" + threadId + "_td_state").toggleClass("threaddump-td-mouseover"); +$("#" + threadId + "_td_locking").toggleClass("threaddump-td-mouseover"); } function onSearchStringChange() { http://git-wip-us.apache.org/repos/asf/spark/blob/3b624bed/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala index a0ef80d..c6a0744 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala @@ -48,6 +48,16 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage } }.map { thread => val threadId = thread.threadId +val blockedBy = thread.blockedByThreadId match { + case Some(blockedByThreadId) => + + Blocked by + Thread {thread.blockedByThreadId} {thread.blockedByLock} + + case None => Text("") +} +val heldLocks = thread.holdingLocks.mkString(", ") + {threadId} {thread.threadName} {thread.threadState} + {blockedBy}{heldLocks} {thread.stackTrace} } @@ -86,6 +97,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage Thread ID Thread Name Thread State + Thread Locks {dumpRows} http://git-wip-us.apache.org/repos/asf/spark/blob/3b624bed/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala -- diff --git a/core/src/main/scala/or
spark git commit: [SPARK-18192] Support all file formats in structured streaming
Repository: spark Updated Branches: refs/heads/branch-2.1 e6509c245 -> 85dd07374 [SPARK-18192] Support all file formats in structured streaming ## What changes were proposed in this pull request? This patch adds support for all file formats in structured streaming sinks. This is actually a very small change thanks to all the previous refactoring done using the new internal commit protocol API. ## How was this patch tested? Updated FileStreamSinkSuite to add test cases for json, text, and parquet. Author: Reynold Xin Closes #15711 from rxin/SPARK-18192. (cherry picked from commit a36653c5b7b2719f8bfddf4ddfc6e1b828ac9af1) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/85dd0737 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/85dd0737 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/85dd0737 Branch: refs/heads/branch-2.1 Commit: 85dd073743946383438aabb9f1281e6075f25cc5 Parents: e6509c2 Author: Reynold Xin Authored: Tue Nov 1 23:37:03 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 23:37:11 2016 -0700 -- .../sql/execution/datasources/DataSource.scala | 8 +-- .../sql/streaming/FileStreamSinkSuite.scala | 62 +--- 2 files changed, 32 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/85dd0737/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index d980e6a..3f956c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -29,7 +29,6 @@ import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat @@ -37,7 +36,6 @@ import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider import org.apache.spark.sql.execution.datasources.json.JsonFileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{CalendarIntervalType, StructType} @@ -292,7 +290,7 @@ case class DataSource( case s: StreamSinkProvider => s.createSink(sparkSession.sqlContext, options, partitionColumns, outputMode) - case parquet: parquet.ParquetFileFormat => + case fileFormat: FileFormat => val caseInsensitiveOptions = new CaseInsensitiveMap(options) val path = caseInsensitiveOptions.getOrElse("path", { throw new IllegalArgumentException("'path' is not specified") @@ -301,7 +299,7 @@ case class DataSource( throw new IllegalArgumentException( s"Data source $className does not support $outputMode output mode") } -new FileStreamSink(sparkSession, path, parquet, partitionColumns, options) +new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, options) case _ => throw new UnsupportedOperationException( @@ -516,7 +514,7 @@ case class DataSource( val plan = data.logicalPlan plan.resolve(name :: Nil, data.sparkSession.sessionState.analyzer.resolver).getOrElse { throw new AnalysisException( - s"Unable to resolve ${name} given [${plan.output.map(_.name).mkString(", ")}]") + s"Unable to resolve $name given [${plan.output.map(_.name).mkString(", ")}]") }.asInstanceOf[Attribute] } // For partitioned relation r, r.schema's column ordering can be different from the column http://git-wip-us.apache.org/repos/asf/spark/blob/85dd0737/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala index 902cf05..0f
spark git commit: [SPARK-18192] Support all file formats in structured streaming
Repository: spark Updated Branches: refs/heads/master abefe2ec4 -> a36653c5b [SPARK-18192] Support all file formats in structured streaming ## What changes were proposed in this pull request? This patch adds support for all file formats in structured streaming sinks. This is actually a very small change thanks to all the previous refactoring done using the new internal commit protocol API. ## How was this patch tested? Updated FileStreamSinkSuite to add test cases for json, text, and parquet. Author: Reynold Xin Closes #15711 from rxin/SPARK-18192. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a36653c5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a36653c5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a36653c5 Branch: refs/heads/master Commit: a36653c5b7b2719f8bfddf4ddfc6e1b828ac9af1 Parents: abefe2e Author: Reynold Xin Authored: Tue Nov 1 23:37:03 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 23:37:03 2016 -0700 -- .../sql/execution/datasources/DataSource.scala | 8 +-- .../sql/streaming/FileStreamSinkSuite.scala | 62 +--- 2 files changed, 32 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a36653c5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index d980e6a..3f956c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -29,7 +29,6 @@ import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat @@ -37,7 +36,6 @@ import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider import org.apache.spark.sql.execution.datasources.json.JsonFileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{CalendarIntervalType, StructType} @@ -292,7 +290,7 @@ case class DataSource( case s: StreamSinkProvider => s.createSink(sparkSession.sqlContext, options, partitionColumns, outputMode) - case parquet: parquet.ParquetFileFormat => + case fileFormat: FileFormat => val caseInsensitiveOptions = new CaseInsensitiveMap(options) val path = caseInsensitiveOptions.getOrElse("path", { throw new IllegalArgumentException("'path' is not specified") @@ -301,7 +299,7 @@ case class DataSource( throw new IllegalArgumentException( s"Data source $className does not support $outputMode output mode") } -new FileStreamSink(sparkSession, path, parquet, partitionColumns, options) +new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, options) case _ => throw new UnsupportedOperationException( @@ -516,7 +514,7 @@ case class DataSource( val plan = data.logicalPlan plan.resolve(name :: Nil, data.sparkSession.sessionState.analyzer.resolver).getOrElse { throw new AnalysisException( - s"Unable to resolve ${name} given [${plan.output.map(_.name).mkString(", ")}]") + s"Unable to resolve $name given [${plan.output.map(_.name).mkString(", ")}]") }.asInstanceOf[Attribute] } // For partitioned relation r, r.schema's column ordering can be different from the column http://git-wip-us.apache.org/repos/asf/spark/blob/a36653c5/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala index 902cf05..0f140f9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala ++
spark git commit: [SPARK-18183][SPARK-18184] Fix INSERT [INTO|OVERWRITE] TABLE ... PARTITION for Datasource tables
Repository: spark Updated Branches: refs/heads/branch-2.1 39d2fdb51 -> e6509c245 [SPARK-18183][SPARK-18184] Fix INSERT [INTO|OVERWRITE] TABLE ... PARTITION for Datasource tables There are a couple issues with the current 2.1 behavior when inserting into Datasource tables with partitions managed by Hive. (1) OVERWRITE TABLE ... PARTITION will actually overwrite the entire table instead of just the specified partition. (2) INSERT|OVERWRITE does not work with partitions that have custom locations. This PR fixes both of these issues for Datasource tables managed by Hive. The behavior for legacy tables or when `manageFilesourcePartitions = false` is unchanged. There is one other issue in that INSERT OVERWRITE with dynamic partitions will overwrite the entire table instead of just the updated partitions, but this behavior is pretty complicated to implement for Datasource tables. We should address that in a future release. Unit tests. Author: Eric Liang Closes #15705 from ericl/sc-4942. (cherry picked from commit abefe2ec428dc24a4112c623fb6fbe4b2ca60a2b) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6509c24 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6509c24 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6509c24 Branch: refs/heads/branch-2.1 Commit: e6509c2459e7ece3c3c6bcd143b8cc71f8f4d5c8 Parents: 39d2fdb Author: Eric Liang Authored: Wed Nov 2 14:15:10 2016 +0800 Committer: Reynold Xin Committed: Tue Nov 1 23:23:55 2016 -0700 -- .../apache/spark/sql/catalyst/dsl/package.scala | 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 9 +++- .../plans/logical/basicLogicalOperators.scala | 19 ++- .../sql/catalyst/parser/PlanParserSuite.scala | 15 -- .../org/apache/spark/sql/DataFrameWriter.scala | 4 +- .../datasources/CatalogFileIndex.scala | 5 +- .../datasources/DataSourceStrategy.scala| 30 +-- .../InsertIntoDataSourceCommand.scala | 6 +-- .../apache/spark/sql/hive/HiveStrategies.scala | 3 +- .../CreateHiveTableAsSelectCommand.scala| 5 +- .../PartitionProviderCompatibilitySuite.scala | 52 11 files changed, 129 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e6509c24/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 66e52ca..e901683 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -367,7 +367,7 @@ package object dsl { def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan = InsertIntoTable( analysis.UnresolvedRelation(TableIdentifier(tableName)), - Map.empty, logicalPlan, overwrite, false) + Map.empty, logicalPlan, OverwriteOptions(overwrite), false) def as(alias: String): LogicalPlan = logicalPlan match { case UnresolvedRelation(tbl, _) => UnresolvedRelation(tbl, Option(alias)) http://git-wip-us.apache.org/repos/asf/spark/blob/e6509c24/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 38e9bb6..ac1577b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -177,12 +177,19 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " + "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx) } +val overwrite = ctx.OVERWRITE != null +val overwritePartition = + if (overwrite && partitionKeys.nonEmpty && dynamicPartitionKeys.isEmpty) { +Some(partitionKeys.map(t => (t._1, t._2.get))) + } else { +None + } InsertIntoTable( UnresolvedRelation(tableIdent, None), partitionKeys, query, - ctx.OVERWRITE != null, + OverwriteOptions(overwrite, overwritePartition), ctx.EXISTS != null) } http://git-wip-us.apache.org/repos/asf/spark/blob/e6509c24/sql/catalyst/src/main/scala/org/apache/spark/s
spark git commit: [SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use checksum files
Repository: spark Updated Branches: refs/heads/branch-2.1 1bbf9ff63 -> 39d2fdb51 [SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use checksum files ## What changes were proposed in this pull request? When the metadata logs for various parts of Structured Streaming are stored on non-HDFS filesystems such as NFS or ext4, the HDFSMetadataLog class leaves hidden HDFS-style checksum (CRC) files in the log directory, one file per batch. This PR modifies HDFSMetadataLog so that it detects the use of a filesystem that doesn't use CRC files and removes the CRC files. ## How was this patch tested? Modified an existing test case in HDFSMetadataLogSuite to check whether HDFSMetadataLog correctly removes CRC files on the local POSIX filesystem. Ran the entire regression suite. Author: frreiss Closes #15027 from frreiss/fred-17475. (cherry picked from commit 620da3b4828b3580c7ed7339b2a07938e6be1bb1) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39d2fdb5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39d2fdb5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39d2fdb5 Branch: refs/heads/branch-2.1 Commit: 39d2fdb51233ed9b1aaf3adaa3267853f5e58c0f Parents: 1bbf9ff Author: frreiss Authored: Tue Nov 1 23:00:17 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 23:00:28 2016 -0700 -- .../apache/spark/sql/execution/streaming/HDFSMetadataLog.scala | 5 + .../spark/sql/execution/streaming/HDFSMetadataLogSuite.scala | 6 ++ 2 files changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39d2fdb5/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index c7235320..9a0f87c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -148,6 +148,11 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String) // It will fail if there is an existing file (someone has committed the batch) logDebug(s"Attempting to write log #${batchIdToPath(batchId)}") fileManager.rename(tempPath, batchIdToPath(batchId)) + + // SPARK-17475: HDFSMetadataLog should not leak CRC files + // If the underlying filesystem didn't rename the CRC file, delete it. + val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc") + if (fileManager.exists(crcPath)) fileManager.delete(crcPath) return } catch { case e: IOException if isFileAlreadyExistsException(e) => http://git-wip-us.apache.org/repos/asf/spark/blob/39d2fdb5/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala index 9c1d26d..d03e08d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala @@ -119,6 +119,12 @@ class HDFSMetadataLogSuite extends SparkFunSuite with SharedSQLContext { assert(metadataLog.get(1).isEmpty) assert(metadataLog.get(2).isDefined) assert(metadataLog.getLatest().get._1 == 2) + + // There should be exactly one file, called "2", in the metadata directory. + // This check also tests for regressions of SPARK-17475 + val allFiles = new File(metadataLog.metadataPath.toString).listFiles().toSeq + assert(allFiles.size == 1) + assert(allFiles(0).getName() == "2") } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use checksum files
Repository: spark Updated Branches: refs/heads/master 1bbf9ff63 -> 620da3b48 [SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use checksum files ## What changes were proposed in this pull request? When the metadata logs for various parts of Structured Streaming are stored on non-HDFS filesystems such as NFS or ext4, the HDFSMetadataLog class leaves hidden HDFS-style checksum (CRC) files in the log directory, one file per batch. This PR modifies HDFSMetadataLog so that it detects the use of a filesystem that doesn't use CRC files and removes the CRC files. ## How was this patch tested? Modified an existing test case in HDFSMetadataLogSuite to check whether HDFSMetadataLog correctly removes CRC files on the local POSIX filesystem. Ran the entire regression suite. Author: frreiss Closes #15027 from frreiss/fred-17475. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/620da3b4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/620da3b4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/620da3b4 Branch: refs/heads/master Commit: 620da3b4828b3580c7ed7339b2a07938e6be1bb1 Parents: 1bbf9ff Author: frreiss Authored: Tue Nov 1 23:00:17 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 23:00:17 2016 -0700 -- .../apache/spark/sql/execution/streaming/HDFSMetadataLog.scala | 5 + .../spark/sql/execution/streaming/HDFSMetadataLogSuite.scala | 6 ++ 2 files changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/620da3b4/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index c7235320..9a0f87c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -148,6 +148,11 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String) // It will fail if there is an existing file (someone has committed the batch) logDebug(s"Attempting to write log #${batchIdToPath(batchId)}") fileManager.rename(tempPath, batchIdToPath(batchId)) + + // SPARK-17475: HDFSMetadataLog should not leak CRC files + // If the underlying filesystem didn't rename the CRC file, delete it. + val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc") + if (fileManager.exists(crcPath)) fileManager.delete(crcPath) return } catch { case e: IOException if isFileAlreadyExistsException(e) => http://git-wip-us.apache.org/repos/asf/spark/blob/620da3b4/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala index 9c1d26d..d03e08d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala @@ -119,6 +119,12 @@ class HDFSMetadataLogSuite extends SparkFunSuite with SharedSQLContext { assert(metadataLog.get(1).isEmpty) assert(metadataLog.get(2).isDefined) assert(metadataLog.getLatest().get._1 == 2) + + // There should be exactly one file, called "2", in the metadata directory. + // This check also tests for regressions of SPARK-17475 + val allFiles = new File(metadataLog.metadataPath.toString).listFiles().toSeq + assert(allFiles.size == 1) + assert(allFiles(0).getName() == "2") } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] Git Push Summary
Repository: spark Updated Branches: refs/heads/branch-2.1 [created] 1bbf9ff63 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17992][SQL] Return all partitions from HiveShim when Hive throws a metastore exception when attempting to fetch partitions by filter
Repository: spark Updated Branches: refs/heads/master 1ecfafa08 -> 1bbf9ff63 [SPARK-17992][SQL] Return all partitions from HiveShim when Hive throws a metastore exception when attempting to fetch partitions by filter (Link to Jira issue: https://issues.apache.org/jira/browse/SPARK-17992) ## What changes were proposed in this pull request? We recently added table partition pruning for partitioned Hive tables converted to using `TableFileCatalog`. When the Hive configuration option `hive.metastore.try.direct.sql` is set to `false`, Hive will throw an exception for unsupported filter expressions. For example, attempting to filter on an integer partition column will throw a `org.apache.hadoop.hive.metastore.api.MetaException`. I discovered this behavior because VideoAmp uses the CDH version of Hive with a Postgresql metastore DB. In this configuration, CDH sets `hive.metastore.try.direct.sql` to `false` by default, and queries that filter on a non-string partition column will fail. Rather than throw an exception in query planning, this patch catches this exception, logs a warning and returns all table partitions instead. Clients of this method are already expected to handle the possibility that the filters will not be honored. ## How was this patch tested? A unit test was added. Author: Michael Allman Closes #15673 from mallman/spark-17992-catch_hive_partition_filter_exception. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1bbf9ff6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1bbf9ff6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1bbf9ff6 Branch: refs/heads/master Commit: 1bbf9ff634745148e782370009aa31d3a042638c Parents: 1ecfafa Author: Michael Allman Authored: Tue Nov 1 22:20:19 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 22:20:19 2016 -0700 -- .../apache/spark/sql/hive/client/HiveShim.scala | 31 ++-- .../sql/hive/client/HiveClientBuilder.scala | 56 ++ .../spark/sql/hive/client/HiveClientSuite.scala | 61 .../spark/sql/hive/client/VersionsSuite.scala | 77 +--- 4 files changed, 160 insertions(+), 65 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1bbf9ff6/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 85edaf6..3d9642d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -29,7 +29,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf -import org.apache.hadoop.hive.metastore.api.{Function => HiveFunction, FunctionType, NoSuchObjectException, PrincipalType, ResourceType, ResourceUri} +import org.apache.hadoop.hive.metastore.api.{Function => HiveFunction, FunctionType, MetaException, PrincipalType, ResourceType, ResourceUri} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.metadata.{Hive, HiveException, Partition, Table} import org.apache.hadoop.hive.ql.plan.AddPartitionDesc @@ -43,6 +43,7 @@ import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTablePartition, FunctionResource, FunctionResourceType} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegralType, StringType} import org.apache.spark.util.Utils @@ -586,17 +587,31 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]] } else { logDebug(s"Hive metastore filter is '$filter'.") +val tryDirectSqlConfVar = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL +val tryDirectSql = + hive.getConf.getBoolean(tryDirectSqlConfVar.varname, tryDirectSqlConfVar.defaultBoolVal) try { + // Hive may throw an exception when calling this method in some circumstances, such as + // when filtering on a non-string partition column when the hive config key + // hive.metastore.try.direct.sql is false getPartitionsByFilterMethod.invoke(hive, table, filter) .asInstanceOf[JArrayList[Partition]] } catch { - case e: InvocationTargetException => -// SPARK-18167 retry to investigate the flaky test. This should be revert
spark git commit: [SPARK-18216][SQL] Make Column.expr public
Repository: spark Updated Branches: refs/heads/master 77a98162d -> ad4832a9f [SPARK-18216][SQL] Make Column.expr public ## What changes were proposed in this pull request? Column.expr is private[sql], but it's an actually really useful field to have for debugging. We should open it up, similar to how we use QueryExecution. ## How was this patch tested? N/A - this is a simple visibility change. Author: Reynold Xin Closes #15724 from rxin/SPARK-18216. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad4832a9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad4832a9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad4832a9 Branch: refs/heads/master Commit: ad4832a9faf2c0c869bbcad9d71afe1cecbd3ec8 Parents: 77a9816 Author: Reynold Xin Authored: Tue Nov 1 21:20:53 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 21:20:53 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/Column.scala | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ad4832a9/sql/core/src/main/scala/org/apache/spark/sql/Column.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 05e867b..249408e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -118,6 +118,9 @@ class TypedColumn[-T, U]( * $"a" === $"b" * }}} * + * Note that the internal Catalyst expression can be accessed via "expr", but this method is for + * debugging purposes only and can change in any future Spark releases. + * * @groupname java_expr_ops Java-specific expression operators * @groupname expr_ops Expression operators * @groupname df_ops DataFrame functions @@ -126,7 +129,7 @@ class TypedColumn[-T, U]( * @since 1.3.0 */ @InterfaceStability.Stable -class Column(protected[sql] val expr: Expression) extends Logging { +class Column(val expr: Expression) extends Logging { def this(name: String) = this(name match { case "*" => UnresolvedStar(None) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18182] Expose ReplayListenerBus.read() overload which takes string iterator
Repository: spark Updated Branches: refs/heads/master 6e6298154 -> b929537b6 [SPARK-18182] Expose ReplayListenerBus.read() overload which takes string iterator The `ReplayListenerBus.read()` method is used when implementing a custom `ApplicationHistoryProvider`. The current interface only exposes a `read()` method which takes an `InputStream` and performs stream-to-lines conversion itself, but it would also be useful to expose an overloaded method which accepts an iterator of strings, thereby enabling events to be provided from non-`InputStream` sources. Author: Josh Rosen Closes #15698 from JoshRosen/replay-listener-bus-interface. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b929537b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b929537b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b929537b Branch: refs/heads/master Commit: b929537b6eb0f8f34497c3dbceea8045bf5dffdb Parents: 6e62981 Author: Josh Rosen Authored: Tue Nov 1 16:49:41 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 16:49:41 2016 -0700 -- .../apache/spark/scheduler/ReplayListenerBus.scala | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b929537b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala index 2424586..0bd5a6b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala @@ -53,13 +53,24 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { sourceName: String, maybeTruncated: Boolean = false, eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = { +val lines = Source.fromInputStream(logData).getLines() +replay(lines, sourceName, maybeTruncated, eventsFilter) + } + /** + * Overloaded variant of [[replay()]] which accepts an iterator of lines instead of an + * [[InputStream]]. Exposed for use by custom ApplicationHistoryProvider implementations. + */ + def replay( + lines: Iterator[String], + sourceName: String, + maybeTruncated: Boolean, + eventsFilter: ReplayEventsFilter): Unit = { var currentLine: String = null var lineNumber: Int = 0 try { - val lineEntries = Source.fromInputStream(logData) -.getLines() + val lineEntries = lines .zipWithIndex .filter { case (line, _) => eventsFilter(line) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17350][SQL] Disable default use of KryoSerializer in Thrift Server
Repository: spark Updated Branches: refs/heads/master 01dd00830 -> 6e6298154 [SPARK-17350][SQL] Disable default use of KryoSerializer in Thrift Server In SPARK-4761 / #3621 (December 2014) we enabled Kryo serialization by default in the Spark Thrift Server. However, I don't think that the original rationale for doing this still holds now that most Spark SQL serialization is now performed via encoders and our UnsafeRow format. In addition, the use of Kryo as the default serializer can introduce performance problems because the creation of new KryoSerializer instances is expensive and we haven't performed instance-reuse optimizations in several code paths (including DirectTaskResult deserialization). Given all of this, I propose to revert back to using JavaSerializer as the default serializer in the Thrift Server. /cc liancheng Author: Josh Rosen Closes #14906 from JoshRosen/disable-kryo-in-thriftserver. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e629815 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e629815 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e629815 Branch: refs/heads/master Commit: 6e6298154aba63831a292117797798131a646869 Parents: 01dd008 Author: Josh Rosen Authored: Tue Nov 1 16:23:47 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 16:23:47 2016 -0700 -- docs/configuration.md | 5 ++--- .../apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala | 10 -- 2 files changed, 2 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e629815/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 780fc94..0017219 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -767,7 +767,7 @@ Apart from these, the following properties are also available, and may be useful spark.kryo.referenceTracking - true (false when using Spark SQL Thrift Server) + true Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple @@ -838,8 +838,7 @@ Apart from these, the following properties are also available, and may be useful spark.serializer -org.apache.spark.serializer.JavaSerializer (org.apache.spark.serializer. -KryoSerializer when using Spark SQL Thrift Server) +org.apache.spark.serializer.JavaSerializer Class to use for serializing objects that will be sent over the network or need to be cached http://git-wip-us.apache.org/repos/asf/spark/blob/6e629815/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala index 6389115..78a3094 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream -import scala.collection.JavaConverters._ - import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} @@ -37,8 +35,6 @@ private[hive] object SparkSQLEnv extends Logging { def init() { if (sqlContext == null) { val sparkConf = new SparkConf(loadDefaults = true) - val maybeSerializer = sparkConf.getOption("spark.serializer") - val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking") // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of // the default appName [SparkSQLCLIDriver] in cli or beeline. val maybeAppName = sparkConf @@ -47,12 +43,6 @@ private[hive] object SparkSQLEnv extends Logging { sparkConf .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}")) -.set( - "spark.serializer", - maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer")) -.set( - "spark.kryo.referenceTracking", - maybeKryoReferenceTracking.getOrElse("false")) val sparkSession = SparkSession.builder.config(sparkConf).enableHiveSupport().getOrCreate() sparkContext = sparkSession.sparkContext
spark git commit: [SPARK-18114][HOTFIX] Fix line-too-long style error from backport of SPARK-18114
Repository: spark Updated Branches: refs/heads/branch-2.0 4176da8be -> a01b95060 [SPARK-18114][HOTFIX] Fix line-too-long style error from backport of SPARK-18114 ## What changes were proposed in this pull request? Fix style error introduced in cherry-pick of https://github.com/apache/spark/pull/15643 to branch-2.0. ## How was this patch tested? Existing tests Author: Sean Owen Closes #15719 from srowen/SPARK-18114.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a01b9506 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a01b9506 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a01b9506 Branch: refs/heads/branch-2.0 Commit: a01b950602c4bb56c5a7d6213cdf6b7515ff36ec Parents: 4176da8 Author: Sean Owen Authored: Tue Nov 1 12:43:50 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 12:43:50 2016 -0700 -- .../spark/scheduler/cluster/mesos/MesosClusterScheduler.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a01b9506/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index cbf97c3..94827e4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -448,7 +448,8 @@ private[spark] class MesosClusterScheduler( } desc.schedulerProperties .filter { case (key, _) => !replicatedOptionsBlacklist.contains(key) } - .foreach { case (key, value) => options ++= Seq("--conf", s$key=${shellEscape(value)}.stripMargin) } + .foreach { case (key, value) => +options ++= Seq("--conf", s$key=${shellEscape(value)}.stripMargin) } options } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18167] Disable flaky SQLQuerySuite test
Repository: spark Updated Branches: refs/heads/master d0272b436 -> cfac17ee1 [SPARK-18167] Disable flaky SQLQuerySuite test We now know it's a persistent environmental issue that is causing this test to sometimes fail. One hypothesis is that some configuration is leaked from another suite, and depending on suite ordering this can cause this test to fail. I am planning on mining the jenkins logs to try to narrow down which suite could be causing this. For now, disable the test. Author: Eric Liang Closes #15720 from ericl/disable-flaky-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cfac17ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cfac17ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cfac17ee Branch: refs/heads/master Commit: cfac17ee1cec414663b957228e469869eb7673c1 Parents: d0272b4 Author: Eric Liang Authored: Tue Nov 1 12:35:34 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 12:35:34 2016 -0700 -- .../scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cfac17ee/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 8b91693..b9353b5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1565,7 +1565,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { ).map(i => Row(i._1, i._2, i._3, i._4))) } - test("SPARK-10562: partition by column with mixed case name") { + ignore("SPARK-10562: partition by column with mixed case name") { def runOnce() { withTable("tbl10562") { val df = Seq(2012 -> "a").toDF("Year", "val") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy
Repository: spark Updated Branches: refs/heads/master 8a538c97b -> d0272b436 [SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy ## What changes were proposed in this pull request? Aggregation Without Window/GroupBy expressions will fail in `checkAnalysis`, the error message is a bit misleading, we should generate a more specific error message for this case. For example, ``` spark.read.load("/some-data") .withColumn("date_dt", to_date($"date")) .withColumn("year", year($"date_dt")) .withColumn("week", weekofyear($"date_dt")) .withColumn("user_count", count($"userId")) .withColumn("daily_max_in_week", max($"user_count").over(weeklyWindow)) ) ``` creates the following output: ``` org.apache.spark.sql.AnalysisException: expression '`randomColumn`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; ``` In the error message above, `randomColumn` doesn't appear in the query(acturally it's added by function `withColumn`), so the message is not enough for the user to address the problem. ## How was this patch tested? Manually test Before: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: expression 'tbl.`col`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;; ``` After: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: grouping expressions sequence is empty, and 'tbl.`col`' is not an aggregate function. Wrap '(count(col#231L) AS count(col)#239L)' in windowing function(s) or wrap 'tbl.`col`' in first() (or first_value) if you don't care which value you get.;; ``` Also add new test sqls in `group-by.sql`. Author: jiangxingbo Closes #15672 from jiangxb1987/groupBy-empty. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0272b43 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0272b43 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0272b43 Branch: refs/heads/master Commit: d0272b436512b71f04313e109d3d21a6e9deefca Parents: 8a538c9 Author: jiangxingbo Authored: Tue Nov 1 11:25:11 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 11:25:11 2016 -0700 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 12 ++ .../resources/sql-tests/inputs/group-by.sql | 41 +-- .../sql-tests/results/group-by.sql.out | 116 --- .../org/apache/spark/sql/SQLQuerySuite.scala| 35 -- 4 files changed, 140 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d0272b43/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 9a7c2a9..3455a56 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper { s"appear in the arguments of an aggregate function.") } } + case e: Attribute if groupingExprs.isEmpty => +// Collect all [[AggregateExpressions]]s. +val aggExprs = aggregateExprs.filter(_.collect { + case a: AggregateExpression => a +}.nonEmpty) +failAnalysis( + s"grouping expressions sequence is empty, " + +s"and '${e.sql}' is not an aggregate function. " + +s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' in windowing " + +s"function(s) or wrap '${e.sql}' in first() (or first_value) " + +s"if you don't care which value you get." +) case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) => failAnalysis( s"expression '${e.sql}' is neither present in the group by, " + http://git-wip-us.apache.org/repos/asf/spark/blob/d0272b43/sql/core/src/test/resources/sql-tests/inputs/group-by.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 6741703..d950ec8 100644 --- a/sql/core/src/test/reso
spark git commit: [SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy
Repository: spark Updated Branches: refs/heads/branch-2.0 58655f51f -> 4176da8be [SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy ## What changes were proposed in this pull request? Aggregation Without Window/GroupBy expressions will fail in `checkAnalysis`, the error message is a bit misleading, we should generate a more specific error message for this case. For example, ``` spark.read.load("/some-data") .withColumn("date_dt", to_date($"date")) .withColumn("year", year($"date_dt")) .withColumn("week", weekofyear($"date_dt")) .withColumn("user_count", count($"userId")) .withColumn("daily_max_in_week", max($"user_count").over(weeklyWindow)) ) ``` creates the following output: ``` org.apache.spark.sql.AnalysisException: expression '`randomColumn`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; ``` In the error message above, `randomColumn` doesn't appear in the query(acturally it's added by function `withColumn`), so the message is not enough for the user to address the problem. ## How was this patch tested? Manually test Before: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: expression 'tbl.`col`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;; ``` After: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: grouping expressions sequence is empty, and 'tbl.`col`' is not an aggregate function. Wrap '(count(col#231L) AS count(col)#239L)' in windowing function(s) or wrap 'tbl.`col`' in first() (or first_value) if you don't care which value you get.;; ``` Also add new test sqls in `group-by.sql`. Author: jiangxingbo Closes #15672 from jiangxb1987/groupBy-empty. (cherry picked from commit d0272b436512b71f04313e109d3d21a6e9deefca) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4176da8b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4176da8b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4176da8b Branch: refs/heads/branch-2.0 Commit: 4176da8be57bb0b36b9f2c580a547713c2048d17 Parents: 58655f5 Author: jiangxingbo Authored: Tue Nov 1 11:25:11 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 11:25:18 2016 -0700 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 12 ++ .../resources/sql-tests/inputs/group-by.sql | 41 +-- .../sql-tests/results/group-by.sql.out | 116 --- .../org/apache/spark/sql/SQLQuerySuite.scala| 35 -- 4 files changed, 140 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4176da8b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 790566c..10e0eef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper { s"appear in the arguments of an aggregate function.") } } + case e: Attribute if groupingExprs.isEmpty => +// Collect all [[AggregateExpressions]]s. +val aggExprs = aggregateExprs.filter(_.collect { + case a: AggregateExpression => a +}.nonEmpty) +failAnalysis( + s"grouping expressions sequence is empty, " + +s"and '${e.sql}' is not an aggregate function. " + +s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' in windowing " + +s"function(s) or wrap '${e.sql}' in first() (or first_value) " + +s"if you don't care which value you get." +) case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) => failAnalysis( s"expression '${e.sql}' is neither present in the group by, " + http://git-wip-us.apache.org/repos/asf/spark/blob/4176da8b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/s
spark git commit: [SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset
Repository: spark Updated Branches: refs/heads/branch-2.0 4d2672a40 -> 58655f51f [SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset ## What changes were proposed in this pull request? Likewise [DataSet.scala](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L156) KeyValueGroupedDataset should mark the queryExecution as transient. As mentioned in the Jira ticket, without transient we saw serialization issues like ``` Caused by: java.io.NotSerializableException: org.apache.spark.sql.execution.QueryExecution Serialization stack: - object not serializable (class: org.apache.spark.sql.execution.QueryExecution, value: == ``` ## How was this patch tested? Run the query which is specified in the Jira ticket before and after: ``` val a = spark.createDataFrame(sc.parallelize(Seq((1,2),(3,4.as[(Int,Int)] val grouped = a.groupByKey( {x:(Int,Int)=>x._1} ) val mappedGroups = grouped.mapGroups((k,x)=> {(k,1)} ) val yyy = sc.broadcast(1) val last = mappedGroups.rdd.map(xx=> { val simpley = yyy.value 1 } ) ``` Author: Ergin Seyfe Closes #15706 from seyfe/keyvaluegrouped_serialization. (cherry picked from commit 8a538c97b556f80f67c80519af0ce879557050d5) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58655f51 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58655f51 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58655f51 Branch: refs/heads/branch-2.0 Commit: 58655f51f65d852ec65a65b54f26b3c8eac8cc60 Parents: 4d2672a Author: Ergin Seyfe Authored: Tue Nov 1 11:18:42 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 11:18:50 2016 -0700 -- .../scala/org/apache/spark/repl/ReplSuite.scala| 17 + .../apache/spark/sql/KeyValueGroupedDataset.scala | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/58655f51/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index f7d7a4f..8deafe3 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -473,4 +473,21 @@ class ReplSuite extends SparkFunSuite { assertDoesNotContain("AssertionError", output) assertDoesNotContain("Exception", output) } + + test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { +val resultValue = 12345 +val output = runInterpreter("local", + s""" + |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) + |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) + |val broadcasted = sc.broadcast($resultValue) + | + |// Using broadcast triggers serialization issue in KeyValueGroupedDataset + |val dataset = mapGroups.map(_ => broadcasted.value) + |dataset.collect() + """.stripMargin) +assertDoesNotContain("error:", output) +assertDoesNotContain("Exception", output) +assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/58655f51/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 8eec42a..407d036 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.expressions.ReduceAggregator class KeyValueGroupedDataset[K, V] private[sql]( kEncoder: Encoder[K], vEncoder: Encoder[V], -val queryExecution: QueryExecution, +@transient val queryExecution: QueryExecution, private val dataAttributes: Seq[Attribute], private val groupingAttributes: Seq[Attribute]) extends Serializable { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset
Repository: spark Updated Branches: refs/heads/master 8cdf143f4 -> 8a538c97b [SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset ## What changes were proposed in this pull request? Likewise [DataSet.scala](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L156) KeyValueGroupedDataset should mark the queryExecution as transient. As mentioned in the Jira ticket, without transient we saw serialization issues like ``` Caused by: java.io.NotSerializableException: org.apache.spark.sql.execution.QueryExecution Serialization stack: - object not serializable (class: org.apache.spark.sql.execution.QueryExecution, value: == ``` ## How was this patch tested? Run the query which is specified in the Jira ticket before and after: ``` val a = spark.createDataFrame(sc.parallelize(Seq((1,2),(3,4.as[(Int,Int)] val grouped = a.groupByKey( {x:(Int,Int)=>x._1} ) val mappedGroups = grouped.mapGroups((k,x)=> {(k,1)} ) val yyy = sc.broadcast(1) val last = mappedGroups.rdd.map(xx=> { val simpley = yyy.value 1 } ) ``` Author: Ergin Seyfe Closes #15706 from seyfe/keyvaluegrouped_serialization. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a538c97 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a538c97 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a538c97 Branch: refs/heads/master Commit: 8a538c97b556f80f67c80519af0ce879557050d5 Parents: 8cdf143 Author: Ergin Seyfe Authored: Tue Nov 1 11:18:42 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 11:18:42 2016 -0700 -- .../scala/org/apache/spark/repl/ReplSuite.scala| 17 + .../apache/spark/sql/KeyValueGroupedDataset.scala | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a538c97/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala -- diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 9262e93..96d2dfc 100644 --- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -473,4 +473,21 @@ class ReplSuite extends SparkFunSuite { assertDoesNotContain("AssertionError", output) assertDoesNotContain("Exception", output) } + + test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { +val resultValue = 12345 +val output = runInterpreter("local", + s""" + |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1) + |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1)) + |val broadcasted = sc.broadcast($resultValue) + | + |// Using broadcast triggers serialization issue in KeyValueGroupedDataset + |val dataset = mapGroups.map(_ => broadcasted.value) + |dataset.collect() + """.stripMargin) +assertDoesNotContain("error:", output) +assertDoesNotContain("Exception", output) +assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/8a538c97/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 4cb0313..31ce8eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.expressions.ReduceAggregator class KeyValueGroupedDataset[K, V] private[sql]( kEncoder: Encoder[K], vEncoder: Encoder[V], -val queryExecution: QueryExecution, +@transient val queryExecution: QueryExecution, private val dataAttributes: Seq[Attribute], private val groupingAttributes: Seq[Attribute]) extends Serializable { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18103][FOLLOW-UP][SQL][MINOR] Rename `MetadataLogFileCatalog` to `MetadataLogFileIndex`
Repository: spark Updated Branches: refs/heads/master 8ac09108f -> 8cdf143f4 [SPARK-18103][FOLLOW-UP][SQL][MINOR] Rename `MetadataLogFileCatalog` to `MetadataLogFileIndex` ## What changes were proposed in this pull request? This is a follow-up to https://github.com/apache/spark/pull/15634. ## How was this patch tested? N/A Author: Liwei Lin Closes #15712 from lw-lin/18103. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8cdf143f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8cdf143f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8cdf143f Branch: refs/heads/master Commit: 8cdf143f4b1ca5c6bc0256808e6f42d9ef299cbd Parents: 8ac0910 Author: Liwei Lin Authored: Tue Nov 1 11:17:35 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 11:17:35 2016 -0700 -- .../streaming/MetadataLogFileCatalog.scala | 60 .../streaming/MetadataLogFileIndex.scala| 60 2 files changed, 60 insertions(+), 60 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8cdf143f/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala deleted file mode 100644 index aeaa134..000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.streaming - -import scala.collection.mutable - -import org.apache.hadoop.fs.{FileStatus, Path} - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.execution.datasources._ - - -/** - * A [[FileIndex]] that generates the list of files to processing by reading them from the - * metadata log files generated by the [[FileStreamSink]]. - */ -class MetadataLogFileIndex(sparkSession: SparkSession, path: Path) - extends PartitioningAwareFileIndex(sparkSession, Map.empty, None) { - - private val metadataDirectory = new Path(path, FileStreamSink.metadataDir) - logInfo(s"Reading streaming file log from $metadataDirectory") - private val metadataLog = -new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString) - private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory) - private var cachedPartitionSpec: PartitionSpec = _ - - override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = { -new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f) - } - - override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = { -allFilesFromLog.toArray.groupBy(_.getPath.getParent) - } - - override def rootPaths: Seq[Path] = path :: Nil - - override def refresh(): Unit = { } - - override def partitionSpec(): PartitionSpec = { -if (cachedPartitionSpec == null) { - cachedPartitionSpec = inferPartitioning() -} -cachedPartitionSpec - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/8cdf143f/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala new file mode 100644 index 000..aeaa134 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding
spark git commit: [SPARK-18107][SQL] Insert overwrite statement runs much slower in spark-sql than it does in hive-client
Repository: spark Updated Branches: refs/heads/master d9d146500 -> dd85eb544 [SPARK-18107][SQL] Insert overwrite statement runs much slower in spark-sql than it does in hive-client ## What changes were proposed in this pull request? As reported on the jira, insert overwrite statement runs much slower in Spark, compared with hive-client. It seems there is a patch [HIVE-11940](https://github.com/apache/hive/commit/ba21806b77287e237e1aa68fa169d2a81e07346d) which largely improves insert overwrite performance on Hive. HIVE-11940 is patched after Hive 2.0.0. Because Spark SQL uses older Hive library, we can not benefit from such improvement. The reporter verified that there is also a big performance gap between Hive 1.2.1 (520.037 secs) and Hive 2.0.1 (35.975 secs) on insert overwrite execution. Instead of upgrading to Hive 2.0 in Spark SQL, which might not be a trivial task, this patch provides an approach to delete the partition before asking Hive to load data files into the partition. Note: The case reported on the jira is insert overwrite to partition. Since `Hive.loadTable` also uses the function to replace files, insert overwrite to table should has the same issue. We can take the same approach to delete the table first. I will upgrade this to include this. ## How was this patch tested? Jenkins tests. There are existing tests using insert overwrite statement. Those tests should be passed. I added a new test to specially test insert overwrite into partition. For performance issue, as I don't have Hive 2.0 environment, this needs the reporter to verify it. Please refer to the jira. Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: Liang-Chi Hsieh Closes #15667 from viirya/improve-hive-insertoverwrite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dd85eb54 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dd85eb54 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dd85eb54 Branch: refs/heads/master Commit: dd85eb5448c8f2672260b57e94c0da0eaac12616 Parents: d9d1465 Author: Liang-Chi Hsieh Authored: Tue Nov 1 00:24:08 2016 -0700 Committer: Reynold Xin Committed: Tue Nov 1 00:24:08 2016 -0700 -- .../hive/execution/InsertIntoHiveTable.scala| 24 +- .../sql/hive/execution/SQLQuerySuite.scala | 33 2 files changed, 56 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dd85eb54/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index c3c4e29..2843100 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand} import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.SparkException @@ -257,7 +258,28 @@ case class InsertIntoHiveTable( table.catalogTable.identifier.table, partitionSpec) +var doHiveOverwrite = overwrite + if (oldPart.isEmpty || !ifNotExists) { + // SPARK-18107: Insert overwrite runs much slower than hive-client. + // Newer Hive largely improves insert overwrite performance. As Spark uses older Hive + // version and we may not want to catch up new Hive version every time. We delete the + // Hive partition first and then load data file into the Hive partition. + if (oldPart.nonEmpty && overwrite) { +oldPart.get.storage.locationUri.map { uri => + val partitionPath = new Path(uri) + val fs = partitionPath.getFileSystem(hadoopConf) + if (fs.exists(partitionPath)) { +if (!fs.delete(partitionPath, true)) { + throw new RuntimeException( +"Cannot remove partition directory '" + partitionPath.toString) +} +// Don't let Hive do overwrite operation since it is slower. +doHiveOverwrite = false +
spark git commit: [SPARK-18024][SQL] Introduce an internal commit protocol API
Repository: spark Updated Branches: refs/heads/master 7d6c87155 -> d9d146500 [SPARK-18024][SQL] Introduce an internal commit protocol API ## What changes were proposed in this pull request? This patch introduces an internal commit protocol API that is used by the batch data source to do write commits. It currently has only one implementation that uses Hadoop MapReduce's OutputCommitter API. In the future, this commit API can be used to unify streaming and batch commits. ## How was this patch tested? Should be covered by existing write tests. Author: Reynold Xin Author: Eric Liang Closes #15707 from rxin/SPARK-18024-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9d14650 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9d14650 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9d14650 Branch: refs/heads/master Commit: d9d1465009fb40550467089ede315496552374c5 Parents: 7d6c871 Author: Reynold Xin Authored: Mon Oct 31 22:23:38 2016 -0700 Committer: Reynold Xin Committed: Mon Oct 31 22:23:38 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 17 +- .../datasources/FileCommitProtocol.scala| 254 +++ .../execution/datasources/OutputWriter.scala| 26 +- .../sql/execution/datasources/WriteOutput.scala | 167 .../execution/datasources/csv/CSVRelation.scala | 17 +- .../datasources/json/JsonFileFormat.scala | 17 +- .../datasources/parquet/ParquetFileFormat.scala | 8 +- .../parquet/ParquetOutputWriter.scala | 19 +- .../datasources/text/TextFileFormat.scala | 17 +- .../org/apache/spark/sql/internal/SQLConf.scala | 29 ++- .../spark/sql/hive/orc/OrcFileFormat.scala | 28 +- .../sql/sources/CommitFailureTestSource.scala | 10 +- .../spark/sql/sources/SimpleTextRelation.scala | 19 +- 13 files changed, 387 insertions(+), 241 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9d14650/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 5e9e6ff..cb3ca1b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -41,17 +41,11 @@ import org.apache.spark.sql.types._ import org.apache.spark.util.SerializableConfiguration private[libsvm] class LibSVMOutputWriter( -stagingDir: String, -fileNamePrefix: String, +path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { - override val path: String = { -val compressionExtension = TextOutputWriter.getCompressionExtension(context) -new Path(stagingDir, fileNamePrefix + ".libsvm" + compressionExtension).toString - } - private[this] val buffer = new Text() private val recordWriter: RecordWriter[NullWritable, Text] = { @@ -135,11 +129,14 @@ private[libsvm] class LibSVMFileFormat extends TextBasedFileFormat with DataSour dataSchema: StructType): OutputWriterFactory = { new OutputWriterFactory { override def newInstance( - stagingDir: String, - fileNamePrefix: String, + path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { -new LibSVMOutputWriter(stagingDir, fileNamePrefix, dataSchema, context) +new LibSVMOutputWriter(path, dataSchema, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { +".libsvm" + TextOutputWriter.getCompressionExtension(context) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/d9d14650/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala new file mode 100644 index 000..1ce9ae4 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"
spark git commit: [SPARK-18167][SQL] Retry when the SQLQuerySuite test flakes
Repository: spark Updated Branches: refs/heads/master efc254a82 -> 7d6c87155 [SPARK-18167][SQL] Retry when the SQLQuerySuite test flakes ## What changes were proposed in this pull request? This will re-run the flaky test a few times after it fails. This will help determine if it's due to nondeterministic test setup, or because of some environment issue (e.g. leaked config from another test). cc yhuai Author: Eric Liang Closes #15708 from ericl/spark-18167-3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d6c8715 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d6c8715 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d6c8715 Branch: refs/heads/master Commit: 7d6c87155c740cf622c2c600a8ca64154d24c422 Parents: efc254a Author: Eric Liang Authored: Mon Oct 31 20:23:22 2016 -0700 Committer: Reynold Xin Committed: Mon Oct 31 20:23:22 2016 -0700 -- .../sql/hive/execution/SQLQuerySuite.scala | 28 ++-- 1 file changed, 20 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d6c8715/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 2735d3a..f64010a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1566,14 +1566,26 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-10562: partition by column with mixed case name") { -withTable("tbl10562") { - val df = Seq(2012 -> "a").toDF("Year", "val") - df.write.partitionBy("Year").saveAsTable("tbl10562") - checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012)) - checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012)) - checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012)) - checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil) - checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a")) +def runOnce() { + withTable("tbl10562") { +val df = Seq(2012 -> "a").toDF("Year", "val") +df.write.partitionBy("Year").saveAsTable("tbl10562") +checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012)) +checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012)) +checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012)) +checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil) +checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a")) + } +} +try { + runOnce() +} catch { + case t: Throwable => +// Retry to gather more test data. TODO(ekl) revert this once we deflake this test. +runOnce() +runOnce() +runOnce() +throw t } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18087][SQL] Optimize insert to not require REPAIR TABLE
Repository: spark Updated Branches: refs/heads/master 6633b97b5 -> efc254a82 [SPARK-18087][SQL] Optimize insert to not require REPAIR TABLE ## What changes were proposed in this pull request? When inserting into datasource tables with partitions managed by the hive metastore, we need to notify the metastore of newly added partitions. Previously this was implemented via `msck repair table`, but this is more expensive than needed. This optimizes the insertion path to add only the updated partitions. ## How was this patch tested? Existing tests (I verified manually that tests fail if the repair operation is omitted). Author: Eric Liang Closes #15633 from ericl/spark-18087. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/efc254a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/efc254a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/efc254a8 Branch: refs/heads/master Commit: efc254a82bc3331d78023f00d29d4c4318dfb734 Parents: 6633b97 Author: Eric Liang Authored: Mon Oct 31 19:46:55 2016 -0700 Committer: Reynold Xin Committed: Mon Oct 31 19:46:55 2016 -0700 -- .../sql/execution/datasources/DataSource.scala | 2 +- .../datasources/DataSourceStrategy.scala| 27 +++--- .../InsertIntoHadoopFsRelationCommand.scala | 3 +- .../datasources/PartitioningUtils.scala | 12 .../sql/execution/datasources/WriteOutput.scala | 29 ++-- 5 files changed, 52 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/efc254a8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 9961098..d980e6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -528,7 +528,7 @@ case class DataSource( columns, bucketSpec, format, -() => Unit, // No existing table needs to be refreshed. +_ => Unit, // No existing table needs to be refreshed. options, data.logicalPlan, mode) http://git-wip-us.apache.org/repos/asf/spark/blob/efc254a8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index f0bcf94..34b77ca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, Inte import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SimpleCatalogRelation} +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation @@ -34,7 +35,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} -import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, DDLUtils, ExecutedCommandExec} +import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, DDLUtils, ExecutedCommandExec} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -179,24 +180,30 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { "Cannot overwrite a path that is also being read from.") } + def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = { +if (l.catalogTable.isDefined && +l.catalogTable.get.partitionColumnNames.nonEmpty && +l.catalogTable.get.partitionProviderIsHive) { + val metastoreUpdater = AlterTableAddPartition
spark git commit: [SPARK-18143][SQL] Ignore Structured Streaming event logs to avoid breaking history server (branch 2.0)
Repository: spark Updated Branches: refs/heads/branch-2.0 9f924747d -> 300d596a5 [SPARK-18143][SQL] Ignore Structured Streaming event logs to avoid breaking history server (branch 2.0) ## What changes were proposed in this pull request? Backport #15663 to branch-2.0 and fixed conflicts in `ReplayListenerBus`. ## How was this patch tested? Jenkins Author: Shixiong Zhu Closes #15695 from zsxwing/fix-event-log-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/300d596a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/300d596a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/300d596a Branch: refs/heads/branch-2.0 Commit: 300d596a5177ae372194f73f717174b7ff7acd36 Parents: 9f92474 Author: Shixiong Zhu Authored: Mon Oct 31 16:03:44 2016 -0700 Committer: Reynold Xin Committed: Mon Oct 31 16:03:44 2016 -0700 -- .../spark/scheduler/ReplayListenerBus.scala | 17 .../query-event-logs-version-2.0.0.txt | 4 ++ .../query-event-logs-version-2.0.1.txt | 4 ++ .../streaming/StreamingQueryListenerSuite.scala | 42 4 files changed, 67 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/300d596a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala index d32f5eb..c65e7a2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala @@ -25,6 +25,7 @@ import com.fasterxml.jackson.core.JsonParseException import org.json4s.jackson.JsonMethods._ import org.apache.spark.internal.Logging +import org.apache.spark.scheduler.ReplayListenerBus._ import org.apache.spark.util.JsonProtocol /** @@ -57,6 +58,10 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { try { postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine))) } catch { + case e: ClassNotFoundException if KNOWN_REMOVED_CLASSES.contains(e.getMessage) => +// Ignore events generated by Structured Streaming in Spark 2.0.0 and 2.0.1. +// It's safe since no place uses them. +logWarning(s"Dropped incompatible Structured Streaming log: $currentLine") case jpe: JsonParseException => // We can only ignore exception from last line of the file that might be truncated if (!maybeTruncated || lines.hasNext) { @@ -78,3 +83,15 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { } } + +private[spark] object ReplayListenerBus { + + /** + * Classes that were removed. Structured Streaming doesn't use them any more. However, parsing + * old json may fail and we can just ignore these failures. + */ + val KNOWN_REMOVED_CLASSES = Set( +"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress", +"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated" + ) +} http://git-wip-us.apache.org/repos/asf/spark/blob/300d596a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt -- diff --git a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt new file mode 100644 index 000..aa7e9a8 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt @@ -0,0 +1,4 @@ +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@2b85b3a5","offsetDesc":"[#0]"}}} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@2b85b3a5","offsetDesc":"[#0]"}},"exception":null,"stackTrace":[]} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@514502dc","offsetDesc":"[-]"}},"exception":"Quer
[1/2] spark git commit: [SPARK-18103][SQL] Rename *FileCatalog to *FileIndex
Repository: spark Updated Branches: refs/heads/master 3ad99f166 -> 90d3b91f4 http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index d1de863..624ab74 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -200,7 +200,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log val rootPaths: Seq[Path] = if (lazyPruningEnabled) { Seq(metastoreRelation.hiveQlTable.getDataLocation) } else { -// By convention (for example, see TableFileCatalog), the definition of a +// By convention (for example, see CatalogFileIndex), the definition of a // partitioned table's paths depends on whether that table has any actual partitions. // Partitioned tables without partitions use the location of the table's base path. // Partitioned tables with partitions use the locations of those partitions' data @@ -227,7 +227,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log val logicalRelation = cached.getOrElse { val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong val fileCatalog = { - val catalog = new TableFileCatalog( + val catalog = new CatalogFileIndex( sparkSession, metastoreRelation.catalogTable, sizeInBytes) if (lazyPruningEnabled) { catalog http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index ecdf4f1..fc35304 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Dataset, QueryTest, SaveMode} import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec -import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, TableFileCatalog} +import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils @@ -321,17 +321,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto sql("DROP TABLE cachedTable") } - test("cache a table using TableFileCatalog") { + test("cache a table using CatalogFileIndex") { withTable("test") { sql("CREATE TABLE test(i int) PARTITIONED BY (p int) STORED AS parquet") val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") - val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0) + val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( -location = tableFileCatalog, +location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, @@ -343,7 +343,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto assert(spark.sharedState.cacheManager.lookupCachedData(plan).isDefined) - val sameCatalog = new TableFileCatalog(spark, tableMeta, 0) + val sameCatalog = new CatalogFileIndex(spark, tableMeta, 0) val sameRelation = HadoopFsRelation( location = sameCatalog, partitionSchema = tableMeta.partitionSchema, http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala index 476383a..d8e31c4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scal
[2/2] spark git commit: [SPARK-18103][SQL] Rename *FileCatalog to *FileIndex
[SPARK-18103][SQL] Rename *FileCatalog to *FileIndex ## What changes were proposed in this pull request? To reduce the number of components in SQL named *Catalog, rename *FileCatalog to *FileIndex. A FileIndex is responsible for returning the list of partitions / files to scan given a filtering expression. ``` TableFileCatalog => CatalogFileIndex FileCatalog => FileIndex ListingFileCatalog => InMemoryFileIndex MetadataLogFileCatalog => MetadataLogFileIndex PrunedTableFileCatalog => PrunedInMemoryFileIndex ``` cc yhuai marmbrus ## How was this patch tested? N/A Author: Eric Liang Author: Eric Liang Closes #15634 from ericl/rename-file-provider. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/90d3b91f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/90d3b91f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/90d3b91f Branch: refs/heads/master Commit: 90d3b91f4cb59d84fea7105d54ef8c87a7d5c6a2 Parents: 3ad99f1 Author: Eric Liang Authored: Sun Oct 30 13:14:45 2016 -0700 Committer: Reynold Xin Committed: Sun Oct 30 13:14:45 2016 -0700 -- .../spark/metrics/source/StaticSources.scala| 2 +- .../spark/sql/execution/CacheManager.scala | 2 +- .../datasources/CatalogFileIndex.scala | 110 + .../sql/execution/datasources/DataSource.scala | 10 +- .../sql/execution/datasources/FileCatalog.scala | 70 --- .../sql/execution/datasources/FileIndex.scala | 70 +++ .../datasources/HadoopFsRelation.scala | 4 +- .../datasources/InMemoryFileIndex.scala | 87 .../datasources/ListingFileCatalog.scala| 87 .../PartitioningAwareFileCatalog.scala | 437 --- .../PartitioningAwareFileIndex.scala| 437 +++ .../datasources/PruneFileSourcePartitions.scala | 6 +- .../datasources/TableFileCatalog.scala | 110 - .../streaming/CompactibleFileStreamLog.scala| 4 +- .../execution/streaming/FileStreamSource.scala | 4 +- .../streaming/MetadataLogFileCatalog.scala | 6 +- .../datasources/FileCatalogSuite.scala | 36 +- .../datasources/FileSourceStrategySuite.scala | 2 +- .../ParquetPartitionDiscoverySuite.scala| 2 +- .../sql/streaming/FileStreamSinkSuite.scala | 6 +- .../sql/streaming/FileStreamSourceSuite.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 4 +- .../spark/sql/hive/CachedTableSuite.scala | 10 +- .../hive/PartitionedTablePerfStatsSuite.scala | 2 +- .../PruneFileSourcePartitionsSuite.scala| 6 +- 25 files changed, 758 insertions(+), 758 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala -- diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala index b54885b..3f7cfd9 100644 --- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala +++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala @@ -76,7 +76,7 @@ object HiveCatalogMetrics extends Source { val METRIC_PARTITIONS_FETCHED = metricRegistry.counter(MetricRegistry.name("partitionsFetched")) /** - * Tracks the total number of files discovered off of the filesystem by ListingFileCatalog. + * Tracks the total number of files discovered off of the filesystem by InMemoryFileIndex. */ val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered")) http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala index fb72c67..526623a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala @@ -177,7 +177,7 @@ class CacheManager extends Logging { /** * Traverses a given `plan` and searches for the occurrences of `qualifiedPath` in the - * [[org.apache.spark.sql.execution.datasources.FileCatalog]] of any [[HadoopFsRelation]] nodes + * [[org.apache.spark.sql.execution.datasources.FileIndex]] of any [[HadoopFsRelation]] nodes * in the plan. If found, we refresh the metadata and return true. Otherwise, this method returns * false. */ http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/core/src/main/scala/org/apache/spark/sql/
spark git commit: [SPARK-18167][SQL] Add debug code for SQLQuerySuite flakiness when metastore partition pruning is enabled
Repository: spark Updated Branches: refs/heads/master 59cccbda4 -> d2d438d1d [SPARK-18167][SQL] Add debug code for SQLQuerySuite flakiness when metastore partition pruning is enabled ## What changes were proposed in this pull request? org.apache.spark.sql.hive.execution.SQLQuerySuite is flaking when hive partition pruning is enabled. Based on the stack traces, it seems to be an old issue where Hive fails to cast a numeric partition column ("Invalid character string format for type DECIMAL"). There are two possibilities here: either we are somehow corrupting the partition table to have non-decimal values in that column, or there is a transient issue with Derby. This PR logs the result of the retry when this exception is encountered, so we can confirm what is going on. ## How was this patch tested? n/a cc yhuai Author: Eric Liang Closes #15676 from ericl/spark-18167. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2d438d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2d438d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2d438d1 Branch: refs/heads/master Commit: d2d438d1d549628a0183e468ed11d6e85b5d6061 Parents: 59cccbd Author: Eric Liang Authored: Sat Oct 29 06:49:57 2016 +0200 Committer: Reynold Xin Committed: Sat Oct 29 06:49:57 2016 +0200 -- .../org/apache/spark/sql/hive/client/HiveShim.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d2d438d1/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 3238770..4bbbd66 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -24,6 +24,7 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, Set => JS import java.util.concurrent.TimeUnit import scala.collection.JavaConverters._ +import scala.util.Try import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileSystem, Path} @@ -585,7 +586,19 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]] } else { logDebug(s"Hive metastore filter is '$filter'.") -getPartitionsByFilterMethod.invoke(hive, table, filter).asInstanceOf[JArrayList[Partition]] +try { + getPartitionsByFilterMethod.invoke(hive, table, filter) +.asInstanceOf[JArrayList[Partition]] +} catch { + case e: InvocationTargetException => +// SPARK-18167 retry to investigate the flaky test. This should be reverted before +// the release is cut. +val retry = Try(getPartitionsByFilterMethod.invoke(hive, table, filter)) +val full = Try(getAllPartitionsMethod.invoke(hive, table)) +logError("getPartitionsByFilter failed, retry success = " + retry.isSuccess) +logError("getPartitionsByFilter failed, full fetch success = " + full.isSuccess) +throw e +} } partitions.asScala.toSeq - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18126][SPARK-CORE] getIteratorZipWithIndex accepts negative value as index
Repository: spark Updated Branches: refs/heads/master 29cea8f33 -> a76846cfb [SPARK-18126][SPARK-CORE] getIteratorZipWithIndex accepts negative value as index ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) `Utils.getIteratorZipWithIndex` was added to deal with number of records > 2147483647 in one partition. method `getIteratorZipWithIndex` accepts `startIndex` < 0, which leads to negative index. This PR just adds a defensive check on `startIndex` to make sure it is >= 0. ## How was this patch tested? Add a new unit test. Author: Miao Wang Closes #15639 from wangmiao1981/zip. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a76846cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a76846cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a76846cf Branch: refs/heads/master Commit: a76846cfb1c2d6c8f4d647426030b59de20d9433 Parents: 29cea8f Author: Miao Wang Authored: Thu Oct 27 01:17:32 2016 +0200 Committer: Reynold Xin Committed: Thu Oct 27 01:17:32 2016 +0200 -- core/src/main/scala/org/apache/spark/util/Utils.scala | 1 + core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 3 +++ 2 files changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a76846cf/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index e57eb0d..6027b07 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1765,6 +1765,7 @@ private[spark] object Utils extends Logging { */ def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = { new Iterator[(T, Long)] { + require(startIndex >= 0, "startIndex should be >= 0.") var index: Long = startIndex - 1L def hasNext: Boolean = iterator.hasNext def next(): (T, Long) = { http://git-wip-us.apache.org/repos/asf/spark/blob/a76846cf/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index aeb2969..15ef32f 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -401,6 +401,9 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(iterator.toArray === Array( (0, -1L + Int.MaxValue), (1, 0L + Int.MaxValue), (2, 1L + Int.MaxValue) )) +intercept[IllegalArgumentException] { + Utils.getIteratorZipWithIndex(Iterator(0, 1, 2), -1L) +} } test("doesDirectoryContainFilesNewerThan") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18094][SQL][TESTS] Move group analytics test cases from `SQLQuerySuite` into a query file test.
Repository: spark Updated Branches: refs/heads/master dcdda1978 -> 5b7d403c1 [SPARK-18094][SQL][TESTS] Move group analytics test cases from `SQLQuerySuite` into a query file test. ## What changes were proposed in this pull request? Currently we have several test cases for group analytics(ROLLUP/CUBE/GROUPING SETS) in `SQLQuerySuite`, should better move them into a query file test. The following test cases are moved to `group-analytics.sql`: ``` test("rollup") test("grouping sets when aggregate functions containing groupBy columns") test("cube") test("grouping sets") test("grouping and grouping_id") test("grouping and grouping_id in having") test("grouping and grouping_id in sort") ``` This is followup work of #15582 ## How was this patch tested? Modified query file `group-analytics.sql`, which will be tested by `SQLQueryTestSuite`. Author: jiangxingbo Closes #15624 from jiangxb1987/group-analytics-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b7d403c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b7d403c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b7d403c Branch: refs/heads/master Commit: 5b7d403c1819c32a6a5b87d470f8de1a8ad7a987 Parents: dcdda19 Author: jiangxingbo Authored: Wed Oct 26 23:51:16 2016 +0200 Committer: Reynold Xin Committed: Wed Oct 26 23:51:16 2016 +0200 -- .../sql-tests/inputs/group-analytics.sql| 46 +++- .../sql-tests/results/group-analytics.sql.out | 247 ++- .../org/apache/spark/sql/SQLQuerySuite.scala| 189 -- 3 files changed, 290 insertions(+), 192 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b7d403c/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql index 2f78349..f813538 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql @@ -10,4 +10,48 @@ SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE; -- ROLLUP on overlapping columns SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP; -SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP; \ No newline at end of file +SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP; + +CREATE OR REPLACE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES +("dotNET", 2012, 1), ("Java", 2012, 2), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 3) +AS courseSales(course, year, earnings); + +-- ROLLUP +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year; + +-- CUBE +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year; + +-- GROUPING SETS +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year); +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course); +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year); + +-- GROUPING SETS with aggregate functions containing groupBy columns +SELECT course, SUM(earnings) AS sum FROM courseSales +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum; +SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum; + +-- GROUPING/GROUPING_ID +SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales +GROUP BY CUBE(course, year); +SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year; +SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year; +SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year); + +-- GROUPING/GROUPING_ID in having clause +SELECT course, year FROM courseSales GROUP BY CUBE(course, year) +HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0; +SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0; +SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0; +SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0; + +-- GROUPING/GROUPING_ID in orderBy clause +SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year) +ORDER BY GROUPING(course), GROUPING(year), course, year; +SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY
spark git commit: [SPARK-18063][SQL] Failed to infer constraints over multiple aliases
Repository: spark Updated Branches: refs/heads/branch-2.0 773fbfef1 -> 5b81b0102 [SPARK-18063][SQL] Failed to infer constraints over multiple aliases ## What changes were proposed in this pull request? The `UnaryNode.getAliasedConstraints` function fails to replace all expressions by their alias where constraints contains more than one expression to be replaced. For example: ``` val tr = LocalRelation('a.int, 'b.string, 'c.int) val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y)) multiAlias.analyze.constraints ``` currently outputs: ``` ExpressionSet(Seq( IsNotNull(resolveColumn(multiAlias.analyze, "x")), IsNotNull(resolveColumn(multiAlias.analyze, "y")) ) ``` The constraint `resolveColumn(multiAlias.analyze, "x") === resolveColumn(multiAlias.analyze, "y") + 10)` is missing. ## How was this patch tested? Add new test cases in `ConstraintPropagationSuite`. Author: jiangxingbo Closes #15597 from jiangxb1987/alias-constraints. (cherry picked from commit fa7d9d70825a6816495d239da925d0087f7cb94f) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b81b010 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b81b010 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b81b010 Branch: refs/heads/branch-2.0 Commit: 5b81b01026bc215c7982a640a794cd36ea720959 Parents: 773fbfe Author: jiangxingbo Authored: Wed Oct 26 20:12:20 2016 +0200 Committer: Reynold Xin Committed: Wed Oct 26 20:12:44 2016 +0200 -- .../sql/catalyst/plans/logical/LogicalPlan.scala| 16 ++-- .../catalyst/plans/ConstraintPropagationSuite.scala | 8 2 files changed, 18 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b81b010/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 6d77991..9c152fb88 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -293,15 +293,19 @@ abstract class UnaryNode extends LogicalPlan { * expressions with the corresponding alias */ protected def getAliasedConstraints(projectList: Seq[NamedExpression]): Set[Expression] = { -projectList.flatMap { +var allConstraints = child.constraints.asInstanceOf[Set[Expression]] +projectList.foreach { case a @ Alias(e, _) => -child.constraints.map(_ transform { +// For every alias in `projectList`, replace the reference in constraints by its attribute. +allConstraints ++= allConstraints.map(_ transform { case expr: Expression if expr.semanticEquals(e) => a.toAttribute -}).union(Set(EqualNullSafe(e, a.toAttribute))) - case _ => -Set.empty[Expression] -}.toSet +}) +allConstraints += EqualNullSafe(e, a.toAttribute) + case _ => // Don't change. +} + +allConstraints -- child.constraints } override protected def validConstraints: Set[Expression] = child.constraints http://git-wip-us.apache.org/repos/asf/spark/blob/5b81b010/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala index 8d6a49a..8068ce9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala @@ -128,8 +128,16 @@ class ConstraintPropagationSuite extends SparkFunSuite { ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10, IsNotNull(resolveColumn(aliasedRelation.analyze, "x")), resolveColumn(aliasedRelation.analyze, "b") <=> resolveColumn(aliasedRelation.analyze, "y"), +resolveColumn(aliasedRelation.analyze, "z") <=> resolveColumn(aliasedRelation.analyze, "x"), resolveColumn(aliasedRelation.analyze, "z") > 10, IsNotNull(resolveColumn(aliasedRelation.analyze, "z") + +val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y)) +verifyConstraints(multiAlias.analyze.constraints, + ExpressionSet(Seq(IsNotNull(resolveColumn(multiAlias.a
spark git commit: [SPARK-18063][SQL] Failed to infer constraints over multiple aliases
Repository: spark Updated Branches: refs/heads/master 7ac70e7ba -> fa7d9d708 [SPARK-18063][SQL] Failed to infer constraints over multiple aliases ## What changes were proposed in this pull request? The `UnaryNode.getAliasedConstraints` function fails to replace all expressions by their alias where constraints contains more than one expression to be replaced. For example: ``` val tr = LocalRelation('a.int, 'b.string, 'c.int) val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y)) multiAlias.analyze.constraints ``` currently outputs: ``` ExpressionSet(Seq( IsNotNull(resolveColumn(multiAlias.analyze, "x")), IsNotNull(resolveColumn(multiAlias.analyze, "y")) ) ``` The constraint `resolveColumn(multiAlias.analyze, "x") === resolveColumn(multiAlias.analyze, "y") + 10)` is missing. ## How was this patch tested? Add new test cases in `ConstraintPropagationSuite`. Author: jiangxingbo Closes #15597 from jiangxb1987/alias-constraints. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa7d9d70 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa7d9d70 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa7d9d70 Branch: refs/heads/master Commit: fa7d9d70825a6816495d239da925d0087f7cb94f Parents: 7ac70e7 Author: jiangxingbo Authored: Wed Oct 26 20:12:20 2016 +0200 Committer: Reynold Xin Committed: Wed Oct 26 20:12:20 2016 +0200 -- .../sql/catalyst/plans/logical/LogicalPlan.scala| 16 ++-- .../catalyst/plans/ConstraintPropagationSuite.scala | 8 2 files changed, 18 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa7d9d70/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 0972547..b0a4145 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -293,15 +293,19 @@ abstract class UnaryNode extends LogicalPlan { * expressions with the corresponding alias */ protected def getAliasedConstraints(projectList: Seq[NamedExpression]): Set[Expression] = { -projectList.flatMap { +var allConstraints = child.constraints.asInstanceOf[Set[Expression]] +projectList.foreach { case a @ Alias(e, _) => -child.constraints.map(_ transform { +// For every alias in `projectList`, replace the reference in constraints by its attribute. +allConstraints ++= allConstraints.map(_ transform { case expr: Expression if expr.semanticEquals(e) => a.toAttribute -}).union(Set(EqualNullSafe(e, a.toAttribute))) - case _ => -Set.empty[Expression] -}.toSet +}) +allConstraints += EqualNullSafe(e, a.toAttribute) + case _ => // Don't change. +} + +allConstraints -- child.constraints } override protected def validConstraints: Set[Expression] = child.constraints http://git-wip-us.apache.org/repos/asf/spark/blob/fa7d9d70/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala index 8d6a49a..8068ce9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala @@ -128,8 +128,16 @@ class ConstraintPropagationSuite extends SparkFunSuite { ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10, IsNotNull(resolveColumn(aliasedRelation.analyze, "x")), resolveColumn(aliasedRelation.analyze, "b") <=> resolveColumn(aliasedRelation.analyze, "y"), +resolveColumn(aliasedRelation.analyze, "z") <=> resolveColumn(aliasedRelation.analyze, "x"), resolveColumn(aliasedRelation.analyze, "z") > 10, IsNotNull(resolveColumn(aliasedRelation.analyze, "z") + +val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y)) +verifyConstraints(multiAlias.analyze.constraints, + ExpressionSet(Seq(IsNotNull(resolveColumn(multiAlias.analyze, "x")), +IsNotNull(resolveColumn(multiAlias.analyze, "y")), +resolveColumn(multiAlias
spark git commit: [SPARK-17698][SQL] Join predicates should not contain filter clauses
Repository: spark Updated Branches: refs/heads/branch-2.0 b959dab32 -> 3d5878751 [SPARK-17698][SQL] Join predicates should not contain filter clauses ## What changes were proposed in this pull request? This is a backport of https://github.com/apache/spark/pull/15272 to 2.0 branch. Jira : https://issues.apache.org/jira/browse/SPARK-17698 `ExtractEquiJoinKeys` is incorrectly using filter predicates as the join condition for joins. `canEvaluate` [0] tries to see if the an `Expression` can be evaluated using output of a given `Plan`. In case of filter predicates (eg. `a.id='1'`), the `Expression` passed for the right hand side (ie. '1' ) is a `Literal` which does not have any attribute references. Thus `expr.references` is an empty set which theoretically is a subset of any set. This leads to `canEvaluate` returning `true` and `a.id='1'` is treated as a join predicate. While this does not lead to incorrect results but in case of bucketed + sorted tables, we might miss out on avoiding un-necessary shuffle + sort. See example below: [0] : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala#L91 eg. ``` val df = (1 until 10).toDF("id").coalesce(1) hc.sql("DROP TABLE IF EXISTS table1").collect df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table1") hc.sql("DROP TABLE IF EXISTS table2").collect df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table2") sqlContext.sql(""" SELECT a.id, b.id FROM table1 a FULL OUTER JOIN table2 b ON a.id = b.id AND a.id='1' AND b.id='1' """).explain(true) ``` BEFORE: This is doing shuffle + sort over table scan outputs which is not needed as both tables are bucketed and sorted on the same columns and have same number of buckets. This should be a single stage job. ``` SortMergeJoin [id#38, cast(id#38 as double), 1.0], [id#39, 1.0, cast(id#39 as double)], FullOuter :- *Sort [id#38 ASC NULLS FIRST, cast(id#38 as double) ASC NULLS FIRST, 1.0 ASC NULLS FIRST], false, 0 : +- Exchange hashpartitioning(id#38, cast(id#38 as double), 1.0, 200) : +- *FileScan parquet default.table1[id#38] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], PushedFilters: [], ReadSchema: struct +- *Sort [id#39 ASC NULLS FIRST, 1.0 ASC NULLS FIRST, cast(id#39 as double) ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(id#39, 1.0, cast(id#39 as double), 200) +- *FileScan parquet default.table2[id#39] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], PushedFilters: [], ReadSchema: struct ``` AFTER : ``` SortMergeJoin [id#32], [id#33], FullOuter, ((cast(id#32 as double) = 1.0) && (cast(id#33 as double) = 1.0)) :- *FileScan parquet default.table1[id#32] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], PushedFilters: [], ReadSchema: struct +- *FileScan parquet default.table2[id#33] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], PushedFilters: [], ReadSchema: struct ``` ## How was this patch tested? - Added a new test case for this scenario : `SPARK-17698 Join predicates should not contain filter clauses` - Ran all the tests in `BucketedReadSuite` Author: Tejas Patil Closes #15600 from tejasapatil/SPARK-17698_2.0_backport. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3d587875 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3d587875 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3d587875 Branch: refs/heads/branch-2.0 Commit: 3d587875102fc2f10f03956ef50457203cb4a840 Parents: b959dab Author: Tejas Patil Authored: Sat Oct 22 16:32:49 2016 -0700 Committer: Reynold Xin Committed: Sat Oct 22 16:32:49 2016 -0700 -- .../sql/catalyst/expressions/predicates.scala | 5 +- .../spark/sql/catalyst/optimizer/joins.scala| 4 +- .../spark/sql/catalyst/planning/patterns.scala | 2 + .../spark/sql/sources/BucketedReadSuite.scala | 82 +--- 4 files changed, 79 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3d587875/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 100087e..abe0f08 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -84,8 +84
spark git commit: [SPARK-928][CORE] Add support for Unsafe-based serializer in Kryo
Repository: spark Updated Branches: refs/heads/master 4f1dcd3dc -> bc167a2a5 [SPARK-928][CORE] Add support for Unsafe-based serializer in Kryo ## What changes were proposed in this pull request? Now since we have migrated to Kryo-3.0.0 in https://issues.apache.org/jira/browse/SPARK-11416, we can gives users option to use unsafe SerDer. It can turned by setting `spark.kryo.useUnsafe` to `true` ## How was this patch tested? Ran existing tests ``` Benchmark Kryo Unsafe vs safe Serialization: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative basicTypes: Int unsafe:true160 / 178 98.5 10.1 1.0X basicTypes: Long unsafe:true 210 / 218 74.9 13.4 0.8X basicTypes: Float unsafe:true 203 / 213 77.5 12.9 0.8X basicTypes: Double unsafe:true 226 / 235 69.5 14.4 0.7X Array: Int unsafe:true1087 / 1101 14.5 69.1 0.1X Array: Long unsafe:true 2758 / 2844 5.7 175.4 0.1X Array: Float unsafe:true 1511 / 1552 10.4 96.1 0.1X Array: Double unsafe:true 2942 / 2972 5.3 187.0 0.1X Map of string->Double unsafe:true 2645 / 2739 5.9 168.2 0.1X basicTypes: Int unsafe:false 211 / 218 74.7 13.4 0.8X basicTypes: Long unsafe:false 247 / 253 63.6 15.7 0.6X basicTypes: Float unsafe:false 211 / 216 74.5 13.4 0.8X basicTypes: Double unsafe:false227 / 233 69.2 14.4 0.7X Array: Int unsafe:false 3012 / 3032 5.2 191.5 0.1X Array: Long unsafe:false 4463 / 4515 3.5 283.8 0.0X Array: Float unsafe:false 2788 / 2868 5.6 177.2 0.1X Array: Double unsafe:false3558 / 3752 4.4 226.2 0.0X Map of string->Double unsafe:false2806 / 2933 5.6 178.4 0.1X ``` Author: Sandeep Singh Author: Sandeep Singh Closes #12913 from techaddict/SPARK-928. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc167a2a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc167a2a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc167a2a Branch: refs/heads/master Commit: bc167a2a53f5a795d089e8a884569b1b3e2cd439 Parents: 4f1dcd3 Author: Sandeep Singh Authored: Sat Oct 22 12:03:37 2016 -0700 Committer: Reynold Xin Committed: Sat Oct 22 12:03:37 2016 -0700 -- .../spark/serializer/KryoSerializer.scala | 36 +++-- .../apache/spark/serializer/KryoBenchmark.scala | 139 +++ .../spark/serializer/KryoSerializerSuite.scala | 1 + .../serializer/UnsafeKryoSerializerSuite.scala | 33 + docs/configuration.md | 8 ++ 5 files changed, 206 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bc167a2a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala -- diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 1fba552..0d26281 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -27,6 +27,7 @@ import scala.reflect.ClassTag import com.esotericsoftware.kryo.{Kryo, KryoException, Serializer => KryoClassSerializer} import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput} +import com.esotericsoftware.kryo.io.{UnsafeInput => KryoUnsafeInput, UnsafeOutput => KryoUnsafeOutput} import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer} import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator} import org.apache.avro.generic.{GenericData, GenericRecord} @@ -78,8 +79,15 @@ class KryoSerializer(conf: SparkConf) .filter(!_.isEmpty) private val avroSchemas = conf.getAvroSchema + // whether to use unsafe based IO for serialization + private val useUnsafe = conf.getBoolean("spark.kryo.unsafe", false) - def newKryoOutput(): KryoOutput = new KryoOut
spark git commit: [SPARK-18051][SPARK CORE] fix bug of custom PartitionCoalescer causing serialization exception
Repository: spark Updated Branches: refs/heads/master 5fa9f8795 -> 4f1dcd3dc [SPARK-18051][SPARK CORE] fix bug of custom PartitionCoalescer causing serialization exception ## What changes were proposed in this pull request? add a require check in `CoalescedRDD` to make sure the passed in `partitionCoalescer` to be `serializable`. and update the document for api `RDD.coalesce` ## How was this patch tested? Manual.(test code in jira [SPARK-18051]) Author: WeichenXu Closes #15587 from WeichenXu123/fix_coalescer_bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f1dcd3d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f1dcd3d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f1dcd3d Branch: refs/heads/master Commit: 4f1dcd3dce270268b42fbe59409790364fa5c5df Parents: 5fa9f87 Author: WeichenXu Authored: Sat Oct 22 11:59:28 2016 -0700 Committer: Reynold Xin Committed: Sat Oct 22 11:59:28 2016 -0700 -- core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala | 4 core/src/main/scala/org/apache/spark/rdd/RDD.scala | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f1dcd3d/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala index 9c198a6..2cba1fe 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala @@ -80,6 +80,10 @@ private[spark] class CoalescedRDD[T: ClassTag]( require(maxPartitions > 0 || maxPartitions == prev.partitions.length, s"Number of partitions ($maxPartitions) must be positive.") + if (partitionCoalescer.isDefined) { +require(partitionCoalescer.get.isInstanceOf[Serializable], + "The partition coalescer passed in must be serializable.") + } override def getPartitions: Array[Partition] = { val pc = partitionCoalescer.getOrElse(new DefaultPartitionCoalescer()) http://git-wip-us.apache.org/repos/asf/spark/blob/4f1dcd3d/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index be11957..db535de 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -432,7 +432,8 @@ abstract class RDD[T: ClassTag]( * of partitions. This is useful if you have a small number of partitions, * say 100, potentially with a few partitions being abnormally large. Calling * coalesce(1000, shuffle = true) will result in 1000 partitions with the - * data distributed using a hash partitioner. + * data distributed using a hash partitioner. The optional partition coalescer + * passed in must be serializable. */ def coalesce(numPartitions: Int, shuffle: Boolean = false, partitionCoalescer: Option[PartitionCoalescer] = Option.empty) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same log message typo
Repository: spark Updated Branches: refs/heads/branch-2.0 d3c78c4f3 -> a0c03c925 [SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same log message typo ## What changes were proposed in this pull request? Tiny follow-up to SPARK-16606 / https://github.com/apache/spark/pull/14533 , to correct more instances of the same log message typo ## How was this patch tested? Existing tests (no functional change anyway) Author: Sean Owen Closes #15586 from srowen/SPARK-16606.2. (cherry picked from commit 7178c56433cd138dae53db9194c55e3f4fa0fa69) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0c03c92 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0c03c92 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0c03c92 Branch: refs/heads/branch-2.0 Commit: a0c03c92545c147015308cce195dfc2e8a3074fb Parents: d3c78c4 Author: Sean Owen Authored: Fri Oct 21 22:20:52 2016 -0700 Committer: Reynold Xin Committed: Fri Oct 21 22:21:07 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a0c03c92/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index a7de115..13d3e75 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -802,7 +802,7 @@ object SparkSession { if ((session ne null) && !session.sparkContext.isStopped) { options.foreach { case (k, v) => session.conf.set(k, v) } if (options.nonEmpty) { - logWarning("Use an existing SparkSession, some configuration may not take effect.") + logWarning("Using an existing SparkSession; some configuration may not take effect.") } return session } @@ -814,7 +814,7 @@ object SparkSession { if ((session ne null) && !session.sparkContext.isStopped) { options.foreach { case (k, v) => session.conf.set(k, v) } if (options.nonEmpty) { -logWarning("Use an existing SparkSession, some configuration may not take effect.") +logWarning("Using an existing SparkSession; some configuration may not take effect.") } return session } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same log message typo
Repository: spark Updated Branches: refs/heads/master 3fbf5a58c -> 7178c5643 [SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same log message typo ## What changes were proposed in this pull request? Tiny follow-up to SPARK-16606 / https://github.com/apache/spark/pull/14533 , to correct more instances of the same log message typo ## How was this patch tested? Existing tests (no functional change anyway) Author: Sean Owen Closes #15586 from srowen/SPARK-16606.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7178c564 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7178c564 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7178c564 Branch: refs/heads/master Commit: 7178c56433cd138dae53db9194c55e3f4fa0fa69 Parents: 3fbf5a5 Author: Sean Owen Authored: Fri Oct 21 22:20:52 2016 -0700 Committer: Reynold Xin Committed: Fri Oct 21 22:20:52 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7178c564/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index baae550..3045eb6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -814,7 +814,7 @@ object SparkSession { if ((session ne null) && !session.sparkContext.isStopped) { options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) } if (options.nonEmpty) { - logWarning("Use an existing SparkSession, some configuration may not take effect.") + logWarning("Using an existing SparkSession; some configuration may not take effect.") } return session } @@ -826,7 +826,7 @@ object SparkSession { if ((session ne null) && !session.sparkContext.isStopped) { options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) } if (options.nonEmpty) { -logWarning("Use an existing SparkSession, some configuration may not take effect.") +logWarning("Using an existing SparkSession; some configuration may not take effect.") } return session } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18042][SQL] OutputWriter should expose file path written
Repository: spark Updated Branches: refs/heads/master c9720b219 -> 3fbf5a58c [SPARK-18042][SQL] OutputWriter should expose file path written ## What changes were proposed in this pull request? This patch adds a new "path" method on OutputWriter that returns the path of the file written by the OutputWriter. This is part of the necessary work to consolidate structured streaming and batch write paths. The batch write path has a nice feature that each data source can define the extension of the files, and allow Spark to specify the staging directory and the prefix for the files. However, in the streaming path we need to collect the list of files written, and there is no interface right now to do that. ## How was this patch tested? N/A - there is no behavior change and this should be covered by existing tests. Author: Reynold Xin Closes #15580 from rxin/SPARK-18042. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3fbf5a58 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3fbf5a58 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3fbf5a58 Branch: refs/heads/master Commit: 3fbf5a58c236fc5d5fee39cb29e7f5c7e01c0ee7 Parents: c9720b2 Author: Reynold Xin Authored: Fri Oct 21 17:27:18 2016 -0700 Committer: Reynold Xin Committed: Fri Oct 21 17:27:18 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 8 +- .../execution/datasources/OutputWriter.scala| 17 +++- .../execution/datasources/csv/CSVRelation.scala | 8 +- .../datasources/json/JsonFileFormat.scala | 8 +- .../datasources/parquet/ParquetFileFormat.scala | 2 +- .../datasources/parquet/ParquetOptions.scala| 2 +- .../parquet/ParquetOutputWriter.scala | 24 +--- .../datasources/text/TextFileFormat.scala | 25 +++-- .../spark/sql/hive/orc/OrcFileFormat.scala | 29 ++-- .../sql/sources/CommitFailureTestSource.scala | 3 ++ .../spark/sql/sources/SimpleTextRelation.scala | 3 ++ 11 files changed, 90 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3fbf5a58/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index fff8668..5e9e6ff 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.text.TextOutputWriter import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.util.SerializableConfiguration @@ -46,12 +47,17 @@ private[libsvm] class LibSVMOutputWriter( context: TaskAttemptContext) extends OutputWriter { + override val path: String = { +val compressionExtension = TextOutputWriter.getCompressionExtension(context) +new Path(stagingDir, fileNamePrefix + ".libsvm" + compressionExtension).toString + } + private[this] val buffer = new Text() private val recordWriter: RecordWriter[NullWritable, Text] = { new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { -new Path(stagingDir, fileNamePrefix + extension) +new Path(path) } }.getRecordWriter(context) } http://git-wip-us.apache.org/repos/asf/spark/blob/3fbf5a58/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala index f4cefda..fbf6e96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala @@ -42,11 +42,12 @@ abstract class OutputWriterFactory extends Serializable { * @param fileNamePrefix Prefix of the file name. The returned OutputWriter must make sure this * prefix is used in the actual file name. For example, if the prefix is * &quo
spark git commit: [SPARK-18021][SQL] Refactor file name specification for data sources
Repository: spark Updated Branches: refs/heads/master 947f4f252 -> 7f9ec19ea [SPARK-18021][SQL] Refactor file name specification for data sources ## What changes were proposed in this pull request? Currently each data source OutputWriter is responsible for specifying the entire file name for each file output. This, however, does not make any sense because we rely on file naming schemes for certain behaviors in Spark SQL, e.g. bucket id. The current approach allows individual data sources to break the implementation of bucketing. On the flip side, we also don't want to move file naming entirely out of data sources, because different data sources do want to specify different extensions. This patch divides file name specification into two parts: the first part is a prefix specified by the caller of OutputWriter (in WriteOutput), and the second part is the suffix that can be specified by the OutputWriter itself. Note that a side effect of this change is that now all file based data sources also support bucketing automatically. There are also some other minor cleanups: - Removed the UUID passed through generic Configuration string - Some minor rewrites for better clarity - Renamed "path" in multiple places to "stagingDir", to more accurately reflect its meaning ## How was this patch tested? This should be covered by existing data source tests. Author: Reynold Xin Closes #15562 from rxin/SPARK-18021. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f9ec19e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f9ec19e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f9ec19e Branch: refs/heads/master Commit: 7f9ec19eae60abe589ffd22259a9065e7e353a57 Parents: 947f4f2 Author: Reynold Xin Authored: Thu Oct 20 12:18:56 2016 -0700 Committer: Reynold Xin Committed: Thu Oct 20 12:18:56 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 16 +++--- .../execution/datasources/OutputWriter.scala| 17 +++--- .../sql/execution/datasources/WriteOutput.scala | 56 +--- .../execution/datasources/csv/CSVRelation.scala | 18 +++ .../datasources/json/JsonFileFormat.scala | 17 +++--- .../datasources/parquet/ParquetFileFormat.scala | 7 ++- .../parquet/ParquetOutputWriter.scala | 32 +++ .../datasources/text/TextFileFormat.scala | 21 .../spark/sql/hive/orc/OrcFileFormat.scala | 21 .../spark/sql/sources/BucketedWriteSuite.scala | 5 -- .../sql/sources/CommitFailureTestSource.scala | 6 +-- .../spark/sql/sources/SimpleTextRelation.scala | 26 + 12 files changed, 99 insertions(+), 143 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f9ec19e/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 8577803..fff8668 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -40,7 +40,8 @@ import org.apache.spark.sql.types._ import org.apache.spark.util.SerializableConfiguration private[libsvm] class LibSVMOutputWriter( -path: String, +stagingDir: String, +fileNamePrefix: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { @@ -50,11 +51,7 @@ private[libsvm] class LibSVMOutputWriter( private val recordWriter: RecordWriter[NullWritable, Text] = { new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { -val configuration = context.getConfiguration -val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) -val taskAttemptId = context.getTaskAttemptID -val split = taskAttemptId.getTaskID.getId -new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") +new Path(stagingDir, fileNamePrefix + extension) } }.getRecordWriter(context) } @@ -132,12 +129,11 @@ private[libsvm] class LibSVMFileFormat extends TextBasedFileFormat with DataSour dataSchema: StructType): OutputWriterFactory = { new OutputWriterFactory { override def newInstance( - path: String, - bucketId: Option[Int], + stagingDir: String, + fileNamePrefix: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { -if (bucketId.isDefined) { sys.er
spark git commit: [SPARK-15780][SQL] Support mapValues on KeyValueGroupedDataset
Repository: spark Updated Branches: refs/heads/master fb0894b3a -> 84b245f2d [SPARK-15780][SQL] Support mapValues on KeyValueGroupedDataset ## What changes were proposed in this pull request? Add mapValues to KeyValueGroupedDataset ## How was this patch tested? New test in DatasetSuite for groupBy function, mapValues, flatMap Author: Koert Kuipers Closes #13526 from koertkuipers/feat-keyvaluegroupeddataset-mapvalues. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84b245f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84b245f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84b245f2 Branch: refs/heads/master Commit: 84b245f2dd31c1cebbf12458bf11f67e287e93f4 Parents: fb0894b Author: Koert Kuipers Authored: Thu Oct 20 10:08:12 2016 -0700 Committer: Reynold Xin Committed: Thu Oct 20 10:08:12 2016 -0700 -- .../sql/catalyst/plans/logical/object.scala | 13 ++ .../spark/sql/KeyValueGroupedDataset.scala | 42 .../org/apache/spark/sql/DatasetSuite.scala | 11 + 3 files changed, 66 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/84b245f2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala index fefe5a3..0ab4c90 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala @@ -230,6 +230,19 @@ object AppendColumns { encoderFor[U].namedExpressions, child) } + + def apply[T : Encoder, U : Encoder]( + func: T => U, + inputAttributes: Seq[Attribute], + child: LogicalPlan): AppendColumns = { +new AppendColumns( + func.asInstanceOf[Any => Any], + implicitly[Encoder[T]].clsTag.runtimeClass, + implicitly[Encoder[T]].schema, + UnresolvedDeserializer(encoderFor[T].deserializer, inputAttributes), + encoderFor[U].namedExpressions, + child) + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/84b245f2/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 828eb94..4cb0313 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -67,6 +67,48 @@ class KeyValueGroupedDataset[K, V] private[sql]( groupingAttributes) /** + * Returns a new [[KeyValueGroupedDataset]] where the given function `func` has been applied + * to the data. The grouping key is unchanged by this. + * + * {{{ + * // Create values grouped by key from a Dataset[(K, V)] + * ds.groupByKey(_._1).mapValues(_._2) // Scala + * }}} + * + * @since 2.1.0 + */ + def mapValues[W : Encoder](func: V => W): KeyValueGroupedDataset[K, W] = { +val withNewData = AppendColumns(func, dataAttributes, logicalPlan) +val projected = Project(withNewData.newColumns ++ groupingAttributes, withNewData) +val executed = sparkSession.sessionState.executePlan(projected) + +new KeyValueGroupedDataset( + encoderFor[K], + encoderFor[W], + executed, + withNewData.newColumns, + groupingAttributes) + } + + /** + * Returns a new [[KeyValueGroupedDataset]] where the given function `func` has been applied + * to the data. The grouping key is unchanged by this. + * + * {{{ + * // Create Integer values grouped by String key from a Dataset> + * Dataset> ds = ...; + * KeyValueGroupedDataset grouped = + * ds.groupByKey(t -> t._1, Encoders.STRING()).mapValues(t -> t._2, Encoders.INT()); // Java 8 + * }}} + * + * @since 2.1.0 + */ + def mapValues[W](func: MapFunction[V, W], encoder: Encoder[W]): KeyValueGroupedDataset[K, W] = { +implicit val uEnc = encoder +mapValues { (v: V) => func.call(v) } + } + + /** * Returns a [[Dataset]] that contains each unique key. This is equivalent to doing mapping * over the Dataset to extract the keys and then running a distinct operation on those. * http://git-wip-us.apache.org/repos/asf/spark/blob/84b245f2/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/
spark git commit: [SPARK-17698][SQL] Join predicates should not contain filter clauses
Repository: spark Updated Branches: refs/heads/master e895bc254 -> fb0894b3a [SPARK-17698][SQL] Join predicates should not contain filter clauses ## What changes were proposed in this pull request? Jira : https://issues.apache.org/jira/browse/SPARK-17698 `ExtractEquiJoinKeys` is incorrectly using filter predicates as the join condition for joins. `canEvaluate` [0] tries to see if the an `Expression` can be evaluated using output of a given `Plan`. In case of filter predicates (eg. `a.id='1'`), the `Expression` passed for the right hand side (ie. '1' ) is a `Literal` which does not have any attribute references. Thus `expr.references` is an empty set which theoretically is a subset of any set. This leads to `canEvaluate` returning `true` and `a.id='1'` is treated as a join predicate. While this does not lead to incorrect results but in case of bucketed + sorted tables, we might miss out on avoiding un-necessary shuffle + sort. See example below: [0] : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala#L91 eg. ``` val df = (1 until 10).toDF("id").coalesce(1) hc.sql("DROP TABLE IF EXISTS table1").collect df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table1") hc.sql("DROP TABLE IF EXISTS table2").collect df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table2") sqlContext.sql(""" SELECT a.id, b.id FROM table1 a FULL OUTER JOIN table2 b ON a.id = b.id AND a.id='1' AND b.id='1' """).explain(true) ``` BEFORE: This is doing shuffle + sort over table scan outputs which is not needed as both tables are bucketed and sorted on the same columns and have same number of buckets. This should be a single stage job. ``` SortMergeJoin [id#38, cast(id#38 as double), 1.0], [id#39, 1.0, cast(id#39 as double)], FullOuter :- *Sort [id#38 ASC NULLS FIRST, cast(id#38 as double) ASC NULLS FIRST, 1.0 ASC NULLS FIRST], false, 0 : +- Exchange hashpartitioning(id#38, cast(id#38 as double), 1.0, 200) : +- *FileScan parquet default.table1[id#38] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], PushedFilters: [], ReadSchema: struct +- *Sort [id#39 ASC NULLS FIRST, 1.0 ASC NULLS FIRST, cast(id#39 as double) ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(id#39, 1.0, cast(id#39 as double), 200) +- *FileScan parquet default.table2[id#39] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], PushedFilters: [], ReadSchema: struct ``` AFTER : ``` SortMergeJoin [id#32], [id#33], FullOuter, ((cast(id#32 as double) = 1.0) && (cast(id#33 as double) = 1.0)) :- *FileScan parquet default.table1[id#32] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], PushedFilters: [], ReadSchema: struct +- *FileScan parquet default.table2[id#33] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], PushedFilters: [], ReadSchema: struct ``` ## How was this patch tested? - Added a new test case for this scenario : `SPARK-17698 Join predicates should not contain filter clauses` - Ran all the tests in `BucketedReadSuite` Author: Tejas Patil Closes #15272 from tejasapatil/SPARK-17698_join_predicate_filter_clause. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb0894b3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb0894b3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb0894b3 Branch: refs/heads/master Commit: fb0894b3a87331a731129ad3fc7ebe598d90a6ee Parents: e895bc2 Author: Tejas Patil Authored: Thu Oct 20 09:50:55 2016 -0700 Committer: Reynold Xin Committed: Thu Oct 20 09:50:55 2016 -0700 -- .../sql/catalyst/expressions/predicates.scala | 5 +- .../spark/sql/catalyst/optimizer/joins.scala| 4 +- .../spark/sql/catalyst/planning/patterns.scala | 2 + .../spark/sql/sources/BucketedReadSuite.scala | 124 +++ 4 files changed, 109 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb0894b3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 799858a..9394e39 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -84,8 +84,9 @@ trait PredicateHelper { * * For example consider a join
spark git commit: [SPARK-17991][SQL] Enable metastore partition pruning by default.
Repository: spark Updated Branches: refs/heads/master 39755169f -> 4bd17c460 [SPARK-17991][SQL] Enable metastore partition pruning by default. ## What changes were proposed in this pull request? This should apply to non-converted metastore relations. WIP to see if this causes any test failures. ## How was this patch tested? Existing tests. Author: Eric Liang Closes #15475 from ericl/try-enabling-pruning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4bd17c46 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4bd17c46 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4bd17c46 Branch: refs/heads/master Commit: 4bd17c4606764242bc29888b8eedc8e4b5a00f46 Parents: 3975516 Author: Eric Liang Authored: Wed Oct 19 23:55:05 2016 -0700 Committer: Reynold Xin Committed: Wed Oct 19 23:55:05 2016 -0700 -- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4bd17c46/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9061b1b..ebf4fad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -267,7 +267,7 @@ object SQLConf { .doc("When true, some predicates will be pushed down into the Hive metastore so that " + "unmatching partitions can be eliminated earlier.") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val HIVE_FILESOURCE_PARTITION_PRUNING = SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing
Repository: spark Updated Branches: refs/heads/branch-2.0 995f602d2 -> 4131623a8 [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing ## What changes were proposed in this pull request? - Fix bug of RDD `zipWithIndex` generating wrong result when one partition contains more than 2147483647 records. - Fix bug of RDD `zipWithUniqueId` generating wrong result when one partition contains more than 2147483647 records. ## How was this patch tested? test added. Author: WeichenXu Closes #15550 from WeichenXu123/fix_rdd_zipWithIndex_overflow. (cherry picked from commit 39755169fb5bb07332eef263b4c18ede1528812d) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4131623a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4131623a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4131623a Branch: refs/heads/branch-2.0 Commit: 4131623a8585fe99f79d82c24ab3b8b506d0d616 Parents: 995f602 Author: WeichenXu Authored: Wed Oct 19 23:41:38 2016 -0700 Committer: Reynold Xin Committed: Wed Oct 19 23:41:46 2016 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 2 +- .../org/apache/spark/rdd/ZippedWithIndexRDD.scala| 5 ++--- .../src/main/scala/org/apache/spark/util/Utils.scala | 15 +++ .../scala/org/apache/spark/util/UtilsSuite.scala | 7 +++ 4 files changed, 25 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 34d32aa..7013396 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag]( def zipWithUniqueId(): RDD[(T, Long)] = withScope { val n = this.partitions.length.toLong this.mapPartitionsWithIndex { case (k, iter) => - iter.zipWithIndex.map { case (item, i) => + Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) => (item, i * n + k) } } http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala index 32931d5..dff6737 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala @@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] -firstParent[T].iterator(split.prev, context).zipWithIndex.map { x => - (x._1, split.startIndex + x._2) -} +val parentIter = firstParent[T].iterator(split.prev, context) +Utils.getIteratorZipWithIndex(parentIter, split.startIndex) } } http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 3d862f4..1686edb 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1768,6 +1768,21 @@ private[spark] object Utils extends Logging { } /** + * Generate a zipWithIndex iterator, avoid index value overflowing problem + * in scala's zipWithIndex + */ + def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = { +new Iterator[(T, Long)] { + var index: Long = startIndex - 1L + def hasNext: Boolean = iterator.hasNext + def next(): (T, Long) = { +index += 1L +(iterator.next(), index) + } +} + } + + /** * Creates a symlink. * * @param src absolute path to the source http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 2741ad7..b67482a 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/sr
spark git commit: [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing
Repository: spark Updated Branches: refs/heads/master f313117bc -> 39755169f [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing ## What changes were proposed in this pull request? - Fix bug of RDD `zipWithIndex` generating wrong result when one partition contains more than 2147483647 records. - Fix bug of RDD `zipWithUniqueId` generating wrong result when one partition contains more than 2147483647 records. ## How was this patch tested? test added. Author: WeichenXu Closes #15550 from WeichenXu123/fix_rdd_zipWithIndex_overflow. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39755169 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39755169 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39755169 Branch: refs/heads/master Commit: 39755169fb5bb07332eef263b4c18ede1528812d Parents: f313117 Author: WeichenXu Authored: Wed Oct 19 23:41:38 2016 -0700 Committer: Reynold Xin Committed: Wed Oct 19 23:41:38 2016 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 2 +- .../org/apache/spark/rdd/ZippedWithIndexRDD.scala| 5 ++--- .../src/main/scala/org/apache/spark/util/Utils.scala | 15 +++ .../scala/org/apache/spark/util/UtilsSuite.scala | 7 +++ 4 files changed, 25 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 6dc334c..be11957 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag]( def zipWithUniqueId(): RDD[(T, Long)] = withScope { val n = this.partitions.length.toLong this.mapPartitionsWithIndex { case (k, iter) => - iter.zipWithIndex.map { case (item, i) => + Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) => (item, i * n + k) } } http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala index b5738b9..b0e5ba0 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala @@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] -firstParent[T].iterator(split.prev, context).zipWithIndex.map { x => - (x._1, split.startIndex + x._2) -} +val parentIter = firstParent[T].iterator(split.prev, context) +Utils.getIteratorZipWithIndex(parentIter, split.startIndex) } } http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 7fba901..bfc6094 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1760,6 +1760,21 @@ private[spark] object Utils extends Logging { } /** + * Generate a zipWithIndex iterator, avoid index value overflowing problem + * in scala's zipWithIndex + */ + def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = { +new Iterator[(T, Long)] { + var index: Long = startIndex - 1L + def hasNext: Boolean = iterator.hasNext + def next(): (T, Long) = { +index += 1L +(iterator.next(), index) + } +} + } + + /** * Creates a symlink. * * @param src absolute path to the source http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index b427f7f..4dda80f 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -396,6 +396,13 @@ class UtilsSuite extends SparkFun
spark git commit: [SPARK-16078][SQL] Backport: from_utc_timestamp/to_utc_timestamp should not depends on local timezone
Repository: spark Updated Branches: refs/heads/branch-1.6 b95ac0d00 -> 82e98f126 [SPARK-16078][SQL] Backport: from_utc_timestamp/to_utc_timestamp should not depends on local timezone ## What changes were proposed in this pull request? Back-port of https://github.com/apache/spark/pull/13784 to `branch-1.6` ## How was this patch tested? Existing tests. Author: Davies Liu Closes #15554 from srowen/SPARK-16078. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/82e98f12 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/82e98f12 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/82e98f12 Branch: refs/heads/branch-1.6 Commit: 82e98f1265f98b49893e04590989b623169d66d9 Parents: b95ac0d Author: Davies Liu Authored: Wed Oct 19 22:55:30 2016 -0700 Committer: Reynold Xin Committed: Wed Oct 19 22:55:30 2016 -0700 -- .../expressions/datetimeExpressions.scala | 10 +-- .../spark/sql/catalyst/util/DateTimeUtils.scala | 35 +-- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 65 3 files changed, 74 insertions(+), 36 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/82e98f12/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 03c39f8..91eca24 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -658,16 +658,17 @@ case class FromUTCTimestamp(left: Expression, right: Expression) """.stripMargin } else { val tzTerm = ctx.freshName("tz") +val utcTerm = ctx.freshName("utc") val tzClass = classOf[TimeZone].getName ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $tzClass.getTimeZone("$tz");""") +ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $tzClass.getTimeZone("UTC");""") val eval = left.gen(ctx) s""" |${eval.code} |boolean ${ev.isNull} = ${eval.isNull}; |long ${ev.value} = 0; |if (!${ev.isNull}) { - | ${ev.value} = ${eval.value} + - | ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L; + | ${ev.value} = $dtu.convertTz(${eval.value}, $utcTerm, $tzTerm); |} """.stripMargin } @@ -783,16 +784,17 @@ case class ToUTCTimestamp(left: Expression, right: Expression) """.stripMargin } else { val tzTerm = ctx.freshName("tz") +val utcTerm = ctx.freshName("utc") val tzClass = classOf[TimeZone].getName ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $tzClass.getTimeZone("$tz");""") +ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $tzClass.getTimeZone("UTC");""") val eval = left.gen(ctx) s""" |${eval.code} |boolean ${ev.isNull} = ${eval.isNull}; |long ${ev.value} = 0; |if (!${ev.isNull}) { - | ${ev.value} = ${eval.value} - - | ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L; + | ${ev.value} = $dtu.convertTz(${eval.value}, $tzTerm, $utcTerm); |} """.stripMargin } http://git-wip-us.apache.org/repos/asf/spark/blob/82e98f12/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 157ac2b..36fe11c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -55,6 +55,7 @@ object DateTimeUtils { // this is year -17999, calculation: 50 * daysIn400Year final val YearZero = -17999 final val toYearZero = to2001 + 7304850 + final val TimeZoneGMT = TimeZone.getTimeZone("GMT") @transient lazy val defaultTimeZone = TimeZone.getDefault @@ -855,13 +856,37 @@ object DateTimeUtils { } /** + * Convert the timestamp `ts` from one timezone to another. + * + * TODO: Because of DST, the conversion between UTC and human time is not exactly one-to-one + * mapping, the conversion here may return wrong result, we should make the timestamp + * timezone-aware. + *
spark git commit: [SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than throwing ClassCastException
Repository: spark Updated Branches: refs/heads/branch-2.0 cdd2570e6 -> 995f602d2 [SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than throwing ClassCastException ## What changes were proposed in this pull request? This PR proposes to check the second argument, `ascendingOrder` rather than throwing `ClassCastException` exception message. ```sql select sort_array(array('b', 'd'), '1'); ``` **Before** ``` 16/10/19 13:16:08 ERROR SparkSQLDriver: Failed in [select sort_array(array('b', 'd'), '1')] java.lang.ClassCastException: org.apache.spark.unsafe.types.UTF8String cannot be cast to java.lang.Boolean at scala.runtime.BoxesRunTime.unboxToBoolean(BoxesRunTime.java:85) at org.apache.spark.sql.catalyst.expressions.SortArray.nullSafeEval(collectionOperations.scala:185) at org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:416) at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:50) at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:43) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:297) ``` **After** ``` Error in query: cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7; ``` ## How was this patch tested? Unit test in `DataFrameFunctionsSuite`. Author: hyukjinkwon Closes #15532 from HyukjinKwon/SPARK-17989. (cherry picked from commit 4b2011ec9da1245923b5cbd883240fef0dbf3ef0) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/995f602d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/995f602d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/995f602d Branch: refs/heads/branch-2.0 Commit: 995f602d27bdcf9e6787d93dbea2357e6dc6ccaa Parents: cdd2570 Author: hyukjinkwon Authored: Wed Oct 19 19:36:21 2016 -0700 Committer: Reynold Xin Committed: Wed Oct 19 19:36:53 2016 -0700 -- .../expressions/collectionOperations.scala | 8 +++- .../test/resources/sql-tests/inputs/array.sql | 6 ++ .../resources/sql-tests/results/array.sql.out | 21 +--- 3 files changed, 31 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/995f602d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 2e8ea11..1efe2cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -112,7 +112,13 @@ case class SortArray(base: Expression, ascendingOrder: Expression) override def checkInputDataTypes(): TypeCheckResult = base.dataType match { case ArrayType(dt, _) if RowOrdering.isOrderable(dt) => - TypeCheckResult.TypeCheckSuccess + ascendingOrder match { +case Literal(_: Boolean, BooleanType) => + TypeCheckResult.TypeCheckSuccess +case _ => + TypeCheckResult.TypeCheckFailure( +"Sort order in second argument requires a boolean literal.") + } case ArrayType(dt, _) => TypeCheckResult.TypeCheckFailure( s"$prettyName does not support sorting array of type ${dt.simpleString}") http://git-wip-us.apache.org/repos/asf/spark/blob/995f602d/sql/core/src/test/resources/sql-tests/inputs/array.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql index 4038a0d..984321a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/array.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql @@ -71,6 +71,12 @@ select sort_array(timestamp_array) from primitive_arrays; +-- sort_array with an invalid string literal for the argument of sort order.
spark git commit: [SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than throwing ClassCastException
Repository: spark Updated Branches: refs/heads/master 444c2d22e -> 4b2011ec9 [SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than throwing ClassCastException ## What changes were proposed in this pull request? This PR proposes to check the second argument, `ascendingOrder` rather than throwing `ClassCastException` exception message. ```sql select sort_array(array('b', 'd'), '1'); ``` **Before** ``` 16/10/19 13:16:08 ERROR SparkSQLDriver: Failed in [select sort_array(array('b', 'd'), '1')] java.lang.ClassCastException: org.apache.spark.unsafe.types.UTF8String cannot be cast to java.lang.Boolean at scala.runtime.BoxesRunTime.unboxToBoolean(BoxesRunTime.java:85) at org.apache.spark.sql.catalyst.expressions.SortArray.nullSafeEval(collectionOperations.scala:185) at org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:416) at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:50) at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:43) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:297) ``` **After** ``` Error in query: cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7; ``` ## How was this patch tested? Unit test in `DataFrameFunctionsSuite`. Author: hyukjinkwon Closes #15532 from HyukjinKwon/SPARK-17989. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b2011ec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b2011ec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b2011ec Branch: refs/heads/master Commit: 4b2011ec9da1245923b5cbd883240fef0dbf3ef0 Parents: 444c2d2 Author: hyukjinkwon Authored: Wed Oct 19 19:36:21 2016 -0700 Committer: Reynold Xin Committed: Wed Oct 19 19:36:21 2016 -0700 -- .../expressions/collectionOperations.scala | 8 +++- .../test/resources/sql-tests/inputs/array.sql | 6 ++ .../resources/sql-tests/results/array.sql.out | 21 +--- 3 files changed, 31 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4b2011ec/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index c020029..f56bb39 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -124,7 +124,13 @@ case class SortArray(base: Expression, ascendingOrder: Expression) override def checkInputDataTypes(): TypeCheckResult = base.dataType match { case ArrayType(dt, _) if RowOrdering.isOrderable(dt) => - TypeCheckResult.TypeCheckSuccess + ascendingOrder match { +case Literal(_: Boolean, BooleanType) => + TypeCheckResult.TypeCheckSuccess +case _ => + TypeCheckResult.TypeCheckFailure( +"Sort order in second argument requires a boolean literal.") + } case ArrayType(dt, _) => TypeCheckResult.TypeCheckFailure( s"$prettyName does not support sorting array of type ${dt.simpleString}") http://git-wip-us.apache.org/repos/asf/spark/blob/4b2011ec/sql/core/src/test/resources/sql-tests/inputs/array.sql -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql index 4038a0d..984321a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/array.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql @@ -71,6 +71,12 @@ select sort_array(timestamp_array) from primitive_arrays; +-- sort_array with an invalid string literal for the argument of sort order. +select sort_array(array('b', 'd'), '1'); + +-- sort_array with an invalid null literal casted as boolean
spark git commit: [SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame
Repository: spark Updated Branches: refs/heads/branch-2.0 3796a98cf -> cdd2570e6 [SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame ## What changes were proposed in this pull request? In http://spark.apache.org/docs/latest/sql-programming-guide.html, Section "Untyped Dataset Operations (aka DataFrame Operations)" Link to R DataFrame doesn't work that return The requested URL /docs/latest/api/R/DataFrame.html was not found on this server. Correct link is SparkDataFrame.html for spark 2.0 ## How was this patch tested? Manual checked. Author: Tommy YU Closes #15543 from Wenpei/spark-18001. (cherry picked from commit f39852e59883c214b0d007faffb406570ea3084b) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cdd2570e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cdd2570e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cdd2570e Branch: refs/heads/branch-2.0 Commit: cdd2570e6dbfc5af68d0c9a49e4493e4e5e53020 Parents: 3796a98 Author: Tommy YU Authored: Tue Oct 18 21:15:32 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 21:15:40 2016 -0700 -- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cdd2570e/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 0a6bdb6..3a90323 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -140,7 +140,7 @@ As an example, the following creates a DataFrame based on the content of a JSON ## Untyped Dataset Operations (aka DataFrame Operations) -DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/DataFrame.html). +DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/SparkDataFrame.html). As mentioned above, in Spark 2.0, DataFrames are just Dataset of `Row`s in Scala and Java API. These operations are also referred as "untyped transformations" in contrast to "typed transformations" come with strongly typed Scala/Java Datasets. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame
Repository: spark Updated Branches: refs/heads/master 4329c5cea -> f39852e59 [SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame ## What changes were proposed in this pull request? In http://spark.apache.org/docs/latest/sql-programming-guide.html, Section "Untyped Dataset Operations (aka DataFrame Operations)" Link to R DataFrame doesn't work that return The requested URL /docs/latest/api/R/DataFrame.html was not found on this server. Correct link is SparkDataFrame.html for spark 2.0 ## How was this patch tested? Manual checked. Author: Tommy YU Closes #15543 from Wenpei/spark-18001. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f39852e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f39852e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f39852e5 Branch: refs/heads/master Commit: f39852e59883c214b0d007faffb406570ea3084b Parents: 4329c5c Author: Tommy YU Authored: Tue Oct 18 21:15:32 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 21:15:32 2016 -0700 -- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f39852e5/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 3f1b73a..d334a86 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -140,7 +140,7 @@ As an example, the following creates a DataFrame based on the content of a JSON ## Untyped Dataset Operations (aka DataFrame Operations) -DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/DataFrame.html). +DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/SparkDataFrame.html). As mentioned above, in Spark 2.0, DataFrames are just Dataset of `Row`s in Scala and Java API. These operations are also referred as "untyped transformations" in contrast to "typed transformations" come with strongly typed Scala/Java Datasets. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17841][STREAMING][KAFKA] drain commitQueue
Repository: spark Updated Branches: refs/heads/branch-2.0 6ef923137 -> f6b87939c [SPARK-17841][STREAMING][KAFKA] drain commitQueue ## What changes were proposed in this pull request? Actually drain commit queue rather than just iterating it. iterator() on a concurrent linked queue won't remove items from the queue, poll() will. ## How was this patch tested? Unit tests Author: cody koeninger Closes #15407 from koeninger/SPARK-17841. (cherry picked from commit cd106b050ff789b6de539956a7f01159ab15c820) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f6b87939 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f6b87939 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f6b87939 Branch: refs/heads/branch-2.0 Commit: f6b87939cb90bf4a0996b3728c1bccdf5e24dd4e Parents: 6ef9231 Author: cody koeninger Authored: Tue Oct 18 14:01:49 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 14:01:59 2016 -0700 -- .../spark/streaming/kafka010/DirectKafkaInputDStream.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f6b87939/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala -- diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala index 432537e..7e57bb1 100644 --- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala +++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala @@ -282,13 +282,13 @@ private[spark] class DirectKafkaInputDStream[K, V]( protected def commitAll(): Unit = { val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]() -val it = commitQueue.iterator() -while (it.hasNext) { - val osr = it.next +var osr = commitQueue.poll() +while (null != osr) { val tp = osr.topicPartition val x = m.get(tp) val offset = if (null == x) { osr.untilOffset } else { Math.max(x.offset, osr.untilOffset) } m.put(tp, new OffsetAndMetadata(offset)) + osr = commitQueue.poll() } if (!m.isEmpty) { consumer.commitAsync(m, commitCallback.get) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17841][STREAMING][KAFKA] drain commitQueue
Repository: spark Updated Branches: refs/heads/master cd662bc7a -> cd106b050 [SPARK-17841][STREAMING][KAFKA] drain commitQueue ## What changes were proposed in this pull request? Actually drain commit queue rather than just iterating it. iterator() on a concurrent linked queue won't remove items from the queue, poll() will. ## How was this patch tested? Unit tests Author: cody koeninger Closes #15407 from koeninger/SPARK-17841. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd106b05 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd106b05 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd106b05 Branch: refs/heads/master Commit: cd106b050ff789b6de539956a7f01159ab15c820 Parents: cd662bc Author: cody koeninger Authored: Tue Oct 18 14:01:49 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 14:01:49 2016 -0700 -- .../spark/streaming/kafka010/DirectKafkaInputDStream.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cd106b05/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala -- diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala index 432537e..7e57bb1 100644 --- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala +++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala @@ -282,13 +282,13 @@ private[spark] class DirectKafkaInputDStream[K, V]( protected def commitAll(): Unit = { val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]() -val it = commitQueue.iterator() -while (it.hasNext) { - val osr = it.next +var osr = commitQueue.poll() +while (null != osr) { val tp = osr.topicPartition val x = m.get(tp) val offset = if (null == x) { osr.untilOffset } else { Math.max(x.offset, osr.untilOffset) } m.put(tp, new OffsetAndMetadata(offset)) + osr = commitQueue.poll() } if (!m.isEmpty) { consumer.commitAsync(m, commitCallback.get) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-17985][CORE] Bump commons-lang3 version to 3.5."
Repository: spark Updated Branches: refs/heads/master b3130c7b6 -> cd662bc7a Revert "[SPARK-17985][CORE] Bump commons-lang3 version to 3.5." This reverts commit bfe7885aee2f406c1bbde08e30809a0b4bb070d2. The commit caused build failures on Hadoop 2.2 profile: ``` [error] /scratch/rxin/spark/core/src/main/scala/org/apache/spark/util/Utils.scala:1489: value read is not a member of object org.apache.commons.io.IOUtils [error] var numBytes = IOUtils.read(gzInputStream, buf) [error] ^ [error] /scratch/rxin/spark/core/src/main/scala/org/apache/spark/util/Utils.scala:1492: value read is not a member of object org.apache.commons.io.IOUtils [error] numBytes = IOUtils.read(gzInputStream, buf) [error]^ ``` Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd662bc7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd662bc7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd662bc7 Branch: refs/heads/master Commit: cd662bc7a2050264f40650442858a85c4827b608 Parents: b3130c7 Author: Reynold Xin Authored: Tue Oct 18 13:56:35 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 13:56:35 2016 -0700 -- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- docs/streaming-flume-integration.md | 4 ++-- pom.xml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 525dcef..b30f8c3 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -33,7 +33,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.5.jar +commons-lang3-3.3.2.jar commons-logging-1.1.3.jar commons-math-2.1.jar commons-math3-3.4.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 562fe64..5b3a765 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -36,7 +36,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.5.jar +commons-lang3-3.3.2.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 747521a..e323efe 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -36,7 +36,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.5.jar +commons-lang3-3.3.2.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index afd4502..77d97e5 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -40,7 +40,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.5.jar +commons-lang3-3.3.2.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 687b855..572edfa 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -40,7 +40,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.5.jar +commons-lang3-3.3.2.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/docs/streaming-flume-integration.md -- diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md index a5d36da..767e1f9 100644 --- a/docs/streaming-flume-integration.
spark git commit: [SPARK-17955][SQL] Make DataFrameReader.jdbc call DataFrameReader.format("jdbc").load
Repository: spark Updated Branches: refs/heads/master 4518642ab -> b3130c7b6 [SPARK-17955][SQL] Make DataFrameReader.jdbc call DataFrameReader.format("jdbc").load ## What changes were proposed in this pull request? This PR proposes to make `DataFrameReader.jdbc` call `DataFrameReader.format("jdbc").load` consistently with other APIs in `DataFrameReader`/`DataFrameWriter` and avoid calling `sparkSession.baseRelationToDataFrame(..)` here and there. The changes were mostly copied from `DataFrameWriter.jdbc()` which was recently updated. ```diff -val params = extraOptions.toMap ++ connectionProperties.asScala.toMap -val options = new JDBCOptions(url, table, params) -val relation = JDBCRelation(parts, options)(sparkSession) -sparkSession.baseRelationToDataFrame(relation) +this.extraOptions = this.extraOptions ++ connectionProperties.asScala +// explicit url and dbtable should override all +this.extraOptions += ("url" -> url, "dbtable" -> table) +format("jdbc").load() ``` ## How was this patch tested? Existing tests should cover this. Author: hyukjinkwon Closes #15499 from HyukjinKwon/SPARK-17955. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3130c7b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3130c7b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3130c7b Branch: refs/heads/master Commit: b3130c7b6a1ab4975023f08c3ab02ee8d2c7e995 Parents: 4518642 Author: hyukjinkwon Authored: Tue Oct 18 13:49:02 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 13:49:02 2016 -0700 -- .../main/scala/org/apache/spark/sql/DataFrameReader.scala| 8 .../main/scala/org/apache/spark/sql/DataFrameWriter.scala| 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3130c7b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index ac33585..b7b2203 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -232,10 +232,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { parts: Array[Partition], connectionProperties: Properties): DataFrame = { // connectionProperties should override settings in extraOptions. -val params = extraOptions.toMap ++ connectionProperties.asScala.toMap -val options = new JDBCOptions(url, table, params) -val relation = JDBCRelation(parts, options)(sparkSession) -sparkSession.baseRelationToDataFrame(relation) +this.extraOptions = this.extraOptions ++ connectionProperties.asScala +// explicit url and dbtable should override all +this.extraOptions += ("url" -> url, "dbtable" -> table) +format("jdbc").load() } /** http://git-wip-us.apache.org/repos/asf/spark/blob/b3130c7b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 35ef050..5be3277 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -426,8 +426,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { def jdbc(url: String, table: String, connectionProperties: Properties): Unit = { assertNotPartitioned("jdbc") assertNotBucketed("jdbc") -// connectionProperties should override settings in extraOptions -this.extraOptions = this.extraOptions ++ (connectionProperties.asScala) +// connectionProperties should override settings in extraOptions. +this.extraOptions = this.extraOptions ++ connectionProperties.asScala // explicit url and dbtable should override all this.extraOptions += ("url" -> url, "dbtable" -> table) format("jdbc").save() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOC] Add more built-in sources in sql-programming-guide.md
Repository: spark Updated Branches: refs/heads/master bfe7885ae -> 20dd11096 [MINOR][DOC] Add more built-in sources in sql-programming-guide.md ## What changes were proposed in this pull request? Add more built-in sources in sql-programming-guide.md. ## How was this patch tested? Manually. Author: Weiqing Yang Closes #15522 from weiqingy/dsDoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20dd1109 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20dd1109 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20dd1109 Branch: refs/heads/master Commit: 20dd11096cfda51e47b9dbe3b715a12ccbb4ce1d Parents: bfe7885 Author: Weiqing Yang Authored: Tue Oct 18 13:38:14 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 13:38:14 2016 -0700 -- docs/sql-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/20dd1109/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index dcc828c..3f1b73a 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -422,8 +422,8 @@ In the simplest form, the default data source (`parquet` unless otherwise config You can also manually specify the data source that will be used along with any extra options that you would like to pass to the data source. Data sources are specified by their fully qualified name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can also use their short -names (`json`, `parquet`, `jdbc`). DataFrames loaded from any data source type can be converted into other types -using this syntax. +names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames loaded from any data +source type can be converted into other types using this syntax. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOC] Add more built-in sources in sql-programming-guide.md
Repository: spark Updated Branches: refs/heads/branch-2.0 26e978a93 -> 6ef923137 [MINOR][DOC] Add more built-in sources in sql-programming-guide.md ## What changes were proposed in this pull request? Add more built-in sources in sql-programming-guide.md. ## How was this patch tested? Manually. Author: Weiqing Yang Closes #15522 from weiqingy/dsDoc. (cherry picked from commit 20dd11096cfda51e47b9dbe3b715a12ccbb4ce1d) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ef92313 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ef92313 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ef92313 Branch: refs/heads/branch-2.0 Commit: 6ef9231377c7cce949dc7a988bb9d7a5cb3e458d Parents: 26e978a Author: Weiqing Yang Authored: Tue Oct 18 13:38:14 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 13:38:50 2016 -0700 -- docs/sql-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ef92313/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 0bd0093..0a6bdb6 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -387,8 +387,8 @@ In the simplest form, the default data source (`parquet` unless otherwise config You can also manually specify the data source that will be used along with any extra options that you would like to pass to the data source. Data sources are specified by their fully qualified name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can also use their short -names (`json`, `parquet`, `jdbc`). DataFrames loaded from any data source type can be converted into other types -using this syntax. +names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames loaded from any data +source type can be converted into other types using this syntax. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17985][CORE] Bump commons-lang3 version to 3.5.
Repository: spark Updated Branches: refs/heads/master 4ef39c2f4 -> bfe7885ae [SPARK-17985][CORE] Bump commons-lang3 version to 3.5. ## What changes were proposed in this pull request? `SerializationUtils.clone()` of commons-lang3 (<3.5) has a bug that breaks thread safety, which gets stack sometimes caused by race condition of initializing hash map. See https://issues.apache.org/jira/browse/LANG-1251. ## How was this patch tested? Existing tests. Author: Takuya UESHIN Closes #15525 from ueshin/issues/SPARK-17985. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfe7885a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfe7885a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfe7885a Branch: refs/heads/master Commit: bfe7885aee2f406c1bbde08e30809a0b4bb070d2 Parents: 4ef39c2 Author: Takuya UESHIN Authored: Tue Oct 18 13:36:00 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 13:36:00 2016 -0700 -- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- docs/streaming-flume-integration.md | 4 ++-- pom.xml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index b30f8c3..525dcef 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -33,7 +33,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.3.2.jar +commons-lang3-3.5.jar commons-logging-1.1.3.jar commons-math-2.1.jar commons-math3-3.4.1.jar http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 5b3a765..562fe64 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -36,7 +36,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.3.2.jar +commons-lang3-3.5.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index e323efe..747521a 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -36,7 +36,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.3.2.jar +commons-lang3-3.5.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 77d97e5..afd4502 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -40,7 +40,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.3.2.jar +commons-lang3-3.5.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.7 -- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 572edfa..687b855 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -40,7 +40,7 @@ commons-digester-1.8.jar commons-httpclient-3.1.jar commons-io-2.4.jar commons-lang-2.6.jar -commons-lang3-3.3.2.jar +commons-lang3-3.5.jar commons-logging-1.1.3.jar commons-math3-3.4.1.jar commons-net-2.2.jar http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/docs/streaming-flume-integration.md -- diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md index 767e1f9..a5d36da 100644 --- a/docs/streaming-flume-integration.md +++ b/docs/streaming-flume-integration.md @@ -115,11 +115,11 @@ Configuring Flume on the chosen machine requires the following two steps. artifactId = scala-library version = {{site.SCALA_VERSION}} - (iii) *Commons Lang 3 JAR*
spark git commit: [SPARK-17974] try 2) Refactor FileCatalog classes to simplify the inheritance tree
Repository: spark Updated Branches: refs/heads/master 231f39e3f -> 4ef39c2f4 [SPARK-17974] try 2) Refactor FileCatalog classes to simplify the inheritance tree ## What changes were proposed in this pull request? This renames `BasicFileCatalog => FileCatalog`, combines `SessionFileCatalog` with `PartitioningAwareFileCatalog`, and removes the old `FileCatalog` trait. In summary, ``` MetadataLogFileCatalog extends PartitioningAwareFileCatalog ListingFileCatalog extends PartitioningAwareFileCatalog PartitioningAwareFileCatalog extends FileCatalog TableFileCatalog extends FileCatalog ``` (note that this is a re-submission of https://github.com/apache/spark/pull/15518 which got reverted) ## How was this patch tested? Existing tests Author: Eric Liang Closes #15533 from ericl/fix-scalastyle-revert. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ef39c2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ef39c2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ef39c2f Branch: refs/heads/master Commit: 4ef39c2f4436fa22d0b957fe7ad477e4c4a16452 Parents: 231f39e Author: Eric Liang Authored: Tue Oct 18 13:33:46 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 13:33:46 2016 -0700 -- .../scala/org/apache/spark/sql/Dataset.scala| 2 +- .../sql/execution/DataSourceScanExec.scala | 4 +- .../sql/execution/datasources/FileCatalog.scala | 66 ++ .../sql/execution/datasources/FileFormat.scala | 61 - .../datasources/HadoopFsRelation.scala | 4 +- .../PartitioningAwareFileCatalog.scala | 217 +- .../datasources/PartitioningUtils.scala | 12 +- .../datasources/SessionFileCatalog.scala| 225 --- .../datasources/TableFileCatalog.scala | 11 +- .../datasources/FileCatalogSuite.scala | 10 + .../datasources/SessionFileCatalogSuite.scala | 34 --- .../ParquetPartitionDiscoverySuite.scala| 10 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 2 +- 13 files changed, 304 insertions(+), 354 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ef39c2f/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 7dccbbd..073d2b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.usePrettyExpression import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution} import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView} -import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.json.JacksonGenerator import org.apache.spark.sql.execution.python.EvaluatePython import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery} http://git-wip-us.apache.org/repos/asf/spark/blob/4ef39c2f/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 623d2be..fdd1fa3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -431,7 +431,7 @@ case class FileSourceScanExec( private def createBucketedReadRDD( bucketSpec: BucketSpec, readFile: (PartitionedFile) => Iterator[InternalRow], - selectedPartitions: Seq[Partition], + selectedPartitions: Seq[PartitionDirectory], fsRelation: HadoopFsRelation): RDD[InternalRow] = { logInfo(s"Planning with ${bucketSpec.numBuckets} buckets") val bucketed = @@ -463,7 +463,7 @@ case class FileSourceScanExec( */ private def createNonBucketedReadRDD( readFile: (PartitionedFile) => Iterator[InternalRow], - selectedPartitions: Seq[Partition], + selectedPartitions: Seq[PartitionDirectory], fsRelation: HadoopFsRelation): RDD[InternalRow] = { val defaultMaxSplitBytes = fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes http://git-wip-us.apache.org/repos/asf/spark/blo
spark git commit: [SPARK-17899][SQL][FOLLOW-UP] debug mode should work for corrupted table
Repository: spark Updated Branches: refs/heads/master a9e79a41e -> e59df62e6 [SPARK-17899][SQL][FOLLOW-UP] debug mode should work for corrupted table ## What changes were proposed in this pull request? Debug mode should work for corrupted table, so that we can really debug ## How was this patch tested? new test in `MetastoreDataSourcesSuite` Author: Wenchen Fan Closes #15528 from cloud-fan/debug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e59df62e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e59df62e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e59df62e Branch: refs/heads/master Commit: e59df62e62ec4c5f8bd02a13f05fa3ec6f0fc694 Parents: a9e79a4 Author: Wenchen Fan Authored: Tue Oct 18 11:03:10 2016 -0700 Committer: Reynold Xin Committed: Tue Oct 18 11:03:10 2016 -0700 -- .../spark/sql/hive/HiveExternalCatalog.scala | 9 ++--- .../sql/hive/MetastoreDataSourcesSuite.scala | 18 +++--- 2 files changed, 17 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e59df62e/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index ff59b54..2003ff4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -448,7 +448,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat * properties, and filter out these special entries from table properties. */ private def restoreTableMetadata(table: CatalogTable): CatalogTable = { -val catalogTable = if (table.tableType == VIEW) { +val catalogTable = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) { table } else { getProviderFromTableProperties(table).map { provider => @@ -467,18 +467,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } else { table.storage } -val tableProps = if (conf.get(DEBUG_MODE)) { - table.properties -} else { - getOriginalTableProperties(table) -} table.copy( storage = storage, schema = getSchemaFromTableProperties(table), provider = Some(provider), partitionColumnNames = getPartitionColumnsFromTableProperties(table), bucketSpec = getBucketSpecFromTableProperties(table), - properties = tableProps) + properties = getOriginalTableProperties(table)) } getOrElse { table.copy(provider = Some("hive")) } http://git-wip-us.apache.org/repos/asf/spark/blob/e59df62e/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 7cc6179..eaa67d3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -1321,20 +1321,32 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv sharedState.externalCatalog.getTable("default", "t") }.getMessage assert(e.contains(s"Could not read schema from the hive metastore because it is corrupted")) + + withDebugMode { +val tableMeta = sharedState.externalCatalog.getTable("default", "t") +assert(tableMeta.identifier == TableIdentifier("t", Some("default"))) +assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json") + } } finally { hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = true) } } test("should keep data source entries in table properties when debug mode is on") { -val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE) -try { - sparkSession.sparkContext.conf.set(DEBUG_MODE, true) +withDebugMode { val newSession = sparkSession.newSession() newSession.sql("CREATE TABLE abc(i int) USING json") val tableMeta = newSession.sessionState.catalog.getTableMetadata(TableIdentifier("abc")) assert(tableMeta.properties(DATASOURCE_SCHEMA_NUMPARTS).toInt == 1) assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json") +} + } + + private def withDebugMode(f: => Unit): Unit = { +