from:"rxin"

spark git commit: [SPARK-17183][SPARK-17983][SPARK-18101][SQL] put hive serde table schema to table properties like data source table

2016-11-05 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 6e2701815 -> 95ec4e25b


[SPARK-17183][SPARK-17983][SPARK-18101][SQL] put hive serde table schema to 
table properties like data source table

## What changes were proposed in this pull request?

For data source tables, we will put its table schema, partition columns, etc. 
to table properties, to work around some hive metastore issues, e.g. not 
case-preserving, bad decimal type support, etc.

We should also do this for hive serde tables, to reduce the difference between 
hive serde tables and data source tables, e.g. column names should be case 
preserving.
## How was this patch tested?

existing tests, and a new test in `HiveExternalCatalog`

Author: Wenchen Fan 

Closes #14750 from cloud-fan/minor1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95ec4e25
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95ec4e25
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95ec4e25

Branch: refs/heads/master
Commit: 95ec4e25bb65f37f80222ffe70a95993a9149f80
Parents: 6e27018
Author: Wenchen Fan 
Authored: Sat Nov 5 00:58:50 2016 -0700
Committer: Reynold Xin 
Committed: Sat Nov 5 00:58:50 2016 -0700

--
 .../sql/catalyst/catalog/ExternalCatalog.scala  |   8 +-
 .../sql/catalyst/catalog/InMemoryCatalog.scala  |   6 -
 .../org/apache/spark/sql/types/DataType.scala   |  24 ++
 .../catalyst/catalog/ExternalCatalogSuite.scala |  20 ++
 .../org/apache/spark/sql/DataFrameWriter.scala  |  10 +-
 .../spark/sql/execution/SparkSqlParser.scala|   4 +-
 .../spark/sql/execution/SparkStrategies.scala   |   6 +-
 .../spark/sql/execution/command/ddl.scala   |   4 +-
 .../spark/sql/execution/datasources/rules.scala |   5 +-
 .../spark/sql/hive/HiveExternalCatalog.scala| 218 ++-
 .../input1-2-d3aa54d5436b7b59ff5c7091b7ca6145   |   4 +-
 .../input2-1-e0efeda558cd0194f4764a5735147b16   |   4 +-
 .../input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd   |   4 +-
 .../input2-4-235f92683416fab031e6e7490487b15b   |   6 +-
 ...w_columns-2-b74990316ec4245fd8a7011e684b39da |   6 +-
 .../hive/PartitionedTablePerfStatsSuite.scala   |   9 +-
 .../sql/hive/execution/SQLQuerySuite.scala  |   4 +-
 17 files changed, 245 insertions(+), 97 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/95ec4e25/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index a5e0252..14dd707 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
-import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, 
NoSuchDatabaseException, NoSuchFunctionException}
+import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, 
NoSuchDatabaseException, NoSuchFunctionException, NoSuchTableException}
 import org.apache.spark.sql.catalyst.expressions.Expression
 
 
@@ -39,6 +39,12 @@ abstract class ExternalCatalog {
 }
   }
 
+  protected def requireTableExists(db: String, table: String): Unit = {
+if (!tableExists(db, table)) {
+  throw new NoSuchTableException(db = db, table = table)
+}
+  }
+
   protected def requireFunctionExists(db: String, funcName: String): Unit = {
 if (!functionExists(db, funcName)) {
   throw new NoSuchFunctionException(db = db, func = funcName)

http://git-wip-us.apache.org/repos/asf/spark/blob/95ec4e25/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index ea675b7..bc39688 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -64,12 +64,6 @@ class InMemoryCatalog(
 catalog(db).tables(table).partitions.contains(spec)
   }
 
-  private def requireTableExists(db: String, table: String): Unit = {
-if (!tableExists(db, table)) {
-  throw new NoSuchTableException(db = db, table = table)
-}
-  }
-
   private def requireTableNotExists(db: String, table: String): Unit = {
 if (tableExists(db, table)) {
   throw new TableAlreadyE

spark git commit: [SPARK-18260] Make from_json null safe

2016-11-05 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 8a9ca1924 -> 6e2701815


[SPARK-18260] Make from_json null safe

## What changes were proposed in this pull request?

`from_json` is currently not safe against `null` rows. This PR adds a fix and a 
regression test for it.

## How was this patch tested?

Regression test

Author: Burak Yavuz 

Closes #15771 from brkyvz/json_fix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e270181
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e270181
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e270181

Branch: refs/heads/master
Commit: 6e2701815761d5870111cb56300e30d3059b39ed
Parents: 8a9ca19
Author: Burak Yavuz 
Authored: Sat Nov 5 00:07:51 2016 -0700
Committer: Reynold Xin 
Committed: Sat Nov 5 00:07:51 2016 -0700

--
 .../spark/sql/catalyst/expressions/jsonExpressions.scala | 4 +++-
 .../sql/catalyst/expressions/JsonExpressionsSuite.scala  | 8 
 2 files changed, 11 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6e270181/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index e034735..89fe7c4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -498,7 +498,9 @@ case class JsonToStruct(schema: StructType, options: 
Map[String, String], child:
   override def children: Seq[Expression] = child :: Nil
 
   override def eval(input: InternalRow): Any = {
-try parser.parse(child.eval(input).toString).head catch {
+val json = child.eval(input)
+if (json == null) return null
+try parser.parse(json.toString).head catch {
   case _: SparkSQLJsonProcessingException => null
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/6e270181/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index f9db649..3bfa0bf 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -344,6 +344,14 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 )
   }
 
+  test("from_json null input column") {
+val schema = StructType(StructField("a", IntegerType) :: Nil)
+checkEvaluation(
+  JsonToStruct(schema, Map.empty, Literal(null)),
+  null
+)
+  }
+
   test("to_json") {
 val schema = StructType(StructField("a", IntegerType) :: Nil)
 val struct = Literal.create(create_row(1), schema)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18260] Make from_json null safe

2016-11-05 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 707630147 -> 42386e796


[SPARK-18260] Make from_json null safe

## What changes were proposed in this pull request?

`from_json` is currently not safe against `null` rows. This PR adds a fix and a 
regression test for it.

## How was this patch tested?

Regression test

Author: Burak Yavuz 

Closes #15771 from brkyvz/json_fix.

(cherry picked from commit 6e2701815761d5870111cb56300e30d3059b39ed)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42386e79
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42386e79
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42386e79

Branch: refs/heads/branch-2.1
Commit: 42386e796f6519d22092fba88a8c42cba6511d7c
Parents: 7076301
Author: Burak Yavuz 
Authored: Sat Nov 5 00:07:51 2016 -0700
Committer: Reynold Xin 
Committed: Sat Nov 5 00:08:00 2016 -0700

--
 .../spark/sql/catalyst/expressions/jsonExpressions.scala | 4 +++-
 .../sql/catalyst/expressions/JsonExpressionsSuite.scala  | 8 
 2 files changed, 11 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/42386e79/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index e034735..89fe7c4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -498,7 +498,9 @@ case class JsonToStruct(schema: StructType, options: 
Map[String, String], child:
   override def children: Seq[Expression] = child :: Nil
 
   override def eval(input: InternalRow): Any = {
-try parser.parse(child.eval(input).toString).head catch {
+val json = child.eval(input)
+if (json == null) return null
+try parser.parse(json.toString).head catch {
   case _: SparkSQLJsonProcessingException => null
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/42386e79/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index f9db649..3bfa0bf 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -344,6 +344,14 @@ class JsonExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 )
   }
 
+  test("from_json null input column") {
+val schema = StructType(StructField("a", IntegerType) :: Nil)
+checkEvaluation(
+  JsonToStruct(schema, Map.empty, Literal(null)),
+  null
+)
+  }
+
   test("to_json") {
 val schema = StructType(StructField("a", IntegerType) :: Nil)
 val struct = Literal.create(create_row(1), schema)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not used

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 491db67a5 -> 707630147


[SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not 
used

## What changes were proposed in this pull request?
Add comments.

## How was this patch tested?
Build passed.

Author: Weiqing Yang 

Closes #15776 from weiqingy/SPARK-17710.

(cherry picked from commit 8a9ca1924792d1a7c733bdfd757996b3ade0d63d)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70763014
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70763014
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70763014

Branch: refs/heads/branch-2.1
Commit: 707630147e51114aa90f58f375df43bb2b5f7fb4
Parents: 491db67
Author: Weiqing Yang 
Authored: Fri Nov 4 23:44:46 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 23:44:53 2016 -0700

--
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 
 1 file changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/70763014/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 22c28fb..1de66af 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2539,6 +2539,8 @@ private[util] object CallerContext extends Logging {
   val callerContextSupported: Boolean = {
 SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", 
false) && {
   try {
+// `Utils.classForName` will make `ReplSuite` fail with 
`ClassCircularityError` in
+// master Maven build, so do not use it before resolving SPARK-17714.
 // scalastyle:off classforname
 Class.forName("org.apache.hadoop.ipc.CallerContext")
 Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
@@ -2604,6 +2606,8 @@ private[spark] class CallerContext(
   def setCurrentContext(): Unit = {
 if (CallerContext.callerContextSupported) {
   try {
+// `Utils.classForName` will make `ReplSuite` fail with 
`ClassCircularityError` in
+// master Maven build, so do not use it before resolving SPARK-17714.
 // scalastyle:off classforname
 val callerContext = 
Class.forName("org.apache.hadoop.ipc.CallerContext")
 val builder = 
Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not used

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 0f7c9e84e -> 8a9ca1924


[SPARK-17710][FOLLOW UP] Add comments to state why 'Utils.classForName' is not 
used

## What changes were proposed in this pull request?
Add comments.

## How was this patch tested?
Build passed.

Author: Weiqing Yang 

Closes #15776 from weiqingy/SPARK-17710.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a9ca192
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a9ca192
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a9ca192

Branch: refs/heads/master
Commit: 8a9ca1924792d1a7c733bdfd757996b3ade0d63d
Parents: 0f7c9e8
Author: Weiqing Yang 
Authored: Fri Nov 4 23:44:46 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 23:44:46 2016 -0700

--
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 
 1 file changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8a9ca192/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 22c28fb..1de66af 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2539,6 +2539,8 @@ private[util] object CallerContext extends Logging {
   val callerContextSupported: Boolean = {
 SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", 
false) && {
   try {
+// `Utils.classForName` will make `ReplSuite` fail with 
`ClassCircularityError` in
+// master Maven build, so do not use it before resolving SPARK-17714.
 // scalastyle:off classforname
 Class.forName("org.apache.hadoop.ipc.CallerContext")
 Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
@@ -2604,6 +2606,8 @@ private[spark] class CallerContext(
   def setCurrentContext(): Unit = {
 if (CallerContext.callerContextSupported) {
   try {
+// `Utils.classForName` will make `ReplSuite` fail with 
`ClassCircularityError` in
+// master Maven build, so do not use it before resolving SPARK-17714.
 // scalastyle:off classforname
 val callerContext = 
Class.forName("org.apache.hadoop.ipc.CallerContext")
 val builder = 
Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent java.lang.ClassCircularityError

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 0a303a694 -> 491db67a5


[SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent 
java.lang.ClassCircularityError

closes #15774

(cherry picked from commit 0f7c9e84e0d00813bf56712097677add5657f19f)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/491db67a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/491db67a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/491db67a

Branch: refs/heads/branch-2.1
Commit: 491db67a5fd067ef5e767ac4a07144722302d95a
Parents: 0a303a6
Author: Reynold Xin 
Authored: Fri Nov 4 23:34:29 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 23:35:04 2016 -0700

--
 .../scala/org/apache/spark/repl/ReplSuite.scala| 17 -
 .../scala/org/apache/spark/sql/DatasetSuite.scala  | 12 
 2 files changed, 12 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/491db67a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
--
diff --git 
a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala 
b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 96d2dfc..9262e93 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite {
 assertDoesNotContain("AssertionError", output)
 assertDoesNotContain("Exception", output)
   }
-
-  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
-val resultValue = 12345
-val output = runInterpreter("local",
-  s"""
- |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
- |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
- |val broadcasted = sc.broadcast($resultValue)
- |
- |// Using broadcast triggers serialization issue in 
KeyValueGroupedDataset
- |val dataset = mapGroups.map(_ => broadcasted.value)
- |dataset.collect()
-  """.stripMargin)
-assertDoesNotContain("error:", output)
-assertDoesNotContain("Exception", output)
-assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
-  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/491db67a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 55f0487..6fa7b04 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -923,6 +923,18 @@ class DatasetSuite extends QueryTest with SharedSQLContext 
{
 .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() })
   }
 
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+val resultValue = 12345
+val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+val broadcasted = spark.sparkContext.broadcast(resultValue)
+
+// Using broadcast triggers serialization issue in KeyValueGroupedDataset
+val dataset = mapGroups.map(_ => broadcasted.value)
+
+assert(dataset.collect() sameElements Array(resultValue, resultValue))
+  }
+
   Seq(true, false).foreach { eager =>
 def testCheckpointing(testName: String)(f: => Unit): Unit = {
   test(s"Dataset.checkpoint() - $testName (eager = $eager)") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent java.lang.ClassCircularityError

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 0e3312ee7 -> 0f7c9e84e


[SPARK-18189] [SQL] [Followup] Move test from ReplSuite to prevent 
java.lang.ClassCircularityError

closes #15774


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f7c9e84
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f7c9e84
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f7c9e84

Branch: refs/heads/master
Commit: 0f7c9e84e0d00813bf56712097677add5657f19f
Parents: 0e3312e
Author: Reynold Xin 
Authored: Fri Nov 4 23:34:29 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 23:34:29 2016 -0700

--
 .../scala/org/apache/spark/repl/ReplSuite.scala| 17 -
 .../scala/org/apache/spark/sql/DatasetSuite.scala  | 12 
 2 files changed, 12 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0f7c9e84/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
--
diff --git 
a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala 
b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 96d2dfc..9262e93 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite {
 assertDoesNotContain("AssertionError", output)
 assertDoesNotContain("Exception", output)
   }
-
-  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
-val resultValue = 12345
-val output = runInterpreter("local",
-  s"""
- |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
- |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
- |val broadcasted = sc.broadcast($resultValue)
- |
- |// Using broadcast triggers serialization issue in 
KeyValueGroupedDataset
- |val dataset = mapGroups.map(_ => broadcasted.value)
- |dataset.collect()
-  """.stripMargin)
-assertDoesNotContain("error:", output)
-assertDoesNotContain("Exception", output)
-assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
-  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/0f7c9e84/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 55f0487..6fa7b04 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -923,6 +923,18 @@ class DatasetSuite extends QueryTest with SharedSQLContext 
{
 .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() })
   }
 
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+val resultValue = 12345
+val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+val broadcasted = spark.sparkContext.broadcast(resultValue)
+
+// Using broadcast triggers serialization issue in KeyValueGroupedDataset
+val dataset = mapGroups.map(_ => broadcasted.value)
+
+assert(dataset.collect() sameElements Array(resultValue, resultValue))
+  }
+
   Seq(true, false).foreach { eager =>
 def testCheckpointing(testName: String)(f: => Unit): Unit = {
   test(s"Dataset.checkpoint() - $testName (eager = $eager)") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18189][SQL][FOLLOWUP] Move test from ReplSuite to prevent java.lang.ClassCircularityError

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 399597b04 -> 8b99e204a


[SPARK-18189][SQL][FOLLOWUP] Move test from ReplSuite to prevent 
java.lang.ClassCircularityError

## What changes were proposed in this pull request?
Move the test which is causing java.lang.ClassCircularityError from ReplSuite 
to DatasetSuite.

## How was this patch tested?
> build/mvn -DskipTests -Phadoop-2.3 -Pyarn -Phive -Phive-thriftserver 
> -Pkinesis-asl -Pmesos clean package
> build/mvn -Dtest=none -DwildcardSuites=org.apache.spark.repl.ReplSuite test

Author: Ergin Seyfe 

Closes #15774 from seyfe/fix_replsuite_test_error_branch2.0.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b99e204
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b99e204
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b99e204

Branch: refs/heads/branch-2.0
Commit: 8b99e204a9a056fd071f9bd75f3e0a29f90bccc0
Parents: 399597b
Author: Ergin Seyfe 
Authored: Fri Nov 4 23:29:20 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 23:29:20 2016 -0700

--
 .../scala/org/apache/spark/repl/ReplSuite.scala| 17 -
 .../scala/org/apache/spark/sql/DatasetSuite.scala  | 13 +
 2 files changed, 13 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8b99e204/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
--
diff --git 
a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala 
b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 8deafe3..f7d7a4f 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite {
 assertDoesNotContain("AssertionError", output)
 assertDoesNotContain("Exception", output)
   }
-
-  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
-val resultValue = 12345
-val output = runInterpreter("local",
-  s"""
- |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
- |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
- |val broadcasted = sc.broadcast($resultValue)
- |
- |// Using broadcast triggers serialization issue in 
KeyValueGroupedDataset
- |val dataset = mapGroups.map(_ => broadcasted.value)
- |dataset.collect()
-  """.stripMargin)
-assertDoesNotContain("error:", output)
-assertDoesNotContain("Exception", output)
-assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
-  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/8b99e204/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index f897cfb..6113e5d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -882,6 +882,19 @@ class DatasetSuite extends QueryTest with SharedSQLContext 
{
   df.withColumn("b", expr("0")).as[ClassData]
 .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() })
   }
+
+  // This is moved from ReplSuite to prevent java.lang.ClassCircularityError.
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+val resultValue = 12345
+val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+val broadcasted = spark.sparkContext.broadcast(resultValue)
+
+// Using broadcast triggers serialization issue in KeyValueGroupedDataset
+val dataset = mapGroups.map(_ => broadcasted.value)
+
+assert(dataset.collect() sameElements Array(resultValue, resultValue))
+  }
 }
 
 case class Generic[T](id: T, value: Double)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17337][SPARK-16804][SQL][BRANCH-2.0] Backport subquery related PRs

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 c864e8a80 -> 399597b04


[SPARK-17337][SPARK-16804][SQL][BRANCH-2.0] Backport subquery related PRs

## What changes were proposed in this pull request?
This PR backports two subquery related PRs to branch-2.0:

- https://github.com/apache/spark/pull/14411
- https://github.com/apache/spark/pull/15761

## How was this patch tested?
Added a tests  to `SubquerySuite`.

Author: Nattavut Sutyanyong 
Author: Herman van Hovell 

Closes #15772 from hvanhovell/SPARK-17337-2.0.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/399597b0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/399597b0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/399597b0

Branch: refs/heads/branch-2.0
Commit: 399597b04a83bbe3cc748c21446de0d808d08155
Parents: c864e8a
Author: Herman van Hovell 
Authored: Fri Nov 4 15:54:58 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 15:54:58 2016 -0700

--
 .../spark/sql/catalyst/analysis/Analyzer.scala  | 13 ++
 .../sql/catalyst/optimizer/Optimizer.scala  | 16 ++-
 .../catalyst/analysis/AnalysisErrorSuite.scala  | 17 
 .../org/apache/spark/sql/SubquerySuite.scala| 44 
 4 files changed, 89 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/399597b0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 617f3e0..6332f92 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1025,6 +1025,19 @@ class Analyzer(
 case e: Expand =>
   failOnOuterReferenceInSubTree(e, "an EXPAND")
   e
+case l : LocalLimit =>
+  failOnOuterReferenceInSubTree(l, "a LIMIT")
+  l
+// Since LIMIT  is represented as GlobalLimit(, (LocalLimit 
(, child))
+// and we are walking bottom up, we will fail on LocalLimit before
+// reaching GlobalLimit.
+// The code below is just a safety net.
+case g : GlobalLimit =>
+  failOnOuterReferenceInSubTree(g, "a LIMIT")
+  g
+case s : Sample =>
+  failOnOuterReferenceInSubTree(s, "a TABLESAMPLE")
+  s
 case p =>
   failOnOuterReference(p)
   p

http://git-wip-us.apache.org/repos/asf/spark/blob/399597b0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 4c06038..f0992b3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1020,7 +1020,7 @@ object PushDownPredicate extends Rule[LogicalPlan] with 
PredicateHelper {
 // state and all the input rows processed before. In another word, the 
order of input rows
 // matters for non-deterministic expressions, while pushing down 
predicates changes the order.
 case filter @ Filter(condition, project @ Project(fields, grandChild))
-  if fields.forall(_.deterministic) =>
+  if fields.forall(_.deterministic) && canPushThroughCondition(grandChild, 
condition) =>
 
   // Create a map of Aliases to their values from the child projection.
   // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b).
@@ -1161,6 +1161,20 @@ object PushDownPredicate extends Rule[LogicalPlan] with 
PredicateHelper {
   filter
 }
   }
+
+  /**
+   * Check if we can safely push a filter through a projection, by making sure 
that predicate
+   * subqueries in the condition do not contain the same attributes as the 
plan they are moved
+   * into. This can happen when the plan and predicate subquery have the same 
source.
+   */
+  private def canPushThroughCondition(plan: LogicalPlan, condition: 
Expression): Boolean = {
+val attributes = plan.outputSet
+val matched = condition.find {
+  case PredicateSubquery(p, _, _, _) => 
p.outputSet.intersect(attributes).nonEmpty
+  case _ => false
+}
+matched.isEmpty
+  }
 }
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/399597b0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
---

spark git commit: [SPARK-18197][CORE] Optimise AppendOnlyMap implementation

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 cfe76028b -> a2d7e25e7


[SPARK-18197][CORE] Optimise AppendOnlyMap implementation

## What changes were proposed in this pull request?
This improvement works by using the fastest comparison test first and we 
observed a 1% throughput performance improvement on PageRank (HiBench large 
profile) with this change.

We used tprof and before the change in AppendOnlyMap.changeValue (where the 
optimisation occurs) this method was being used for 8053 profiling ticks 
representing 0.72% of the overall application time.

After this change we observed this method only occurring for 2786 ticks and for 
0.25% of the overall time.

## How was this patch tested?
Existing unit tests and for performance we used HiBench large, profiling with 
tprof and IBM Healthcenter.

Author: Adam Roberts 

Closes #15714 from a-roberts/patch-9.

(cherry picked from commit a42d738c5de08bd395a7c220c487146173c6c163)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2d7e25e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2d7e25e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2d7e25e

Branch: refs/heads/branch-2.1
Commit: a2d7e25e7c85ce17c8ceac5e1806afe96d3acc14
Parents: cfe7602
Author: Adam Roberts 
Authored: Fri Nov 4 12:06:06 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 12:06:12 2016 -0700

--
 .../org/apache/spark/util/collection/AppendOnlyMap.scala  | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a2d7e25e/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala 
b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
index 6b74a29..bcb95b4 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -140,16 +140,16 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
 var i = 1
 while (true) {
   val curKey = data(2 * pos)
-  if (k.eq(curKey) || k.equals(curKey)) {
-val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
-data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
-return newValue
-  } else if (curKey.eq(null)) {
+  if (curKey.eq(null)) {
 val newValue = updateFunc(false, null.asInstanceOf[V])
 data(2 * pos) = k
 data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
 incrementSize()
 return newValue
+  } else if (k.eq(curKey) || k.equals(curKey)) {
+val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
+data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
+return newValue
   } else {
 val delta = i
 pos = (pos + delta) & mask


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18197][CORE] Optimise AppendOnlyMap implementation

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 14f235d56 -> a42d738c5


[SPARK-18197][CORE] Optimise AppendOnlyMap implementation

## What changes were proposed in this pull request?
This improvement works by using the fastest comparison test first and we 
observed a 1% throughput performance improvement on PageRank (HiBench large 
profile) with this change.

We used tprof and before the change in AppendOnlyMap.changeValue (where the 
optimisation occurs) this method was being used for 8053 profiling ticks 
representing 0.72% of the overall application time.

After this change we observed this method only occurring for 2786 ticks and for 
0.25% of the overall time.

## How was this patch tested?
Existing unit tests and for performance we used HiBench large, profiling with 
tprof and IBM Healthcenter.

Author: Adam Roberts 

Closes #15714 from a-roberts/patch-9.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a42d738c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a42d738c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a42d738c

Branch: refs/heads/master
Commit: a42d738c5de08bd395a7c220c487146173c6c163
Parents: 14f235d
Author: Adam Roberts 
Authored: Fri Nov 4 12:06:06 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 12:06:06 2016 -0700

--
 .../org/apache/spark/util/collection/AppendOnlyMap.scala  | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a42d738c/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala 
b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
index 6b74a29..bcb95b4 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -140,16 +140,16 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
 var i = 1
 while (true) {
   val curKey = data(2 * pos)
-  if (k.eq(curKey) || k.equals(curKey)) {
-val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
-data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
-return newValue
-  } else if (curKey.eq(null)) {
+  if (curKey.eq(null)) {
 val newValue = updateFunc(false, null.asInstanceOf[V])
 data(2 * pos) = k
 data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
 incrementSize()
 return newValue
+  } else if (k.eq(curKey) || k.equals(curKey)) {
+val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
+data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
+return newValue
   } else {
 val delta = i
 pos = (pos + delta) & mask


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: Closing some stale/invalid pull requests

2016-11-04 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 27602c337 -> 14f235d56


Closing some stale/invalid pull requests

Closes #15758
Closes #15753
Closes #12708


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14f235d5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14f235d5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14f235d5

Branch: refs/heads/master
Commit: 14f235d5643bca75e270652c15154d86e57a7a70
Parents: 27602c3
Author: Reynold Xin 
Authored: Fri Nov 4 01:27:06 2016 -0700
Committer: Reynold Xin 
Committed: Fri Nov 4 01:27:06 2016 -0700

--

--



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 dae1581d9 -> c864e8a80


[SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in 
OpenHashSet

## What changes were proposed in this pull request?

This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent.

**Before**
```
nextPowerOf2(0) => 2
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

**After**
```
nextPowerOf2(0) => 1
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

## How was this patch tested?

N/A

Author: Dongjoon Hyun 

Closes #15754 from dongjoon-hyun/SPARK-18200-2.

(cherry picked from commit 27602c33751cebf6cd173c0de103454608cf6625)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c864e8a8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c864e8a8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c864e8a8

Branch: refs/heads/branch-2.0
Commit: c864e8a8020f4890f1839766851e7f4917da5c70
Parents: dae1581
Author: Dongjoon Hyun 
Authored: Thu Nov 3 23:15:33 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 23:17:15 2016 -0700

--
 .../main/scala/org/apache/spark/util/collection/OpenHashSet.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c864e8a8/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 7a1be85..60f6f53 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   private def nextPowerOf2(n: Int): Int = {
 if (n == 0) {
-  2
+  1
 } else {
   val highBit = Integer.highestOneBit(n)
   if (highBit == n) n else highBit << 1


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 8e145a94b -> cfe76028b


[SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in 
OpenHashSet

## What changes were proposed in this pull request?

This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent.

**Before**
```
nextPowerOf2(0) => 2
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

**After**
```
nextPowerOf2(0) => 1
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

## How was this patch tested?

N/A

Author: Dongjoon Hyun 

Closes #15754 from dongjoon-hyun/SPARK-18200-2.

(cherry picked from commit 27602c33751cebf6cd173c0de103454608cf6625)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cfe76028
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cfe76028
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cfe76028

Branch: refs/heads/branch-2.1
Commit: cfe76028bb116d72eab6601bff3b2a1856597370
Parents: 8e145a9
Author: Dongjoon Hyun 
Authored: Thu Nov 3 23:15:33 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 23:17:07 2016 -0700

--
 .../main/scala/org/apache/spark/util/collection/OpenHashSet.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cfe76028/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 7a1be85..60f6f53 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   private def nextPowerOf2(n: Int): Int = {
 if (n == 0) {
-  2
+  1
 } else {
   val highBit = Integer.highestOneBit(n)
   if (highBit == n) n else highBit << 1


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in OpenHashSet

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master a08463b1d -> 27602c337


[SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an initial capacity in 
OpenHashSet

## What changes were proposed in this pull request?

This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent.

**Before**
```
nextPowerOf2(0) => 2
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

**After**
```
nextPowerOf2(0) => 1
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

## How was this patch tested?

N/A

Author: Dongjoon Hyun 

Closes #15754 from dongjoon-hyun/SPARK-18200-2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27602c33
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27602c33
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27602c33

Branch: refs/heads/master
Commit: 27602c33751cebf6cd173c0de103454608cf6625
Parents: a08463b
Author: Dongjoon Hyun 
Authored: Thu Nov 3 23:15:33 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 23:15:33 2016 -0700

--
 .../main/scala/org/apache/spark/util/collection/OpenHashSet.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/27602c33/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 7a1be85..60f6f53 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   private def nextPowerOf2(n: Int): Int = {
 if (n == 0) {
-  2
+  1
 } else {
   val highBit = Integer.highestOneBit(n)
   if (highBit == n) n else highBit << 1


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18259][SQL] Do not capture Throwable in QueryExecution

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 37550c492 -> 91d567150


[SPARK-18259][SQL] Do not capture Throwable in QueryExecution

## What changes were proposed in this pull request?
`QueryExecution.toString` currently captures `java.lang.Throwable`s; this is 
far from a best practice and can lead to confusing situation or invalid 
application states. This PR fixes this by only capturing `AnalysisException`s.

## How was this patch tested?
Added a `QueryExecutionSuite`.

Author: Herman van Hovell 

Closes #15760 from hvanhovell/SPARK-18259.

(cherry picked from commit aa412c55e31e61419d3de57ef4b13e50f9b38af0)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91d56715
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91d56715
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91d56715

Branch: refs/heads/branch-2.1
Commit: 91d567150b305d05acb8543da5cbf21df244352d
Parents: 37550c4
Author: Herman van Hovell 
Authored: Thu Nov 3 21:59:59 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 22:00:23 2016 -0700

--
 .../spark/sql/execution/QueryExecution.scala|  2 +-
 .../sql/execution/QueryExecutionSuite.scala | 50 
 2 files changed, 51 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/91d56715/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index cb45a6d..b3ef29f 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -104,7 +104,7 @@ class QueryExecution(val sparkSession: SparkSession, val 
logical: LogicalPlan) {
 ReuseSubquery(sparkSession.sessionState.conf))
 
   protected def stringOrError[A](f: => A): String =
-try f.toString catch { case e: Throwable => e.toString }
+try f.toString catch { case e: AnalysisException => e.toString }
 
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/91d56715/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
new file mode 100644
index 000..8bceab3
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
OneRowRelation}
+import org.apache.spark.sql.test.SharedSQLContext
+
+class QueryExecutionSuite extends SharedSQLContext {
+  test("toString() exception/error handling") {
+val badRule = new SparkStrategy {
+  var mode: String = ""
+  override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase 
match {
+case "exception" => throw new AnalysisException(mode)
+case "error" => throw new Error(mode)
+case _ => Nil
+  }
+}
+spark.experimental.extraStrategies = badRule :: Nil
+
+def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)
+
+// Nothing!
+badRule.mode = ""
+assert(qe.toString.contains("OneRowRelation"))
+
+// Throw an AnalysisException - this should be captured.
+badRule.mode = "exception"
+assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))
+
+// Throw an Error - this should not be captured.
+badRule.mode = "error"
+val error = intercept[Error](qe.toString)
+assert(error.getMessage.contains("error"))
+  }
+}


--

spark git commit: [SPARK-18259][SQL] Do not capture Throwable in QueryExecution

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master dc4c60098 -> aa412c55e


[SPARK-18259][SQL] Do not capture Throwable in QueryExecution

## What changes were proposed in this pull request?
`QueryExecution.toString` currently captures `java.lang.Throwable`s; this is 
far from a best practice and can lead to confusing situation or invalid 
application states. This PR fixes this by only capturing `AnalysisException`s.

## How was this patch tested?
Added a `QueryExecutionSuite`.

Author: Herman van Hovell 

Closes #15760 from hvanhovell/SPARK-18259.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aa412c55
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aa412c55
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aa412c55

Branch: refs/heads/master
Commit: aa412c55e31e61419d3de57ef4b13e50f9b38af0
Parents: dc4c600
Author: Herman van Hovell 
Authored: Thu Nov 3 21:59:59 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 21:59:59 2016 -0700

--
 .../spark/sql/execution/QueryExecution.scala|  2 +-
 .../sql/execution/QueryExecutionSuite.scala | 50 
 2 files changed, 51 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/aa412c55/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index cb45a6d..b3ef29f 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -104,7 +104,7 @@ class QueryExecution(val sparkSession: SparkSession, val 
logical: LogicalPlan) {
 ReuseSubquery(sparkSession.sessionState.conf))
 
   protected def stringOrError[A](f: => A): String =
-try f.toString catch { case e: Throwable => e.toString }
+try f.toString catch { case e: AnalysisException => e.toString }
 
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/aa412c55/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
new file mode 100644
index 000..8bceab3
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
OneRowRelation}
+import org.apache.spark.sql.test.SharedSQLContext
+
+class QueryExecutionSuite extends SharedSQLContext {
+  test("toString() exception/error handling") {
+val badRule = new SparkStrategy {
+  var mode: String = ""
+  override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase 
match {
+case "exception" => throw new AnalysisException(mode)
+case "error" => throw new Error(mode)
+case _ => Nil
+  }
+}
+spark.experimental.extraStrategies = badRule :: Nil
+
+def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)
+
+// Nothing!
+badRule.mode = ""
+assert(qe.toString.contains("OneRowRelation"))
+
+// Throw an AnalysisException - this should be captured.
+badRule.mode = "exception"
+assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))
+
+// Throw an Error - this should not be captured.
+badRule.mode = "error"
+val error = intercept[Error](qe.toString)
+assert(error.getMessage.contains("error"))
+  }
+}


-
To unsubscribe, e-mail: commits-unsubscr..

spark git commit: [SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 af60b1ebb -> 37550c492


[SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 
are deprecated in Spark 2.1.0

## What changes were proposed in this pull request?

Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in 
Spark 2.1.0. This does not actually implement any of the change in SPARK-18138, 
just peppers the documentation with notices about it.

## How was this patch tested?

Doc build

Author: Sean Owen 

Closes #15733 from srowen/SPARK-18138.

(cherry picked from commit dc4c60098641cf64007e2f0e36378f000ad5f6b1)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37550c49
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37550c49
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37550c49

Branch: refs/heads/branch-2.1
Commit: 37550c49218e1890f8adc10c9549a23dc072e21f
Parents: af60b1e
Author: Sean Owen 
Authored: Thu Nov 3 17:27:23 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 17:27:44 2016 -0700

--
 core/src/main/scala/org/apache/spark/SparkContext.scala | 12 
 docs/building-spark.md  |  6 ++
 docs/index.md   |  4 
 docs/programming-guide.md   |  4 
 python/pyspark/context.py   |  4 
 5 files changed, 30 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/37550c49/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 63478c8..9f0f607 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging {
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
 
+  warnDeprecatedVersions()
+
   /* 
-
 *
| Private variables. These variables keep the internal state of the 
context, and are|
| not accessible by the outside world. They're mutable since we want to 
initialize all  |
@@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging {
 value
   }
 
+  private def warnDeprecatedVersions(): Unit = {
+val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
+if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) {
+  logWarning("Support for Java 7 is deprecated as of Spark 2.0.0")
+}
+if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
+  logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
+}
+  }
+
   /** Control our logLevel. This overrides any user-defined log settings.
* @param logLevel The desired log level as a string.
* Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

http://git-wip-us.apache.org/repos/asf/spark/blob/37550c49/docs/building-spark.md
--
diff --git a/docs/building-spark.md b/docs/building-spark.md
index ebe46a4..2b404bd 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -13,6 +13,7 @@ redirect_from: "building-with-maven.html"
 
 The Maven-based build is the build of reference for Apache Spark.
 Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
+Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be 
removed in Spark 2.2.0.
 
 ### Setting up Maven's Memory Usage
 
@@ -79,6 +80,9 @@ Because HDFS is not protocol-compatible across versions, if 
you want to read fro
   
 
 
+Note that support for versions of Hadoop before 2.6 are deprecated as of Spark 
2.1.0 and may be 
+removed in Spark 2.2.0.
+
 
 You can enable the `yarn` profile and optionally set the `yarn.version` 
property if it is different from `hadoop.version`. Spark only supports YARN 
versions 2.2.0 and later.
 
@@ -129,6 +133,8 @@ To produce a Spark package compiled with Scala 2.10, use 
the `-Dscala-2.10` prop
 
 ./dev/change-scala-version.sh 2.10
 ./build/mvn -Pyarn -Phadoop-2.4 -Dscala-2.10 -DskipTests clean package
+
+Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be 
removed in Spark 2.2.0.
 
 ## Building submodules individually
 

http://git-wip-us.apache.org/repos/asf/spark/blob/37550c49/docs/index.md
--
diff --git a/docs/index.md b/docs/index.md
index

spark git commit: [SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master f22954ad4 -> dc4c60098


[SPARK-18138][DOCS] Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 
are deprecated in Spark 2.1.0

## What changes were proposed in this pull request?

Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in 
Spark 2.1.0. This does not actually implement any of the change in SPARK-18138, 
just peppers the documentation with notices about it.

## How was this patch tested?

Doc build

Author: Sean Owen 

Closes #15733 from srowen/SPARK-18138.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc4c6009
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc4c6009
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc4c6009

Branch: refs/heads/master
Commit: dc4c60098641cf64007e2f0e36378f000ad5f6b1
Parents: f22954a
Author: Sean Owen 
Authored: Thu Nov 3 17:27:23 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 17:27:23 2016 -0700

--
 core/src/main/scala/org/apache/spark/SparkContext.scala | 12 
 docs/building-spark.md  |  6 ++
 docs/index.md   |  4 
 docs/programming-guide.md   |  4 
 python/pyspark/context.py   |  4 
 5 files changed, 30 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c6009/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 63478c8..9f0f607 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging {
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
 
+  warnDeprecatedVersions()
+
   /* 
-
 *
| Private variables. These variables keep the internal state of the 
context, and are|
| not accessible by the outside world. They're mutable since we want to 
initialize all  |
@@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging {
 value
   }
 
+  private def warnDeprecatedVersions(): Unit = {
+val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
+if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) {
+  logWarning("Support for Java 7 is deprecated as of Spark 2.0.0")
+}
+if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
+  logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
+}
+  }
+
   /** Control our logLevel. This overrides any user-defined log settings.
* @param logLevel The desired log level as a string.
* Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c6009/docs/building-spark.md
--
diff --git a/docs/building-spark.md b/docs/building-spark.md
index ebe46a4..2b404bd 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -13,6 +13,7 @@ redirect_from: "building-with-maven.html"
 
 The Maven-based build is the build of reference for Apache Spark.
 Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
+Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be 
removed in Spark 2.2.0.
 
 ### Setting up Maven's Memory Usage
 
@@ -79,6 +80,9 @@ Because HDFS is not protocol-compatible across versions, if 
you want to read fro
   
 
 
+Note that support for versions of Hadoop before 2.6 are deprecated as of Spark 
2.1.0 and may be 
+removed in Spark 2.2.0.
+
 
 You can enable the `yarn` profile and optionally set the `yarn.version` 
property if it is different from `hadoop.version`. Spark only supports YARN 
versions 2.2.0 and later.
 
@@ -129,6 +133,8 @@ To produce a Spark package compiled with Scala 2.10, use 
the `-Dscala-2.10` prop
 
 ./dev/change-scala-version.sh 2.10
 ./build/mvn -Pyarn -Phadoop-2.4 -Dscala-2.10 -DskipTests clean package
+
+Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be 
removed in Spark 2.2.0.
 
 ## Building submodules individually
 

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c6009/docs/index.md
--
diff --git a/docs/index.md b/docs/index.md
index a7a92f6..fe51439 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -28,6 +28,10 @@ Spark runs on Java 7+,

spark git commit: [SPARK-18257][SS] Improve error reporting for FileStressSuite

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 2daca62cd -> af60b1ebb


[SPARK-18257][SS] Improve error reporting for FileStressSuite

## What changes were proposed in this pull request?
This patch improves error reporting for FileStressSuite, when there is an error 
in Spark itself (not user code). This works by simply tightening the exception 
verification, and gets rid of the unnecessary thread for starting the stream.

Also renamed the class FileStreamStressSuite to make it more obvious it is a 
streaming suite.

## How was this patch tested?
This is a test only change and I manually verified error reporting by injecting 
some bug in the addBatch code for FileStreamSink.

Author: Reynold Xin 

Closes #15757 from rxin/SPARK-18257.

(cherry picked from commit f22954ad49bf5a32c7b6d8487cd38ffe0da904ca)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af60b1eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af60b1eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af60b1eb

Branch: refs/heads/branch-2.1
Commit: af60b1ebbf5cb91dc724aad9d3d7476ce9085ac9
Parents: 2daca62
Author: Reynold Xin 
Authored: Thu Nov 3 15:30:45 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 15:30:55 2016 -0700

--
 .../sql/streaming/FileStreamStressSuite.scala   | 156 +++
 .../spark/sql/streaming/FileStressSuite.scala   | 153 --
 2 files changed, 156 insertions(+), 153 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/af60b1eb/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
new file mode 100644
index 000..28412ea
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.io.File
+import java.util.UUID
+
+import scala.util.Random
+import scala.util.control.NonFatal
+
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.util.Utils
+
+/**
+ * A stress test for streaming queries that read and write files.  This test 
consists of
+ * two threads:
+ *  - one that writes out `numRecords` distinct integers to files of random 
sizes (the total
+ *number of records is fixed but each files size / creation time is 
random).
+ *  - another that continually restarts a buggy streaming query (i.e. fails 
with 5% probability on
+ *any partition).
+ *
+ * At the end, the resulting files are loaded and the answer is checked.
+ */
+class FileStreamStressSuite extends StreamTest {
+  import testImplicits._
+
+  // Error message thrown in the streaming job for testing recovery.
+  private val injectedErrorMsg = "test suite injected failure!"
+
+  testQuietly("fault tolerance stress test - unpartitioned output") {
+stressTest(partitionWrites = false)
+  }
+
+  testQuietly("fault tolerance stress test - partitioned output") {
+stressTest(partitionWrites = true)
+  }
+
+  def stressTest(partitionWrites: Boolean): Unit = {
+val numRecords = 1
+val inputDir = Utils.createTempDir(namePrefix = 
"stream.input").getCanonicalPath
+val stagingDir = Utils.createTempDir(namePrefix = 
"stream.staging").getCanonicalPath
+val outputDir = Utils.createTempDir(namePrefix = 
"stream.output").getCanonicalPath
+val checkpoint = Utils.createTempDir(namePrefix = 
"stream.checkpoint").getCanonicalPath
+
+@volatile
+var continue = true
+@volatile
+var stream: StreamingQuery = null
+
+val writer = new Thread("stream writer") {
+  override def run(): Unit = {
+var

spark git commit: [SPARK-18257][SS] Improve error reporting for FileStressSuite

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master e89202523 -> f22954ad4


[SPARK-18257][SS] Improve error reporting for FileStressSuite

## What changes were proposed in this pull request?
This patch improves error reporting for FileStressSuite, when there is an error 
in Spark itself (not user code). This works by simply tightening the exception 
verification, and gets rid of the unnecessary thread for starting the stream.

Also renamed the class FileStreamStressSuite to make it more obvious it is a 
streaming suite.

## How was this patch tested?
This is a test only change and I manually verified error reporting by injecting 
some bug in the addBatch code for FileStreamSink.

Author: Reynold Xin 

Closes #15757 from rxin/SPARK-18257.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f22954ad
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f22954ad
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f22954ad

Branch: refs/heads/master
Commit: f22954ad49bf5a32c7b6d8487cd38ffe0da904ca
Parents: e892025
Author: Reynold Xin 
Authored: Thu Nov 3 15:30:45 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 15:30:45 2016 -0700

--
 .../sql/streaming/FileStreamStressSuite.scala   | 156 +++
 .../spark/sql/streaming/FileStressSuite.scala   | 153 --
 2 files changed, 156 insertions(+), 153 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f22954ad/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
new file mode 100644
index 000..28412ea
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.io.File
+import java.util.UUID
+
+import scala.util.Random
+import scala.util.control.NonFatal
+
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.util.Utils
+
+/**
+ * A stress test for streaming queries that read and write files.  This test 
consists of
+ * two threads:
+ *  - one that writes out `numRecords` distinct integers to files of random 
sizes (the total
+ *number of records is fixed but each files size / creation time is 
random).
+ *  - another that continually restarts a buggy streaming query (i.e. fails 
with 5% probability on
+ *any partition).
+ *
+ * At the end, the resulting files are loaded and the answer is checked.
+ */
+class FileStreamStressSuite extends StreamTest {
+  import testImplicits._
+
+  // Error message thrown in the streaming job for testing recovery.
+  private val injectedErrorMsg = "test suite injected failure!"
+
+  testQuietly("fault tolerance stress test - unpartitioned output") {
+stressTest(partitionWrites = false)
+  }
+
+  testQuietly("fault tolerance stress test - partitioned output") {
+stressTest(partitionWrites = true)
+  }
+
+  def stressTest(partitionWrites: Boolean): Unit = {
+val numRecords = 1
+val inputDir = Utils.createTempDir(namePrefix = 
"stream.input").getCanonicalPath
+val stagingDir = Utils.createTempDir(namePrefix = 
"stream.staging").getCanonicalPath
+val outputDir = Utils.createTempDir(namePrefix = 
"stream.output").getCanonicalPath
+val checkpoint = Utils.createTempDir(namePrefix = 
"stream.checkpoint").getCanonicalPath
+
+@volatile
+var continue = true
+@volatile
+var stream: StreamingQuery = null
+
+val writer = new Thread("stream writer") {
+  override def run(): Unit = {
+var i = numRecords
+while (i > 0) {
+  val count = Random.nextInt(100)
+  var j = 0
+

spark git commit: [SPARK-18237][HIVE] hive.exec.stagingdir have no effect

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master b17057c0a -> 16293311c


[SPARK-18237][HIVE] hive.exec.stagingdir have no effect

hive.exec.stagingdir have no effect in spark2.0.1ï¼
Hive confs in hive-site.xml will be loaded in `hadoopConf`, so we should use 
`hadoopConf` in `InsertIntoHiveTable` instead of `SessionState.conf`

Author: ç¦æ 

Closes #15744 from ClassNotFoundExp/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16293311
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16293311
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16293311

Branch: refs/heads/master
Commit: 16293311cdb25a62733a9aae4355659b971a3ce1
Parents: b17057c
Author: ç¦æ 
Authored: Thu Nov 3 12:02:01 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 12:02:01 2016 -0700

--
 .../apache/spark/sql/hive/execution/InsertIntoHiveTable.scala| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/16293311/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 15be12c..e333fc7 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -76,7 +76,8 @@ case class InsertIntoHiveTable(
 
   def output: Seq[Attribute] = Seq.empty
 
-  val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", 
".hive-staging")
+  val hadoopConf = sessionState.newHadoopConf()
+  val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
 
   private def executionId: String = {
 val rand: Random = new Random
@@ -163,7 +164,6 @@ case class InsertIntoHiveTable(
 // instances within the closure, since Serializer is not serializable 
while TableDesc is.
 val tableDesc = table.tableDesc
 val tableLocation = table.hiveQlTable.getDataLocation
-val hadoopConf = sessionState.newHadoopConf()
 val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf)
 val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
 val isCompressed = hadoopConf.get("hive.exec.compress.output", 
"false").toBoolean


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18237][HIVE] hive.exec.stagingdir have no effect

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 4f91630c8 -> 3e139e239


[SPARK-18237][HIVE] hive.exec.stagingdir have no effect

hive.exec.stagingdir have no effect in spark2.0.1ï¼
Hive confs in hive-site.xml will be loaded in `hadoopConf`, so we should use 
`hadoopConf` in `InsertIntoHiveTable` instead of `SessionState.conf`

Author: ç¦æ 

Closes #15744 from ClassNotFoundExp/master.

(cherry picked from commit 16293311cdb25a62733a9aae4355659b971a3ce1)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e139e23
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e139e23
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e139e23

Branch: refs/heads/branch-2.1
Commit: 3e139e2390085cfb42f7136f150b0fa08c14eb61
Parents: 4f91630
Author: ç¦æ 
Authored: Thu Nov 3 12:02:01 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 12:02:08 2016 -0700

--
 .../apache/spark/sql/hive/execution/InsertIntoHiveTable.scala| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3e139e23/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 15be12c..e333fc7 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -76,7 +76,8 @@ case class InsertIntoHiveTable(
 
   def output: Seq[Attribute] = Seq.empty
 
-  val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", 
".hive-staging")
+  val hadoopConf = sessionState.newHadoopConf()
+  val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
 
   private def executionId: String = {
 val rand: Random = new Random
@@ -163,7 +164,6 @@ case class InsertIntoHiveTable(
 // instances within the closure, since Serializer is not serializable 
while TableDesc is.
 val tableDesc = table.tableDesc
 val tableLocation = table.hiveQlTable.getDataLocation
-val hadoopConf = sessionState.newHadoopConf()
 val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf)
 val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
 val isCompressed = hadoopConf.get("hive.exec.compress.output", 
"false").toBoolean


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 c2876bfbf -> 4f91630c8


[SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog

## What changes were proposed in this pull request?
This patch renames partitionProviderIsHive to tracksPartitionsInCatalog, as the 
old name was too Hive specific.

## How was this patch tested?
Should be covered by existing tests.

Author: Reynold Xin 

Closes #15750 from rxin/SPARK-18244.

(cherry picked from commit b17057c0a69b9c56e503483d97f5dc209eef0884)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f91630c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f91630c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f91630c

Branch: refs/heads/branch-2.1
Commit: 4f91630c8100ee3a6fd168bc4247ca6fadd0a736
Parents: c2876bf
Author: Reynold Xin 
Authored: Thu Nov 3 11:48:05 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 11:48:17 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  9 +
 .../sql/catalyst/trees/TreeNodeSuite.scala  |  2 +-
 .../command/createDataSourceTables.scala|  2 +-
 .../spark/sql/execution/command/ddl.scala   |  4 ++--
 .../spark/sql/execution/command/tables.scala|  2 +-
 .../sql/execution/datasources/DataSource.scala  |  2 +-
 .../datasources/DataSourceStrategy.scala|  7 ---
 .../InsertIntoHadoopFsRelationCommand.scala |  6 +-
 .../spark/sql/execution/command/DDLSuite.scala  |  2 +-
 .../spark/sql/hive/HiveExternalCatalog.scala| 21 
 10 files changed, 30 insertions(+), 27 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4f91630c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 7c3bec8..34748a0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -138,8 +138,9 @@ case class BucketSpec(
  * Can be None if this table is a View, should be "hive" for 
hive serde tables.
  * @param unsupportedFeatures is a list of string descriptions of features 
that are used by the
  *underlying table but not supported by Spark SQL yet.
- * @param partitionProviderIsHive whether this table's partition metadata is 
stored in the Hive
- *metastore.
+ * @param tracksPartitionsInCatalog whether this table's partition metadata is 
stored in the
+ *  catalog. If false, it is inferred 
automatically based on file
+ *  structure.
  */
 case class CatalogTable(
 identifier: TableIdentifier,
@@ -158,7 +159,7 @@ case class CatalogTable(
 viewText: Option[String] = None,
 comment: Option[String] = None,
 unsupportedFeatures: Seq[String] = Seq.empty,
-partitionProviderIsHive: Boolean = false) {
+tracksPartitionsInCatalog: Boolean = false) {
 
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
@@ -217,7 +218,7 @@ case class CatalogTable(
 if (properties.nonEmpty) s"Properties: $tableProperties" else "",
 if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "",
 s"$storage",
-if (partitionProviderIsHive) "Partition Provider: Hive" else "")
+if (tracksPartitionsInCatalog) "Partition Provider: Catalog" else "")
 
 output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/4f91630c/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 3eff12f..af1eaa1 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -489,7 +489,7 @@ class TreeNodeSuite extends SparkFunSuite {
 "owner" -> "",
 "createTime" -> 0,
 "lastAccessTime" -&

spark git commit: [SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 27daf6bcd -> b17057c0a


[SPARK-18244][SQL] Rename partitionProviderIsHive -> tracksPartitionsInCatalog

## What changes were proposed in this pull request?
This patch renames partitionProviderIsHive to tracksPartitionsInCatalog, as the 
old name was too Hive specific.

## How was this patch tested?
Should be covered by existing tests.

Author: Reynold Xin 

Closes #15750 from rxin/SPARK-18244.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b17057c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b17057c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b17057c0

Branch: refs/heads/master
Commit: b17057c0a69b9c56e503483d97f5dc209eef0884
Parents: 27daf6b
Author: Reynold Xin 
Authored: Thu Nov 3 11:48:05 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 11:48:05 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  9 +
 .../sql/catalyst/trees/TreeNodeSuite.scala  |  2 +-
 .../command/createDataSourceTables.scala|  2 +-
 .../spark/sql/execution/command/ddl.scala   |  4 ++--
 .../spark/sql/execution/command/tables.scala|  2 +-
 .../sql/execution/datasources/DataSource.scala  |  2 +-
 .../datasources/DataSourceStrategy.scala|  7 ---
 .../InsertIntoHadoopFsRelationCommand.scala |  6 +-
 .../spark/sql/execution/command/DDLSuite.scala  |  2 +-
 .../spark/sql/hive/HiveExternalCatalog.scala| 21 
 10 files changed, 30 insertions(+), 27 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b17057c0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 7c3bec8..34748a0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -138,8 +138,9 @@ case class BucketSpec(
  * Can be None if this table is a View, should be "hive" for 
hive serde tables.
  * @param unsupportedFeatures is a list of string descriptions of features 
that are used by the
  *underlying table but not supported by Spark SQL yet.
- * @param partitionProviderIsHive whether this table's partition metadata is 
stored in the Hive
- *metastore.
+ * @param tracksPartitionsInCatalog whether this table's partition metadata is 
stored in the
+ *  catalog. If false, it is inferred 
automatically based on file
+ *  structure.
  */
 case class CatalogTable(
 identifier: TableIdentifier,
@@ -158,7 +159,7 @@ case class CatalogTable(
 viewText: Option[String] = None,
 comment: Option[String] = None,
 unsupportedFeatures: Seq[String] = Seq.empty,
-partitionProviderIsHive: Boolean = false) {
+tracksPartitionsInCatalog: Boolean = false) {
 
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
@@ -217,7 +218,7 @@ case class CatalogTable(
 if (properties.nonEmpty) s"Properties: $tableProperties" else "",
 if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "",
 s"$storage",
-if (partitionProviderIsHive) "Partition Provider: Hive" else "")
+if (tracksPartitionsInCatalog) "Partition Provider: Catalog" else "")
 
 output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/b17057c0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 3eff12f..af1eaa1 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -489,7 +489,7 @@ class TreeNodeSuite extends SparkFunSuite {
 "owner" -> "",
 "createTime" -> 0,
 "lastAccessTime" -> -1,
-"partitionProviderIsHive" -> false,
+"tracksPartitionsInCata

spark git commit: [SQL] minor - internal doc improvement for InsertIntoTable.

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 bc7f05f5f -> 71104c9c9


[SQL] minor - internal doc improvement for InsertIntoTable.

## What changes were proposed in this pull request?
I was reading this part of the code and was really confused by the "partition" 
parameter. This patch adds some documentation for it to reduce confusion in the 
future.

I also looked around other logical plans but most of them are either already 
documented, or pretty self-evident to people that know Spark SQL.

## How was this patch tested?
N/A - doc change only.

Author: Reynold Xin 

Closes #15749 from rxin/doc-improvement.

(cherry picked from commit 0ea5d5b24c1f7b29efeac0e72d271aba279523f7)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71104c9c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71104c9c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71104c9c

Branch: refs/heads/branch-2.1
Commit: 71104c9c97a648c94e6619279ad49752c01c89c3
Parents: bc7f05f
Author: Reynold Xin 
Authored: Thu Nov 3 02:45:54 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 02:46:01 2016 -0700

--
 .../plans/logical/basicLogicalOperators.scala   | 16 ++
 .../hive/execution/InsertIntoHiveTable.scala| 31 
 2 files changed, 42 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/71104c9c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 7a15c22..65ceab2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -360,6 +360,22 @@ case class OverwriteOptions(
   }
 }
 
+/**
+ * Insert some data into a table.
+ *
+ * @param table the logical plan representing the table. In the future this 
should be a
+ *  [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we 
converge Hive tables
+ *  and data source tables.
+ * @param partition a map from the partition key to the partition value 
(optional). If the partition
+ *  value is optional, dynamic partition insert will be 
performed.
+ *  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS 
...` would have
+ *  Map('a' -> Some('1'), 'b' -> Some('2')),
+ *  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
+ *  would have Map('a' -> Some('1'), 'b' -> None).
+ * @param child the logical plan representing data to write to.
+ * @param overwrite overwrite existing table or partitions.
+ * @param ifNotExists If true, only write if the table or partition does not 
exist.
+ */
 case class InsertIntoTable(
 table: LogicalPlan,
 partition: Map[String, Option[String]],

http://git-wip-us.apache.org/repos/asf/spark/blob/71104c9c/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 05164d7..15be12c 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -35,13 +35,35 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
-import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, 
AlterTableDropPartitionCommand}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.SparkException
 import org.apache.spark.util.SerializableJobConf
 
 
+/**
+ * Command for writing data out to a Hive table.
+ *
+ * This class is mostly a mess, for legacy reasons (since it evolved in 
organic ways and had to
+ * follow Hive's internal implementations closely, which itself was a mess 
too). Please don't
+ * blame Reynold for this! He was just moving code arou

spark git commit: [SQL] minor - internal doc improvement for InsertIntoTable.

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 937af592e -> 0ea5d5b24


[SQL] minor - internal doc improvement for InsertIntoTable.

## What changes were proposed in this pull request?
I was reading this part of the code and was really confused by the "partition" 
parameter. This patch adds some documentation for it to reduce confusion in the 
future.

I also looked around other logical plans but most of them are either already 
documented, or pretty self-evident to people that know Spark SQL.

## How was this patch tested?
N/A - doc change only.

Author: Reynold Xin 

Closes #15749 from rxin/doc-improvement.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0ea5d5b2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0ea5d5b2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0ea5d5b2

Branch: refs/heads/master
Commit: 0ea5d5b24c1f7b29efeac0e72d271aba279523f7
Parents: 937af59
Author: Reynold Xin 
Authored: Thu Nov 3 02:45:54 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 02:45:54 2016 -0700

--
 .../plans/logical/basicLogicalOperators.scala   | 16 ++
 .../hive/execution/InsertIntoHiveTable.scala| 31 
 2 files changed, 42 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0ea5d5b2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 7a15c22..65ceab2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -360,6 +360,22 @@ case class OverwriteOptions(
   }
 }
 
+/**
+ * Insert some data into a table.
+ *
+ * @param table the logical plan representing the table. In the future this 
should be a
+ *  [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we 
converge Hive tables
+ *  and data source tables.
+ * @param partition a map from the partition key to the partition value 
(optional). If the partition
+ *  value is optional, dynamic partition insert will be 
performed.
+ *  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS 
...` would have
+ *  Map('a' -> Some('1'), 'b' -> Some('2')),
+ *  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
+ *  would have Map('a' -> Some('1'), 'b' -> None).
+ * @param child the logical plan representing data to write to.
+ * @param overwrite overwrite existing table or partitions.
+ * @param ifNotExists If true, only write if the table or partition does not 
exist.
+ */
 case class InsertIntoTable(
 table: LogicalPlan,
 partition: Map[String, Option[String]],

http://git-wip-us.apache.org/repos/asf/spark/blob/0ea5d5b2/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 05164d7..15be12c 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -35,13 +35,35 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
-import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, 
AlterTableDropPartitionCommand}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.SparkException
 import org.apache.spark.util.SerializableJobConf
 
 
+/**
+ * Command for writing data out to a Hive table.
+ *
+ * This class is mostly a mess, for legacy reasons (since it evolved in 
organic ways and had to
+ * follow Hive's internal implementations closely, which itself was a mess 
too). Please don't
+ * blame Reynold for this! He was just moving code around!
+ *
+ * In the future we should converge the write path for Hive with the normal 
data source write path,

spark git commit: [SPARK-18219] Move commit protocol API (internal) from sql/core to core module

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 c4c5328f2 -> bc7f05f5f


[SPARK-18219] Move commit protocol API (internal) from sql/core to core module

## What changes were proposed in this pull request?
This patch moves the new commit protocol API from sql/core to core module, so 
we can use it in the future in the RDD API.

As part of this patch, I also moved the speficiation of the random uuid for the 
write path out of the commit protocol, and instead pass in a job id.

## How was this patch tested?
N/A

Author: Reynold Xin 

Closes #15731 from rxin/SPARK-18219.

(cherry picked from commit 937af592e65f4dd878aafcabf8fe2cfe7fa3d9b3)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc7f05f5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc7f05f5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc7f05f5

Branch: refs/heads/branch-2.1
Commit: bc7f05f5f03653c623190b8178bcbe981a41c2f3
Parents: c4c5328
Author: Reynold Xin 
Authored: Thu Nov 3 02:42:48 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 02:43:03 2016 -0700

--
 .../spark/internal/io/FileCommitProtocol.scala  | 126 +
 .../io/HadoopMapReduceCommitProtocol.scala  | 111 
 .../datasources/FileCommitProtocol.scala| 257 ---
 .../datasources/FileFormatWriter.scala  |   3 +-
 .../InsertIntoHadoopFsRelationCommand.scala |   6 +-
 .../SQLHadoopMapReduceCommitProtocol.scala  |  72 ++
 .../execution/streaming/FileStreamSink.scala|   9 +-
 .../streaming/ManifestFileCommitProtocol.scala  |   6 +-
 .../org/apache/spark/sql/internal/SQLConf.scala |   4 +-
 9 files changed, 327 insertions(+), 267 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bc7f05f5/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala 
b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
new file mode 100644
index 000..fb80205
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import org.apache.hadoop.mapreduce._
+
+import org.apache.spark.util.Utils
+
+
+/**
+ * An interface to define how a single Spark job commits its outputs. Two 
notes:
+ *
+ * 1. Implementations must be serializable, as the committer instance 
instantiated on the driver
+ *will be used for tasks on executors.
+ * 2. Implementations should have a constructor with either 2 or 3 arguments:
+ *(jobId: String, path: String) or (jobId: String, path: String, isAppend: 
Boolean).
+ * 3. A committer should not be reused across multiple Spark jobs.
+ *
+ * The proper call sequence is:
+ *
+ * 1. Driver calls setupJob.
+ * 2. As part of each task's execution, executor calls setupTask and then 
commitTask
+ *(or abortTask if task failed).
+ * 3. When all necessary tasks completed successfully, the driver calls 
commitJob. If the job
+ *failed to execute (e.g. too many failed tasks), the job should call 
abortJob.
+ */
+abstract class FileCommitProtocol {
+  import FileCommitProtocol._
+
+  /**
+   * Setups up a job. Must be called on the driver before any other methods 
can be invoked.
+   */
+  def setupJob(jobContext: JobContext): Unit
+
+  /**
+   * Commits a job after the writes succeed. Must be called on the driver.
+   */
+  def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): 
Unit
+
+  /**
+   * Aborts a job after the writes fail. Must be called on the driver.
+   *
+   * Calling this function is a best-effort attempt, because it is possible 
that the driver
+   * just crashes (or killed) before it can call abort.
+   */
+  def abortJob(jobContext: JobContext): Unit
+
+  /**
+   * Sets up a task within

spark git commit: [SPARK-18219] Move commit protocol API (internal) from sql/core to core module

2016-11-03 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 96cc1b567 -> 937af592e


[SPARK-18219] Move commit protocol API (internal) from sql/core to core module

## What changes were proposed in this pull request?
This patch moves the new commit protocol API from sql/core to core module, so 
we can use it in the future in the RDD API.

As part of this patch, I also moved the speficiation of the random uuid for the 
write path out of the commit protocol, and instead pass in a job id.

## How was this patch tested?
N/A

Author: Reynold Xin 

Closes #15731 from rxin/SPARK-18219.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/937af592
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/937af592
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/937af592

Branch: refs/heads/master
Commit: 937af592e65f4dd878aafcabf8fe2cfe7fa3d9b3
Parents: 96cc1b56
Author: Reynold Xin 
Authored: Thu Nov 3 02:42:48 2016 -0700
Committer: Reynold Xin 
Committed: Thu Nov 3 02:42:48 2016 -0700

--
 .../spark/internal/io/FileCommitProtocol.scala  | 126 +
 .../io/HadoopMapReduceCommitProtocol.scala  | 111 
 .../datasources/FileCommitProtocol.scala| 257 ---
 .../datasources/FileFormatWriter.scala  |   3 +-
 .../InsertIntoHadoopFsRelationCommand.scala |   6 +-
 .../SQLHadoopMapReduceCommitProtocol.scala  |  72 ++
 .../execution/streaming/FileStreamSink.scala|   9 +-
 .../streaming/ManifestFileCommitProtocol.scala  |   6 +-
 .../org/apache/spark/sql/internal/SQLConf.scala |   4 +-
 9 files changed, 327 insertions(+), 267 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/937af592/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala 
b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
new file mode 100644
index 000..fb80205
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import org.apache.hadoop.mapreduce._
+
+import org.apache.spark.util.Utils
+
+
+/**
+ * An interface to define how a single Spark job commits its outputs. Two 
notes:
+ *
+ * 1. Implementations must be serializable, as the committer instance 
instantiated on the driver
+ *will be used for tasks on executors.
+ * 2. Implementations should have a constructor with either 2 or 3 arguments:
+ *(jobId: String, path: String) or (jobId: String, path: String, isAppend: 
Boolean).
+ * 3. A committer should not be reused across multiple Spark jobs.
+ *
+ * The proper call sequence is:
+ *
+ * 1. Driver calls setupJob.
+ * 2. As part of each task's execution, executor calls setupTask and then 
commitTask
+ *(or abortTask if task failed).
+ * 3. When all necessary tasks completed successfully, the driver calls 
commitJob. If the job
+ *failed to execute (e.g. too many failed tasks), the job should call 
abortJob.
+ */
+abstract class FileCommitProtocol {
+  import FileCommitProtocol._
+
+  /**
+   * Setups up a job. Must be called on the driver before any other methods 
can be invoked.
+   */
+  def setupJob(jobContext: JobContext): Unit
+
+  /**
+   * Commits a job after the writes succeed. Must be called on the driver.
+   */
+  def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): 
Unit
+
+  /**
+   * Aborts a job after the writes fail. Must be called on the driver.
+   *
+   * Calling this function is a best-effort attempt, because it is possible 
that the driver
+   * just crashes (or killed) before it can call abort.
+   */
+  def abortJob(jobContext: JobContext): Unit
+
+  /**
+   * Sets up a task within a job.
+   * Must be called before any other task related methods can be invoked.
+   */

spark git commit: [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 2cf39d638 -> 965c964c2


[SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet

## What changes were proposed in this pull request?

[SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache 
Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: 
Invalid initial capacity` while running `triangleCount`. The root cause is that 
`VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial 
size. This PR loosens the restriction to allow zero.

## How was this patch tested?

Pass the Jenkins test with a new test case in `OpenHashSetSuite`.

Author: Dongjoon Hyun 

Closes #15741 from dongjoon-hyun/SPARK-18200.

(cherry picked from commit d24e736471f34ef8f2c12766393379c4213fe96e)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/965c964c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/965c964c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/965c964c

Branch: refs/heads/branch-2.1
Commit: 965c964c2657aaf575f0e00ce6b74a8f05172c06
Parents: 2cf39d6
Author: Dongjoon Hyun 
Authored: Wed Nov 2 23:50:50 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 23:51:16 2016 -0700

--
 .../org/apache/spark/util/collection/OpenHashSet.scala| 10 +++---
 .../apache/spark/util/collection/OpenHashMapSuite.scala   |  3 ---
 .../apache/spark/util/collection/OpenHashSetSuite.scala   |  5 +
 .../util/collection/PrimitiveKeyOpenHashMapSuite.scala|  3 ---
 4 files changed, 12 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/965c964c/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 0f6a425..7a1be85 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   require(initialCapacity <= OpenHashSet.MAX_CAPACITY,
 s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements")
-  require(initialCapacity >= 1, "Invalid initial capacity")
+  require(initialCapacity >= 0, "Invalid initial capacity")
   require(loadFactor < 1.0, "Load factor must be less than 1.0")
   require(loadFactor > 0.0, "Load factor must be greater than 0.0")
 
@@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
   private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()
 
   private def nextPowerOf2(n: Int): Int = {
-val highBit = Integer.highestOneBit(n)
-if (highBit == n) n else highBit << 1
+if (n == 0) {
+  2
+} else {
+  val highBit = Integer.highestOneBit(n)
+  if (highBit == n) n else highBit << 1
+}
   }
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/965c964c/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 3066e99..335ecb9 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
 intercept[IllegalArgumentException] {
   new OpenHashMap[String, Int](-1)
 }
-intercept[IllegalArgumentException] {
-  new OpenHashMap[String, String](0)
-}
   }
 
   test("primitive value") {

http://git-wip-us.apache.org/repos/asf/spark/blob/965c964c/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 2607a54..210bc5c 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
 assert(set.size === 1000)
 assert(set.capacity > 1000)
   }
+
+  test("SPARK-18200 Support zero as an initial set size") {
+val set = new OpenHashSet[Long](0)
+assert(set.size === 0)
+  }
 }

http:/

spark git commit: [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 3253ae7f7 -> dae1581d9


[SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet

## What changes were proposed in this pull request?

[SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache 
Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: 
Invalid initial capacity` while running `triangleCount`. The root cause is that 
`VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial 
size. This PR loosens the restriction to allow zero.

## How was this patch tested?

Pass the Jenkins test with a new test case in `OpenHashSetSuite`.

Author: Dongjoon Hyun 

Closes #15741 from dongjoon-hyun/SPARK-18200.

(cherry picked from commit d24e736471f34ef8f2c12766393379c4213fe96e)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dae1581d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dae1581d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dae1581d

Branch: refs/heads/branch-2.0
Commit: dae1581d9461346511098dc83938939a0f930048
Parents: 3253ae7
Author: Dongjoon Hyun 
Authored: Wed Nov 2 23:50:50 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 23:51:26 2016 -0700

--
 .../org/apache/spark/util/collection/OpenHashSet.scala| 10 +++---
 .../apache/spark/util/collection/OpenHashMapSuite.scala   |  3 ---
 .../apache/spark/util/collection/OpenHashSetSuite.scala   |  5 +
 .../util/collection/PrimitiveKeyOpenHashMapSuite.scala|  3 ---
 4 files changed, 12 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dae1581d/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 0f6a425..7a1be85 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   require(initialCapacity <= OpenHashSet.MAX_CAPACITY,
 s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements")
-  require(initialCapacity >= 1, "Invalid initial capacity")
+  require(initialCapacity >= 0, "Invalid initial capacity")
   require(loadFactor < 1.0, "Load factor must be less than 1.0")
   require(loadFactor > 0.0, "Load factor must be greater than 0.0")
 
@@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
   private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()
 
   private def nextPowerOf2(n: Int): Int = {
-val highBit = Integer.highestOneBit(n)
-if (highBit == n) n else highBit << 1
+if (n == 0) {
+  2
+} else {
+  val highBit = Integer.highestOneBit(n)
+  if (highBit == n) n else highBit << 1
+}
   }
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/dae1581d/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 3066e99..335ecb9 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
 intercept[IllegalArgumentException] {
   new OpenHashMap[String, Int](-1)
 }
-intercept[IllegalArgumentException] {
-  new OpenHashMap[String, String](0)
-}
   }
 
   test("primitive value") {

http://git-wip-us.apache.org/repos/asf/spark/blob/dae1581d/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 2607a54..210bc5c 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
 assert(set.size === 1000)
 assert(set.capacity > 1000)
   }
+
+  test("SPARK-18200 Support zero as an initial set size") {
+val set = new OpenHashSet[Long](0)
+assert(set.size === 0)
+  }
 }

http:/

spark git commit: [SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 9ddec8636 -> d24e73647


[SPARK-18200][GRAPHX] Support zero as an initial capacity in OpenHashSet

## What changes were proposed in this pull request?

[SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache 
Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: 
Invalid initial capacity` while running `triangleCount`. The root cause is that 
`VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial 
size. This PR loosens the restriction to allow zero.

## How was this patch tested?

Pass the Jenkins test with a new test case in `OpenHashSetSuite`.

Author: Dongjoon Hyun 

Closes #15741 from dongjoon-hyun/SPARK-18200.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d24e7364
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d24e7364
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d24e7364

Branch: refs/heads/master
Commit: d24e736471f34ef8f2c12766393379c4213fe96e
Parents: 9ddec86
Author: Dongjoon Hyun 
Authored: Wed Nov 2 23:50:50 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 23:50:50 2016 -0700

--
 .../org/apache/spark/util/collection/OpenHashSet.scala| 10 +++---
 .../apache/spark/util/collection/OpenHashMapSuite.scala   |  3 ---
 .../apache/spark/util/collection/OpenHashSetSuite.scala   |  5 +
 .../util/collection/PrimitiveKeyOpenHashMapSuite.scala|  3 ---
 4 files changed, 12 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 0f6a425..7a1be85 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   require(initialCapacity <= OpenHashSet.MAX_CAPACITY,
 s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements")
-  require(initialCapacity >= 1, "Invalid initial capacity")
+  require(initialCapacity >= 0, "Invalid initial capacity")
   require(loadFactor < 1.0, "Load factor must be less than 1.0")
   require(loadFactor > 0.0, "Load factor must be greater than 0.0")
 
@@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
   private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()
 
   private def nextPowerOf2(n: Int): Int = {
-val highBit = Integer.highestOneBit(n)
-if (highBit == n) n else highBit << 1
+if (n == 0) {
+  2
+} else {
+  val highBit = Integer.highestOneBit(n)
+  if (highBit == n) n else highBit << 1
+}
   }
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 3066e99..335ecb9 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
 intercept[IllegalArgumentException] {
   new OpenHashMap[String, Int](-1)
 }
-intercept[IllegalArgumentException] {
-  new OpenHashMap[String, String](0)
-}
   }
 
   test("primitive value") {

http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 2607a54..210bc5c 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
 assert(set.size === 1000)
 assert(set.capacity > 1000)
   }
+
+  test("SPARK-18200 Support zero as an initial set size") {
+val set = new OpenHashSet[Long](0)
+assert(set.size === 0)
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/d24e7364/core/src/test/scala/org/apache/spark/util/collection/

spark git commit: [SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 1eef8e5cd -> 2aff2ea81


[SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion

## What changes were proposed in this pull request?
RuntimeReplaceable is used to create aliases for expressions, but the way it 
deals with type coercion is pretty weird (each expression is responsible for 
how to handle type coercion, which does not obey the normal implicit type cast 
rules).

This patch simplifies its handling by allowing the analyzer to traverse into 
the actual expression of a RuntimeReplaceable.

## How was this patch tested?
- Correctness should be guaranteed by existing unit tests already
- Removed SQLCompatibilityFunctionSuite and moved it 
sql-compatibility-functions.sql
- Added a new test case in sql-compatibility-functions.sql for verifying 
explain behavior.

Author: Reynold Xin 

Closes #15723 from rxin/SPARK-18214.

(cherry picked from commit fd90541c35af2bccf0155467bec8cea7c8865046)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2aff2ea8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2aff2ea8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2aff2ea8

Branch: refs/heads/branch-2.1
Commit: 2aff2ea81d260a47e7762b2990ed62a91e5d0198
Parents: 1eef8e5
Author: Reynold Xin 
Authored: Wed Nov 2 15:53:02 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 15:53:09 2016 -0700

--
 .../sql/catalyst/analysis/TypeCoercion.scala|   2 -
 .../sql/catalyst/expressions/Expression.scala   |  30 ++---
 .../expressions/datetimeExpressions.scala   |   2 -
 .../catalyst/expressions/nullExpressions.scala  |  75 ---
 .../sql/catalyst/optimizer/finishAnalysis.scala |   2 +-
 .../expressions/NullFunctionsSuite.scala|  19 ++-
 .../inputs/sql-compatibility-functions.sql  |  25 
 .../resources/sql-tests/results/array.sql.out   |   5 +-
 .../results/sql-compatibility-functions.sql.out | 124 +++
 .../sql/SQLCompatibilityFunctionSuite.scala |  98 ---
 .../apache/spark/sql/SQLQueryTestSuite.scala|   4 +-
 11 files changed, 204 insertions(+), 182 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2aff2ea8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 01b04c0..6662a9e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -528,8 +528,6 @@ object TypeCoercion {
 NaNvl(l, Cast(r, DoubleType))
   case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType 
=>
 NaNvl(Cast(l, DoubleType), r)
-
-  case e: RuntimeReplaceable => e.replaceForTypeCoercion()
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2aff2ea8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 726a231..221f830 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -186,7 +186,7 @@ abstract class Expression extends TreeNode[Expression] {
*/
   def prettyName: String = nodeName.toLowerCase
 
-  protected def flatArguments = productIterator.flatMap {
+  protected def flatArguments: Iterator[Any] = productIterator.flatMap {
 case t: Traversable[_] => t
 case single => single :: Nil
   }
@@ -229,26 +229,16 @@ trait Unevaluable extends Expression {
  * An expression that gets replaced at runtime (currently by the optimizer) 
into a different
  * expression for evaluation. This is mainly used to provide compatibility 
with other databases.
  * For example, we use this to support "nvl" by replacing it with "coalesce".
+ *
+ * A RuntimeReplaceable should have the original parameters along with a 
"child" expression in the
+ * case class constructor, and define a normal constructor that accepts only 
the original
+ * parameters. For an example, see [[Nvl]]. To make sure the explain plan and 
expression SQL
+ * works correctly, the implementation should also override flatArguments

spark git commit: [SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 37d95227a -> fd90541c3


[SPARK-18214][SQL] Simplify RuntimeReplaceable type coercion

## What changes were proposed in this pull request?
RuntimeReplaceable is used to create aliases for expressions, but the way it 
deals with type coercion is pretty weird (each expression is responsible for 
how to handle type coercion, which does not obey the normal implicit type cast 
rules).

This patch simplifies its handling by allowing the analyzer to traverse into 
the actual expression of a RuntimeReplaceable.

## How was this patch tested?
- Correctness should be guaranteed by existing unit tests already
- Removed SQLCompatibilityFunctionSuite and moved it 
sql-compatibility-functions.sql
- Added a new test case in sql-compatibility-functions.sql for verifying 
explain behavior.

Author: Reynold Xin 

Closes #15723 from rxin/SPARK-18214.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd90541c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd90541c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd90541c

Branch: refs/heads/master
Commit: fd90541c35af2bccf0155467bec8cea7c8865046
Parents: 37d9522
Author: Reynold Xin 
Authored: Wed Nov 2 15:53:02 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 15:53:02 2016 -0700

--
 .../sql/catalyst/analysis/TypeCoercion.scala|   2 -
 .../sql/catalyst/expressions/Expression.scala   |  30 ++---
 .../expressions/datetimeExpressions.scala   |   2 -
 .../catalyst/expressions/nullExpressions.scala  |  75 ---
 .../sql/catalyst/optimizer/finishAnalysis.scala |   2 +-
 .../expressions/NullFunctionsSuite.scala|  19 ++-
 .../inputs/sql-compatibility-functions.sql  |  25 
 .../resources/sql-tests/results/array.sql.out   |   5 +-
 .../results/sql-compatibility-functions.sql.out | 124 +++
 .../sql/SQLCompatibilityFunctionSuite.scala |  98 ---
 .../apache/spark/sql/SQLQueryTestSuite.scala|   4 +-
 11 files changed, 204 insertions(+), 182 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fd90541c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 01b04c0..6662a9e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -528,8 +528,6 @@ object TypeCoercion {
 NaNvl(l, Cast(r, DoubleType))
   case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType 
=>
 NaNvl(Cast(l, DoubleType), r)
-
-  case e: RuntimeReplaceable => e.replaceForTypeCoercion()
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/fd90541c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 726a231..221f830 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -186,7 +186,7 @@ abstract class Expression extends TreeNode[Expression] {
*/
   def prettyName: String = nodeName.toLowerCase
 
-  protected def flatArguments = productIterator.flatMap {
+  protected def flatArguments: Iterator[Any] = productIterator.flatMap {
 case t: Traversable[_] => t
 case single => single :: Nil
   }
@@ -229,26 +229,16 @@ trait Unevaluable extends Expression {
  * An expression that gets replaced at runtime (currently by the optimizer) 
into a different
  * expression for evaluation. This is mainly used to provide compatibility 
with other databases.
  * For example, we use this to support "nvl" by replacing it with "coalesce".
+ *
+ * A RuntimeReplaceable should have the original parameters along with a 
"child" expression in the
+ * case class constructor, and define a normal constructor that accepts only 
the original
+ * parameters. For an example, see [[Nvl]]. To make sure the explain plan and 
expression SQL
+ * works correctly, the implementation should also override flatArguments 
method and sql method.
  */
-trait RuntimeReplaceable extends Unevaluable {
-  /**
-   * Method for concr

spark git commit: [SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test against staging artifacts

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 bd3ea6595 -> 1eef8e5cd


[SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test 
against staging artifacts

## What changes were proposed in this pull request?

Adds a `snapshots-and-staging profile` so that  RCs of projects like Hadoop and 
HBase can be used in developer-only build and test runs. There's a comment 
above the profile telling people not to use this in production.

There's no attempt to do the same for SBT, as Ivy is different.
## How was this patch tested?

Tested by building against the Hadoop 2.7.3 RC 1 JARs

without the profile (and without any local copy of the 2.7.3 artifacts), the 
build failed

```
mvn install -DskipTests -Pyarn,hadoop-2.7,hive -Dhadoop.version=2.7.3

...

[INFO] 
[INFO] Building Spark Project Launcher 2.1.0-SNAPSHOT
[INFO] 
Downloading: 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.pom
[WARNING] The POM for org.apache.hadoop:hadoop-client:jar:2.7.3 is missing, no 
dependency information available
Downloading: 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.jar
[INFO] 
[INFO] Reactor Summary:
[INFO]
[INFO] Spark Project Parent POM ... SUCCESS [  4.482 s]
[INFO] Spark Project Tags . SUCCESS [ 17.402 s]
[INFO] Spark Project Sketch ... SUCCESS [ 11.252 s]
[INFO] Spark Project Networking ... SUCCESS [ 13.458 s]
[INFO] Spark Project Shuffle Streaming Service  SUCCESS [  9.043 s]
[INFO] Spark Project Unsafe ... SUCCESS [ 16.027 s]
[INFO] Spark Project Launcher . FAILURE [  1.653 s]
[INFO] Spark Project Core . SKIPPED
...
```

With the profile, the build completed

```
mvn install -DskipTests -Pyarn,hadoop-2.7,hive,snapshots-and-staging 
-Dhadoop.version=2.7.3
```

Author: Steve Loughran 

Closes #14646 from steveloughran/stevel/SPARK-17058-support-asf-snapshots.

(cherry picked from commit 37d95227a21de602b939dae84943ba007f434513)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1eef8e5c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1eef8e5c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1eef8e5c

Branch: refs/heads/branch-2.1
Commit: 1eef8e5cd09dfb8b77044ef9864321618e8ea8c8
Parents: bd3ea65
Author: Steve Loughran 
Authored: Wed Nov 2 11:52:29 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:52:38 2016 -0700

--
 pom.xml | 48 
 1 file changed, 48 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1eef8e5c/pom.xml
--
diff --git a/pom.xml b/pom.xml
index aaf7cfa..04d2eaa 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2694,6 +2694,54 @@
 
 
 
+
+  snapshots-and-staging
+  
+
+
https://repository.apache.org/content/groups/staging/
+
https://repository.apache.org/content/repositories/snapshots/
+  
+
+  
+
+  ASF Staging
+  ${asf.staging}
+
+
+  ASF Snapshots
+  ${asf.snapshots}
+  
+true
+  
+  
+false
+  
+
+
+  
+  
+
+  ASF Staging
+  ${asf.staging}
+
+
+  ASF Snapshots
+  ${asf.snapshots}
+  
+true
+  
+  
+false
+  
+
+  
+
+
+

spark git commit: [SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test against staging artifacts

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 3c24299b7 -> 37d95227a


[SPARK-17058][BUILD] Add maven snapshots-and-staging profile to build/test 
against staging artifacts

## What changes were proposed in this pull request?

Adds a `snapshots-and-staging profile` so that  RCs of projects like Hadoop and 
HBase can be used in developer-only build and test runs. There's a comment 
above the profile telling people not to use this in production.

There's no attempt to do the same for SBT, as Ivy is different.
## How was this patch tested?

Tested by building against the Hadoop 2.7.3 RC 1 JARs

without the profile (and without any local copy of the 2.7.3 artifacts), the 
build failed

```
mvn install -DskipTests -Pyarn,hadoop-2.7,hive -Dhadoop.version=2.7.3

...

[INFO] 
[INFO] Building Spark Project Launcher 2.1.0-SNAPSHOT
[INFO] 
Downloading: 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.pom
[WARNING] The POM for org.apache.hadoop:hadoop-client:jar:2.7.3 is missing, no 
dependency information available
Downloading: 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.jar
[INFO] 
[INFO] Reactor Summary:
[INFO]
[INFO] Spark Project Parent POM ... SUCCESS [  4.482 s]
[INFO] Spark Project Tags . SUCCESS [ 17.402 s]
[INFO] Spark Project Sketch ... SUCCESS [ 11.252 s]
[INFO] Spark Project Networking ... SUCCESS [ 13.458 s]
[INFO] Spark Project Shuffle Streaming Service  SUCCESS [  9.043 s]
[INFO] Spark Project Unsafe ... SUCCESS [ 16.027 s]
[INFO] Spark Project Launcher . FAILURE [  1.653 s]
[INFO] Spark Project Core . SKIPPED
...
```

With the profile, the build completed

```
mvn install -DskipTests -Pyarn,hadoop-2.7,hive,snapshots-and-staging 
-Dhadoop.version=2.7.3
```

Author: Steve Loughran 

Closes #14646 from steveloughran/stevel/SPARK-17058-support-asf-snapshots.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37d95227
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37d95227
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37d95227

Branch: refs/heads/master
Commit: 37d95227a21de602b939dae84943ba007f434513
Parents: 3c24299
Author: Steve Loughran 
Authored: Wed Nov 2 11:52:29 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:52:29 2016 -0700

--
 pom.xml | 48 
 1 file changed, 48 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/37d95227/pom.xml
--
diff --git a/pom.xml b/pom.xml
index aaf7cfa..04d2eaa 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2694,6 +2694,54 @@
 
 
 
+
+  snapshots-and-staging
+  
+
+
https://repository.apache.org/content/groups/staging/
+
https://repository.apache.org/content/repositories/snapshots/
+  
+
+  
+
+  ASF Staging
+  ${asf.staging}
+
+
+  ASF Snapshots
+  ${asf.snapshots}
+  
+true
+  
+  
+false
+  
+
+
+  
+  
+
+  ASF Staging
+  ${asf.staging}
+
+
+  ASF Snapshots
+  ${asf.snapshots}
+  
+true
+  
+  
+false
+  
+
+  
+
+
+

spark git commit: [SPARK-18111][SQL] Wrong approximate quantile answer when multiple records have the minimum value(for branch 2.0)

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 1696bcfad -> 3253ae7f7


[SPARK-18111][SQL] Wrong approximate quantile answer when multiple records have 
the minimum value(for branch 2.0)

## What changes were proposed in this pull request?
When multiple records have the minimum value, the answer of 
`StatFunctions.multipleApproxQuantiles` is wrong.

## How was this patch tested?
add a test case

Author: wangzhenhua 

Closes #15732 from wzhfy/percentile2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3253ae7f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3253ae7f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3253ae7f

Branch: refs/heads/branch-2.0
Commit: 3253ae7f722a996cf0af21608e1a27d5d2a12004
Parents: 1696bcf
Author: wangzhenhua 
Authored: Wed Nov 2 11:49:30 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:49:30 2016 -0700

--
 .../spark/sql/execution/stat/StatFunctions.scala   |  4 +++-
 .../org/apache/spark/sql/DataFrameStatSuite.scala  | 13 +
 2 files changed, 16 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index 7e2ebe8..acc42a0 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -337,7 +337,9 @@ object StatFunctions extends Logging {
   res.prepend(head)
   // If necessary, add the minimum element:
   val currHead = currentSamples.head
-  if (currHead.value < head.value) {
+  // don't add the minimum element if `currentSamples` has only one 
element (both `currHead` and
+  // `head` point to the same element)
+  if (currHead.value <= head.value && currentSamples.length > 1) {
 res.prepend(currentSamples.head)
   }
   res.toArray

http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 73026c7..571e2ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -152,6 +152,19 @@ class DataFrameStatSuite extends QueryTest with 
SharedSQLContext {
 }
   }
 
+  test("approximate quantile, multiple records with the minimum value in a 
partition") {
+val data = Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5)
+val df = spark.sparkContext.makeRDD(data, 4).toDF("col")
+val epsilons = List(0.1, 0.05, 0.001)
+val quantile = 0.5
+val expected = 1
+for (epsilon <- epsilons) {
+  val Array(answer) = df.stat.approxQuantile("col", Array(quantile), 
epsilon)
+  val error = 2 * data.length * epsilon
+  assert(math.abs(answer - expected) < error)
+}
+  }
+
   test("crosstab") {
 val rng = new Random()
 val data = Seq.tabulate(25)(i => (rng.nextInt(5), rng.nextInt(10)))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17895] Improve doc for rangeBetween and rowsBetween

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 4af0ce2d9 -> 742e0fea5


[SPARK-17895] Improve doc for rangeBetween and rowsBetween

## What changes were proposed in this pull request?

Copied description for row and range based frame boundary from 
https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala#L56

Added examples to show different behavior of rangeBetween and rowsBetween when 
involving duplicate values.

Please review 
https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before 
opening a pull request.

Author: buzhihuojie 

Closes #15727 from david-weiluo-ren/improveDocForRangeAndRowsBetween.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/742e0fea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/742e0fea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/742e0fea

Branch: refs/heads/master
Commit: 742e0fea5391857964e90d396641ecf95cac4248
Parents: 4af0ce2
Author: buzhihuojie 
Authored: Wed Nov 2 11:36:20 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:36:20 2016 -0700

--
 .../apache/spark/sql/expressions/Window.scala   | 55 
 .../spark/sql/expressions/WindowSpec.scala  | 55 
 2 files changed, 110 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/742e0fea/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 0b26d86..327bc37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -121,6 +121,32 @@ object Window {
* and [[Window.currentRow]] to specify special boundary values, rather than 
using integral
* values directly.
*
+   * A row based boundary is based on the position of the row within the 
partition.
+   * An offset indicates the number of rows above or below the current row, 
the frame for the
+   * current row starts or ends. For instance, given a row based sliding frame 
with a lower bound
+   * offset of -1 and a upper bound offset of +2. The frame for row with index 
5 would range from
+   * index 4 to index 6.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   * .toDF("id", "category")
+   *   df.withColumn("sum",
+   *   sum('id) over 
Window.partitionBy('category).orderBy('id).rowsBetween(0,1))
+   * .show()
+   *
+   *   +---++---+
+   *   | id|category|sum|
+   *   +---++---+
+   *   |  1|   b|  3|
+   *   |  2|   b|  5|
+   *   |  3|   b|  3|
+   *   |  1|   a|  2|
+   *   |  1|   a|  3|
+   *   |  2|   a|  2|
+   *   +---++---+
+   * }}}
+   *
* @param start boundary start, inclusive. The frame is unbounded if this is
*  the minimum long value ([[Window.unboundedPreceding]]).
* @param end boundary end, inclusive. The frame is unbounded if this is the
@@ -144,6 +170,35 @@ object Window {
* and [[Window.currentRow]] to specify special boundary values, rather than 
using integral
* values directly.
*
+   * A range based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY 
expression, for
+   * instance if the current order by expression has a value of 10 and the 
lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. 
This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one 
expression and this
+   * expression must have a numerical data type. An exception can be made when 
the offset is 0,
+   * because no value modification is needed, in this case multiple and 
non-numeric ORDER BY
+   * expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   * .toDF("id", "category")
+   *   df.withColumn("sum",
+   *   sum('id) over 
Window.partitionBy('category).orderBy('id).rangeBetween(0,1))
+   * .show()
+   *
+   *   +---++---+
+   *   | id|category|sum|
+   *   +---++---+
+   *   |  1|   b|  3|
+   *   |  2|   b|  5|
+   *   |  3|   b|  3|
+   *   |  1|   a|  4|
+   *   |  1|   a|  4|
+   *   |  2|   a|  2|
+   *   +---++---+
+   * }}}
+   *
* @param start boundary start, inclusive. The frame is unboun

spark git commit: [SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 a885d5bbc -> 0093257ea


[SPARK-14393][SQL] values generated by non-deterministic functions shouldn't 
change after coalesce or union

## What changes were proposed in this pull request?

When a user appended a column using a "nondeterministic" function to a 
DataFrame, e.g., `rand`, `randn`, and `monotonically_increasing_id`, the 
expected semantic is the following:
- The value in each row should remain unchanged, as if we materialize the 
column immediately, regardless of later DataFrame operations.

However, since we use `TaskContext.getPartitionId` to get the partition index 
from the current thread, the values from nondeterministic columns might change 
if we call `union` or `coalesce` after. `TaskContext.getPartitionId` returns 
the partition index of the current Spark task, which might not be the 
corresponding partition index of the DataFrame where we defined the column.

See the unit tests below or JIRA for examples.

This PR uses the partition index from `RDD.mapPartitionWithIndex` instead of 
`TaskContext` and fixes the partition initialization logic in whole-stage 
codegen, normal codegen, and codegen fallback. 
`initializeStatesForPartition(partitionIndex: Int)` was added to `Projection`, 
`Nondeterministic`, and `Predicate` (codegen) and initialized right after 
object creation in `mapPartitionWithIndex`. `newPredicate` now returns a 
`Predicate` instance rather than a function for proper initialization.
## How was this patch tested?

Unit tests. (Actually I'm not very confident that this PR fixed all issues 
without introducing new ones ...)

cc: rxin davies

Author: Xiangrui Meng 

Closes #15567 from mengxr/SPARK-14393.

(cherry picked from commit 02f203107b8eda1f1576e36c4f12b0e3bc5e910e)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0093257e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0093257e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0093257e

Branch: refs/heads/branch-2.1
Commit: 0093257ea94d3a197ca061b54c04685d7c1f616a
Parents: a885d5b
Author: Xiangrui Meng 
Authored: Wed Nov 2 11:41:49 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:42:01 2016 -0700

--
 .../main/scala/org/apache/spark/rdd/RDD.scala   | 16 +-
 .../sql/catalyst/expressions/Expression.scala   | 19 +--
 .../catalyst/expressions/InputFileName.scala|  2 +-
 .../expressions/MonotonicallyIncreasingID.scala | 11 +++--
 .../sql/catalyst/expressions/Projection.scala   | 22 ++---
 .../catalyst/expressions/SparkPartitionID.scala | 13 +++--
 .../expressions/codegen/CodeGenerator.scala | 14 ++
 .../expressions/codegen/CodegenFallback.scala   | 18 +--
 .../codegen/GenerateMutableProjection.scala |  4 ++
 .../expressions/codegen/GeneratePredicate.scala | 18 +--
 .../codegen/GenerateSafeProjection.scala|  4 ++
 .../codegen/GenerateUnsafeProjection.scala  |  4 ++
 .../sql/catalyst/expressions/package.scala  | 10 +++-
 .../sql/catalyst/expressions/predicates.scala   |  4 --
 .../expressions/randomExpressions.scala | 14 +++---
 .../sql/catalyst/optimizer/Optimizer.scala  |  1 +
 .../expressions/ExpressionEvalHelper.scala  |  5 +-
 .../codegen/CodegenExpressionCachingSuite.scala | 13 +++--
 .../sql/execution/DataSourceScanExec.scala  |  6 ++-
 .../spark/sql/execution/ExistingRDD.scala   |  3 +-
 .../spark/sql/execution/GenerateExec.scala  |  3 +-
 .../apache/spark/sql/execution/SparkPlan.scala  |  4 +-
 .../sql/execution/WholeStageCodegenExec.scala   |  8 ++-
 .../sql/execution/basicPhysicalOperators.scala  |  8 +--
 .../columnar/InMemoryTableScanExec.scala|  5 +-
 .../joins/BroadcastNestedLoopJoinExec.scala |  7 +--
 .../execution/joins/CartesianProductExec.scala  |  8 +--
 .../spark/sql/execution/joins/HashJoin.scala|  2 +-
 .../sql/execution/joins/SortMergeJoinExec.scala |  2 +-
 .../apache/spark/sql/execution/objects.scala|  6 ++-
 .../spark/sql/DataFrameFunctionsSuite.scala | 52 
 .../sql/hive/execution/HiveTableScanExec.scala  |  3 +-
 32 files changed, 231 insertions(+), 78 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0093257e/core/src/main/scala/org/apache/spark/rdd/RDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index db535de..e018af3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -788,14 +788,26 @@ abstract class RDD[T: ClassTag](
   }
 
   /**
-   * [performance] Spark's internal mapPartitions method w

spark git commit: [SPARK-14393][SQL] values generated by non-deterministic functions shouldn't change after coalesce or union

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 742e0fea5 -> 02f203107


[SPARK-14393][SQL] values generated by non-deterministic functions shouldn't 
change after coalesce or union

## What changes were proposed in this pull request?

When a user appended a column using a "nondeterministic" function to a 
DataFrame, e.g., `rand`, `randn`, and `monotonically_increasing_id`, the 
expected semantic is the following:
- The value in each row should remain unchanged, as if we materialize the 
column immediately, regardless of later DataFrame operations.

However, since we use `TaskContext.getPartitionId` to get the partition index 
from the current thread, the values from nondeterministic columns might change 
if we call `union` or `coalesce` after. `TaskContext.getPartitionId` returns 
the partition index of the current Spark task, which might not be the 
corresponding partition index of the DataFrame where we defined the column.

See the unit tests below or JIRA for examples.

This PR uses the partition index from `RDD.mapPartitionWithIndex` instead of 
`TaskContext` and fixes the partition initialization logic in whole-stage 
codegen, normal codegen, and codegen fallback. 
`initializeStatesForPartition(partitionIndex: Int)` was added to `Projection`, 
`Nondeterministic`, and `Predicate` (codegen) and initialized right after 
object creation in `mapPartitionWithIndex`. `newPredicate` now returns a 
`Predicate` instance rather than a function for proper initialization.
## How was this patch tested?

Unit tests. (Actually I'm not very confident that this PR fixed all issues 
without introducing new ones ...)

cc: rxin davies

Author: Xiangrui Meng 

Closes #15567 from mengxr/SPARK-14393.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02f20310
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02f20310
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02f20310

Branch: refs/heads/master
Commit: 02f203107b8eda1f1576e36c4f12b0e3bc5e910e
Parents: 742e0fe
Author: Xiangrui Meng 
Authored: Wed Nov 2 11:41:49 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:41:49 2016 -0700

--
 .../main/scala/org/apache/spark/rdd/RDD.scala   | 16 +-
 .../sql/catalyst/expressions/Expression.scala   | 19 +--
 .../catalyst/expressions/InputFileName.scala|  2 +-
 .../expressions/MonotonicallyIncreasingID.scala | 11 +++--
 .../sql/catalyst/expressions/Projection.scala   | 22 ++---
 .../catalyst/expressions/SparkPartitionID.scala | 13 +++--
 .../expressions/codegen/CodeGenerator.scala | 14 ++
 .../expressions/codegen/CodegenFallback.scala   | 18 +--
 .../codegen/GenerateMutableProjection.scala |  4 ++
 .../expressions/codegen/GeneratePredicate.scala | 18 +--
 .../codegen/GenerateSafeProjection.scala|  4 ++
 .../codegen/GenerateUnsafeProjection.scala  |  4 ++
 .../sql/catalyst/expressions/package.scala  | 10 +++-
 .../sql/catalyst/expressions/predicates.scala   |  4 --
 .../expressions/randomExpressions.scala | 14 +++---
 .../sql/catalyst/optimizer/Optimizer.scala  |  1 +
 .../expressions/ExpressionEvalHelper.scala  |  5 +-
 .../codegen/CodegenExpressionCachingSuite.scala | 13 +++--
 .../sql/execution/DataSourceScanExec.scala  |  6 ++-
 .../spark/sql/execution/ExistingRDD.scala   |  3 +-
 .../spark/sql/execution/GenerateExec.scala  |  3 +-
 .../apache/spark/sql/execution/SparkPlan.scala  |  4 +-
 .../sql/execution/WholeStageCodegenExec.scala   |  8 ++-
 .../sql/execution/basicPhysicalOperators.scala  |  8 +--
 .../columnar/InMemoryTableScanExec.scala|  5 +-
 .../joins/BroadcastNestedLoopJoinExec.scala |  7 +--
 .../execution/joins/CartesianProductExec.scala  |  8 +--
 .../spark/sql/execution/joins/HashJoin.scala|  2 +-
 .../sql/execution/joins/SortMergeJoinExec.scala |  2 +-
 .../apache/spark/sql/execution/objects.scala|  6 ++-
 .../spark/sql/DataFrameFunctionsSuite.scala | 52 
 .../sql/hive/execution/HiveTableScanExec.scala  |  3 +-
 32 files changed, 231 insertions(+), 78 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/02f20310/core/src/main/scala/org/apache/spark/rdd/RDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index db535de..e018af3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -788,14 +788,26 @@ abstract class RDD[T: ClassTag](
   }
 
   /**
-   * [performance] Spark's internal mapPartitions method which skips closure 
cleaning. It is a
-   * performance API to be used carefully only if we are sur

spark git commit: [SPARK-17895] Improve doc for rangeBetween and rowsBetween

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 9be069125 -> a885d5bbc


[SPARK-17895] Improve doc for rangeBetween and rowsBetween

## What changes were proposed in this pull request?

Copied description for row and range based frame boundary from 
https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala#L56

Added examples to show different behavior of rangeBetween and rowsBetween when 
involving duplicate values.

Please review 
https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before 
opening a pull request.

Author: buzhihuojie 

Closes #15727 from david-weiluo-ren/improveDocForRangeAndRowsBetween.

(cherry picked from commit 742e0fea5391857964e90d396641ecf95cac4248)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a885d5bb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a885d5bb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a885d5bb

Branch: refs/heads/branch-2.1
Commit: a885d5bbce9dba66b394850b3aac51ae97cb18dd
Parents: 9be0691
Author: buzhihuojie 
Authored: Wed Nov 2 11:36:20 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:36:26 2016 -0700

--
 .../apache/spark/sql/expressions/Window.scala   | 55 
 .../spark/sql/expressions/WindowSpec.scala  | 55 
 2 files changed, 110 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a885d5bb/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 0b26d86..327bc37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -121,6 +121,32 @@ object Window {
* and [[Window.currentRow]] to specify special boundary values, rather than 
using integral
* values directly.
*
+   * A row based boundary is based on the position of the row within the 
partition.
+   * An offset indicates the number of rows above or below the current row, 
the frame for the
+   * current row starts or ends. For instance, given a row based sliding frame 
with a lower bound
+   * offset of -1 and a upper bound offset of +2. The frame for row with index 
5 would range from
+   * index 4 to index 6.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   * .toDF("id", "category")
+   *   df.withColumn("sum",
+   *   sum('id) over 
Window.partitionBy('category).orderBy('id).rowsBetween(0,1))
+   * .show()
+   *
+   *   +---++---+
+   *   | id|category|sum|
+   *   +---++---+
+   *   |  1|   b|  3|
+   *   |  2|   b|  5|
+   *   |  3|   b|  3|
+   *   |  1|   a|  2|
+   *   |  1|   a|  3|
+   *   |  2|   a|  2|
+   *   +---++---+
+   * }}}
+   *
* @param start boundary start, inclusive. The frame is unbounded if this is
*  the minimum long value ([[Window.unboundedPreceding]]).
* @param end boundary end, inclusive. The frame is unbounded if this is the
@@ -144,6 +170,35 @@ object Window {
* and [[Window.currentRow]] to specify special boundary values, rather than 
using integral
* values directly.
*
+   * A range based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY 
expression, for
+   * instance if the current order by expression has a value of 10 and the 
lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. 
This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one 
expression and this
+   * expression must have a numerical data type. An exception can be made when 
the offset is 0,
+   * because no value modification is needed, in this case multiple and 
non-numeric ORDER BY
+   * expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   * .toDF("id", "category")
+   *   df.withColumn("sum",
+   *   sum('id) over 
Window.partitionBy('category).orderBy('id).rangeBetween(0,1))
+   * .show()
+   *
+   *   +---++---+
+   *   | id|category|sum|
+   *   +---++---+
+   *   |  1|   b|  3|
+   *   |  2|   b|  5|
+   *   |  3|   b|  3|
+   *   |  1|   a|  4|
+   *   |  1|   a|  4|
+   *   |  2|   a|  2|
+

spark git commit: [SPARK-17683][SQL] Support ArrayType in Literal.apply

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 41491e540 -> 9be069125


[SPARK-17683][SQL] Support ArrayType in Literal.apply

## What changes were proposed in this pull request?

This pr is to add pattern-matching entries for array data in `Literal.apply`.
## How was this patch tested?

Added tests in `LiteralExpressionSuite`.

Author: Takeshi YAMAMURO 

Closes #15257 from maropu/SPARK-17683.

(cherry picked from commit 4af0ce2d96de3397c9bc05684cad290a52486577)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9be06912
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9be06912
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9be06912

Branch: refs/heads/branch-2.1
Commit: 9be069125f7e94df9d862f307b87965baf9416e3
Parents: 41491e5
Author: Takeshi YAMAMURO 
Authored: Wed Nov 2 11:29:26 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:29:39 2016 -0700

--
 .../sql/catalyst/expressions/literals.scala | 57 +++-
 .../expressions/LiteralExpressionSuite.scala| 27 +-
 2 files changed, 82 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9be06912/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index a597a17..1985e68 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -17,14 +17,25 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.lang.{Boolean => JavaBoolean}
+import java.lang.{Byte => JavaByte}
+import java.lang.{Double => JavaDouble}
+import java.lang.{Float => JavaFloat}
+import java.lang.{Integer => JavaInteger}
+import java.lang.{Long => JavaLong}
+import java.lang.{Short => JavaShort}
+import java.math.{BigDecimal => JavaBigDecimal}
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 import java.util
 import java.util.Objects
 import javax.xml.bind.DatatypeConverter
 
+import scala.math.{BigDecimal, BigInt}
+
 import org.json4s.JsonAST._
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -46,12 +57,17 @@ object Literal {
 case s: String => Literal(UTF8String.fromString(s), StringType)
 case b: Boolean => Literal(b, BooleanType)
 case d: BigDecimal => Literal(Decimal(d), 
DecimalType(Math.max(d.precision, d.scale), d.scale))
-case d: java.math.BigDecimal =>
+case d: JavaBigDecimal =>
   Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), 
d.scale()))
 case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), 
d.scale))
 case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), 
TimestampType)
 case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
 case a: Array[Byte] => Literal(a, BinaryType)
+case a: Array[_] =>
+  val elementType = componentTypeToDataType(a.getClass.getComponentType())
+  val dataType = ArrayType(elementType)
+  val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
+  Literal(convert(a), dataType)
 case i: CalendarInterval => Literal(i, CalendarIntervalType)
 case null => Literal(null, NullType)
 case v: Literal => v
@@ -60,6 +76,45 @@ object Literal {
   }
 
   /**
+   * Returns the Spark SQL DataType for a given class object. Since this type 
needs to be resolved
+   * in runtime, we use match-case idioms for class objects here. However, 
there are similar
+   * functions in other files (e.g., HiveInspectors), so these functions need 
to merged into one.
+   */
+  private[this] def componentTypeToDataType(clz: Class[_]): DataType = clz 
match {
+// primitive types
+case JavaShort.TYPE => ShortType
+case JavaInteger.TYPE => IntegerType
+case JavaLong.TYPE => LongType
+case JavaDouble.TYPE => DoubleType
+case JavaByte.TYPE => ByteType
+case JavaFloat.TYPE => FloatType
+case JavaBoolean.TYPE => BooleanType
+
+// java classes
+case _ if clz == classOf[Date] => DateType
+case _ if clz == classOf[Timestamp] => TimestampType
+case _ if clz == classOf[JavaBigDecimal] => DecimalType.SYSTEM_DEFAULT
+case _ if clz == classOf[Array[Byte]] => BinaryType
+case _ if clz == classOf[JavaShort] => ShortType
+case _ if

spark git commit: [SPARK-17683][SQL] Support ArrayType in Literal.apply

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master f151bd1af -> 4af0ce2d9


[SPARK-17683][SQL] Support ArrayType in Literal.apply

## What changes were proposed in this pull request?

This pr is to add pattern-matching entries for array data in `Literal.apply`.
## How was this patch tested?

Added tests in `LiteralExpressionSuite`.

Author: Takeshi YAMAMURO 

Closes #15257 from maropu/SPARK-17683.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4af0ce2d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4af0ce2d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4af0ce2d

Branch: refs/heads/master
Commit: 4af0ce2d96de3397c9bc05684cad290a52486577
Parents: f151bd1
Author: Takeshi YAMAMURO 
Authored: Wed Nov 2 11:29:26 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 11:29:26 2016 -0700

--
 .../sql/catalyst/expressions/literals.scala | 57 +++-
 .../expressions/LiteralExpressionSuite.scala| 27 +-
 2 files changed, 82 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4af0ce2d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index a597a17..1985e68 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -17,14 +17,25 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.lang.{Boolean => JavaBoolean}
+import java.lang.{Byte => JavaByte}
+import java.lang.{Double => JavaDouble}
+import java.lang.{Float => JavaFloat}
+import java.lang.{Integer => JavaInteger}
+import java.lang.{Long => JavaLong}
+import java.lang.{Short => JavaShort}
+import java.math.{BigDecimal => JavaBigDecimal}
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 import java.util
 import java.util.Objects
 import javax.xml.bind.DatatypeConverter
 
+import scala.math.{BigDecimal, BigInt}
+
 import org.json4s.JsonAST._
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -46,12 +57,17 @@ object Literal {
 case s: String => Literal(UTF8String.fromString(s), StringType)
 case b: Boolean => Literal(b, BooleanType)
 case d: BigDecimal => Literal(Decimal(d), 
DecimalType(Math.max(d.precision, d.scale), d.scale))
-case d: java.math.BigDecimal =>
+case d: JavaBigDecimal =>
   Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), 
d.scale()))
 case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), 
d.scale))
 case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), 
TimestampType)
 case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
 case a: Array[Byte] => Literal(a, BinaryType)
+case a: Array[_] =>
+  val elementType = componentTypeToDataType(a.getClass.getComponentType())
+  val dataType = ArrayType(elementType)
+  val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
+  Literal(convert(a), dataType)
 case i: CalendarInterval => Literal(i, CalendarIntervalType)
 case null => Literal(null, NullType)
 case v: Literal => v
@@ -60,6 +76,45 @@ object Literal {
   }
 
   /**
+   * Returns the Spark SQL DataType for a given class object. Since this type 
needs to be resolved
+   * in runtime, we use match-case idioms for class objects here. However, 
there are similar
+   * functions in other files (e.g., HiveInspectors), so these functions need 
to merged into one.
+   */
+  private[this] def componentTypeToDataType(clz: Class[_]): DataType = clz 
match {
+// primitive types
+case JavaShort.TYPE => ShortType
+case JavaInteger.TYPE => IntegerType
+case JavaLong.TYPE => LongType
+case JavaDouble.TYPE => DoubleType
+case JavaByte.TYPE => ByteType
+case JavaFloat.TYPE => FloatType
+case JavaBoolean.TYPE => BooleanType
+
+// java classes
+case _ if clz == classOf[Date] => DateType
+case _ if clz == classOf[Timestamp] => TimestampType
+case _ if clz == classOf[JavaBigDecimal] => DecimalType.SYSTEM_DEFAULT
+case _ if clz == classOf[Array[Byte]] => BinaryType
+case _ if clz == classOf[JavaShort] => ShortType
+case _ if clz == classOf[JavaInteger] => IntegerType
+case _ if clz == classOf[JavaLong] => LongType
+case

spark git commit: [SPARK-17532] Add lock debugging info to thread dumps.

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 85c5424d4 -> 2dc048081


[SPARK-17532] Add lock debugging info to thread dumps.

## What changes were proposed in this pull request?

This adds information to the web UI thread dump page about the JVM locks
held by threads and the locks that threads are blocked waiting to
acquire. This should help find cases where lock contention is causing
Spark applications to run slowly.
## How was this patch tested?

Tested by applying this patch and viewing the change in the web UI.

![thread-lock-info](https://cloud.githubusercontent.com/assets/87915/18493057/6e5da870-79c3-11e6-8c20-f54c18a37544.png)

Additions:
- A "Thread Locking" column with the locks held by the thread or that are 
blocking the thread
- Links from the a blocked thread to the thread holding the lock
- Stack frames show where threads are inside `synchronized` blocks, "holding 
Monitor(...)"

Author: Ryan Blue 

Closes #15088 from rdblue/SPARK-17532-add-thread-lock-info.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2dc04808
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2dc04808
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2dc04808

Branch: refs/heads/master
Commit: 2dc048081668665f85623839d5f663b402e42555
Parents: 85c5424
Author: Ryan Blue 
Authored: Wed Nov 2 00:08:30 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 00:08:30 2016 -0700

--
 .../org/apache/spark/ui/static/table.js |  3 +-
 .../spark/ui/exec/ExecutorThreadDumpPage.scala  | 12 +++
 .../apache/spark/util/ThreadStackTrace.scala|  6 +++-
 .../scala/org/apache/spark/util/Utils.scala | 34 +---
 4 files changed, 49 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2dc04808/core/src/main/resources/org/apache/spark/ui/static/table.js
--
diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js 
b/core/src/main/resources/org/apache/spark/ui/static/table.js
index 14b06bf..0315ebf 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/table.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/table.js
@@ -36,7 +36,7 @@ function toggleThreadStackTrace(threadId, forceAdd) {
 if (stackTrace.length == 0) {
 var stackTraceText = $('#' + threadId + "_td_stacktrace").html()
 var threadCell = $("#thread_" + threadId + "_tr")
-threadCell.after("" +
+threadCell.after("" +
 stackTraceText +  "")
 } else {
 if (!forceAdd) {
@@ -73,6 +73,7 @@ function onMouseOverAndOut(threadId) {
 $("#" + threadId + "_td_id").toggleClass("threaddump-td-mouseover");
 $("#" + threadId + "_td_name").toggleClass("threaddump-td-mouseover");
 $("#" + threadId + "_td_state").toggleClass("threaddump-td-mouseover");
+$("#" + threadId + "_td_locking").toggleClass("threaddump-td-mouseover");
 }
 
 function onSearchStringChange() {

http://git-wip-us.apache.org/repos/asf/spark/blob/2dc04808/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala 
b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
index a0ef80d..c6a0744 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
@@ -48,6 +48,16 @@ private[ui] class ExecutorThreadDumpPage(parent: 
ExecutorsTab) extends WebUIPage
   }
   }.map { thread =>
 val threadId = thread.threadId
+val blockedBy = thread.blockedByThreadId match {
+  case Some(blockedByThreadId) =>
+
+  Blocked by 
+  Thread {thread.blockedByThreadId} {thread.blockedByLock}
+
+  case None => Text("")
+}
+val heldLocks = thread.holdingLocks.mkString(", ")
+
 {threadId}
   {thread.threadName}
   {thread.threadState}
+  {blockedBy}{heldLocks}
   {thread.stackTrace}
 
   }
@@ -86,6 +97,7 @@ private[ui] class ExecutorThreadDumpPage(parent: 
ExecutorsTab) extends WebUIPage
   Thread ID
   Thread Name
   Thread State
+  Thread Locks
 
 {dumpRows}
   

http://git-wip-us.apache.org/repos/asf/spark/blob/2dc04808/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala 
b/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.s

spark git commit: [SPARK-17532] Add lock debugging info to thread dumps.

2016-11-02 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 4c4bf87ac -> 3b624bedf


[SPARK-17532] Add lock debugging info to thread dumps.

## What changes were proposed in this pull request?

This adds information to the web UI thread dump page about the JVM locks
held by threads and the locks that threads are blocked waiting to
acquire. This should help find cases where lock contention is causing
Spark applications to run slowly.
## How was this patch tested?

Tested by applying this patch and viewing the change in the web UI.

![thread-lock-info](https://cloud.githubusercontent.com/assets/87915/18493057/6e5da870-79c3-11e6-8c20-f54c18a37544.png)

Additions:
- A "Thread Locking" column with the locks held by the thread or that are 
blocking the thread
- Links from the a blocked thread to the thread holding the lock
- Stack frames show where threads are inside `synchronized` blocks, "holding 
Monitor(...)"

Author: Ryan Blue 

Closes #15088 from rdblue/SPARK-17532-add-thread-lock-info.

(cherry picked from commit 2dc048081668665f85623839d5f663b402e42555)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b624bed
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b624bed
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b624bed

Branch: refs/heads/branch-2.1
Commit: 3b624bedf0f0ecd5dcfcc262a3ca8b4e33662533
Parents: 4c4bf87
Author: Ryan Blue 
Authored: Wed Nov 2 00:08:30 2016 -0700
Committer: Reynold Xin 
Committed: Wed Nov 2 00:08:37 2016 -0700

--
 .../org/apache/spark/ui/static/table.js |  3 +-
 .../spark/ui/exec/ExecutorThreadDumpPage.scala  | 12 +++
 .../apache/spark/util/ThreadStackTrace.scala|  6 +++-
 .../scala/org/apache/spark/util/Utils.scala | 34 +---
 4 files changed, 49 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3b624bed/core/src/main/resources/org/apache/spark/ui/static/table.js
--
diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js 
b/core/src/main/resources/org/apache/spark/ui/static/table.js
index 14b06bf..0315ebf 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/table.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/table.js
@@ -36,7 +36,7 @@ function toggleThreadStackTrace(threadId, forceAdd) {
 if (stackTrace.length == 0) {
 var stackTraceText = $('#' + threadId + "_td_stacktrace").html()
 var threadCell = $("#thread_" + threadId + "_tr")
-threadCell.after("" +
+threadCell.after("" +
 stackTraceText +  "")
 } else {
 if (!forceAdd) {
@@ -73,6 +73,7 @@ function onMouseOverAndOut(threadId) {
 $("#" + threadId + "_td_id").toggleClass("threaddump-td-mouseover");
 $("#" + threadId + "_td_name").toggleClass("threaddump-td-mouseover");
 $("#" + threadId + "_td_state").toggleClass("threaddump-td-mouseover");
+$("#" + threadId + "_td_locking").toggleClass("threaddump-td-mouseover");
 }
 
 function onSearchStringChange() {

http://git-wip-us.apache.org/repos/asf/spark/blob/3b624bed/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala 
b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
index a0ef80d..c6a0744 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
@@ -48,6 +48,16 @@ private[ui] class ExecutorThreadDumpPage(parent: 
ExecutorsTab) extends WebUIPage
   }
   }.map { thread =>
 val threadId = thread.threadId
+val blockedBy = thread.blockedByThreadId match {
+  case Some(blockedByThreadId) =>
+
+  Blocked by 
+  Thread {thread.blockedByThreadId} {thread.blockedByLock}
+
+  case None => Text("")
+}
+val heldLocks = thread.holdingLocks.mkString(", ")
+
 {threadId}
   {thread.threadName}
   {thread.threadState}
+  {blockedBy}{heldLocks}
   {thread.stackTrace}
 
   }
@@ -86,6 +97,7 @@ private[ui] class ExecutorThreadDumpPage(parent: 
ExecutorsTab) extends WebUIPage
   Thread ID
   Thread Name
   Thread State
+  Thread Locks
 
 {dumpRows}
   

http://git-wip-us.apache.org/repos/asf/spark/blob/3b624bed/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
--
diff --git a/core/src/main/scala/or

spark git commit: [SPARK-18192] Support all file formats in structured streaming

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 e6509c245 -> 85dd07374


[SPARK-18192] Support all file formats in structured streaming

## What changes were proposed in this pull request?
This patch adds support for all file formats in structured streaming sinks. 
This is actually a very small change thanks to all the previous refactoring 
done using the new internal commit protocol API.

## How was this patch tested?
Updated FileStreamSinkSuite to add test cases for json, text, and parquet.

Author: Reynold Xin 

Closes #15711 from rxin/SPARK-18192.

(cherry picked from commit a36653c5b7b2719f8bfddf4ddfc6e1b828ac9af1)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/85dd0737
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/85dd0737
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/85dd0737

Branch: refs/heads/branch-2.1
Commit: 85dd073743946383438aabb9f1281e6075f25cc5
Parents: e6509c2
Author: Reynold Xin 
Authored: Tue Nov 1 23:37:03 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 23:37:11 2016 -0700

--
 .../sql/execution/datasources/DataSource.scala  |  8 +--
 .../sql/streaming/FileStreamSinkSuite.scala | 62 +---
 2 files changed, 32 insertions(+), 38 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/85dd0737/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index d980e6a..3f956c4 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -29,7 +29,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
@@ -37,7 +36,6 @@ import 
org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
@@ -292,7 +290,7 @@ case class DataSource(
   case s: StreamSinkProvider =>
 s.createSink(sparkSession.sqlContext, options, partitionColumns, 
outputMode)
 
-  case parquet: parquet.ParquetFileFormat =>
+  case fileFormat: FileFormat =>
 val caseInsensitiveOptions = new CaseInsensitiveMap(options)
 val path = caseInsensitiveOptions.getOrElse("path", {
   throw new IllegalArgumentException("'path' is not specified")
@@ -301,7 +299,7 @@ case class DataSource(
   throw new IllegalArgumentException(
 s"Data source $className does not support $outputMode output mode")
 }
-new FileStreamSink(sparkSession, path, parquet, partitionColumns, 
options)
+new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, 
options)
 
   case _ =>
 throw new UnsupportedOperationException(
@@ -516,7 +514,7 @@ case class DataSource(
   val plan = data.logicalPlan
   plan.resolve(name :: Nil, 
data.sparkSession.sessionState.analyzer.resolver).getOrElse {
 throw new AnalysisException(
-  s"Unable to resolve ${name} given 
[${plan.output.map(_.name).mkString(", ")}]")
+  s"Unable to resolve $name given 
[${plan.output.map(_.name).mkString(", ")}]")
   }.asInstanceOf[Attribute]
 }
 // For partitioned relation r, r.schema's column ordering can be 
different from the column

http://git-wip-us.apache.org/repos/asf/spark/blob/85dd0737/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 902cf05..0f

spark git commit: [SPARK-18192] Support all file formats in structured streaming

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master abefe2ec4 -> a36653c5b


[SPARK-18192] Support all file formats in structured streaming

## What changes were proposed in this pull request?
This patch adds support for all file formats in structured streaming sinks. 
This is actually a very small change thanks to all the previous refactoring 
done using the new internal commit protocol API.

## How was this patch tested?
Updated FileStreamSinkSuite to add test cases for json, text, and parquet.

Author: Reynold Xin 

Closes #15711 from rxin/SPARK-18192.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a36653c5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a36653c5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a36653c5

Branch: refs/heads/master
Commit: a36653c5b7b2719f8bfddf4ddfc6e1b828ac9af1
Parents: abefe2e
Author: Reynold Xin 
Authored: Tue Nov 1 23:37:03 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 23:37:03 2016 -0700

--
 .../sql/execution/datasources/DataSource.scala  |  8 +--
 .../sql/streaming/FileStreamSinkSuite.scala | 62 +---
 2 files changed, 32 insertions(+), 38 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a36653c5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index d980e6a..3f956c4 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -29,7 +29,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
@@ -37,7 +36,6 @@ import 
org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
@@ -292,7 +290,7 @@ case class DataSource(
   case s: StreamSinkProvider =>
 s.createSink(sparkSession.sqlContext, options, partitionColumns, 
outputMode)
 
-  case parquet: parquet.ParquetFileFormat =>
+  case fileFormat: FileFormat =>
 val caseInsensitiveOptions = new CaseInsensitiveMap(options)
 val path = caseInsensitiveOptions.getOrElse("path", {
   throw new IllegalArgumentException("'path' is not specified")
@@ -301,7 +299,7 @@ case class DataSource(
   throw new IllegalArgumentException(
 s"Data source $className does not support $outputMode output mode")
 }
-new FileStreamSink(sparkSession, path, parquet, partitionColumns, 
options)
+new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, 
options)
 
   case _ =>
 throw new UnsupportedOperationException(
@@ -516,7 +514,7 @@ case class DataSource(
   val plan = data.logicalPlan
   plan.resolve(name :: Nil, 
data.sparkSession.sessionState.analyzer.resolver).getOrElse {
 throw new AnalysisException(
-  s"Unable to resolve ${name} given 
[${plan.output.map(_.name).mkString(", ")}]")
+  s"Unable to resolve $name given 
[${plan.output.map(_.name).mkString(", ")}]")
   }.asInstanceOf[Attribute]
 }
 // For partitioned relation r, r.schema's column ordering can be 
different from the column

http://git-wip-us.apache.org/repos/asf/spark/blob/a36653c5/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 902cf05..0f140f9 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
++

spark git commit: [SPARK-18183][SPARK-18184] Fix INSERT [INTO|OVERWRITE] TABLE ... PARTITION for Datasource tables

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 39d2fdb51 -> e6509c245


[SPARK-18183][SPARK-18184] Fix INSERT [INTO|OVERWRITE] TABLE ... PARTITION for 
Datasource tables

There are a couple issues with the current 2.1 behavior when inserting into 
Datasource tables with partitions managed by Hive.

(1) OVERWRITE TABLE ... PARTITION will actually overwrite the entire table 
instead of just the specified partition.
(2) INSERT|OVERWRITE does not work with partitions that have custom locations.

This PR fixes both of these issues for Datasource tables managed by Hive. The 
behavior for legacy tables or when `manageFilesourcePartitions = false` is 
unchanged.

There is one other issue in that INSERT OVERWRITE with dynamic partitions will 
overwrite the entire table instead of just the updated partitions, but this 
behavior is pretty complicated to implement for Datasource tables. We should 
address that in a future release.

Unit tests.

Author: Eric Liang 

Closes #15705 from ericl/sc-4942.

(cherry picked from commit abefe2ec428dc24a4112c623fb6fbe4b2ca60a2b)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6509c24
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6509c24
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6509c24

Branch: refs/heads/branch-2.1
Commit: e6509c2459e7ece3c3c6bcd143b8cc71f8f4d5c8
Parents: 39d2fdb
Author: Eric Liang 
Authored: Wed Nov 2 14:15:10 2016 +0800
Committer: Reynold Xin 
Committed: Tue Nov 1 23:23:55 2016 -0700

--
 .../apache/spark/sql/catalyst/dsl/package.scala |  2 +-
 .../spark/sql/catalyst/parser/AstBuilder.scala  |  9 +++-
 .../plans/logical/basicLogicalOperators.scala   | 19 ++-
 .../sql/catalyst/parser/PlanParserSuite.scala   | 15 --
 .../org/apache/spark/sql/DataFrameWriter.scala  |  4 +-
 .../datasources/CatalogFileIndex.scala  |  5 +-
 .../datasources/DataSourceStrategy.scala| 30 +--
 .../InsertIntoDataSourceCommand.scala   |  6 +--
 .../apache/spark/sql/hive/HiveStrategies.scala  |  3 +-
 .../CreateHiveTableAsSelectCommand.scala|  5 +-
 .../PartitionProviderCompatibilitySuite.scala   | 52 
 11 files changed, 129 insertions(+), 21 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e6509c24/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 66e52ca..e901683 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -367,7 +367,7 @@ package object dsl {
   def insertInto(tableName: String, overwrite: Boolean = false): 
LogicalPlan =
 InsertIntoTable(
   analysis.UnresolvedRelation(TableIdentifier(tableName)),
-  Map.empty, logicalPlan, overwrite, false)
+  Map.empty, logicalPlan, OverwriteOptions(overwrite), false)
 
   def as(alias: String): LogicalPlan = logicalPlan match {
 case UnresolvedRelation(tbl, _) => UnresolvedRelation(tbl, 
Option(alias))

http://git-wip-us.apache.org/repos/asf/spark/blob/e6509c24/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 38e9bb6..ac1577b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -177,12 +177,19 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
Logging {
   throw new ParseException(s"Dynamic partitions do not support IF NOT 
EXISTS. Specified " +
 "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", 
",", "]"), ctx)
 }
+val overwrite = ctx.OVERWRITE != null
+val overwritePartition =
+  if (overwrite && partitionKeys.nonEmpty && dynamicPartitionKeys.isEmpty) 
{
+Some(partitionKeys.map(t => (t._1, t._2.get)))
+  } else {
+None
+  }
 
 InsertIntoTable(
   UnresolvedRelation(tableIdent, None),
   partitionKeys,
   query,
-  ctx.OVERWRITE != null,
+  OverwriteOptions(overwrite, overwritePartition),
   ctx.EXISTS != null)
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e6509c24/sql/catalyst/src/main/scala/org/apache/spark/s

spark git commit: [SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use checksum files

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 1bbf9ff63 -> 39d2fdb51


[SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use 
checksum files

## What changes were proposed in this pull request?

When the metadata logs for various parts of Structured Streaming are stored on 
non-HDFS filesystems such as NFS or ext4, the HDFSMetadataLog class leaves 
hidden HDFS-style checksum (CRC) files in the log directory, one file per 
batch. This PR modifies HDFSMetadataLog so that it detects the use of a 
filesystem that doesn't use CRC files and removes the CRC files.
## How was this patch tested?

Modified an existing test case in HDFSMetadataLogSuite to check whether 
HDFSMetadataLog correctly removes CRC files on the local POSIX filesystem.  Ran 
the entire regression suite.

Author: frreiss 

Closes #15027 from frreiss/fred-17475.

(cherry picked from commit 620da3b4828b3580c7ed7339b2a07938e6be1bb1)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39d2fdb5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39d2fdb5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39d2fdb5

Branch: refs/heads/branch-2.1
Commit: 39d2fdb51233ed9b1aaf3adaa3267853f5e58c0f
Parents: 1bbf9ff
Author: frreiss 
Authored: Tue Nov 1 23:00:17 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 23:00:28 2016 -0700

--
 .../apache/spark/sql/execution/streaming/HDFSMetadataLog.scala | 5 +
 .../spark/sql/execution/streaming/HDFSMetadataLogSuite.scala   | 6 ++
 2 files changed, 11 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/39d2fdb5/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index c7235320..9a0f87c 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -148,6 +148,11 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: 
SparkSession, path: String)
   // It will fail if there is an existing file (someone has committed 
the batch)
   logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
   fileManager.rename(tempPath, batchIdToPath(batchId))
+
+  // SPARK-17475: HDFSMetadataLog should not leak CRC files
+  // If the underlying filesystem didn't rename the CRC file, delete 
it.
+  val crcPath = new Path(tempPath.getParent(), 
s".${tempPath.getName()}.crc")
+  if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
   return
 } catch {
   case e: IOException if isFileAlreadyExistsException(e) =>

http://git-wip-us.apache.org/repos/asf/spark/blob/39d2fdb5/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
index 9c1d26d..d03e08d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
@@ -119,6 +119,12 @@ class HDFSMetadataLogSuite extends SparkFunSuite with 
SharedSQLContext {
   assert(metadataLog.get(1).isEmpty)
   assert(metadataLog.get(2).isDefined)
   assert(metadataLog.getLatest().get._1 == 2)
+
+  // There should be exactly one file, called "2", in the metadata 
directory.
+  // This check also tests for regressions of SPARK-17475
+  val allFiles = new 
File(metadataLog.metadataPath.toString).listFiles().toSeq
+  assert(allFiles.size == 1)
+  assert(allFiles(0).getName() == "2")
 }
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use checksum files

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 1bbf9ff63 -> 620da3b48


[SPARK-17475][STREAMING] Delete CRC files if the filesystem doesn't use 
checksum files

## What changes were proposed in this pull request?

When the metadata logs for various parts of Structured Streaming are stored on 
non-HDFS filesystems such as NFS or ext4, the HDFSMetadataLog class leaves 
hidden HDFS-style checksum (CRC) files in the log directory, one file per 
batch. This PR modifies HDFSMetadataLog so that it detects the use of a 
filesystem that doesn't use CRC files and removes the CRC files.
## How was this patch tested?

Modified an existing test case in HDFSMetadataLogSuite to check whether 
HDFSMetadataLog correctly removes CRC files on the local POSIX filesystem.  Ran 
the entire regression suite.

Author: frreiss 

Closes #15027 from frreiss/fred-17475.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/620da3b4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/620da3b4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/620da3b4

Branch: refs/heads/master
Commit: 620da3b4828b3580c7ed7339b2a07938e6be1bb1
Parents: 1bbf9ff
Author: frreiss 
Authored: Tue Nov 1 23:00:17 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 23:00:17 2016 -0700

--
 .../apache/spark/sql/execution/streaming/HDFSMetadataLog.scala | 5 +
 .../spark/sql/execution/streaming/HDFSMetadataLogSuite.scala   | 6 ++
 2 files changed, 11 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/620da3b4/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index c7235320..9a0f87c 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -148,6 +148,11 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: 
SparkSession, path: String)
   // It will fail if there is an existing file (someone has committed 
the batch)
   logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
   fileManager.rename(tempPath, batchIdToPath(batchId))
+
+  // SPARK-17475: HDFSMetadataLog should not leak CRC files
+  // If the underlying filesystem didn't rename the CRC file, delete 
it.
+  val crcPath = new Path(tempPath.getParent(), 
s".${tempPath.getName()}.crc")
+  if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
   return
 } catch {
   case e: IOException if isFileAlreadyExistsException(e) =>

http://git-wip-us.apache.org/repos/asf/spark/blob/620da3b4/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
index 9c1d26d..d03e08d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
@@ -119,6 +119,12 @@ class HDFSMetadataLogSuite extends SparkFunSuite with 
SharedSQLContext {
   assert(metadataLog.get(1).isEmpty)
   assert(metadataLog.get(2).isDefined)
   assert(metadataLog.getLatest().get._1 == 2)
+
+  // There should be exactly one file, called "2", in the metadata 
directory.
+  // This check also tests for regressions of SPARK-17475
+  val allFiles = new 
File(metadataLog.metadataPath.toString).listFiles().toSeq
+  assert(allFiles.size == 1)
+  assert(allFiles(0).getName() == "2")
 }
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] Git Push Summary

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 [created] 1bbf9ff63

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17992][SQL] Return all partitions from HiveShim when Hive throws a metastore exception when attempting to fetch partitions by filter

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 1ecfafa08 -> 1bbf9ff63


[SPARK-17992][SQL] Return all partitions from HiveShim when Hive throws a 
metastore exception when attempting to fetch partitions by filter

(Link to Jira issue: https://issues.apache.org/jira/browse/SPARK-17992)
## What changes were proposed in this pull request?

We recently added table partition pruning for partitioned Hive tables converted 
to using `TableFileCatalog`. When the Hive configuration option 
`hive.metastore.try.direct.sql` is set to `false`, Hive will throw an exception 
for unsupported filter expressions. For example, attempting to filter on an 
integer partition column will throw a 
`org.apache.hadoop.hive.metastore.api.MetaException`.

I discovered this behavior because VideoAmp uses the CDH version of Hive with a 
Postgresql metastore DB. In this configuration, CDH sets 
`hive.metastore.try.direct.sql` to `false` by default, and queries that filter 
on a non-string partition column will fail.

Rather than throw an exception in query planning, this patch catches this 
exception, logs a warning and returns all table partitions instead. Clients of 
this method are already expected to handle the possibility that the filters 
will not be honored.
## How was this patch tested?

A unit test was added.

Author: Michael Allman 

Closes #15673 from mallman/spark-17992-catch_hive_partition_filter_exception.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1bbf9ff6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1bbf9ff6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1bbf9ff6

Branch: refs/heads/master
Commit: 1bbf9ff634745148e782370009aa31d3a042638c
Parents: 1ecfafa
Author: Michael Allman 
Authored: Tue Nov 1 22:20:19 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 22:20:19 2016 -0700

--
 .../apache/spark/sql/hive/client/HiveShim.scala | 31 ++--
 .../sql/hive/client/HiveClientBuilder.scala | 56 ++
 .../spark/sql/hive/client/HiveClientSuite.scala | 61 
 .../spark/sql/hive/client/VersionsSuite.scala   | 77 +---
 4 files changed, 160 insertions(+), 65 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1bbf9ff6/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 85edaf6..3d9642d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -29,7 +29,7 @@ import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.metastore.api.{Function => HiveFunction, 
FunctionType, NoSuchObjectException, PrincipalType, ResourceType, ResourceUri}
+import org.apache.hadoop.hive.metastore.api.{Function => HiveFunction, 
FunctionType, MetaException, PrincipalType, ResourceType, ResourceUri}
 import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.{Hive, HiveException, Partition, 
Table}
 import org.apache.hadoop.hive.ql.plan.AddPartitionDesc
@@ -43,6 +43,7 @@ import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException
 import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, 
CatalogTablePartition, FunctionResource, FunctionResourceType}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{IntegralType, StringType}
 import org.apache.spark.util.Utils
 
@@ -586,17 +587,31 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 getAllPartitionsMethod.invoke(hive, 
table).asInstanceOf[JSet[Partition]]
   } else {
 logDebug(s"Hive metastore filter is '$filter'.")
+val tryDirectSqlConfVar = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL
+val tryDirectSql =
+  hive.getConf.getBoolean(tryDirectSqlConfVar.varname, 
tryDirectSqlConfVar.defaultBoolVal)
 try {
+  // Hive may throw an exception when calling this method in some 
circumstances, such as
+  // when filtering on a non-string partition column when the hive 
config key
+  // hive.metastore.try.direct.sql is false
   getPartitionsByFilterMethod.invoke(hive, table, filter)
 .asInstanceOf[JArrayList[Partition]]
 } catch {
-  case e: InvocationTargetException =>
-// SPARK-18167 retry to investigate the flaky test. This should be 
revert

spark git commit: [SPARK-18216][SQL] Make Column.expr public

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 77a98162d -> ad4832a9f


[SPARK-18216][SQL] Make Column.expr public

## What changes were proposed in this pull request?
Column.expr is private[sql], but it's an actually really useful field to have 
for debugging. We should open it up, similar to how we use QueryExecution.

## How was this patch tested?
N/A - this is a simple visibility change.

Author: Reynold Xin 

Closes #15724 from rxin/SPARK-18216.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad4832a9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad4832a9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad4832a9

Branch: refs/heads/master
Commit: ad4832a9faf2c0c869bbcad9d71afe1cecbd3ec8
Parents: 77a9816
Author: Reynold Xin 
Authored: Tue Nov 1 21:20:53 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 21:20:53 2016 -0700

--
 sql/core/src/main/scala/org/apache/spark/sql/Column.scala | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ad4832a9/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 05e867b..249408e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -118,6 +118,9 @@ class TypedColumn[-T, U](
  *   $"a" === $"b"
  * }}}
  *
+ * Note that the internal Catalyst expression can be accessed via "expr", but 
this method is for
+ * debugging purposes only and can change in any future Spark releases.
+ *
  * @groupname java_expr_ops Java-specific expression operators
  * @groupname expr_ops Expression operators
  * @groupname df_ops DataFrame functions
@@ -126,7 +129,7 @@ class TypedColumn[-T, U](
  * @since 1.3.0
  */
 @InterfaceStability.Stable
-class Column(protected[sql] val expr: Expression) extends Logging {
+class Column(val expr: Expression) extends Logging {
 
   def this(name: String) = this(name match {
 case "*" => UnresolvedStar(None)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18182] Expose ReplayListenerBus.read() overload which takes string iterator

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 6e6298154 -> b929537b6


[SPARK-18182] Expose ReplayListenerBus.read() overload which takes string 
iterator

The `ReplayListenerBus.read()` method is used when implementing a custom 
`ApplicationHistoryProvider`. The current interface only exposes a `read()` 
method which takes an `InputStream` and performs stream-to-lines conversion 
itself, but it would also be useful to expose an overloaded method which 
accepts an iterator of strings, thereby enabling events to be provided from 
non-`InputStream` sources.

Author: Josh Rosen 

Closes #15698 from JoshRosen/replay-listener-bus-interface.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b929537b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b929537b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b929537b

Branch: refs/heads/master
Commit: b929537b6eb0f8f34497c3dbceea8045bf5dffdb
Parents: 6e62981
Author: Josh Rosen 
Authored: Tue Nov 1 16:49:41 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 16:49:41 2016 -0700

--
 .../apache/spark/scheduler/ReplayListenerBus.scala   | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b929537b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala 
b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 2424586..0bd5a6b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -53,13 +53,24 @@ private[spark] class ReplayListenerBus extends 
SparkListenerBus with Logging {
   sourceName: String,
   maybeTruncated: Boolean = false,
   eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = {
+val lines = Source.fromInputStream(logData).getLines()
+replay(lines, sourceName, maybeTruncated, eventsFilter)
+  }
 
+  /**
+   * Overloaded variant of [[replay()]] which accepts an iterator of lines 
instead of an
+   * [[InputStream]]. Exposed for use by custom ApplicationHistoryProvider 
implementations.
+   */
+  def replay(
+  lines: Iterator[String],
+  sourceName: String,
+  maybeTruncated: Boolean,
+  eventsFilter: ReplayEventsFilter): Unit = {
 var currentLine: String = null
 var lineNumber: Int = 0
 
 try {
-  val lineEntries = Source.fromInputStream(logData)
-.getLines()
+  val lineEntries = lines
 .zipWithIndex
 .filter { case (line, _) => eventsFilter(line) }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17350][SQL] Disable default use of KryoSerializer in Thrift Server

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 01dd00830 -> 6e6298154


[SPARK-17350][SQL] Disable default use of KryoSerializer in Thrift Server

In SPARK-4761 / #3621 (December 2014) we enabled Kryo serialization by default 
in the Spark Thrift Server. However, I don't think that the original rationale 
for doing this still holds now that most Spark SQL serialization is now 
performed via encoders and our UnsafeRow format.

In addition, the use of Kryo as the default serializer can introduce 
performance problems because the creation of new KryoSerializer instances is 
expensive and we haven't performed instance-reuse optimizations in several code 
paths (including DirectTaskResult deserialization).

Given all of this, I propose to revert back to using JavaSerializer as the 
default serializer in the Thrift Server.

/cc liancheng

Author: Josh Rosen 

Closes #14906 from JoshRosen/disable-kryo-in-thriftserver.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e629815
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e629815
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e629815

Branch: refs/heads/master
Commit: 6e6298154aba63831a292117797798131a646869
Parents: 01dd008
Author: Josh Rosen 
Authored: Tue Nov 1 16:23:47 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 16:23:47 2016 -0700

--
 docs/configuration.md |  5 ++---
 .../apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala  | 10 --
 2 files changed, 2 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6e629815/docs/configuration.md
--
diff --git a/docs/configuration.md b/docs/configuration.md
index 780fc94..0017219 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -767,7 +767,7 @@ Apart from these, the following properties are also 
available, and may be useful
 
 
   spark.kryo.referenceTracking
-  true (false when using Spark SQL Thrift Server)
+  true
   
 Whether to track references to the same object when serializing data with 
Kryo, which is
 necessary if your object graphs have loops and useful for efficiency if 
they contain multiple
@@ -838,8 +838,7 @@ Apart from these, the following properties are also 
available, and may be useful
 
   spark.serializer
   
-org.apache.spark.serializer.JavaSerializer 
(org.apache.spark.serializer.
-KryoSerializer when using Spark SQL Thrift Server)
+org.apache.spark.serializer.JavaSerializer
   
   
 Class to use for serializing objects that will be sent over the network or 
need to be cached

http://git-wip-us.apache.org/repos/asf/spark/blob/6e629815/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
--
diff --git 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
index 6389115..78a3094 100644
--- 
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
+++ 
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.hive.thriftserver
 
 import java.io.PrintStream
 
-import scala.collection.JavaConverters._
-
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{SparkSession, SQLContext}
@@ -37,8 +35,6 @@ private[hive] object SparkSQLEnv extends Logging {
   def init() {
 if (sqlContext == null) {
   val sparkConf = new SparkConf(loadDefaults = true)
-  val maybeSerializer = sparkConf.getOption("spark.serializer")
-  val maybeKryoReferenceTracking = 
sparkConf.getOption("spark.kryo.referenceTracking")
   // If user doesn't specify the appName, we want to get 
[SparkSQL::localHostName] instead of
   // the default appName [SparkSQLCLIDriver] in cli or beeline.
   val maybeAppName = sparkConf
@@ -47,12 +43,6 @@ private[hive] object SparkSQLEnv extends Logging {
 
   sparkConf
 
.setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}"))
-.set(
-  "spark.serializer",
-  
maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer"))
-.set(
-  "spark.kryo.referenceTracking",
-  maybeKryoReferenceTracking.getOrElse("false"))
 
   val sparkSession = 
SparkSession.builder.config(sparkConf).enableHiveSupport().getOrCreate()
   sparkContext = sparkSession.sparkContext

spark git commit: [SPARK-18114][HOTFIX] Fix line-too-long style error from backport of SPARK-18114

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 4176da8be -> a01b95060


[SPARK-18114][HOTFIX] Fix line-too-long style error from backport of SPARK-18114

## What changes were proposed in this pull request?

Fix style error introduced in cherry-pick of 
https://github.com/apache/spark/pull/15643 to branch-2.0.

## How was this patch tested?

Existing tests

Author: Sean Owen 

Closes #15719 from srowen/SPARK-18114.2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a01b9506
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a01b9506
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a01b9506

Branch: refs/heads/branch-2.0
Commit: a01b950602c4bb56c5a7d6213cdf6b7515ff36ec
Parents: 4176da8
Author: Sean Owen 
Authored: Tue Nov 1 12:43:50 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 12:43:50 2016 -0700

--
 .../spark/scheduler/cluster/mesos/MesosClusterScheduler.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a01b9506/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index cbf97c3..94827e4 100644
--- 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -448,7 +448,8 @@ private[spark] class MesosClusterScheduler(
 }
 desc.schedulerProperties
   .filter { case (key, _) => !replicatedOptionsBlacklist.contains(key) }
-  .foreach { case (key, value) => options ++= Seq("--conf", 
s$key=${shellEscape(value)}.stripMargin) }
+  .foreach { case (key, value) =>
+options ++= Seq("--conf", 
s$key=${shellEscape(value)}.stripMargin) }
 options
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18167] Disable flaky SQLQuerySuite test

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master d0272b436 -> cfac17ee1


[SPARK-18167] Disable flaky SQLQuerySuite test

We now know it's a persistent environmental issue that is causing this test to 
sometimes fail. One hypothesis is that some configuration is leaked from 
another suite, and depending on suite ordering this can cause this test to fail.

I am planning on mining the jenkins logs to try to narrow down which suite 
could be causing this. For now, disable the test.

Author: Eric Liang 

Closes #15720 from ericl/disable-flaky-test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cfac17ee
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cfac17ee
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cfac17ee

Branch: refs/heads/master
Commit: cfac17ee1cec414663b957228e469869eb7673c1
Parents: d0272b4
Author: Eric Liang 
Authored: Tue Nov 1 12:35:34 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 12:35:34 2016 -0700

--
 .../scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cfac17ee/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8b91693..b9353b5 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1565,7 +1565,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
-  test("SPARK-10562: partition by column with mixed case name") {
+  ignore("SPARK-10562: partition by column with mixed case name") {
 def runOnce() {
   withTable("tbl10562") {
 val df = Seq(2012 -> "a").toDF("Year", "val")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 8a538c97b -> d0272b436


[SPARK-18148][SQL] Misleading Error Message for Aggregation Without 
Window/GroupBy

## What changes were proposed in this pull request?

Aggregation Without Window/GroupBy expressions will fail in `checkAnalysis`, 
the error message is a bit misleading, we should generate a more specific error 
message for this case.

For example,

```
spark.read.load("/some-data")
  .withColumn("date_dt", to_date($"date"))
  .withColumn("year", year($"date_dt"))
  .withColumn("week", weekofyear($"date_dt"))
  .withColumn("user_count", count($"userId"))
  .withColumn("daily_max_in_week", max($"user_count").over(weeklyWindow))
)
```

creates the following output:

```
org.apache.spark.sql.AnalysisException: expression '`randomColumn`' is neither 
present in the group by, nor is it an aggregate function. Add to group by or 
wrap in first() (or first_value) if you don't care which value you get.;
```

In the error message above, `randomColumn` doesn't appear in the 
query(acturally it's added by function `withColumn`), so the message is not 
enough for the user to address the problem.
## How was this patch tested?

Manually test

Before:

```
scala> spark.sql("select col, count(col) from tbl")
org.apache.spark.sql.AnalysisException: expression 'tbl.`col`' is neither 
present in the group by, nor is it an aggregate function. Add to group by or 
wrap in first() (or first_value) if you don't care which value you get.;;
```

After:

```
scala> spark.sql("select col, count(col) from tbl")
org.apache.spark.sql.AnalysisException: grouping expressions sequence is empty, 
and 'tbl.`col`' is not an aggregate function. Wrap '(count(col#231L) AS 
count(col)#239L)' in windowing function(s) or wrap 'tbl.`col`' in first() (or 
first_value) if you don't care which value you get.;;
```

Also add new test sqls in `group-by.sql`.

Author: jiangxingbo 

Closes #15672 from jiangxb1987/groupBy-empty.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0272b43
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0272b43
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0272b43

Branch: refs/heads/master
Commit: d0272b436512b71f04313e109d3d21a6e9deefca
Parents: 8a538c9
Author: jiangxingbo 
Authored: Tue Nov 1 11:25:11 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 11:25:11 2016 -0700

--
 .../sql/catalyst/analysis/CheckAnalysis.scala   |  12 ++
 .../resources/sql-tests/inputs/group-by.sql |  41 +--
 .../sql-tests/results/group-by.sql.out  | 116 ---
 .../org/apache/spark/sql/SQLQuerySuite.scala|  35 --
 4 files changed, 140 insertions(+), 64 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d0272b43/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 9a7c2a9..3455a56 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper {
 s"appear in the arguments of an aggregate function.")
   }
 }
+  case e: Attribute if groupingExprs.isEmpty =>
+// Collect all [[AggregateExpressions]]s.
+val aggExprs = aggregateExprs.filter(_.collect {
+  case a: AggregateExpression => a
+}.nonEmpty)
+failAnalysis(
+  s"grouping expressions sequence is empty, " +
+s"and '${e.sql}' is not an aggregate function. " +
+s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' 
in windowing " +
+s"function(s) or wrap '${e.sql}' in first() (or 
first_value) " +
+s"if you don't care which value you get."
+)
   case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) 
=>
 failAnalysis(
   s"expression '${e.sql}' is neither present in the group by, 
" +

http://git-wip-us.apache.org/repos/asf/spark/blob/d0272b43/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
--
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql 
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 6741703..d950ec8 100644
--- a/sql/core/src/test/reso

spark git commit: [SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 58655f51f -> 4176da8be


[SPARK-18148][SQL] Misleading Error Message for Aggregation Without 
Window/GroupBy

## What changes were proposed in this pull request?

Aggregation Without Window/GroupBy expressions will fail in `checkAnalysis`, 
the error message is a bit misleading, we should generate a more specific error 
message for this case.

For example,

```
spark.read.load("/some-data")
  .withColumn("date_dt", to_date($"date"))
  .withColumn("year", year($"date_dt"))
  .withColumn("week", weekofyear($"date_dt"))
  .withColumn("user_count", count($"userId"))
  .withColumn("daily_max_in_week", max($"user_count").over(weeklyWindow))
)
```

creates the following output:

```
org.apache.spark.sql.AnalysisException: expression '`randomColumn`' is neither 
present in the group by, nor is it an aggregate function. Add to group by or 
wrap in first() (or first_value) if you don't care which value you get.;
```

In the error message above, `randomColumn` doesn't appear in the 
query(acturally it's added by function `withColumn`), so the message is not 
enough for the user to address the problem.
## How was this patch tested?

Manually test

Before:

```
scala> spark.sql("select col, count(col) from tbl")
org.apache.spark.sql.AnalysisException: expression 'tbl.`col`' is neither 
present in the group by, nor is it an aggregate function. Add to group by or 
wrap in first() (or first_value) if you don't care which value you get.;;
```

After:

```
scala> spark.sql("select col, count(col) from tbl")
org.apache.spark.sql.AnalysisException: grouping expressions sequence is empty, 
and 'tbl.`col`' is not an aggregate function. Wrap '(count(col#231L) AS 
count(col)#239L)' in windowing function(s) or wrap 'tbl.`col`' in first() (or 
first_value) if you don't care which value you get.;;
```

Also add new test sqls in `group-by.sql`.

Author: jiangxingbo 

Closes #15672 from jiangxb1987/groupBy-empty.

(cherry picked from commit d0272b436512b71f04313e109d3d21a6e9deefca)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4176da8b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4176da8b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4176da8b

Branch: refs/heads/branch-2.0
Commit: 4176da8be57bb0b36b9f2c580a547713c2048d17
Parents: 58655f5
Author: jiangxingbo 
Authored: Tue Nov 1 11:25:11 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 11:25:18 2016 -0700

--
 .../sql/catalyst/analysis/CheckAnalysis.scala   |  12 ++
 .../resources/sql-tests/inputs/group-by.sql |  41 +--
 .../sql-tests/results/group-by.sql.out  | 116 ---
 .../org/apache/spark/sql/SQLQuerySuite.scala|  35 --
 4 files changed, 140 insertions(+), 64 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4176da8b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 790566c..10e0eef 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper {
 s"appear in the arguments of an aggregate function.")
   }
 }
+  case e: Attribute if groupingExprs.isEmpty =>
+// Collect all [[AggregateExpressions]]s.
+val aggExprs = aggregateExprs.filter(_.collect {
+  case a: AggregateExpression => a
+}.nonEmpty)
+failAnalysis(
+  s"grouping expressions sequence is empty, " +
+s"and '${e.sql}' is not an aggregate function. " +
+s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' 
in windowing " +
+s"function(s) or wrap '${e.sql}' in first() (or 
first_value) " +
+s"if you don't care which value you get."
+)
   case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) 
=>
 failAnalysis(
   s"expression '${e.sql}' is neither present in the group by, 
" +

http://git-wip-us.apache.org/repos/asf/spark/blob/4176da8b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
--
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql 
b/sql/core/s

spark git commit: [SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 4d2672a40 -> 58655f51f


[SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset

## What changes were proposed in this pull request?
Likewise 
[DataSet.scala](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L156)
 KeyValueGroupedDataset should mark the queryExecution as transient.

As mentioned in the Jira ticket, without transient we saw serialization issues 
like

```
Caused by: java.io.NotSerializableException: 
org.apache.spark.sql.execution.QueryExecution
Serialization stack:
- object not serializable (class: 
org.apache.spark.sql.execution.QueryExecution, value: ==
```

## How was this patch tested?

Run the query which is specified in the Jira ticket before and after:
```
val a = spark.createDataFrame(sc.parallelize(Seq((1,2),(3,4.as[(Int,Int)]
val grouped = a.groupByKey(
{x:(Int,Int)=>x._1}
)
val mappedGroups = grouped.mapGroups((k,x)=>
{(k,1)}
)
val yyy = sc.broadcast(1)
val last = mappedGroups.rdd.map(xx=>
{ val simpley = yyy.value 1 }
)
```

Author: Ergin Seyfe 

Closes #15706 from seyfe/keyvaluegrouped_serialization.

(cherry picked from commit 8a538c97b556f80f67c80519af0ce879557050d5)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58655f51
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58655f51
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58655f51

Branch: refs/heads/branch-2.0
Commit: 58655f51f65d852ec65a65b54f26b3c8eac8cc60
Parents: 4d2672a
Author: Ergin Seyfe 
Authored: Tue Nov 1 11:18:42 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 11:18:50 2016 -0700

--
 .../scala/org/apache/spark/repl/ReplSuite.scala| 17 +
 .../apache/spark/sql/KeyValueGroupedDataset.scala  |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/58655f51/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
--
diff --git 
a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala 
b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index f7d7a4f..8deafe3 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -473,4 +473,21 @@ class ReplSuite extends SparkFunSuite {
 assertDoesNotContain("AssertionError", output)
 assertDoesNotContain("Exception", output)
   }
+
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+val resultValue = 12345
+val output = runInterpreter("local",
+  s"""
+ |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+ |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+ |val broadcasted = sc.broadcast($resultValue)
+ |
+ |// Using broadcast triggers serialization issue in 
KeyValueGroupedDataset
+ |val dataset = mapGroups.map(_ => broadcasted.value)
+ |dataset.collect()
+  """.stripMargin)
+assertDoesNotContain("error:", output)
+assertDoesNotContain("Exception", output)
+assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/58655f51/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 8eec42a..407d036 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.expressions.ReduceAggregator
 class KeyValueGroupedDataset[K, V] private[sql](
 kEncoder: Encoder[K],
 vEncoder: Encoder[V],
-val queryExecution: QueryExecution,
+@transient val queryExecution: QueryExecution,
 private val dataAttributes: Seq[Attribute],
 private val groupingAttributes: Seq[Attribute]) extends Serializable {
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 8cdf143f4 -> 8a538c97b


[SPARK-18189][SQL] Fix serialization issue in KeyValueGroupedDataset

## What changes were proposed in this pull request?
Likewise 
[DataSet.scala](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L156)
 KeyValueGroupedDataset should mark the queryExecution as transient.

As mentioned in the Jira ticket, without transient we saw serialization issues 
like

```
Caused by: java.io.NotSerializableException: 
org.apache.spark.sql.execution.QueryExecution
Serialization stack:
- object not serializable (class: 
org.apache.spark.sql.execution.QueryExecution, value: ==
```

## How was this patch tested?

Run the query which is specified in the Jira ticket before and after:
```
val a = spark.createDataFrame(sc.parallelize(Seq((1,2),(3,4.as[(Int,Int)]
val grouped = a.groupByKey(
{x:(Int,Int)=>x._1}
)
val mappedGroups = grouped.mapGroups((k,x)=>
{(k,1)}
)
val yyy = sc.broadcast(1)
val last = mappedGroups.rdd.map(xx=>
{ val simpley = yyy.value 1 }
)
```

Author: Ergin Seyfe 

Closes #15706 from seyfe/keyvaluegrouped_serialization.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a538c97
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a538c97
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a538c97

Branch: refs/heads/master
Commit: 8a538c97b556f80f67c80519af0ce879557050d5
Parents: 8cdf143
Author: Ergin Seyfe 
Authored: Tue Nov 1 11:18:42 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 11:18:42 2016 -0700

--
 .../scala/org/apache/spark/repl/ReplSuite.scala| 17 +
 .../apache/spark/sql/KeyValueGroupedDataset.scala  |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8a538c97/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
--
diff --git 
a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala 
b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 9262e93..96d2dfc 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -473,4 +473,21 @@ class ReplSuite extends SparkFunSuite {
 assertDoesNotContain("AssertionError", output)
 assertDoesNotContain("Exception", output)
   }
+
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+val resultValue = 12345
+val output = runInterpreter("local",
+  s"""
+ |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+ |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+ |val broadcasted = sc.broadcast($resultValue)
+ |
+ |// Using broadcast triggers serialization issue in 
KeyValueGroupedDataset
+ |val dataset = mapGroups.map(_ => broadcasted.value)
+ |dataset.collect()
+  """.stripMargin)
+assertDoesNotContain("error:", output)
+assertDoesNotContain("Exception", output)
+assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/8a538c97/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 4cb0313..31ce8eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.expressions.ReduceAggregator
 class KeyValueGroupedDataset[K, V] private[sql](
 kEncoder: Encoder[K],
 vEncoder: Encoder[V],
-val queryExecution: QueryExecution,
+@transient val queryExecution: QueryExecution,
 private val dataAttributes: Seq[Attribute],
 private val groupingAttributes: Seq[Attribute]) extends Serializable {
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18103][FOLLOW-UP][SQL][MINOR] Rename `MetadataLogFileCatalog` to `MetadataLogFileIndex`

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 8ac09108f -> 8cdf143f4


[SPARK-18103][FOLLOW-UP][SQL][MINOR] Rename `MetadataLogFileCatalog` to 
`MetadataLogFileIndex`

## What changes were proposed in this pull request?

This is a follow-up to https://github.com/apache/spark/pull/15634.

## How was this patch tested?

N/A

Author: Liwei Lin 

Closes #15712 from lw-lin/18103.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8cdf143f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8cdf143f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8cdf143f

Branch: refs/heads/master
Commit: 8cdf143f4b1ca5c6bc0256808e6f42d9ef299cbd
Parents: 8ac0910
Author: Liwei Lin 
Authored: Tue Nov 1 11:17:35 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 11:17:35 2016 -0700

--
 .../streaming/MetadataLogFileCatalog.scala  | 60 
 .../streaming/MetadataLogFileIndex.scala| 60 
 2 files changed, 60 insertions(+), 60 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8cdf143f/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
deleted file mode 100644
index aeaa134..000
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import scala.collection.mutable
-
-import org.apache.hadoop.fs.{FileStatus, Path}
-
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.datasources._
-
-
-/**
- * A [[FileIndex]] that generates the list of files to processing by reading 
them from the
- * metadata log files generated by the [[FileStreamSink]].
- */
-class MetadataLogFileIndex(sparkSession: SparkSession, path: Path)
-  extends PartitioningAwareFileIndex(sparkSession, Map.empty, None) {
-
-  private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
-  logInfo(s"Reading streaming file log from $metadataDirectory")
-  private val metadataLog =
-new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, 
metadataDirectory.toUri.toString)
-  private val allFilesFromLog = 
metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory)
-  private var cachedPartitionSpec: PartitionSpec = _
-
-  override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
-new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f)
-  }
-
-  override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] 
= {
-allFilesFromLog.toArray.groupBy(_.getPath.getParent)
-  }
-
-  override def rootPaths: Seq[Path] = path :: Nil
-
-  override def refresh(): Unit = { }
-
-  override def partitionSpec(): PartitionSpec = {
-if (cachedPartitionSpec == null) {
-  cachedPartitionSpec = inferPartitioning()
-}
-cachedPartitionSpec
-  }
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/8cdf143f/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
new file mode 100644
index 000..aeaa134
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding

spark git commit: [SPARK-18107][SQL] Insert overwrite statement runs much slower in spark-sql than it does in hive-client

2016-11-01 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master d9d146500 -> dd85eb544


[SPARK-18107][SQL] Insert overwrite statement runs much slower in spark-sql 
than it does in hive-client

## What changes were proposed in this pull request?

As reported on the jira, insert overwrite statement runs much slower in Spark, 
compared with hive-client.

It seems there is a patch 
[HIVE-11940](https://github.com/apache/hive/commit/ba21806b77287e237e1aa68fa169d2a81e07346d)
 which largely improves insert overwrite performance on Hive. HIVE-11940 is 
patched after Hive 2.0.0.

Because Spark SQL uses older Hive library, we can not benefit from such 
improvement.

The reporter verified that there is also a big performance gap between Hive 
1.2.1 (520.037 secs) and Hive 2.0.1 (35.975 secs) on insert overwrite execution.

Instead of upgrading to Hive 2.0 in Spark SQL, which might not be a trivial 
task, this patch provides an approach to delete the partition before asking 
Hive to load data files into the partition.

Note: The case reported on the jira is insert overwrite to partition. Since 
`Hive.loadTable` also uses the function to replace files, insert overwrite to 
table should has the same issue. We can take the same approach to delete the 
table first. I will upgrade this to include this.
## How was this patch tested?

Jenkins tests.

There are existing tests using insert overwrite statement. Those tests should 
be passed. I added a new test to specially test insert overwrite into partition.

For performance issue, as I don't have Hive 2.0 environment, this needs the 
reporter to verify it. Please refer to the jira.

Please review 
https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before 
opening a pull request.

Author: Liang-Chi Hsieh 

Closes #15667 from viirya/improve-hive-insertoverwrite.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dd85eb54
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dd85eb54
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dd85eb54

Branch: refs/heads/master
Commit: dd85eb5448c8f2672260b57e94c0da0eaac12616
Parents: d9d1465
Author: Liang-Chi Hsieh 
Authored: Tue Nov 1 00:24:08 2016 -0700
Committer: Reynold Xin 
Committed: Tue Nov 1 00:24:08 2016 -0700

--
 .../hive/execution/InsertIntoHiveTable.scala| 24 +-
 .../sql/hive/execution/SQLQuerySuite.scala  | 33 
 2 files changed, 56 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dd85eb54/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index c3c4e29..2843100 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, 
AlterTableDropPartitionCommand}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.SparkException
@@ -257,7 +258,28 @@ case class InsertIntoHiveTable(
 table.catalogTable.identifier.table,
 partitionSpec)
 
+var doHiveOverwrite = overwrite
+
 if (oldPart.isEmpty || !ifNotExists) {
+  // SPARK-18107: Insert overwrite runs much slower than hive-client.
+  // Newer Hive largely improves insert overwrite performance. As 
Spark uses older Hive
+  // version and we may not want to catch up new Hive version every 
time. We delete the
+  // Hive partition first and then load data file into the Hive 
partition.
+  if (oldPart.nonEmpty && overwrite) {
+oldPart.get.storage.locationUri.map { uri =>
+  val partitionPath = new Path(uri)
+  val fs = partitionPath.getFileSystem(hadoopConf)
+  if (fs.exists(partitionPath)) {
+if (!fs.delete(partitionPath, true)) {
+  throw new RuntimeException(
+"Cannot remove partition directory '" + 
partitionPath.toString)
+}
+// Don't let Hive do overwrite operation since it is slower.
+doHiveOverwrite = false
+

spark git commit: [SPARK-18024][SQL] Introduce an internal commit protocol API

2016-10-31 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 7d6c87155 -> d9d146500


[SPARK-18024][SQL] Introduce an internal commit protocol API

## What changes were proposed in this pull request?
This patch introduces an internal commit protocol API that is used by the batch 
data source to do write commits. It currently has only one implementation that 
uses Hadoop MapReduce's OutputCommitter API. In the future, this commit API can 
be used to unify streaming and batch commits.

## How was this patch tested?
Should be covered by existing write tests.

Author: Reynold Xin 
Author: Eric Liang 

Closes #15707 from rxin/SPARK-18024-2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9d14650
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9d14650
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9d14650

Branch: refs/heads/master
Commit: d9d1465009fb40550467089ede315496552374c5
Parents: 7d6c871
Author: Reynold Xin 
Authored: Mon Oct 31 22:23:38 2016 -0700
Committer: Reynold Xin 
Committed: Mon Oct 31 22:23:38 2016 -0700

--
 .../spark/ml/source/libsvm/LibSVMRelation.scala |  17 +-
 .../datasources/FileCommitProtocol.scala| 254 +++
 .../execution/datasources/OutputWriter.scala|  26 +-
 .../sql/execution/datasources/WriteOutput.scala | 167 
 .../execution/datasources/csv/CSVRelation.scala |  17 +-
 .../datasources/json/JsonFileFormat.scala   |  17 +-
 .../datasources/parquet/ParquetFileFormat.scala |   8 +-
 .../parquet/ParquetOutputWriter.scala   |  19 +-
 .../datasources/text/TextFileFormat.scala   |  17 +-
 .../org/apache/spark/sql/internal/SQLConf.scala |  29 ++-
 .../spark/sql/hive/orc/OrcFileFormat.scala  |  28 +-
 .../sql/sources/CommitFailureTestSource.scala   |  10 +-
 .../spark/sql/sources/SimpleTextRelation.scala  |  19 +-
 13 files changed, 387 insertions(+), 241 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d9d14650/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 5e9e6ff..cb3ca1b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -41,17 +41,11 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
 private[libsvm] class LibSVMOutputWriter(
-stagingDir: String,
-fileNamePrefix: String,
+path: String,
 dataSchema: StructType,
 context: TaskAttemptContext)
   extends OutputWriter {
 
-  override val path: String = {
-val compressionExtension = 
TextOutputWriter.getCompressionExtension(context)
-new Path(stagingDir, fileNamePrefix + ".libsvm" + 
compressionExtension).toString
-  }
-
   private[this] val buffer = new Text()
 
   private val recordWriter: RecordWriter[NullWritable, Text] = {
@@ -135,11 +129,14 @@ private[libsvm] class LibSVMFileFormat extends 
TextBasedFileFormat with DataSour
   dataSchema: StructType): OutputWriterFactory = {
 new OutputWriterFactory {
   override def newInstance(
-  stagingDir: String,
-  fileNamePrefix: String,
+  path: String,
   dataSchema: StructType,
   context: TaskAttemptContext): OutputWriter = {
-new LibSVMOutputWriter(stagingDir, fileNamePrefix, dataSchema, context)
+new LibSVMOutputWriter(path, dataSchema, context)
+  }
+
+  override def getFileExtension(context: TaskAttemptContext): String = {
+".libsvm" + TextOutputWriter.getCompressionExtension(context)
   }
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/d9d14650/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
new file mode 100644
index 000..1ce9ae4
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"

spark git commit: [SPARK-18167][SQL] Retry when the SQLQuerySuite test flakes

2016-10-31 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master efc254a82 -> 7d6c87155


[SPARK-18167][SQL] Retry when the SQLQuerySuite test flakes

## What changes were proposed in this pull request?

This will re-run the flaky test a few times after it fails. This will help 
determine if it's due to nondeterministic test setup, or because of some 
environment issue (e.g. leaked config from another test).

cc yhuai

Author: Eric Liang 

Closes #15708 from ericl/spark-18167-3.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d6c8715
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d6c8715
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d6c8715

Branch: refs/heads/master
Commit: 7d6c87155c740cf622c2c600a8ca64154d24c422
Parents: efc254a
Author: Eric Liang 
Authored: Mon Oct 31 20:23:22 2016 -0700
Committer: Reynold Xin 
Committed: Mon Oct 31 20:23:22 2016 -0700

--
 .../sql/hive/execution/SQLQuerySuite.scala  | 28 ++--
 1 file changed, 20 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7d6c8715/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 2735d3a..f64010a 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1566,14 +1566,26 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
   }
 
   test("SPARK-10562: partition by column with mixed case name") {
-withTable("tbl10562") {
-  val df = Seq(2012 -> "a").toDF("Year", "val")
-  df.write.partitionBy("Year").saveAsTable("tbl10562")
-  checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012))
-  checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
-  checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
-  checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
-  checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a"))
+def runOnce() {
+  withTable("tbl10562") {
+val df = Seq(2012 -> "a").toDF("Year", "val")
+df.write.partitionBy("Year").saveAsTable("tbl10562")
+checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012))
+checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
+checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
+checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
+checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), 
Row("a"))
+  }
+}
+try {
+  runOnce()
+} catch {
+  case t: Throwable =>
+// Retry to gather more test data. TODO(ekl) revert this once we 
deflake this test.
+runOnce()
+runOnce()
+runOnce()
+throw t
 }
   }
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18087][SQL] Optimize insert to not require REPAIR TABLE

2016-10-31 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 6633b97b5 -> efc254a82


[SPARK-18087][SQL] Optimize insert to not require REPAIR TABLE

## What changes were proposed in this pull request?

When inserting into datasource tables with partitions managed by the hive 
metastore, we need to notify the metastore of newly added partitions. 
Previously this was implemented via `msck repair table`, but this is more 
expensive than needed.

This optimizes the insertion path to add only the updated partitions.
## How was this patch tested?

Existing tests (I verified manually that tests fail if the repair operation is 
omitted).

Author: Eric Liang 

Closes #15633 from ericl/spark-18087.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/efc254a8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/efc254a8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/efc254a8

Branch: refs/heads/master
Commit: efc254a82bc3331d78023f00d29d4c4318dfb734
Parents: 6633b97
Author: Eric Liang 
Authored: Mon Oct 31 19:46:55 2016 -0700
Committer: Reynold Xin 
Committed: Mon Oct 31 19:46:55 2016 -0700

--
 .../sql/execution/datasources/DataSource.scala  |  2 +-
 .../datasources/DataSourceStrategy.scala| 27 +++---
 .../InsertIntoHadoopFsRelationCommand.scala |  3 +-
 .../datasources/PartitioningUtils.scala | 12 
 .../sql/execution/datasources/WriteOutput.scala | 29 ++--
 5 files changed, 52 insertions(+), 21 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/efc254a8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 9961098..d980e6a 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -528,7 +528,7 @@ case class DataSource(
 columns,
 bucketSpec,
 format,
-() => Unit, // No existing table needs to be refreshed.
+_ => Unit, // No existing table needs to be refreshed.
 options,
 data.logicalPlan,
 mode)

http://git-wip-us.apache.org/repos/asf/spark/blob/efc254a8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index f0bcf94..34b77ca 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.{CatalystConf, 
CatalystTypeConverters, Inte
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog.{CatalogTable, 
SimpleCatalogRelation}
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
@@ -34,7 +35,7 @@ import 
org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, 
UnknownPartitioning}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
-import 
org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, 
DDLUtils, ExecutedCommandExec}
+import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, 
DDLUtils, ExecutedCommandExec}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -179,24 +180,30 @@ case class DataSourceAnalysis(conf: CatalystConf) extends 
Rule[LogicalPlan] {
   "Cannot overwrite a path that is also being read from.")
   }
 
+  def refreshPartitionsCallback(updatedPartitions: 
Seq[TablePartitionSpec]): Unit = {
+if (l.catalogTable.isDefined &&
+l.catalogTable.get.partitionColumnNames.nonEmpty &&
+l.catalogTable.get.partitionProviderIsHive) {
+  val metastoreUpdater = AlterTableAddPartition

spark git commit: [SPARK-18143][SQL] Ignore Structured Streaming event logs to avoid breaking history server (branch 2.0)

2016-10-31 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 9f924747d -> 300d596a5


[SPARK-18143][SQL] Ignore Structured Streaming event logs to avoid breaking 
history server (branch 2.0)

## What changes were proposed in this pull request?

Backport #15663 to branch-2.0 and fixed conflicts in `ReplayListenerBus`.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu 

Closes #15695 from zsxwing/fix-event-log-2.0.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/300d596a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/300d596a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/300d596a

Branch: refs/heads/branch-2.0
Commit: 300d596a5177ae372194f73f717174b7ff7acd36
Parents: 9f92474
Author: Shixiong Zhu 
Authored: Mon Oct 31 16:03:44 2016 -0700
Committer: Reynold Xin 
Committed: Mon Oct 31 16:03:44 2016 -0700

--
 .../spark/scheduler/ReplayListenerBus.scala | 17 
 .../query-event-logs-version-2.0.0.txt  |  4 ++
 .../query-event-logs-version-2.0.1.txt  |  4 ++
 .../streaming/StreamingQueryListenerSuite.scala | 42 
 4 files changed, 67 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/300d596a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala 
b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index d32f5eb..c65e7a2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -25,6 +25,7 @@ import com.fasterxml.jackson.core.JsonParseException
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.util.JsonProtocol
 
 /**
@@ -57,6 +58,10 @@ private[spark] class ReplayListenerBus extends 
SparkListenerBus with Logging {
 try {
   postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine)))
 } catch {
+  case e: ClassNotFoundException if 
KNOWN_REMOVED_CLASSES.contains(e.getMessage) =>
+// Ignore events generated by Structured Streaming in Spark 2.0.0 
and 2.0.1.
+// It's safe since no place uses them.
+logWarning(s"Dropped incompatible Structured Streaming log: 
$currentLine")
   case jpe: JsonParseException =>
 // We can only ignore exception from last line of the file that 
might be truncated
 if (!maybeTruncated || lines.hasNext) {
@@ -78,3 +83,15 @@ private[spark] class ReplayListenerBus extends 
SparkListenerBus with Logging {
   }
 
 }
+
+private[spark] object ReplayListenerBus {
+
+  /**
+   * Classes that were removed. Structured Streaming doesn't use them any 
more. However, parsing
+   * old json may fail and we can just ignore these failures.
+   */
+  val KNOWN_REMOVED_CLASSES = Set(
+"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress",
+"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated"
+  )
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/300d596a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
--
diff --git 
a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
 
b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
new file mode 100644
index 000..aa7e9a8
--- /dev/null
+++ 
b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
@@ -0,0 +1,4 @@
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@2b85b3a5","offsetDesc":"[#0]"}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@2b85b3a5","offsetDesc":"[#0]"}},"exception":null,"stackTrace":[]}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@514502dc","offsetDesc":"[-]"}},"exception":"Quer

[1/2] spark git commit: [SPARK-18103][SQL] Rename FileCatalog to FileIndex

2016-10-30 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 3ad99f166 -> 90d3b91f4


http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index d1de863..624ab74 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -200,7 +200,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: 
SparkSession) extends Log
   val rootPaths: Seq[Path] = if (lazyPruningEnabled) {
 Seq(metastoreRelation.hiveQlTable.getDataLocation)
   } else {
-// By convention (for example, see TableFileCatalog), the definition 
of a
+// By convention (for example, see CatalogFileIndex), the definition 
of a
 // partitioned table's paths depends on whether that table has any 
actual partitions.
 // Partitioned tables without partitions use the location of the 
table's base path.
 // Partitioned tables with partitions use the locations of those 
partitions' data
@@ -227,7 +227,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: 
SparkSession) extends Log
   val logicalRelation = cached.getOrElse {
 val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
 val fileCatalog = {
-  val catalog = new TableFileCatalog(
+  val catalog = new CatalogFileIndex(
 sparkSession, metastoreRelation.catalogTable, sizeInBytes)
   if (lazyPruningEnabled) {
 catalog

http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index ecdf4f1..fc35304 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Dataset, 
QueryTest, SaveMode}
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, 
LogicalRelation, TableFileCatalog}
+import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, 
HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
@@ -321,17 +321,17 @@ class CachedTableSuite extends QueryTest with 
SQLTestUtils with TestHiveSingleto
 sql("DROP TABLE cachedTable")
   }
 
-  test("cache a table using TableFileCatalog") {
+  test("cache a table using CatalogFileIndex") {
 withTable("test") {
   sql("CREATE TABLE test(i int) PARTITIONED BY (p int) STORED AS parquet")
   val tableMeta = spark.sharedState.externalCatalog.getTable("default", 
"test")
-  val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0)
+  val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0)
 
   val dataSchema = StructType(tableMeta.schema.filterNot { f =>
 tableMeta.partitionColumnNames.contains(f.name)
   })
   val relation = HadoopFsRelation(
-location = tableFileCatalog,
+location = catalogFileIndex,
 partitionSchema = tableMeta.partitionSchema,
 dataSchema = dataSchema,
 bucketSpec = None,
@@ -343,7 +343,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils 
with TestHiveSingleto
 
   assert(spark.sharedState.cacheManager.lookupCachedData(plan).isDefined)
 
-  val sameCatalog = new TableFileCatalog(spark, tableMeta, 0)
+  val sameCatalog = new CatalogFileIndex(spark, tableMeta, 0)
   val sameRelation = HadoopFsRelation(
 location = sameCatalog,
 partitionSchema = tableMeta.partitionSchema,

http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 476383a..d8e31c4 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scal

[2/2] spark git commit: [SPARK-18103][SQL] Rename FileCatalog to FileIndex

2016-10-30 Thread rxin

[SPARK-18103][SQL] Rename *FileCatalog to *FileIndex

## What changes were proposed in this pull request?

To reduce the number of components in SQL named *Catalog, rename *FileCatalog 
to *FileIndex. A FileIndex is responsible for returning the list of partitions 
/ files to scan given a filtering expression.

```
TableFileCatalog => CatalogFileIndex
FileCatalog => FileIndex
ListingFileCatalog => InMemoryFileIndex
MetadataLogFileCatalog => MetadataLogFileIndex
PrunedTableFileCatalog => PrunedInMemoryFileIndex
```

cc yhuai marmbrus

## How was this patch tested?

N/A

Author: Eric Liang 
Author: Eric Liang 

Closes #15634 from ericl/rename-file-provider.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/90d3b91f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/90d3b91f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/90d3b91f

Branch: refs/heads/master
Commit: 90d3b91f4cb59d84fea7105d54ef8c87a7d5c6a2
Parents: 3ad99f1
Author: Eric Liang 
Authored: Sun Oct 30 13:14:45 2016 -0700
Committer: Reynold Xin 
Committed: Sun Oct 30 13:14:45 2016 -0700

--
 .../spark/metrics/source/StaticSources.scala|   2 +-
 .../spark/sql/execution/CacheManager.scala  |   2 +-
 .../datasources/CatalogFileIndex.scala  | 110 +
 .../sql/execution/datasources/DataSource.scala  |  10 +-
 .../sql/execution/datasources/FileCatalog.scala |  70 ---
 .../sql/execution/datasources/FileIndex.scala   |  70 +++
 .../datasources/HadoopFsRelation.scala  |   4 +-
 .../datasources/InMemoryFileIndex.scala |  87 
 .../datasources/ListingFileCatalog.scala|  87 
 .../PartitioningAwareFileCatalog.scala  | 437 ---
 .../PartitioningAwareFileIndex.scala| 437 +++
 .../datasources/PruneFileSourcePartitions.scala |   6 +-
 .../datasources/TableFileCatalog.scala  | 110 -
 .../streaming/CompactibleFileStreamLog.scala|   4 +-
 .../execution/streaming/FileStreamSource.scala  |   4 +-
 .../streaming/MetadataLogFileCatalog.scala  |   6 +-
 .../datasources/FileCatalogSuite.scala  |  36 +-
 .../datasources/FileSourceStrategySuite.scala   |   2 +-
 .../ParquetPartitionDiscoverySuite.scala|   2 +-
 .../sql/streaming/FileStreamSinkSuite.scala |   6 +-
 .../sql/streaming/FileStreamSourceSuite.scala   |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |   4 +-
 .../spark/sql/hive/CachedTableSuite.scala   |  10 +-
 .../hive/PartitionedTablePerfStatsSuite.scala   |   2 +-
 .../PruneFileSourcePartitionsSuite.scala|   6 +-
 25 files changed, 758 insertions(+), 758 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala 
b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index b54885b..3f7cfd9 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -76,7 +76,7 @@ object HiveCatalogMetrics extends Source {
   val METRIC_PARTITIONS_FETCHED = 
metricRegistry.counter(MetricRegistry.name("partitionsFetched"))
 
   /**
-   * Tracks the total number of files discovered off of the filesystem by 
ListingFileCatalog.
+   * Tracks the total number of files discovered off of the filesystem by 
InMemoryFileIndex.
*/
   val METRIC_FILES_DISCOVERED = 
metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
 

http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index fb72c67..526623a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -177,7 +177,7 @@ class CacheManager extends Logging {
 
   /**
* Traverses a given `plan` and searches for the occurrences of 
`qualifiedPath` in the
-   * [[org.apache.spark.sql.execution.datasources.FileCatalog]] of any 
[[HadoopFsRelation]] nodes
+   * [[org.apache.spark.sql.execution.datasources.FileIndex]] of any 
[[HadoopFsRelation]] nodes
* in the plan. If found, we refresh the metadata and return true. 
Otherwise, this method returns
* false.
*/

http://git-wip-us.apache.org/repos/asf/spark/blob/90d3b91f/sql/core/src/main/scala/org/apache/spark/sql/

spark git commit: [SPARK-18167][SQL] Add debug code for SQLQuerySuite flakiness when metastore partition pruning is enabled

2016-10-28 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 59cccbda4 -> d2d438d1d


[SPARK-18167][SQL] Add debug code for SQLQuerySuite flakiness when metastore 
partition pruning is enabled

## What changes were proposed in this pull request?

org.apache.spark.sql.hive.execution.SQLQuerySuite is flaking when hive 
partition pruning is enabled.
Based on the stack traces, it seems to be an old issue where Hive fails to cast 
a numeric partition column ("Invalid character string format for type 
DECIMAL"). There are two possibilities here: either we are somehow corrupting 
the partition table to have non-decimal values in that column, or there is a 
transient issue with Derby.

This PR logs the result of the retry when this exception is encountered, so we 
can confirm what is going on.

## How was this patch tested?

n/a

cc yhuai

Author: Eric Liang 

Closes #15676 from ericl/spark-18167.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2d438d1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2d438d1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2d438d1

Branch: refs/heads/master
Commit: d2d438d1d549628a0183e468ed11d6e85b5d6061
Parents: 59cccbd
Author: Eric Liang 
Authored: Sat Oct 29 06:49:57 2016 +0200
Committer: Reynold Xin 
Committed: Sat Oct 29 06:49:57 2016 +0200

--
 .../org/apache/spark/sql/hive/client/HiveShim.scala  | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d2d438d1/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 3238770..4bbbd66 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -24,6 +24,7 @@ import java.util.{ArrayList => JArrayList, List => JList, Map 
=> JMap, Set => JS
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
+import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -585,7 +586,19 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 getAllPartitionsMethod.invoke(hive, 
table).asInstanceOf[JSet[Partition]]
   } else {
 logDebug(s"Hive metastore filter is '$filter'.")
-getPartitionsByFilterMethod.invoke(hive, table, 
filter).asInstanceOf[JArrayList[Partition]]
+try {
+  getPartitionsByFilterMethod.invoke(hive, table, filter)
+.asInstanceOf[JArrayList[Partition]]
+} catch {
+  case e: InvocationTargetException =>
+// SPARK-18167 retry to investigate the flaky test. This should be 
reverted before
+// the release is cut.
+val retry = Try(getPartitionsByFilterMethod.invoke(hive, table, 
filter))
+val full = Try(getAllPartitionsMethod.invoke(hive, table))
+logError("getPartitionsByFilter failed, retry success = " + 
retry.isSuccess)
+logError("getPartitionsByFilter failed, full fetch success = " + 
full.isSuccess)
+throw e
+}
   }
 
 partitions.asScala.toSeq


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18126][SPARK-CORE] getIteratorZipWithIndex accepts negative value as index

2016-10-26 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 29cea8f33 -> a76846cfb


[SPARK-18126][SPARK-CORE] getIteratorZipWithIndex accepts negative value as 
index

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

`Utils.getIteratorZipWithIndex` was added to deal with number of records > 
2147483647 in one partition.

method `getIteratorZipWithIndex` accepts `startIndex` < 0, which leads to 
negative index.

This PR just adds a defensive check on `startIndex` to make sure it is >= 0.

## How was this patch tested?

Add a new unit test.

Author: Miao Wang 

Closes #15639 from wangmiao1981/zip.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a76846cf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a76846cf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a76846cf

Branch: refs/heads/master
Commit: a76846cfb1c2d6c8f4d647426030b59de20d9433
Parents: 29cea8f
Author: Miao Wang 
Authored: Thu Oct 27 01:17:32 2016 +0200
Committer: Reynold Xin 
Committed: Thu Oct 27 01:17:32 2016 +0200

--
 core/src/main/scala/org/apache/spark/util/Utils.scala  | 1 +
 core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 3 +++
 2 files changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a76846cf/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index e57eb0d..6027b07 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1765,6 +1765,7 @@ private[spark] object Utils extends Logging {
*/
   def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): 
Iterator[(T, Long)] = {
 new Iterator[(T, Long)] {
+  require(startIndex >= 0, "startIndex should be >= 0.")
   var index: Long = startIndex - 1L
   def hasNext: Boolean = iterator.hasNext
   def next(): (T, Long) = {

http://git-wip-us.apache.org/repos/asf/spark/blob/a76846cf/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index aeb2969..15ef32f 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -401,6 +401,9 @@ class UtilsSuite extends SparkFunSuite with 
ResetSystemProperties with Logging {
 assert(iterator.toArray === Array(
   (0, -1L + Int.MaxValue), (1, 0L + Int.MaxValue), (2, 1L + Int.MaxValue)
 ))
+intercept[IllegalArgumentException] {
+  Utils.getIteratorZipWithIndex(Iterator(0, 1, 2), -1L)
+}
   }
 
   test("doesDirectoryContainFilesNewerThan") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18094][SQL][TESTS] Move group analytics test cases from `SQLQuerySuite` into a query file test.

2016-10-26 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master dcdda1978 -> 5b7d403c1


[SPARK-18094][SQL][TESTS] Move group analytics test cases from `SQLQuerySuite` 
into a query file test.

## What changes were proposed in this pull request?

Currently we have several test cases for group analytics(ROLLUP/CUBE/GROUPING 
SETS) in `SQLQuerySuite`, should better move them into a query file test.
The following test cases are moved to `group-analytics.sql`:
```
test("rollup")
test("grouping sets when aggregate functions containing groupBy columns")
test("cube")
test("grouping sets")
test("grouping and grouping_id")
test("grouping and grouping_id in having")
test("grouping and grouping_id in sort")
```

This is followup work of #15582

## How was this patch tested?

Modified query file `group-analytics.sql`, which will be tested by 
`SQLQueryTestSuite`.

Author: jiangxingbo 

Closes #15624 from jiangxb1987/group-analytics-test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b7d403c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b7d403c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b7d403c

Branch: refs/heads/master
Commit: 5b7d403c1819c32a6a5b87d470f8de1a8ad7a987
Parents: dcdda19
Author: jiangxingbo 
Authored: Wed Oct 26 23:51:16 2016 +0200
Committer: Reynold Xin 
Committed: Wed Oct 26 23:51:16 2016 +0200

--
 .../sql-tests/inputs/group-analytics.sql|  46 +++-
 .../sql-tests/results/group-analytics.sql.out   | 247 ++-
 .../org/apache/spark/sql/SQLQuerySuite.scala| 189 --
 3 files changed, 290 insertions(+), 192 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5b7d403c/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
--
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql 
b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
index 2f78349..f813538 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
@@ -10,4 +10,48 @@ SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE;
 -- ROLLUP on overlapping columns
 SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP;
 
-SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP;
\ No newline at end of file
+SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP;
+
+CREATE OR REPLACE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES
+("dotNET", 2012, 1), ("Java", 2012, 2), ("dotNET", 2012, 5000), 
("dotNET", 2013, 48000), ("Java", 2013, 3)
+AS courseSales(course, year, earnings);
+
+-- ROLLUP
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year) ORDER BY course, year;
+
+-- CUBE
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, 
year) ORDER BY course, year;
+
+-- GROUPING SETS
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year 
GROUPING SETS(course, year);
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year 
GROUPING SETS(course);
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year 
GROUPING SETS(year);
+
+-- GROUPING SETS with aggregate functions containing groupBy columns
+SELECT course, SUM(earnings) AS sum FROM courseSales
+GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) 
ORDER BY course, sum;
+SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM 
courseSales
+GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) 
ORDER BY course, sum;
+
+-- GROUPING/GROUPING_ID
+SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, 
year) FROM courseSales
+GROUP BY CUBE(course, year);
+SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year;
+SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY 
course, year;
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year);
+
+-- GROUPING/GROUPING_ID in having clause
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0;
+SELECT course, year FROM courseSales GROUP BY course, year HAVING 
GROUPING(course) > 0;
+SELECT course, year FROM courseSales GROUP BY course, year HAVING 
GROUPING_ID(course) > 0;
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING 
grouping__id > 0;
+
+-- GROUPING/GROUPING_ID in orderBy clause
+SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP 
BY CUBE(course, year)
+ORDER BY GROUPING(course), GROUPING(year), course, year;
+SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY

spark git commit: [SPARK-18063][SQL] Failed to infer constraints over multiple aliases

2016-10-26 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 773fbfef1 -> 5b81b0102


[SPARK-18063][SQL] Failed to infer constraints over multiple aliases

## What changes were proposed in this pull request?

The `UnaryNode.getAliasedConstraints` function fails to replace all expressions 
by their alias where constraints contains more than one expression to be 
replaced.
For example:
```
val tr = LocalRelation('a.int, 'b.string, 'c.int)
val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y))
multiAlias.analyze.constraints
```
currently outputs:
```
ExpressionSet(Seq(
IsNotNull(resolveColumn(multiAlias.analyze, "x")),
IsNotNull(resolveColumn(multiAlias.analyze, "y"))
)
```
The constraint `resolveColumn(multiAlias.analyze, "x") === 
resolveColumn(multiAlias.analyze, "y") + 10)` is missing.

## How was this patch tested?

Add new test cases in `ConstraintPropagationSuite`.

Author: jiangxingbo 

Closes #15597 from jiangxb1987/alias-constraints.

(cherry picked from commit fa7d9d70825a6816495d239da925d0087f7cb94f)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b81b010
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b81b010
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b81b010

Branch: refs/heads/branch-2.0
Commit: 5b81b01026bc215c7982a640a794cd36ea720959
Parents: 773fbfe
Author: jiangxingbo 
Authored: Wed Oct 26 20:12:20 2016 +0200
Committer: Reynold Xin 
Committed: Wed Oct 26 20:12:44 2016 +0200

--
 .../sql/catalyst/plans/logical/LogicalPlan.scala| 16 ++--
 .../catalyst/plans/ConstraintPropagationSuite.scala |  8 
 2 files changed, 18 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5b81b010/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 6d77991..9c152fb88 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -293,15 +293,19 @@ abstract class UnaryNode extends LogicalPlan {
* expressions with the corresponding alias
*/
   protected def getAliasedConstraints(projectList: Seq[NamedExpression]): 
Set[Expression] = {
-projectList.flatMap {
+var allConstraints = child.constraints.asInstanceOf[Set[Expression]]
+projectList.foreach {
   case a @ Alias(e, _) =>
-child.constraints.map(_ transform {
+// For every alias in `projectList`, replace the reference in 
constraints by its attribute.
+allConstraints ++= allConstraints.map(_ transform {
   case expr: Expression if expr.semanticEquals(e) =>
 a.toAttribute
-}).union(Set(EqualNullSafe(e, a.toAttribute)))
-  case _ =>
-Set.empty[Expression]
-}.toSet
+})
+allConstraints += EqualNullSafe(e, a.toAttribute)
+  case _ => // Don't change.
+}
+
+allConstraints -- child.constraints
   }
 
   override protected def validConstraints: Set[Expression] = child.constraints

http://git-wip-us.apache.org/repos/asf/spark/blob/5b81b010/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 8d6a49a..8068ce9 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -128,8 +128,16 @@ class ConstraintPropagationSuite extends SparkFunSuite {
   ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10,
 IsNotNull(resolveColumn(aliasedRelation.analyze, "x")),
 resolveColumn(aliasedRelation.analyze, "b") <=> 
resolveColumn(aliasedRelation.analyze, "y"),
+resolveColumn(aliasedRelation.analyze, "z") <=> 
resolveColumn(aliasedRelation.analyze, "x"),
 resolveColumn(aliasedRelation.analyze, "z") > 10,
 IsNotNull(resolveColumn(aliasedRelation.analyze, "z")
+
+val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y))
+verifyConstraints(multiAlias.analyze.constraints,
+  ExpressionSet(Seq(IsNotNull(resolveColumn(multiAlias.a

spark git commit: [SPARK-18063][SQL] Failed to infer constraints over multiple aliases

2016-10-26 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 7ac70e7ba -> fa7d9d708


[SPARK-18063][SQL] Failed to infer constraints over multiple aliases

## What changes were proposed in this pull request?

The `UnaryNode.getAliasedConstraints` function fails to replace all expressions 
by their alias where constraints contains more than one expression to be 
replaced.
For example:
```
val tr = LocalRelation('a.int, 'b.string, 'c.int)
val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y))
multiAlias.analyze.constraints
```
currently outputs:
```
ExpressionSet(Seq(
IsNotNull(resolveColumn(multiAlias.analyze, "x")),
IsNotNull(resolveColumn(multiAlias.analyze, "y"))
)
```
The constraint `resolveColumn(multiAlias.analyze, "x") === 
resolveColumn(multiAlias.analyze, "y") + 10)` is missing.

## How was this patch tested?

Add new test cases in `ConstraintPropagationSuite`.

Author: jiangxingbo 

Closes #15597 from jiangxb1987/alias-constraints.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa7d9d70
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa7d9d70
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa7d9d70

Branch: refs/heads/master
Commit: fa7d9d70825a6816495d239da925d0087f7cb94f
Parents: 7ac70e7
Author: jiangxingbo 
Authored: Wed Oct 26 20:12:20 2016 +0200
Committer: Reynold Xin 
Committed: Wed Oct 26 20:12:20 2016 +0200

--
 .../sql/catalyst/plans/logical/LogicalPlan.scala| 16 ++--
 .../catalyst/plans/ConstraintPropagationSuite.scala |  8 
 2 files changed, 18 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fa7d9d70/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 0972547..b0a4145 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -293,15 +293,19 @@ abstract class UnaryNode extends LogicalPlan {
* expressions with the corresponding alias
*/
   protected def getAliasedConstraints(projectList: Seq[NamedExpression]): 
Set[Expression] = {
-projectList.flatMap {
+var allConstraints = child.constraints.asInstanceOf[Set[Expression]]
+projectList.foreach {
   case a @ Alias(e, _) =>
-child.constraints.map(_ transform {
+// For every alias in `projectList`, replace the reference in 
constraints by its attribute.
+allConstraints ++= allConstraints.map(_ transform {
   case expr: Expression if expr.semanticEquals(e) =>
 a.toAttribute
-}).union(Set(EqualNullSafe(e, a.toAttribute)))
-  case _ =>
-Set.empty[Expression]
-}.toSet
+})
+allConstraints += EqualNullSafe(e, a.toAttribute)
+  case _ => // Don't change.
+}
+
+allConstraints -- child.constraints
   }
 
   override protected def validConstraints: Set[Expression] = child.constraints

http://git-wip-us.apache.org/repos/asf/spark/blob/fa7d9d70/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 8d6a49a..8068ce9 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -128,8 +128,16 @@ class ConstraintPropagationSuite extends SparkFunSuite {
   ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10,
 IsNotNull(resolveColumn(aliasedRelation.analyze, "x")),
 resolveColumn(aliasedRelation.analyze, "b") <=> 
resolveColumn(aliasedRelation.analyze, "y"),
+resolveColumn(aliasedRelation.analyze, "z") <=> 
resolveColumn(aliasedRelation.analyze, "x"),
 resolveColumn(aliasedRelation.analyze, "z") > 10,
 IsNotNull(resolveColumn(aliasedRelation.analyze, "z")
+
+val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y))
+verifyConstraints(multiAlias.analyze.constraints,
+  ExpressionSet(Seq(IsNotNull(resolveColumn(multiAlias.analyze, "x")),
+IsNotNull(resolveColumn(multiAlias.analyze, "y")),
+resolveColumn(multiAlias

spark git commit: [SPARK-17698][SQL] Join predicates should not contain filter clauses

2016-10-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 b959dab32 -> 3d5878751


[SPARK-17698][SQL] Join predicates should not contain filter clauses

## What changes were proposed in this pull request?

This is a backport of https://github.com/apache/spark/pull/15272 to 2.0 branch.

Jira : https://issues.apache.org/jira/browse/SPARK-17698

`ExtractEquiJoinKeys` is incorrectly using filter predicates as the join 
condition for joins. `canEvaluate` [0] tries to see if the an `Expression` can 
be evaluated using output of a given `Plan`. In case of filter predicates (eg. 
`a.id='1'`), the `Expression` passed for the right hand side (ie. '1' ) is a 
`Literal` which does not have any attribute references. Thus `expr.references` 
is an empty set which theoretically is a subset of any set. This leads to 
`canEvaluate` returning `true` and `a.id='1'` is treated as a join predicate. 
While this does not lead to incorrect results but in case of bucketed + sorted 
tables, we might miss out on avoiding un-necessary shuffle + sort. See example 
below:

[0] : 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala#L91

eg.

```
val df = (1 until 10).toDF("id").coalesce(1)
hc.sql("DROP TABLE IF EXISTS table1").collect
df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table1")
hc.sql("DROP TABLE IF EXISTS table2").collect
df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table2")

sqlContext.sql("""
  SELECT a.id, b.id
  FROM table1 a
  FULL OUTER JOIN table2 b
  ON a.id = b.id AND a.id='1' AND b.id='1'
""").explain(true)
```

BEFORE: This is doing shuffle + sort over table scan outputs which is not 
needed as both tables are bucketed and sorted on the same columns and have same 
number of buckets. This should be a single stage job.

```
SortMergeJoin [id#38, cast(id#38 as double), 1.0], [id#39, 1.0, cast(id#39 as 
double)], FullOuter
:- *Sort [id#38 ASC NULLS FIRST, cast(id#38 as double) ASC NULLS FIRST, 1.0 ASC 
NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(id#38, cast(id#38 as double), 1.0, 200)
: +- *FileScan parquet default.table1[id#38] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
+- *Sort [id#39 ASC NULLS FIRST, 1.0 ASC NULLS FIRST, cast(id#39 as double) ASC 
NULLS FIRST], false, 0
   +- Exchange hashpartitioning(id#39, 1.0, cast(id#39 as double), 200)
  +- *FileScan parquet default.table2[id#39] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
```

AFTER :

```
SortMergeJoin [id#32], [id#33], FullOuter, ((cast(id#32 as double) = 1.0) && 
(cast(id#33 as double) = 1.0))
:- *FileScan parquet default.table1[id#32] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
+- *FileScan parquet default.table2[id#33] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
```

## How was this patch tested?

- Added a new test case for this scenario : `SPARK-17698 Join predicates should 
not contain filter clauses`
- Ran all the tests in `BucketedReadSuite`

Author: Tejas Patil 

Closes #15600 from tejasapatil/SPARK-17698_2.0_backport.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3d587875
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3d587875
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3d587875

Branch: refs/heads/branch-2.0
Commit: 3d587875102fc2f10f03956ef50457203cb4a840
Parents: b959dab
Author: Tejas Patil 
Authored: Sat Oct 22 16:32:49 2016 -0700
Committer: Reynold Xin 
Committed: Sat Oct 22 16:32:49 2016 -0700

--
 .../sql/catalyst/expressions/predicates.scala   |  5 +-
 .../spark/sql/catalyst/optimizer/joins.scala|  4 +-
 .../spark/sql/catalyst/planning/patterns.scala  |  2 +
 .../spark/sql/sources/BucketedReadSuite.scala   | 82 +---
 4 files changed, 79 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3d587875/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 100087e..abe0f08 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -84,8 +84

spark git commit: [SPARK-928][CORE] Add support for Unsafe-based serializer in Kryo

2016-10-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 4f1dcd3dc -> bc167a2a5


[SPARK-928][CORE] Add support for Unsafe-based serializer in Kryo

## What changes were proposed in this pull request?
Now since we have migrated to Kryo-3.0.0 in 
https://issues.apache.org/jira/browse/SPARK-11416, we can gives users option to 
use unsafe SerDer. It can turned by setting `spark.kryo.useUnsafe` to `true`

## How was this patch tested?
Ran existing tests

```
 Benchmark Kryo Unsafe vs safe Serialization: Best/Avg Time(ms)
Rate(M/s)   Per Row(ns)   Relative
  

  basicTypes: Int unsafe:true160 /  178 98.5
  10.1   1.0X
  basicTypes: Long unsafe:true   210 /  218 74.9
  13.4   0.8X
  basicTypes: Float unsafe:true  203 /  213 77.5
  12.9   0.8X
  basicTypes: Double unsafe:true 226 /  235 69.5
  14.4   0.7X
  Array: Int unsafe:true1087 / 1101 14.5
  69.1   0.1X
  Array: Long unsafe:true   2758 / 2844  5.7
 175.4   0.1X
  Array: Float unsafe:true  1511 / 1552 10.4
  96.1   0.1X
  Array: Double unsafe:true 2942 / 2972  5.3
 187.0   0.1X
  Map of string->Double unsafe:true 2645 / 2739  5.9
 168.2   0.1X
  basicTypes: Int unsafe:false   211 /  218 74.7
  13.4   0.8X
  basicTypes: Long unsafe:false  247 /  253 63.6
  15.7   0.6X
  basicTypes: Float unsafe:false 211 /  216 74.5
  13.4   0.8X
  basicTypes: Double unsafe:false227 /  233 69.2
  14.4   0.7X
  Array: Int unsafe:false   3012 / 3032  5.2
 191.5   0.1X
  Array: Long unsafe:false  4463 / 4515  3.5
 283.8   0.0X
  Array: Float unsafe:false 2788 / 2868  5.6
 177.2   0.1X
  Array: Double unsafe:false3558 / 3752  4.4
 226.2   0.0X
  Map of string->Double unsafe:false2806 / 2933  5.6
 178.4   0.1X
```

Author: Sandeep Singh 
Author: Sandeep Singh 

Closes #12913 from techaddict/SPARK-928.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc167a2a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc167a2a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc167a2a

Branch: refs/heads/master
Commit: bc167a2a53f5a795d089e8a884569b1b3e2cd439
Parents: 4f1dcd3
Author: Sandeep Singh 
Authored: Sat Oct 22 12:03:37 2016 -0700
Committer: Reynold Xin 
Committed: Sat Oct 22 12:03:37 2016 -0700

--
 .../spark/serializer/KryoSerializer.scala   |  36 +++--
 .../apache/spark/serializer/KryoBenchmark.scala | 139 +++
 .../spark/serializer/KryoSerializerSuite.scala  |   1 +
 .../serializer/UnsafeKryoSerializerSuite.scala  |  33 +
 docs/configuration.md   |   8 ++
 5 files changed, 206 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bc167a2a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala 
b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 1fba552..0d26281 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -27,6 +27,7 @@ import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.{Kryo, KryoException, Serializer => 
KryoClassSerializer}
 import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
+import com.esotericsoftware.kryo.io.{UnsafeInput => KryoUnsafeInput, 
UnsafeOutput => KryoUnsafeOutput}
 import com.esotericsoftware.kryo.serializers.{JavaSerializer => 
KryoJavaSerializer}
 import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator}
 import org.apache.avro.generic.{GenericData, GenericRecord}
@@ -78,8 +79,15 @@ class KryoSerializer(conf: SparkConf)
 .filter(!_.isEmpty)
 
   private val avroSchemas = conf.getAvroSchema
+  // whether to use unsafe based IO for serialization
+  private val useUnsafe = conf.getBoolean("spark.kryo.unsafe", false)
 
-  def newKryoOutput(): KryoOutput = new KryoOut

spark git commit: [SPARK-18051][SPARK CORE] fix bug of custom PartitionCoalescer causing serialization exception

2016-10-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 5fa9f8795 -> 4f1dcd3dc


[SPARK-18051][SPARK CORE] fix bug of custom PartitionCoalescer causing 
serialization exception

## What changes were proposed in this pull request?

add a require check in `CoalescedRDD` to make sure the passed in 
`partitionCoalescer` to be `serializable`.
and update the document for api `RDD.coalesce`

## How was this patch tested?

Manual.(test code in jira [SPARK-18051])

Author: WeichenXu 

Closes #15587 from WeichenXu123/fix_coalescer_bug.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f1dcd3d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f1dcd3d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f1dcd3d

Branch: refs/heads/master
Commit: 4f1dcd3dce270268b42fbe59409790364fa5c5df
Parents: 5fa9f87
Author: WeichenXu 
Authored: Sat Oct 22 11:59:28 2016 -0700
Committer: Reynold Xin 
Committed: Sat Oct 22 11:59:28 2016 -0700

--
 core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala | 4 
 core/src/main/scala/org/apache/spark/rdd/RDD.scala  | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4f1dcd3d/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 9c198a6..2cba1fe 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -80,6 +80,10 @@ private[spark] class CoalescedRDD[T: ClassTag](
 
   require(maxPartitions > 0 || maxPartitions == prev.partitions.length,
 s"Number of partitions ($maxPartitions) must be positive.")
+  if (partitionCoalescer.isDefined) {
+require(partitionCoalescer.get.isInstanceOf[Serializable],
+  "The partition coalescer passed in must be serializable.")
+  }
 
   override def getPartitions: Array[Partition] = {
 val pc = partitionCoalescer.getOrElse(new DefaultPartitionCoalescer())

http://git-wip-us.apache.org/repos/asf/spark/blob/4f1dcd3d/core/src/main/scala/org/apache/spark/rdd/RDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index be11957..db535de 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -432,7 +432,8 @@ abstract class RDD[T: ClassTag](
* of partitions. This is useful if you have a small number of partitions,
* say 100, potentially with a few partitions being abnormally large. Calling
* coalesce(1000, shuffle = true) will result in 1000 partitions with the
-   * data distributed using a hash partitioner.
+   * data distributed using a hash partitioner. The optional partition 
coalescer
+   * passed in must be serializable.
*/
   def coalesce(numPartitions: Int, shuffle: Boolean = false,
partitionCoalescer: Option[PartitionCoalescer] = Option.empty)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same log message typo

2016-10-21 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 d3c78c4f3 -> a0c03c925


[SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same 
log message typo

## What changes were proposed in this pull request?

Tiny follow-up to SPARK-16606 / https://github.com/apache/spark/pull/14533 , to 
correct more instances of the same log message typo

## How was this patch tested?

Existing tests (no functional change anyway)

Author: Sean Owen 

Closes #15586 from srowen/SPARK-16606.2.

(cherry picked from commit 7178c56433cd138dae53db9194c55e3f4fa0fa69)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0c03c92
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0c03c92
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0c03c92

Branch: refs/heads/branch-2.0
Commit: a0c03c92545c147015308cce195dfc2e8a3074fb
Parents: d3c78c4
Author: Sean Owen 
Authored: Fri Oct 21 22:20:52 2016 -0700
Committer: Reynold Xin 
Committed: Fri Oct 21 22:21:07 2016 -0700

--
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a0c03c92/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index a7de115..13d3e75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -802,7 +802,7 @@ object SparkSession {
   if ((session ne null) && !session.sparkContext.isStopped) {
 options.foreach { case (k, v) => session.conf.set(k, v) }
 if (options.nonEmpty) {
-  logWarning("Use an existing SparkSession, some configuration may not 
take effect.")
+  logWarning("Using an existing SparkSession; some configuration may 
not take effect.")
 }
 return session
   }
@@ -814,7 +814,7 @@ object SparkSession {
 if ((session ne null) && !session.sparkContext.isStopped) {
   options.foreach { case (k, v) => session.conf.set(k, v) }
   if (options.nonEmpty) {
-logWarning("Use an existing SparkSession, some configuration may 
not take effect.")
+logWarning("Using an existing SparkSession; some configuration may 
not take effect.")
   }
   return session
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same log message typo

2016-10-21 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 3fbf5a58c -> 7178c5643


[SPARK-16606][MINOR] Tiny follow-up to , to correct more instances of the same 
log message typo

## What changes were proposed in this pull request?

Tiny follow-up to SPARK-16606 / https://github.com/apache/spark/pull/14533 , to 
correct more instances of the same log message typo

## How was this patch tested?

Existing tests (no functional change anyway)

Author: Sean Owen 

Closes #15586 from srowen/SPARK-16606.2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7178c564
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7178c564
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7178c564

Branch: refs/heads/master
Commit: 7178c56433cd138dae53db9194c55e3f4fa0fa69
Parents: 3fbf5a5
Author: Sean Owen 
Authored: Fri Oct 21 22:20:52 2016 -0700
Committer: Reynold Xin 
Committed: Fri Oct 21 22:20:52 2016 -0700

--
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7178c564/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index baae550..3045eb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -814,7 +814,7 @@ object SparkSession {
   if ((session ne null) && !session.sparkContext.isStopped) {
 options.foreach { case (k, v) => 
session.sessionState.conf.setConfString(k, v) }
 if (options.nonEmpty) {
-  logWarning("Use an existing SparkSession, some configuration may not 
take effect.")
+  logWarning("Using an existing SparkSession; some configuration may 
not take effect.")
 }
 return session
   }
@@ -826,7 +826,7 @@ object SparkSession {
 if ((session ne null) && !session.sparkContext.isStopped) {
   options.foreach { case (k, v) => 
session.sessionState.conf.setConfString(k, v) }
   if (options.nonEmpty) {
-logWarning("Use an existing SparkSession, some configuration may 
not take effect.")
+logWarning("Using an existing SparkSession; some configuration may 
not take effect.")
   }
   return session
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18042][SQL] OutputWriter should expose file path written

2016-10-21 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master c9720b219 -> 3fbf5a58c


[SPARK-18042][SQL] OutputWriter should expose file path written

## What changes were proposed in this pull request?
This patch adds a new "path" method on OutputWriter that returns the path of 
the file written by the OutputWriter. This is part of the necessary work to 
consolidate structured streaming and batch write paths.

The batch write path has a nice feature that each data source can define the 
extension of the files, and allow Spark to specify the staging directory and 
the prefix for the files. However, in the streaming path we need to collect the 
list of files written, and there is no interface right now to do that.

## How was this patch tested?
N/A - there is no behavior change and this should be covered by existing tests.

Author: Reynold Xin 

Closes #15580 from rxin/SPARK-18042.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3fbf5a58
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3fbf5a58
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3fbf5a58

Branch: refs/heads/master
Commit: 3fbf5a58c236fc5d5fee39cb29e7f5c7e01c0ee7
Parents: c9720b2
Author: Reynold Xin 
Authored: Fri Oct 21 17:27:18 2016 -0700
Committer: Reynold Xin 
Committed: Fri Oct 21 17:27:18 2016 -0700

--
 .../spark/ml/source/libsvm/LibSVMRelation.scala |  8 +-
 .../execution/datasources/OutputWriter.scala| 17 +++-
 .../execution/datasources/csv/CSVRelation.scala |  8 +-
 .../datasources/json/JsonFileFormat.scala   |  8 +-
 .../datasources/parquet/ParquetFileFormat.scala |  2 +-
 .../datasources/parquet/ParquetOptions.scala|  2 +-
 .../parquet/ParquetOutputWriter.scala   | 24 +---
 .../datasources/text/TextFileFormat.scala   | 25 +++--
 .../spark/sql/hive/orc/OrcFileFormat.scala  | 29 ++--
 .../sql/sources/CommitFailureTestSource.scala   |  3 ++
 .../spark/sql/sources/SimpleTextRelation.scala  |  3 ++
 11 files changed, 90 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3fbf5a58/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index fff8668..5e9e6ff 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import 
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -46,12 +47,17 @@ private[libsvm] class LibSVMOutputWriter(
 context: TaskAttemptContext)
   extends OutputWriter {
 
+  override val path: String = {
+val compressionExtension = 
TextOutputWriter.getCompressionExtension(context)
+new Path(stagingDir, fileNamePrefix + ".libsvm" + 
compressionExtension).toString
+  }
+
   private[this] val buffer = new Text()
 
   private val recordWriter: RecordWriter[NullWritable, Text] = {
 new TextOutputFormat[NullWritable, Text]() {
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: 
String): Path = {
-new Path(stagingDir, fileNamePrefix + extension)
+new Path(path)
   }
 }.getRecordWriter(context)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/3fbf5a58/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
index f4cefda..fbf6e96 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
@@ -42,11 +42,12 @@ abstract class OutputWriterFactory extends Serializable {
* @param fileNamePrefix Prefix of the file name. The returned OutputWriter 
must make sure this
*   prefix is used in the actual file name. For 
example, if the prefix is
*   &quo

spark git commit: [SPARK-18021][SQL] Refactor file name specification for data sources

2016-10-20 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 947f4f252 -> 7f9ec19ea


[SPARK-18021][SQL] Refactor file name specification for data sources

## What changes were proposed in this pull request?
Currently each data source OutputWriter is responsible for specifying the 
entire file name for each file output. This, however, does not make any sense 
because we rely on file naming schemes for certain behaviors in Spark SQL, e.g. 
bucket id. The current approach allows individual data sources to break the 
implementation of bucketing.

On the flip side, we also don't want to move file naming entirely out of data 
sources, because different data sources do want to specify different extensions.

This patch divides file name specification into two parts: the first part is a 
prefix specified by the caller of OutputWriter (in WriteOutput), and the second 
part is the suffix that can be specified by the OutputWriter itself. Note that 
a side effect of this change is that now all file based data sources also 
support bucketing automatically.

There are also some other minor cleanups:

- Removed the UUID passed through generic Configuration string
- Some minor rewrites for better clarity
- Renamed "path" in multiple places to "stagingDir", to more accurately reflect 
its meaning

## How was this patch tested?
This should be covered by existing data source tests.

Author: Reynold Xin 

Closes #15562 from rxin/SPARK-18021.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f9ec19e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f9ec19e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f9ec19e

Branch: refs/heads/master
Commit: 7f9ec19eae60abe589ffd22259a9065e7e353a57
Parents: 947f4f2
Author: Reynold Xin 
Authored: Thu Oct 20 12:18:56 2016 -0700
Committer: Reynold Xin 
Committed: Thu Oct 20 12:18:56 2016 -0700

--
 .../spark/ml/source/libsvm/LibSVMRelation.scala | 16 +++---
 .../execution/datasources/OutputWriter.scala| 17 +++---
 .../sql/execution/datasources/WriteOutput.scala | 56 +---
 .../execution/datasources/csv/CSVRelation.scala | 18 +++
 .../datasources/json/JsonFileFormat.scala   | 17 +++---
 .../datasources/parquet/ParquetFileFormat.scala |  7 ++-
 .../parquet/ParquetOutputWriter.scala   | 32 +++
 .../datasources/text/TextFileFormat.scala   | 21 
 .../spark/sql/hive/orc/OrcFileFormat.scala  | 21 
 .../spark/sql/sources/BucketedWriteSuite.scala  |  5 --
 .../sql/sources/CommitFailureTestSource.scala   |  6 +--
 .../spark/sql/sources/SimpleTextRelation.scala  | 26 +
 12 files changed, 99 insertions(+), 143 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7f9ec19e/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 8577803..fff8668 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -40,7 +40,8 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
 private[libsvm] class LibSVMOutputWriter(
-path: String,
+stagingDir: String,
+fileNamePrefix: String,
 dataSchema: StructType,
 context: TaskAttemptContext)
   extends OutputWriter {
@@ -50,11 +51,7 @@ private[libsvm] class LibSVMOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
 new TextOutputFormat[NullWritable, Text]() {
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: 
String): Path = {
-val configuration = context.getConfiguration
-val uniqueWriteJobId = 
configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-val taskAttemptId = context.getTaskAttemptID
-val split = taskAttemptId.getTaskID.getId
-new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
+new Path(stagingDir, fileNamePrefix + extension)
   }
 }.getRecordWriter(context)
   }
@@ -132,12 +129,11 @@ private[libsvm] class LibSVMFileFormat extends 
TextBasedFileFormat with DataSour
   dataSchema: StructType): OutputWriterFactory = {
 new OutputWriterFactory {
   override def newInstance(
-  path: String,
-  bucketId: Option[Int],
+  stagingDir: String,
+  fileNamePrefix: String,
   dataSchema: StructType,
   context: TaskAttemptContext): OutputWriter = {
-if (bucketId.isDefined) { sys.er

spark git commit: [SPARK-15780][SQL] Support mapValues on KeyValueGroupedDataset

2016-10-20 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master fb0894b3a -> 84b245f2d


[SPARK-15780][SQL] Support mapValues on KeyValueGroupedDataset

## What changes were proposed in this pull request?

Add mapValues to KeyValueGroupedDataset

## How was this patch tested?

New test in DatasetSuite for groupBy function, mapValues, flatMap

Author: Koert Kuipers 

Closes #13526 from koertkuipers/feat-keyvaluegroupeddataset-mapvalues.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84b245f2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84b245f2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84b245f2

Branch: refs/heads/master
Commit: 84b245f2dd31c1cebbf12458bf11f67e287e93f4
Parents: fb0894b
Author: Koert Kuipers 
Authored: Thu Oct 20 10:08:12 2016 -0700
Committer: Reynold Xin 
Committed: Thu Oct 20 10:08:12 2016 -0700

--
 .../sql/catalyst/plans/logical/object.scala | 13 ++
 .../spark/sql/KeyValueGroupedDataset.scala  | 42 
 .../org/apache/spark/sql/DatasetSuite.scala | 11 +
 3 files changed, 66 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/84b245f2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
index fefe5a3..0ab4c90 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
@@ -230,6 +230,19 @@ object AppendColumns {
   encoderFor[U].namedExpressions,
   child)
   }
+
+  def apply[T : Encoder, U : Encoder](
+  func: T => U,
+  inputAttributes: Seq[Attribute],
+  child: LogicalPlan): AppendColumns = {
+new AppendColumns(
+  func.asInstanceOf[Any => Any],
+  implicitly[Encoder[T]].clsTag.runtimeClass,
+  implicitly[Encoder[T]].schema,
+  UnresolvedDeserializer(encoderFor[T].deserializer, inputAttributes),
+  encoderFor[U].namedExpressions,
+  child)
+  }
 }
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/84b245f2/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 828eb94..4cb0313 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -67,6 +67,48 @@ class KeyValueGroupedDataset[K, V] private[sql](
   groupingAttributes)
 
   /**
+   * Returns a new [[KeyValueGroupedDataset]] where the given function `func` 
has been applied
+   * to the data. The grouping key is unchanged by this.
+   *
+   * {{{
+   *   // Create values grouped by key from a Dataset[(K, V)]
+   *   ds.groupByKey(_._1).mapValues(_._2) // Scala
+   * }}}
+   *
+   * @since 2.1.0
+   */
+  def mapValues[W : Encoder](func: V => W): KeyValueGroupedDataset[K, W] = {
+val withNewData = AppendColumns(func, dataAttributes, logicalPlan)
+val projected = Project(withNewData.newColumns ++ groupingAttributes, 
withNewData)
+val executed = sparkSession.sessionState.executePlan(projected)
+
+new KeyValueGroupedDataset(
+  encoderFor[K],
+  encoderFor[W],
+  executed,
+  withNewData.newColumns,
+  groupingAttributes)
+  }
+
+  /**
+   * Returns a new [[KeyValueGroupedDataset]] where the given function `func` 
has been applied
+   * to the data. The grouping key is unchanged by this.
+   *
+   * {{{
+   *   // Create Integer values grouped by String key from a 
Dataset>
+   *   Dataset> ds = ...;
+   *   KeyValueGroupedDataset grouped =
+   * ds.groupByKey(t -> t._1, Encoders.STRING()).mapValues(t -> t._2, 
Encoders.INT()); // Java 8
+   * }}}
+   *
+   * @since 2.1.0
+   */
+  def mapValues[W](func: MapFunction[V, W], encoder: Encoder[W]): 
KeyValueGroupedDataset[K, W] = {
+implicit val uEnc = encoder
+mapValues { (v: V) => func.call(v) }
+  }
+
+  /**
* Returns a [[Dataset]] that contains each unique key. This is equivalent 
to doing mapping
* over the Dataset to extract the keys and then running a distinct 
operation on those.
*

http://git-wip-us.apache.org/repos/asf/spark/blob/84b245f2/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
--
diff --git a/sql/core/src/test/scala/org/

spark git commit: [SPARK-17698][SQL] Join predicates should not contain filter clauses

2016-10-20 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master e895bc254 -> fb0894b3a


[SPARK-17698][SQL] Join predicates should not contain filter clauses

## What changes were proposed in this pull request?

Jira : https://issues.apache.org/jira/browse/SPARK-17698

`ExtractEquiJoinKeys` is incorrectly using filter predicates as the join 
condition for joins. `canEvaluate` [0] tries to see if the an `Expression` can 
be evaluated using output of a given `Plan`. In case of filter predicates (eg. 
`a.id='1'`), the `Expression` passed for the right hand side (ie. '1' ) is a 
`Literal` which does not have any attribute references. Thus `expr.references` 
is an empty set which theoretically is a subset of any set. This leads to 
`canEvaluate` returning `true` and `a.id='1'` is treated as a join predicate. 
While this does not lead to incorrect results but in case of bucketed + sorted 
tables, we might miss out on avoiding un-necessary shuffle + sort. See example 
below:

[0] : 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala#L91

eg.

```
val df = (1 until 10).toDF("id").coalesce(1)
hc.sql("DROP TABLE IF EXISTS table1").collect
df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table1")
hc.sql("DROP TABLE IF EXISTS table2").collect
df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table2")

sqlContext.sql("""
  SELECT a.id, b.id
  FROM table1 a
  FULL OUTER JOIN table2 b
  ON a.id = b.id AND a.id='1' AND b.id='1'
""").explain(true)
```

BEFORE: This is doing shuffle + sort over table scan outputs which is not 
needed as both tables are bucketed and sorted on the same columns and have same 
number of buckets. This should be a single stage job.

```
SortMergeJoin [id#38, cast(id#38 as double), 1.0], [id#39, 1.0, cast(id#39 as 
double)], FullOuter
:- *Sort [id#38 ASC NULLS FIRST, cast(id#38 as double) ASC NULLS FIRST, 1.0 ASC 
NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(id#38, cast(id#38 as double), 1.0, 200)
: +- *FileScan parquet default.table1[id#38] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
+- *Sort [id#39 ASC NULLS FIRST, 1.0 ASC NULLS FIRST, cast(id#39 as double) ASC 
NULLS FIRST], false, 0
   +- Exchange hashpartitioning(id#39, 1.0, cast(id#39 as double), 200)
  +- *FileScan parquet default.table2[id#39] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
```

AFTER :

```
SortMergeJoin [id#32], [id#33], FullOuter, ((cast(id#32 as double) = 1.0) && 
(cast(id#33 as double) = 1.0))
:- *FileScan parquet default.table1[id#32] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
+- *FileScan parquet default.table2[id#33] Batched: true, Format: 
ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], 
PushedFilters: [], ReadSchema: struct
```

## How was this patch tested?

- Added a new test case for this scenario : `SPARK-17698 Join predicates should 
not contain filter clauses`
- Ran all the tests in `BucketedReadSuite`

Author: Tejas Patil 

Closes #15272 from tejasapatil/SPARK-17698_join_predicate_filter_clause.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb0894b3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb0894b3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb0894b3

Branch: refs/heads/master
Commit: fb0894b3a87331a731129ad3fc7ebe598d90a6ee
Parents: e895bc2
Author: Tejas Patil 
Authored: Thu Oct 20 09:50:55 2016 -0700
Committer: Reynold Xin 
Committed: Thu Oct 20 09:50:55 2016 -0700

--
 .../sql/catalyst/expressions/predicates.scala   |   5 +-
 .../spark/sql/catalyst/optimizer/joins.scala|   4 +-
 .../spark/sql/catalyst/planning/patterns.scala  |   2 +
 .../spark/sql/sources/BucketedReadSuite.scala   | 124 +++
 4 files changed, 109 insertions(+), 26 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fb0894b3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 799858a..9394e39 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -84,8 +84,9 @@ trait PredicateHelper {
*
* For example consider a join

spark git commit: [SPARK-17991][SQL] Enable metastore partition pruning by default.

2016-10-19 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 39755169f -> 4bd17c460


[SPARK-17991][SQL] Enable metastore partition pruning by default.

## What changes were proposed in this pull request?

This should apply to non-converted metastore relations. WIP to see if this 
causes any test failures.

## How was this patch tested?

Existing tests.

Author: Eric Liang 

Closes #15475 from ericl/try-enabling-pruning.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4bd17c46
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4bd17c46
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4bd17c46

Branch: refs/heads/master
Commit: 4bd17c4606764242bc29888b8eedc8e4b5a00f46
Parents: 3975516
Author: Eric Liang 
Authored: Wed Oct 19 23:55:05 2016 -0700
Committer: Reynold Xin 
Committed: Wed Oct 19 23:55:05 2016 -0700

--
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4bd17c46/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 9061b1b..ebf4fad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -267,7 +267,7 @@ object SQLConf {
   .doc("When true, some predicates will be pushed down into the Hive 
metastore so that " +
"unmatching partitions can be eliminated earlier.")
   .booleanConf
-  .createWithDefault(false)
+  .createWithDefault(true)
 
   val HIVE_FILESOURCE_PARTITION_PRUNING =
 SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing

2016-10-19 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 995f602d2 -> 4131623a8


[SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index 
value overflowing

## What changes were proposed in this pull request?

- Fix bug of RDD `zipWithIndex` generating wrong result when one partition 
contains more than 2147483647 records.

- Fix bug of RDD `zipWithUniqueId` generating wrong result when one partition 
contains more than 2147483647 records.

## How was this patch tested?

test added.

Author: WeichenXu 

Closes #15550 from WeichenXu123/fix_rdd_zipWithIndex_overflow.

(cherry picked from commit 39755169fb5bb07332eef263b4c18ede1528812d)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4131623a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4131623a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4131623a

Branch: refs/heads/branch-2.0
Commit: 4131623a8585fe99f79d82c24ab3b8b506d0d616
Parents: 995f602
Author: WeichenXu 
Authored: Wed Oct 19 23:41:38 2016 -0700
Committer: Reynold Xin 
Committed: Wed Oct 19 23:41:46 2016 -0700

--
 core/src/main/scala/org/apache/spark/rdd/RDD.scala   |  2 +-
 .../org/apache/spark/rdd/ZippedWithIndexRDD.scala|  5 ++---
 .../src/main/scala/org/apache/spark/util/Utils.scala | 15 +++
 .../scala/org/apache/spark/util/UtilsSuite.scala |  7 +++
 4 files changed, 25 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 34d32aa..7013396 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag](
   def zipWithUniqueId(): RDD[(T, Long)] = withScope {
 val n = this.partitions.length.toLong
 this.mapPartitionsWithIndex { case (k, iter) =>
-  iter.zipWithIndex.map { case (item, i) =>
+  Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) =>
 (item, i * n + k)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index 32931d5..dff6737 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends 
RDD[(T, Long)](prev)
 
   override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, 
Long)] = {
 val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
-firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
-  (x._1, split.startIndex + x._2)
-}
+val parentIter = firstParent[T].iterator(split.prev, context)
+Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 3d862f4..1686edb 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1768,6 +1768,21 @@ private[spark] object Utils extends Logging {
   }
 
   /**
+   * Generate a zipWithIndex iterator, avoid index value overflowing problem
+   * in scala's zipWithIndex
+   */
+  def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): 
Iterator[(T, Long)] = {
+new Iterator[(T, Long)] {
+  var index: Long = startIndex - 1L
+  def hasNext: Boolean = iterator.hasNext
+  def next(): (T, Long) = {
+index += 1L
+(iterator.next(), index)
+  }
+}
+  }
+
+  /**
* Creates a symlink.
*
* @param src absolute path to the source

http://git-wip-us.apache.org/repos/asf/spark/blob/4131623a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 2741ad7..b67482a 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/sr

spark git commit: [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing

2016-10-19 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master f313117bc -> 39755169f


[SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index 
value overflowing

## What changes were proposed in this pull request?

- Fix bug of RDD `zipWithIndex` generating wrong result when one partition 
contains more than 2147483647 records.

- Fix bug of RDD `zipWithUniqueId` generating wrong result when one partition 
contains more than 2147483647 records.

## How was this patch tested?

test added.

Author: WeichenXu 

Closes #15550 from WeichenXu123/fix_rdd_zipWithIndex_overflow.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39755169
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39755169
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39755169

Branch: refs/heads/master
Commit: 39755169fb5bb07332eef263b4c18ede1528812d
Parents: f313117
Author: WeichenXu 
Authored: Wed Oct 19 23:41:38 2016 -0700
Committer: Reynold Xin 
Committed: Wed Oct 19 23:41:38 2016 -0700

--
 core/src/main/scala/org/apache/spark/rdd/RDD.scala   |  2 +-
 .../org/apache/spark/rdd/ZippedWithIndexRDD.scala|  5 ++---
 .../src/main/scala/org/apache/spark/util/Utils.scala | 15 +++
 .../scala/org/apache/spark/util/UtilsSuite.scala |  7 +++
 4 files changed, 25 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/main/scala/org/apache/spark/rdd/RDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 6dc334c..be11957 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag](
   def zipWithUniqueId(): RDD[(T, Long)] = withScope {
 val n = this.partitions.length.toLong
 this.mapPartitionsWithIndex { case (k, iter) =>
-  iter.zipWithIndex.map { case (item, i) =>
+  Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) =>
 (item, i * n + k)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
--
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala 
b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index b5738b9..b0e5ba0 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends 
RDD[(T, Long)](prev)
 
   override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, 
Long)] = {
 val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
-firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
-  (x._1, split.startIndex + x._2)
-}
+val parentIter = firstParent[T].iterator(split.prev, context)
+Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 7fba901..bfc6094 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1760,6 +1760,21 @@ private[spark] object Utils extends Logging {
   }
 
   /**
+   * Generate a zipWithIndex iterator, avoid index value overflowing problem
+   * in scala's zipWithIndex
+   */
+  def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): 
Iterator[(T, Long)] = {
+new Iterator[(T, Long)] {
+  var index: Long = startIndex - 1L
+  def hasNext: Boolean = iterator.hasNext
+  def next(): (T, Long) = {
+index += 1L
+(iterator.next(), index)
+  }
+}
+  }
+
+  /**
* Creates a symlink.
*
* @param src absolute path to the source

http://git-wip-us.apache.org/repos/asf/spark/blob/39755169/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index b427f7f..4dda80f 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -396,6 +396,13 @@ class UtilsSuite extends SparkFun

spark git commit: [SPARK-16078][SQL] Backport: from_utc_timestamp/to_utc_timestamp should not depends on local timezone

2016-10-19 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 b95ac0d00 -> 82e98f126


[SPARK-16078][SQL] Backport: from_utc_timestamp/to_utc_timestamp should not 
depends on local timezone

## What changes were proposed in this pull request?

Back-port of https://github.com/apache/spark/pull/13784 to `branch-1.6`

## How was this patch tested?

Existing tests.

Author: Davies Liu 

Closes #15554 from srowen/SPARK-16078.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/82e98f12
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/82e98f12
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/82e98f12

Branch: refs/heads/branch-1.6
Commit: 82e98f1265f98b49893e04590989b623169d66d9
Parents: b95ac0d
Author: Davies Liu 
Authored: Wed Oct 19 22:55:30 2016 -0700
Committer: Reynold Xin 
Committed: Wed Oct 19 22:55:30 2016 -0700

--
 .../expressions/datetimeExpressions.scala   | 10 +--
 .../spark/sql/catalyst/util/DateTimeUtils.scala | 35 +--
 .../sql/catalyst/util/DateTimeUtilsSuite.scala  | 65 
 3 files changed, 74 insertions(+), 36 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/82e98f12/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 03c39f8..91eca24 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -658,16 +658,17 @@ case class FromUTCTimestamp(left: Expression, right: 
Expression)
  """.stripMargin
   } else {
 val tzTerm = ctx.freshName("tz")
+val utcTerm = ctx.freshName("utc")
 val tzClass = classOf[TimeZone].getName
 ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = 
$tzClass.getTimeZone("$tz");""")
+ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = 
$tzClass.getTimeZone("UTC");""")
 val eval = left.gen(ctx)
 s"""
|${eval.code}
|boolean ${ev.isNull} = ${eval.isNull};
|long ${ev.value} = 0;
|if (!${ev.isNull}) {
-   |  ${ev.value} = ${eval.value} +
-   |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
+   |  ${ev.value} = $dtu.convertTz(${eval.value}, $utcTerm, $tzTerm);
|}
  """.stripMargin
   }
@@ -783,16 +784,17 @@ case class ToUTCTimestamp(left: Expression, right: 
Expression)
  """.stripMargin
   } else {
 val tzTerm = ctx.freshName("tz")
+val utcTerm = ctx.freshName("utc")
 val tzClass = classOf[TimeZone].getName
 ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = 
$tzClass.getTimeZone("$tz");""")
+ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = 
$tzClass.getTimeZone("UTC");""")
 val eval = left.gen(ctx)
 s"""
|${eval.code}
|boolean ${ev.isNull} = ${eval.isNull};
|long ${ev.value} = 0;
|if (!${ev.isNull}) {
-   |  ${ev.value} = ${eval.value} -
-   |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
+   |  ${ev.value} = $dtu.convertTz(${eval.value}, $tzTerm, $utcTerm);
|}
  """.stripMargin
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/82e98f12/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 157ac2b..36fe11c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -55,6 +55,7 @@ object DateTimeUtils {
   // this is year -17999, calculation: 50 * daysIn400Year
   final val YearZero = -17999
   final val toYearZero = to2001 + 7304850
+  final val TimeZoneGMT = TimeZone.getTimeZone("GMT")
 
   @transient lazy val defaultTimeZone = TimeZone.getDefault
 
@@ -855,13 +856,37 @@ object DateTimeUtils {
   }
 
   /**
+   * Convert the timestamp `ts` from one timezone to another.
+   *
+   * TODO: Because of DST, the conversion between UTC and human time is not 
exactly one-to-one
+   * mapping, the conversion here may return wrong result, we should make the 
timestamp
+   * timezone-aware.
+   *

spark git commit: [SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than throwing ClassCastException

2016-10-19 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 cdd2570e6 -> 995f602d2


[SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than 
throwing ClassCastException

## What changes were proposed in this pull request?

This PR proposes to check the second argument, `ascendingOrder`  rather than 
throwing `ClassCastException` exception message.

```sql
select sort_array(array('b', 'd'), '1');
```

**Before**

```
16/10/19 13:16:08 ERROR SparkSQLDriver: Failed in [select sort_array(array('b', 
'd'), '1')]
java.lang.ClassCastException: org.apache.spark.unsafe.types.UTF8String cannot 
be cast to java.lang.Boolean
at scala.runtime.BoxesRunTime.unboxToBoolean(BoxesRunTime.java:85)
at 
org.apache.spark.sql.catalyst.expressions.SortArray.nullSafeEval(collectionOperations.scala:185)
at 
org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:416)
at 
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:50)
at 
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:43)
at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292)
at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292)
at 
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74)
at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:297)
```

**After**

```
Error in query: cannot resolve 'sort_array(array('b', 'd'), '1')' due to data 
type mismatch: Sort order in second argument requires a boolean literal.; line 
1 pos 7;
```

## How was this patch tested?

Unit test in `DataFrameFunctionsSuite`.

Author: hyukjinkwon 

Closes #15532 from HyukjinKwon/SPARK-17989.

(cherry picked from commit 4b2011ec9da1245923b5cbd883240fef0dbf3ef0)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/995f602d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/995f602d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/995f602d

Branch: refs/heads/branch-2.0
Commit: 995f602d27bdcf9e6787d93dbea2357e6dc6ccaa
Parents: cdd2570
Author: hyukjinkwon 
Authored: Wed Oct 19 19:36:21 2016 -0700
Committer: Reynold Xin 
Committed: Wed Oct 19 19:36:53 2016 -0700

--
 .../expressions/collectionOperations.scala  |  8 +++-
 .../test/resources/sql-tests/inputs/array.sql   |  6 ++
 .../resources/sql-tests/results/array.sql.out   | 21 +---
 3 files changed, 31 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/995f602d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 2e8ea11..1efe2cb 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -112,7 +112,13 @@ case class SortArray(base: Expression, ascendingOrder: 
Expression)
 
   override def checkInputDataTypes(): TypeCheckResult = base.dataType match {
 case ArrayType(dt, _) if RowOrdering.isOrderable(dt) =>
-  TypeCheckResult.TypeCheckSuccess
+  ascendingOrder match {
+case Literal(_: Boolean, BooleanType) =>
+  TypeCheckResult.TypeCheckSuccess
+case _ =>
+  TypeCheckResult.TypeCheckFailure(
+"Sort order in second argument requires a boolean literal.")
+  }
 case ArrayType(dt, _) =>
   TypeCheckResult.TypeCheckFailure(
 s"$prettyName does not support sorting array of type 
${dt.simpleString}")

http://git-wip-us.apache.org/repos/asf/spark/blob/995f602d/sql/core/src/test/resources/sql-tests/inputs/array.sql
--
diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql 
b/sql/core/src/test/resources/sql-tests/inputs/array.sql
index 4038a0d..984321a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/array.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql
@@ -71,6 +71,12 @@ select
   sort_array(timestamp_array)
 from primitive_arrays;
 
+-- sort_array with an invalid string literal for the argument of sort order.

spark git commit: [SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than throwing ClassCastException

2016-10-19 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 444c2d22e -> 4b2011ec9


[SPARK-17989][SQL] Check ascendingOrder type in sort_array function rather than 
throwing ClassCastException

## What changes were proposed in this pull request?

This PR proposes to check the second argument, `ascendingOrder`  rather than 
throwing `ClassCastException` exception message.

```sql
select sort_array(array('b', 'd'), '1');
```

**Before**

```
16/10/19 13:16:08 ERROR SparkSQLDriver: Failed in [select sort_array(array('b', 
'd'), '1')]
java.lang.ClassCastException: org.apache.spark.unsafe.types.UTF8String cannot 
be cast to java.lang.Boolean
at scala.runtime.BoxesRunTime.unboxToBoolean(BoxesRunTime.java:85)
at 
org.apache.spark.sql.catalyst.expressions.SortArray.nullSafeEval(collectionOperations.scala:185)
at 
org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:416)
at 
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:50)
at 
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:43)
at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292)
at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292)
at 
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74)
at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:297)
```

**After**

```
Error in query: cannot resolve 'sort_array(array('b', 'd'), '1')' due to data 
type mismatch: Sort order in second argument requires a boolean literal.; line 
1 pos 7;
```

## How was this patch tested?

Unit test in `DataFrameFunctionsSuite`.

Author: hyukjinkwon 

Closes #15532 from HyukjinKwon/SPARK-17989.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b2011ec
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b2011ec
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b2011ec

Branch: refs/heads/master
Commit: 4b2011ec9da1245923b5cbd883240fef0dbf3ef0
Parents: 444c2d2
Author: hyukjinkwon 
Authored: Wed Oct 19 19:36:21 2016 -0700
Committer: Reynold Xin 
Committed: Wed Oct 19 19:36:21 2016 -0700

--
 .../expressions/collectionOperations.scala  |  8 +++-
 .../test/resources/sql-tests/inputs/array.sql   |  6 ++
 .../resources/sql-tests/results/array.sql.out   | 21 +---
 3 files changed, 31 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4b2011ec/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index c020029..f56bb39 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -124,7 +124,13 @@ case class SortArray(base: Expression, ascendingOrder: 
Expression)
 
   override def checkInputDataTypes(): TypeCheckResult = base.dataType match {
 case ArrayType(dt, _) if RowOrdering.isOrderable(dt) =>
-  TypeCheckResult.TypeCheckSuccess
+  ascendingOrder match {
+case Literal(_: Boolean, BooleanType) =>
+  TypeCheckResult.TypeCheckSuccess
+case _ =>
+  TypeCheckResult.TypeCheckFailure(
+"Sort order in second argument requires a boolean literal.")
+  }
 case ArrayType(dt, _) =>
   TypeCheckResult.TypeCheckFailure(
 s"$prettyName does not support sorting array of type 
${dt.simpleString}")

http://git-wip-us.apache.org/repos/asf/spark/blob/4b2011ec/sql/core/src/test/resources/sql-tests/inputs/array.sql
--
diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql 
b/sql/core/src/test/resources/sql-tests/inputs/array.sql
index 4038a0d..984321a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/array.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql
@@ -71,6 +71,12 @@ select
   sort_array(timestamp_array)
 from primitive_arrays;
 
+-- sort_array with an invalid string literal for the argument of sort order.
+select sort_array(array('b', 'd'), '1');
+
+-- sort_array with an invalid null literal casted as boolean

spark git commit: [SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 3796a98cf -> cdd2570e6


[SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame

## What changes were proposed in this pull request?

In http://spark.apache.org/docs/latest/sql-programming-guide.html, Section 
"Untyped Dataset Operations (aka DataFrame Operations)"

Link to R DataFrame doesn't work that return
The requested URL /docs/latest/api/R/DataFrame.html was not found on this 
server.

Correct link is SparkDataFrame.html for spark 2.0

## How was this patch tested?

Manual checked.

Author: Tommy YU 

Closes #15543 from Wenpei/spark-18001.

(cherry picked from commit f39852e59883c214b0d007faffb406570ea3084b)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cdd2570e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cdd2570e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cdd2570e

Branch: refs/heads/branch-2.0
Commit: cdd2570e6dbfc5af68d0c9a49e4493e4e5e53020
Parents: 3796a98
Author: Tommy YU 
Authored: Tue Oct 18 21:15:32 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 21:15:40 2016 -0700

--
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cdd2570e/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0a6bdb6..3a90323 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -140,7 +140,7 @@ As an example, the following creates a DataFrame based on 
the content of a JSON
 
 ## Untyped Dataset Operations (aka DataFrame Operations)
 
-DataFrames provide a domain-specific language for structured data manipulation 
in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), 
[Java](api/java/index.html?org/apache/spark/sql/Dataset.html), 
[Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and 
[R](api/R/DataFrame.html).
+DataFrames provide a domain-specific language for structured data manipulation 
in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), 
[Java](api/java/index.html?org/apache/spark/sql/Dataset.html), 
[Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and 
[R](api/R/SparkDataFrame.html).
 
 As mentioned above, in Spark 2.0, DataFrames are just Dataset of `Row`s in 
Scala and Java API. These operations are also referred as "untyped 
transformations" in contrast to "typed transformations" come with strongly 
typed Scala/Java Datasets.
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 4329c5cea -> f39852e59


[SPARK-18001][DOCUMENT] fix broke link to SparkDataFrame

## What changes were proposed in this pull request?

In http://spark.apache.org/docs/latest/sql-programming-guide.html, Section 
"Untyped Dataset Operations (aka DataFrame Operations)"

Link to R DataFrame doesn't work that return
The requested URL /docs/latest/api/R/DataFrame.html was not found on this 
server.

Correct link is SparkDataFrame.html for spark 2.0

## How was this patch tested?

Manual checked.

Author: Tommy YU 

Closes #15543 from Wenpei/spark-18001.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f39852e5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f39852e5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f39852e5

Branch: refs/heads/master
Commit: f39852e59883c214b0d007faffb406570ea3084b
Parents: 4329c5c
Author: Tommy YU 
Authored: Tue Oct 18 21:15:32 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 21:15:32 2016 -0700

--
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f39852e5/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 3f1b73a..d334a86 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -140,7 +140,7 @@ As an example, the following creates a DataFrame based on 
the content of a JSON
 
 ## Untyped Dataset Operations (aka DataFrame Operations)
 
-DataFrames provide a domain-specific language for structured data manipulation 
in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), 
[Java](api/java/index.html?org/apache/spark/sql/Dataset.html), 
[Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and 
[R](api/R/DataFrame.html).
+DataFrames provide a domain-specific language for structured data manipulation 
in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), 
[Java](api/java/index.html?org/apache/spark/sql/Dataset.html), 
[Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and 
[R](api/R/SparkDataFrame.html).
 
 As mentioned above, in Spark 2.0, DataFrames are just Dataset of `Row`s in 
Scala and Java API. These operations are also referred as "untyped 
transformations" in contrast to "typed transformations" come with strongly 
typed Scala/Java Datasets.
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17841][STREAMING][KAFKA] drain commitQueue

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6ef923137 -> f6b87939c


[SPARK-17841][STREAMING][KAFKA] drain commitQueue

## What changes were proposed in this pull request?

Actually drain commit queue rather than just iterating it.
iterator() on a concurrent linked queue won't remove items from the queue, 
poll() will.

## How was this patch tested?
Unit tests

Author: cody koeninger 

Closes #15407 from koeninger/SPARK-17841.

(cherry picked from commit cd106b050ff789b6de539956a7f01159ab15c820)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f6b87939
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f6b87939
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f6b87939

Branch: refs/heads/branch-2.0
Commit: f6b87939cb90bf4a0996b3728c1bccdf5e24dd4e
Parents: 6ef9231
Author: cody koeninger 
Authored: Tue Oct 18 14:01:49 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 14:01:59 2016 -0700

--
 .../spark/streaming/kafka010/DirectKafkaInputDStream.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f6b87939/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
--
diff --git 
a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
 
b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 432537e..7e57bb1 100644
--- 
a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ 
b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -282,13 +282,13 @@ private[spark] class DirectKafkaInputDStream[K, V](
 
   protected def commitAll(): Unit = {
 val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]()
-val it = commitQueue.iterator()
-while (it.hasNext) {
-  val osr = it.next
+var osr = commitQueue.poll()
+while (null != osr) {
   val tp = osr.topicPartition
   val x = m.get(tp)
   val offset = if (null == x) { osr.untilOffset } else { 
Math.max(x.offset, osr.untilOffset) }
   m.put(tp, new OffsetAndMetadata(offset))
+  osr = commitQueue.poll()
 }
 if (!m.isEmpty) {
   consumer.commitAsync(m, commitCallback.get)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17841][STREAMING][KAFKA] drain commitQueue

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master cd662bc7a -> cd106b050


[SPARK-17841][STREAMING][KAFKA] drain commitQueue

## What changes were proposed in this pull request?

Actually drain commit queue rather than just iterating it.
iterator() on a concurrent linked queue won't remove items from the queue, 
poll() will.

## How was this patch tested?
Unit tests

Author: cody koeninger 

Closes #15407 from koeninger/SPARK-17841.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd106b05
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd106b05
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd106b05

Branch: refs/heads/master
Commit: cd106b050ff789b6de539956a7f01159ab15c820
Parents: cd662bc
Author: cody koeninger 
Authored: Tue Oct 18 14:01:49 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 14:01:49 2016 -0700

--
 .../spark/streaming/kafka010/DirectKafkaInputDStream.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cd106b05/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
--
diff --git 
a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
 
b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 432537e..7e57bb1 100644
--- 
a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ 
b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -282,13 +282,13 @@ private[spark] class DirectKafkaInputDStream[K, V](
 
   protected def commitAll(): Unit = {
 val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]()
-val it = commitQueue.iterator()
-while (it.hasNext) {
-  val osr = it.next
+var osr = commitQueue.poll()
+while (null != osr) {
   val tp = osr.topicPartition
   val x = m.get(tp)
   val offset = if (null == x) { osr.untilOffset } else { 
Math.max(x.offset, osr.untilOffset) }
   m.put(tp, new OffsetAndMetadata(offset))
+  osr = commitQueue.poll()
 }
 if (!m.isEmpty) {
   consumer.commitAsync(m, commitCallback.get)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: Revert "[SPARK-17985][CORE] Bump commons-lang3 version to 3.5."

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master b3130c7b6 -> cd662bc7a


Revert "[SPARK-17985][CORE] Bump commons-lang3 version to 3.5."

This reverts commit bfe7885aee2f406c1bbde08e30809a0b4bb070d2.

The commit caused build failures on Hadoop 2.2 profile:

```
[error] 
/scratch/rxin/spark/core/src/main/scala/org/apache/spark/util/Utils.scala:1489: 
value read is not a member of object org.apache.commons.io.IOUtils
[error]   var numBytes = IOUtils.read(gzInputStream, buf)
[error]  ^
[error] 
/scratch/rxin/spark/core/src/main/scala/org/apache/spark/util/Utils.scala:1492: 
value read is not a member of object org.apache.commons.io.IOUtils
[error] numBytes = IOUtils.read(gzInputStream, buf)
[error]^
```


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd662bc7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd662bc7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd662bc7

Branch: refs/heads/master
Commit: cd662bc7a2050264f40650442858a85c4827b608
Parents: b3130c7
Author: Reynold Xin 
Authored: Tue Oct 18 13:56:35 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 13:56:35 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2  | 2 +-
 dev/deps/spark-deps-hadoop-2.3  | 2 +-
 dev/deps/spark-deps-hadoop-2.4  | 2 +-
 dev/deps/spark-deps-hadoop-2.6  | 2 +-
 dev/deps/spark-deps-hadoop-2.7  | 2 +-
 docs/streaming-flume-integration.md | 4 ++--
 pom.xml | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 525dcef..b30f8c3 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -33,7 +33,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math-2.1.jar
 commons-math3-3.4.1.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 562fe64..5b3a765 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 747521a..e323efe 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index afd4502..77d97e5 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/dev/deps/spark-deps-hadoop-2.7
--
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 687b855..572edfa 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/cd662bc7/docs/streaming-flume-integration.md
--
diff --git a/docs/streaming-flume-integration.md 
b/docs/streaming-flume-integration.md
index a5d36da..767e1f9 100644
--- a/docs/streaming-flume-integration.

spark git commit: [SPARK-17955][SQL] Make DataFrameReader.jdbc call DataFrameReader.format("jdbc").load

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 4518642ab -> b3130c7b6


[SPARK-17955][SQL] Make DataFrameReader.jdbc call 
DataFrameReader.format("jdbc").load

## What changes were proposed in this pull request?

This PR proposes to make `DataFrameReader.jdbc` call 
`DataFrameReader.format("jdbc").load` consistently with other APIs in 
`DataFrameReader`/`DataFrameWriter` and avoid calling 
`sparkSession.baseRelationToDataFrame(..)` here and there.

The changes were mostly copied from `DataFrameWriter.jdbc()` which was recently 
updated.

```diff
-val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
-val options = new JDBCOptions(url, table, params)
-val relation = JDBCRelation(parts, options)(sparkSession)
-sparkSession.baseRelationToDataFrame(relation)
+this.extraOptions = this.extraOptions ++ connectionProperties.asScala
+// explicit url and dbtable should override all
+this.extraOptions += ("url" -> url, "dbtable" -> table)
+format("jdbc").load()
```

## How was this patch tested?

Existing tests should cover this.

Author: hyukjinkwon 

Closes #15499 from HyukjinKwon/SPARK-17955.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3130c7b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3130c7b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3130c7b

Branch: refs/heads/master
Commit: b3130c7b6a1ab4975023f08c3ab02ee8d2c7e995
Parents: 4518642
Author: hyukjinkwon 
Authored: Tue Oct 18 13:49:02 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 13:49:02 2016 -0700

--
 .../main/scala/org/apache/spark/sql/DataFrameReader.scala| 8 
 .../main/scala/org/apache/spark/sql/DataFrameWriter.scala| 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b3130c7b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index ac33585..b7b2203 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -232,10 +232,10 @@ class DataFrameReader private[sql](sparkSession: 
SparkSession) extends Logging {
   parts: Array[Partition],
   connectionProperties: Properties): DataFrame = {
 // connectionProperties should override settings in extraOptions.
-val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
-val options = new JDBCOptions(url, table, params)
-val relation = JDBCRelation(parts, options)(sparkSession)
-sparkSession.baseRelationToDataFrame(relation)
+this.extraOptions = this.extraOptions ++ connectionProperties.asScala
+// explicit url and dbtable should override all
+this.extraOptions += ("url" -> url, "dbtable" -> table)
+format("jdbc").load()
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/b3130c7b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 35ef050..5be3277 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -426,8 +426,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) 
{
   def jdbc(url: String, table: String, connectionProperties: Properties): Unit 
= {
 assertNotPartitioned("jdbc")
 assertNotBucketed("jdbc")
-// connectionProperties should override settings in extraOptions
-this.extraOptions = this.extraOptions ++ (connectionProperties.asScala)
+// connectionProperties should override settings in extraOptions.
+this.extraOptions = this.extraOptions ++ connectionProperties.asScala
 // explicit url and dbtable should override all
 this.extraOptions += ("url" -> url, "dbtable" -> table)
 format("jdbc").save()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOC] Add more built-in sources in sql-programming-guide.md

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master bfe7885ae -> 20dd11096


[MINOR][DOC] Add more built-in sources in sql-programming-guide.md

## What changes were proposed in this pull request?
Add more built-in sources in sql-programming-guide.md.

## How was this patch tested?
Manually.

Author: Weiqing Yang 

Closes #15522 from weiqingy/dsDoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20dd1109
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20dd1109
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20dd1109

Branch: refs/heads/master
Commit: 20dd11096cfda51e47b9dbe3b715a12ccbb4ce1d
Parents: bfe7885
Author: Weiqing Yang 
Authored: Tue Oct 18 13:38:14 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 13:38:14 2016 -0700

--
 docs/sql-programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/20dd1109/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index dcc828c..3f1b73a 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -422,8 +422,8 @@ In the simplest form, the default data source (`parquet` 
unless otherwise config
 You can also manually specify the data source that will be used along with any 
extra options
 that you would like to pass to the data source. Data sources are specified by 
their fully qualified
 name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can 
also use their short
-names (`json`, `parquet`, `jdbc`). DataFrames loaded from any data source type 
can be converted into other types
-using this syntax.
+names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames 
loaded from any data
+source type can be converted into other types using this syntax.
 
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOC] Add more built-in sources in sql-programming-guide.md

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 26e978a93 -> 6ef923137


[MINOR][DOC] Add more built-in sources in sql-programming-guide.md

## What changes were proposed in this pull request?
Add more built-in sources in sql-programming-guide.md.

## How was this patch tested?
Manually.

Author: Weiqing Yang 

Closes #15522 from weiqingy/dsDoc.

(cherry picked from commit 20dd11096cfda51e47b9dbe3b715a12ccbb4ce1d)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ef92313
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ef92313
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ef92313

Branch: refs/heads/branch-2.0
Commit: 6ef9231377c7cce949dc7a988bb9d7a5cb3e458d
Parents: 26e978a
Author: Weiqing Yang 
Authored: Tue Oct 18 13:38:14 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 13:38:50 2016 -0700

--
 docs/sql-programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6ef92313/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0bd0093..0a6bdb6 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -387,8 +387,8 @@ In the simplest form, the default data source (`parquet` 
unless otherwise config
 You can also manually specify the data source that will be used along with any 
extra options
 that you would like to pass to the data source. Data sources are specified by 
their fully qualified
 name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can 
also use their short
-names (`json`, `parquet`, `jdbc`). DataFrames loaded from any data source type 
can be converted into other types
-using this syntax.
+names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames 
loaded from any data
+source type can be converted into other types using this syntax.
 
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17985][CORE] Bump commons-lang3 version to 3.5.

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 4ef39c2f4 -> bfe7885ae


[SPARK-17985][CORE] Bump commons-lang3 version to 3.5.

## What changes were proposed in this pull request?

`SerializationUtils.clone()` of commons-lang3 (<3.5) has a bug that breaks 
thread safety, which gets stack sometimes caused by race condition of 
initializing hash map.
See https://issues.apache.org/jira/browse/LANG-1251.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN 

Closes #15525 from ueshin/issues/SPARK-17985.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfe7885a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfe7885a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfe7885a

Branch: refs/heads/master
Commit: bfe7885aee2f406c1bbde08e30809a0b4bb070d2
Parents: 4ef39c2
Author: Takuya UESHIN 
Authored: Tue Oct 18 13:36:00 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 13:36:00 2016 -0700

--
 dev/deps/spark-deps-hadoop-2.2  | 2 +-
 dev/deps/spark-deps-hadoop-2.3  | 2 +-
 dev/deps/spark-deps-hadoop-2.4  | 2 +-
 dev/deps/spark-deps-hadoop-2.6  | 2 +-
 dev/deps/spark-deps-hadoop-2.7  | 2 +-
 docs/streaming-flume-integration.md | 4 ++--
 pom.xml | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index b30f8c3..525dcef 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -33,7 +33,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math-2.1.jar
 commons-math3-3.4.1.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 5b3a765..562fe64 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index e323efe..747521a 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 77d97e5..afd4502 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/dev/deps/spark-deps-hadoop-2.7
--
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 572edfa..687b855 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/bfe7885a/docs/streaming-flume-integration.md
--
diff --git a/docs/streaming-flume-integration.md 
b/docs/streaming-flume-integration.md
index 767e1f9..a5d36da 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -115,11 +115,11 @@ Configuring Flume on the chosen machine requires the 
following two steps.
artifactId = scala-library
version = {{site.SCALA_VERSION}}
 
-   (iii) *Commons Lang 3 JAR*

spark git commit: [SPARK-17974] try 2) Refactor FileCatalog classes to simplify the inheritance tree

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 231f39e3f -> 4ef39c2f4


[SPARK-17974] try 2) Refactor FileCatalog classes to simplify the inheritance 
tree

## What changes were proposed in this pull request?

This renames `BasicFileCatalog => FileCatalog`, combines  `SessionFileCatalog` 
with `PartitioningAwareFileCatalog`, and removes the old `FileCatalog` trait.

In summary,
```
MetadataLogFileCatalog extends PartitioningAwareFileCatalog
ListingFileCatalog extends PartitioningAwareFileCatalog
PartitioningAwareFileCatalog extends FileCatalog
TableFileCatalog extends FileCatalog
```

(note that this is a re-submission of 
https://github.com/apache/spark/pull/15518 which got reverted)

## How was this patch tested?

Existing tests

Author: Eric Liang 

Closes #15533 from ericl/fix-scalastyle-revert.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ef39c2f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ef39c2f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ef39c2f

Branch: refs/heads/master
Commit: 4ef39c2f4436fa22d0b957fe7ad477e4c4a16452
Parents: 231f39e
Author: Eric Liang 
Authored: Tue Oct 18 13:33:46 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 13:33:46 2016 -0700

--
 .../scala/org/apache/spark/sql/Dataset.scala|   2 +-
 .../sql/execution/DataSourceScanExec.scala  |   4 +-
 .../sql/execution/datasources/FileCatalog.scala |  66 ++
 .../sql/execution/datasources/FileFormat.scala  |  61 -
 .../datasources/HadoopFsRelation.scala  |   4 +-
 .../PartitioningAwareFileCatalog.scala  | 217 +-
 .../datasources/PartitioningUtils.scala |  12 +-
 .../datasources/SessionFileCatalog.scala| 225 ---
 .../datasources/TableFileCatalog.scala  |  11 +-
 .../datasources/FileCatalogSuite.scala  |  10 +
 .../datasources/SessionFileCatalogSuite.scala   |  34 ---
 .../ParquetPartitionDiscoverySuite.scala|  10 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |   2 +-
 13 files changed, 304 insertions(+), 354 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4ef39c2f/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7dccbbd..073d2b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, 
QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, 
ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.{FileCatalog, 
HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, 
LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}

http://git-wip-us.apache.org/repos/asf/spark/blob/4ef39c2f/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 623d2be..fdd1fa3 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -431,7 +431,7 @@ case class FileSourceScanExec(
   private def createBucketedReadRDD(
   bucketSpec: BucketSpec,
   readFile: (PartitionedFile) => Iterator[InternalRow],
-  selectedPartitions: Seq[Partition],
+  selectedPartitions: Seq[PartitionDirectory],
   fsRelation: HadoopFsRelation): RDD[InternalRow] = {
 logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
 val bucketed =
@@ -463,7 +463,7 @@ case class FileSourceScanExec(
*/
   private def createNonBucketedReadRDD(
   readFile: (PartitionedFile) => Iterator[InternalRow],
-  selectedPartitions: Seq[Partition],
+  selectedPartitions: Seq[PartitionDirectory],
   fsRelation: HadoopFsRelation): RDD[InternalRow] = {
 val defaultMaxSplitBytes =
   fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes

http://git-wip-us.apache.org/repos/asf/spark/blo

spark git commit: [SPARK-17899][SQL][FOLLOW-UP] debug mode should work for corrupted table

2016-10-18 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master a9e79a41e -> e59df62e6


[SPARK-17899][SQL][FOLLOW-UP] debug mode should work for corrupted table

## What changes were proposed in this pull request?

Debug mode should work for corrupted table, so that we can really debug

## How was this patch tested?

new test in `MetastoreDataSourcesSuite`

Author: Wenchen Fan 

Closes #15528 from cloud-fan/debug.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e59df62e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e59df62e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e59df62e

Branch: refs/heads/master
Commit: e59df62e62ec4c5f8bd02a13f05fa3ec6f0fc694
Parents: a9e79a4
Author: Wenchen Fan 
Authored: Tue Oct 18 11:03:10 2016 -0700
Committer: Reynold Xin 
Committed: Tue Oct 18 11:03:10 2016 -0700

--
 .../spark/sql/hive/HiveExternalCatalog.scala  |  9 ++---
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 18 +++---
 2 files changed, 17 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e59df62e/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index ff59b54..2003ff4 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -448,7 +448,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, 
hadoopConf: Configurat
* properties, and filter out these special entries from table properties.
*/
   private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
-val catalogTable = if (table.tableType == VIEW) {
+val catalogTable = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) {
   table
 } else {
   getProviderFromTableProperties(table).map { provider =>
@@ -467,18 +467,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, 
hadoopConf: Configurat
 } else {
   table.storage
 }
-val tableProps = if (conf.get(DEBUG_MODE)) {
-  table.properties
-} else {
-  getOriginalTableProperties(table)
-}
 table.copy(
   storage = storage,
   schema = getSchemaFromTableProperties(table),
   provider = Some(provider),
   partitionColumnNames = getPartitionColumnsFromTableProperties(table),
   bucketSpec = getBucketSpecFromTableProperties(table),
-  properties = tableProps)
+  properties = getOriginalTableProperties(table))
   } getOrElse {
 table.copy(provider = Some("hive"))
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/e59df62e/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 7cc6179..eaa67d3 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1321,20 +1321,32 @@ class MetastoreDataSourcesSuite extends QueryTest with 
SQLTestUtils with TestHiv
 sharedState.externalCatalog.getTable("default", "t")
   }.getMessage
   assert(e.contains(s"Could not read schema from the hive metastore 
because it is corrupted"))
+
+  withDebugMode {
+val tableMeta = sharedState.externalCatalog.getTable("default", "t")
+assert(tableMeta.identifier == TableIdentifier("t", Some("default")))
+assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json")
+  }
 } finally {
   hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = 
true)
 }
   }
 
   test("should keep data source entries in table properties when debug mode is 
on") {
-val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
-try {
-  sparkSession.sparkContext.conf.set(DEBUG_MODE, true)
+withDebugMode {
   val newSession = sparkSession.newSession()
   newSession.sql("CREATE TABLE abc(i int) USING json")
   val tableMeta = 
newSession.sessionState.catalog.getTableMetadata(TableIdentifier("abc"))
   assert(tableMeta.properties(DATASOURCE_SCHEMA_NUMPARTS).toInt == 1)
   assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json")
+}
+  }
+
+  private def withDebugMode(f: => Unit): Unit = {
+

< 1 2 3 4 5 6 7 8 9 10 >

501 - 600 of 4541 matches

Mail list logo