spark git commit: [SPARK-13866] [SQL] Handle decimal type in CSV inference at CSV data source.

2016-05-12 Thread davies
Repository: spark
Updated Branches:
  refs/heads/master eda2800d4 -> 51841d77d


[SPARK-13866] [SQL] Handle decimal type in CSV inference at CSV data source.

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-13866

This PR adds the support to infer `DecimalType`.
Here are the rules between `IntegerType`, `LongType` and `DecimalType`.

 Infering Types

1. `IntegerType` and then `LongType`are tried first.

  ```scala
  Int.MaxValue => IntegerType
  Long.MaxValue => LongType
  ```

2. If it fails, try `DecimalType`.

  ```scala
  (Long.MaxValue + 1) => DecimalType(20, 0)
  ```
  This does not try to infer this as `DecimalType` when scale is less than 0.

3. if it fails, try `DoubleType`
  ```scala
  0.1 => DoubleType // This is failed to be inferred as `DecimalType` because 
it has the scale, 1.
  ```

 Compatible Types (Merging Types)

For merging types, this is the same with JSON data source. If `DecimalType` is 
not capable, then it becomes `DoubleType`

## How was this patch tested?

Unit tests were used and `./dev/run_tests` for code style test.

Author: hyukjinkwon 
Author: Hyukjin Kwon 

Closes #11724 from HyukjinKwon/SPARK-13866.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/51841d77
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/51841d77
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/51841d77

Branch: refs/heads/master
Commit: 51841d77d99a858f8fa1256e923b0364b9b28fa0
Parents: eda2800
Author: hyukjinkwon 
Authored: Thu May 12 22:31:14 2016 -0700
Committer: Davies Liu 
Committed: Thu May 12 22:31:14 2016 -0700

--
 .../datasources/csv/CSVInferSchema.scala| 50 +++-
 sql/core/src/test/resources/decimal.csv |  7 +++
 .../datasources/csv/CSVInferSchemaSuite.scala   | 13 -
 .../execution/datasources/csv/CSVSuite.scala| 15 ++
 4 files changed, 81 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/51841d77/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index cfd66af..05c8d8e 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.datasources.csv
 
 import java.math.BigDecimal
-import java.text.{NumberFormat, SimpleDateFormat}
+import java.text.NumberFormat
 import java.util.Locale
 
 import scala.util.control.Exception._
@@ -85,6 +85,7 @@ private[csv] object CSVInferSchema {
 case NullType => tryParseInteger(field, options)
 case IntegerType => tryParseInteger(field, options)
 case LongType => tryParseLong(field, options)
+case _: DecimalType => tryParseDecimal(field, options)
 case DoubleType => tryParseDouble(field, options)
 case TimestampType => tryParseTimestamp(field, options)
 case BooleanType => tryParseBoolean(field, options)
@@ -107,10 +108,28 @@ private[csv] object CSVInferSchema {
 if ((allCatch opt field.toLong).isDefined) {
   LongType
 } else {
-  tryParseDouble(field, options)
+  tryParseDecimal(field, options)
 }
   }
 
+  private def tryParseDecimal(field: String, options: CSVOptions): DataType = {
+val decimalTry = allCatch opt {
+  // `BigDecimal` conversion can fail when the `field` is not a form of 
number.
+  val bigDecimal = new BigDecimal(field)
+  // Because many other formats do not support decimal, it reduces the 
cases for
+  // decimals by disallowing values having scale (eg. `1.1`).
+  if (bigDecimal.scale <= 0) {
+// `DecimalType` conversion can fail when
+//   1. The precision is bigger than 38.
+//   2. scale is bigger than precision.
+DecimalType(bigDecimal.precision, bigDecimal.scale)
+  } else {
+tryParseDouble(field, options)
+  }
+}
+decimalTry.getOrElse(tryParseDouble(field, options))
+  }
+
   private def tryParseDouble(field: String, options: CSVOptions): DataType = {
 if ((allCatch opt field.toDouble).isDefined) {
   DoubleType
@@ -170,6 +189,33 @@ private[csv] object CSVInferSchema {
   val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)
   Some(numericPrecedence(index))
 
+// These two cases below deal with when 

spark git commit: [SPARK-14541][SQL] Support IFNULL, NULLIF, NVL and NVL2

2016-05-12 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 d73ce364e -> 51706f8a4


[SPARK-14541][SQL] Support IFNULL, NULLIF, NVL and NVL2

## What changes were proposed in this pull request?
This patch adds support for a few SQL functions to improve compatibility with 
other databases: IFNULL, NULLIF, NVL and NVL2. In order to do this, this patch 
introduced a RuntimeReplaceable expression trait that allows replacing an 
unevaluable expression in the optimizer before evaluation.

Note that the semantics are not completely identical to other databases in 
esoteric cases.

## How was this patch tested?
Added a new test suite SQLCompatibilityFunctionSuite.

Closes #12373.

Author: Reynold Xin 

Closes #13084 from rxin/SPARK-14541.

(cherry picked from commit eda2800d44843b6478e22d2c99bca4af7e9c9613)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/51706f8a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/51706f8a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/51706f8a

Branch: refs/heads/branch-2.0
Commit: 51706f8a4dd94e235cf4e2c0627bc3788fec8251
Parents: d73ce36
Author: Reynold Xin 
Authored: Thu May 12 22:18:39 2016 -0700
Committer: Yin Huai 
Committed: Thu May 12 22:19:03 2016 -0700

--
 .../catalyst/analysis/FunctionRegistry.scala|  5 +-
 .../catalyst/analysis/HiveTypeCoercion.scala|  2 +
 .../sql/catalyst/expressions/Expression.scala   | 27 +++
 .../catalyst/expressions/nullExpressions.scala  | 78 +++-
 .../sql/catalyst/optimizer/Optimizer.scala  | 12 +++
 .../spark/sql/DataFrameFunctionsSuite.scala |  6 --
 .../sql/SQLCompatibilityFunctionSuite.scala | 72 ++
 .../sql/catalyst/ExpressionToSQLSuite.scala |  1 -
 8 files changed, 194 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/51706f8a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index c459fe5..eca837c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -165,13 +165,16 @@ object FunctionRegistry {
 expression[Greatest]("greatest"),
 expression[If]("if"),
 expression[IsNaN]("isnan"),
+expression[IfNull]("ifnull"),
 expression[IsNull]("isnull"),
 expression[IsNotNull]("isnotnull"),
 expression[Least]("least"),
 expression[CreateMap]("map"),
 expression[CreateNamedStruct]("named_struct"),
 expression[NaNvl]("nanvl"),
-expression[Coalesce]("nvl"),
+expression[NullIf]("nullif"),
+expression[Nvl]("nvl"),
+expression[Nvl2]("nvl2"),
 expression[Rand]("rand"),
 expression[Randn]("randn"),
 expression[CreateStruct]("struct"),

http://git-wip-us.apache.org/repos/asf/spark/blob/51706f8a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 8319ec0..537dda6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -521,6 +521,8 @@ object HiveTypeCoercion {
 NaNvl(l, Cast(r, DoubleType))
   case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType 
=>
 NaNvl(Cast(l, DoubleType), r)
+
+  case e: RuntimeReplaceable => e.replaceForTypeCoercion()
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/51706f8a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c26faee..fab1634 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -222,6 +222,33 @@ trait Unevaluable extends Expression 

spark git commit: [SPARK-14541][SQL] Support IFNULL, NULLIF, NVL and NVL2

2016-05-12 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master ba169c323 -> eda2800d4


[SPARK-14541][SQL] Support IFNULL, NULLIF, NVL and NVL2

## What changes were proposed in this pull request?
This patch adds support for a few SQL functions to improve compatibility with 
other databases: IFNULL, NULLIF, NVL and NVL2. In order to do this, this patch 
introduced a RuntimeReplaceable expression trait that allows replacing an 
unevaluable expression in the optimizer before evaluation.

Note that the semantics are not completely identical to other databases in 
esoteric cases.

## How was this patch tested?
Added a new test suite SQLCompatibilityFunctionSuite.

Closes #12373.

Author: Reynold Xin 

Closes #13084 from rxin/SPARK-14541.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eda2800d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eda2800d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eda2800d

Branch: refs/heads/master
Commit: eda2800d44843b6478e22d2c99bca4af7e9c9613
Parents: ba169c3
Author: Reynold Xin 
Authored: Thu May 12 22:18:39 2016 -0700
Committer: Yin Huai 
Committed: Thu May 12 22:18:39 2016 -0700

--
 .../catalyst/analysis/FunctionRegistry.scala|  5 +-
 .../catalyst/analysis/HiveTypeCoercion.scala|  2 +
 .../sql/catalyst/expressions/Expression.scala   | 27 +++
 .../catalyst/expressions/nullExpressions.scala  | 78 +++-
 .../sql/catalyst/optimizer/Optimizer.scala  | 12 +++
 .../spark/sql/DataFrameFunctionsSuite.scala |  6 --
 .../sql/SQLCompatibilityFunctionSuite.scala | 72 ++
 .../sql/catalyst/ExpressionToSQLSuite.scala |  1 -
 8 files changed, 194 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/eda2800d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index c459fe5..eca837c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -165,13 +165,16 @@ object FunctionRegistry {
 expression[Greatest]("greatest"),
 expression[If]("if"),
 expression[IsNaN]("isnan"),
+expression[IfNull]("ifnull"),
 expression[IsNull]("isnull"),
 expression[IsNotNull]("isnotnull"),
 expression[Least]("least"),
 expression[CreateMap]("map"),
 expression[CreateNamedStruct]("named_struct"),
 expression[NaNvl]("nanvl"),
-expression[Coalesce]("nvl"),
+expression[NullIf]("nullif"),
+expression[Nvl]("nvl"),
+expression[Nvl2]("nvl2"),
 expression[Rand]("rand"),
 expression[Randn]("randn"),
 expression[CreateStruct]("struct"),

http://git-wip-us.apache.org/repos/asf/spark/blob/eda2800d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 8319ec0..537dda6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -521,6 +521,8 @@ object HiveTypeCoercion {
 NaNvl(l, Cast(r, DoubleType))
   case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType 
=>
 NaNvl(Cast(l, DoubleType), r)
+
+  case e: RuntimeReplaceable => e.replaceForTypeCoercion()
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eda2800d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c26faee..fab1634 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -222,6 +222,33 @@ trait Unevaluable extends Expression {
 
 
 /**
+ * An expression that gets replaced at runtime (currently by the optimizer) 
into a different
+ * expression for 

spark git commit: [SPARK-15306][SQL] Move object expressions into expressions.objects package

2016-05-12 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master b3930f74a -> ba169c323


[SPARK-15306][SQL] Move object expressions into expressions.objects package

## What changes were proposed in this pull request?
This patch moves all the object related expressions into expressions.objects 
package, for better code organization.

## How was this patch tested?
N/A

Author: Reynold Xin 

Closes #13085 from rxin/SPARK-15306.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ba169c32
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ba169c32
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ba169c32

Branch: refs/heads/master
Commit: ba169c3230e7d6cb192ec4bd567a1fef7b93b29f
Parents: b3930f7
Author: Reynold Xin 
Authored: Thu May 12 21:35:14 2016 -0700
Committer: Reynold Xin 
Committed: Thu May 12 21:35:14 2016 -0700

--
 .../scala/org/apache/spark/sql/Encoders.scala   |   3 +-
 .../spark/sql/catalyst/JavaTypeInference.scala  |   1 +
 .../spark/sql/catalyst/ScalaReflection.scala|   1 +
 .../spark/sql/catalyst/analysis/Analyzer.scala  |   1 +
 .../apache/spark/sql/catalyst/dsl/package.scala |   1 +
 .../catalyst/encoders/ExpressionEncoder.scala   |   1 +
 .../sql/catalyst/encoders/RowEncoder.scala  |   1 +
 .../expressions/ReferenceToExpressions.scala|   1 +
 .../sql/catalyst/expressions/objects.scala  | 732 --
 .../catalyst/expressions/objects/objects.scala  | 733 +++
 .../sql/catalyst/ScalaReflectionSuite.scala |   3 +-
 .../scala/org/apache/spark/sql/Dataset.scala|   1 +
 .../aggregate/TypedAggregateExpression.scala|   1 +
 .../apache/spark/sql/execution/objects.scala|   1 +
 14 files changed, 747 insertions(+), 734 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ba169c32/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
--
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
index 3f4df70..fa96f82 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
@@ -24,7 +24,8 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder}
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, 
DecodeUsingSerializer, EncodeUsingSerializer}
+import 
org.apache.spark.sql.catalyst.expressions.objects.{DecodeUsingSerializer, 
EncodeUsingSerializer}
+import org.apache.spark.sql.catalyst.expressions.BoundReference
 import org.apache.spark.sql.types._
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/ba169c32/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 92caf8f..6907582 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -27,6 +27,7 @@ import com.google.common.reflect.TypeToken
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, 
UnresolvedExtractValue}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, 
GenericArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String

http://git-wip-us.apache.org/repos/asf/spark/blob/ba169c32/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 79bb7a7..cb9a62d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, 
UnresolvedExtractValue}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.objects._
 import 

spark git commit: [SPARK-15306][SQL] Move object expressions into expressions.objects package

2016-05-12 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 54c04aa5d -> d73ce364e


[SPARK-15306][SQL] Move object expressions into expressions.objects package

## What changes were proposed in this pull request?
This patch moves all the object related expressions into expressions.objects 
package, for better code organization.

## How was this patch tested?
N/A

Author: Reynold Xin 

Closes #13085 from rxin/SPARK-15306.

(cherry picked from commit ba169c3230e7d6cb192ec4bd567a1fef7b93b29f)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d73ce364
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d73ce364
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d73ce364

Branch: refs/heads/branch-2.0
Commit: d73ce364e301a54e60b336f0fd2d3836d5d0f25a
Parents: 54c04aa
Author: Reynold Xin 
Authored: Thu May 12 21:35:14 2016 -0700
Committer: Reynold Xin 
Committed: Thu May 12 21:35:22 2016 -0700

--
 .../scala/org/apache/spark/sql/Encoders.scala   |   3 +-
 .../spark/sql/catalyst/JavaTypeInference.scala  |   1 +
 .../spark/sql/catalyst/ScalaReflection.scala|   1 +
 .../spark/sql/catalyst/analysis/Analyzer.scala  |   1 +
 .../apache/spark/sql/catalyst/dsl/package.scala |   1 +
 .../catalyst/encoders/ExpressionEncoder.scala   |   1 +
 .../sql/catalyst/encoders/RowEncoder.scala  |   1 +
 .../expressions/ReferenceToExpressions.scala|   1 +
 .../sql/catalyst/expressions/objects.scala  | 732 --
 .../catalyst/expressions/objects/objects.scala  | 733 +++
 .../sql/catalyst/ScalaReflectionSuite.scala |   3 +-
 .../scala/org/apache/spark/sql/Dataset.scala|   1 +
 .../aggregate/TypedAggregateExpression.scala|   1 +
 .../apache/spark/sql/execution/objects.scala|   1 +
 14 files changed, 747 insertions(+), 734 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d73ce364/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
--
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
index 3f4df70..fa96f82 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
@@ -24,7 +24,8 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder}
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, 
DecodeUsingSerializer, EncodeUsingSerializer}
+import 
org.apache.spark.sql.catalyst.expressions.objects.{DecodeUsingSerializer, 
EncodeUsingSerializer}
+import org.apache.spark.sql.catalyst.expressions.BoundReference
 import org.apache.spark.sql.types._
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/d73ce364/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 92caf8f..6907582 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -27,6 +27,7 @@ import com.google.common.reflect.TypeToken
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, 
UnresolvedExtractValue}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, 
GenericArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String

http://git-wip-us.apache.org/repos/asf/spark/blob/d73ce364/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 79bb7a7..cb9a62d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, 
UnresolvedExtractValue}
 import 

spark git commit: [SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR.

2016-05-12 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 0d24fe09a -> 54c04aa5d


[SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR.

## What changes were proposed in this pull request?

dapplyCollect() applies an R function on each partition of a SparkDataFrame and 
collects the result back to R as a data.frame.
```
dapplyCollect(df, function(ldf) {...})
```

## How was this patch tested?
SparkR unit tests.

Author: Sun Rui 

Closes #12989 from sun-rui/SPARK-15202.

(cherry picked from commit b3930f74a0929b2cdcbbe5cbe34f0b1d35eb01cc)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54c04aa5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54c04aa5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54c04aa5

Branch: refs/heads/branch-2.0
Commit: 54c04aa5d0a6012eb58efd0e7cf6d1d287818fa8
Parents: 0d24fe0
Author: Sun Rui 
Authored: Thu May 12 17:50:55 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Thu May 12 17:51:02 2016 -0700

--
 R/pkg/NAMESPACE   |  1 +
 R/pkg/R/DataFrame.R   | 86 +-
 R/pkg/R/generics.R|  4 ++
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 ++-
 4 files changed, 95 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/54c04aa5/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 1432ab8..239ad06 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -47,6 +47,7 @@ exportMethods("arrange",
   "covar_pop",
   "crosstab",
   "dapply",
+  "dapplyCollect",
   "describe",
   "dim",
   "distinct",

http://git-wip-us.apache.org/repos/asf/spark/blob/54c04aa5/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 43c46b8..0c2a194 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1153,9 +1153,27 @@ setMethod("summarize",
 agg(x, ...)
   })
 
+dapplyInternal <- function(x, func, schema) {
+  packageNamesArr <- serialize(.sparkREnv[[".packages"]],
+   connection = NULL)
+
+  broadcastArr <- lapply(ls(.broadcastNames),
+ function(name) { get(name, .broadcastNames) })
+
+  sdf <- callJStatic(
+   "org.apache.spark.sql.api.r.SQLUtils",
+   "dapply",
+   x@sdf,
+   serialize(cleanClosure(func), connection = NULL),
+   packageNamesArr,
+   broadcastArr,
+   if (is.null(schema)) { schema } else { schema$jobj })
+  dataFrame(sdf)
+}
+
 #' dapply
 #'
-#' Apply a function to each partition of a DataFrame.
+#' Apply a function to each partition of a SparkDataFrame.
 #'
 #' @param x A SparkDataFrame
 #' @param func A function to be applied to each partition of the 
SparkDataFrame.
@@ -1197,21 +1215,57 @@ setMethod("summarize",
 setMethod("dapply",
   signature(x = "SparkDataFrame", func = "function", schema = 
"structType"),
   function(x, func, schema) {
-packageNamesArr <- serialize(.sparkREnv[[".packages"]],
- connection = NULL)
-
-broadcastArr <- lapply(ls(.broadcastNames),
-   function(name) { get(name, .broadcastNames) 
})
-
-sdf <- callJStatic(
- "org.apache.spark.sql.api.r.SQLUtils",
- "dapply",
- x@sdf,
- serialize(cleanClosure(func), connection = NULL),
- packageNamesArr,
- broadcastArr,
- schema$jobj)
-dataFrame(sdf)
+dapplyInternal(x, func, schema)
+  })
+
+#' dapplyCollect
+#'
+#' Apply a function to each partition of a SparkDataFrame and collect the 
result back
+#’ to R as a data.frame.
+#'
+#' @param x A SparkDataFrame
+#' @param func A function to be applied to each partition of the 
SparkDataFrame.
+#' func should have only one parameter, to which a data.frame 
corresponds
+#' to each partition will be passed.
+#' The output of func should be a data.frame.
+#' @family SparkDataFrame functions
+#' @rdname dapply
+#' @name dapplyCollect
+#' @export
+#' @examples
+#' \dontrun{
+#'   df <- createDataFrame (sqlContext, iris)
+#'   ldf <- dapplyCollect(df, function(x) { x })
+#'
+#'   # filter and add a column
+#'   df <- createDataFrame (
+#'   sqlContext, 

spark git commit: [SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR.

2016-05-12 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master bb1362eb3 -> b3930f74a


[SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR.

## What changes were proposed in this pull request?

dapplyCollect() applies an R function on each partition of a SparkDataFrame and 
collects the result back to R as a data.frame.
```
dapplyCollect(df, function(ldf) {...})
```

## How was this patch tested?
SparkR unit tests.

Author: Sun Rui 

Closes #12989 from sun-rui/SPARK-15202.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3930f74
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3930f74
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3930f74

Branch: refs/heads/master
Commit: b3930f74a0929b2cdcbbe5cbe34f0b1d35eb01cc
Parents: bb1362e
Author: Sun Rui 
Authored: Thu May 12 17:50:55 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Thu May 12 17:50:55 2016 -0700

--
 R/pkg/NAMESPACE   |  1 +
 R/pkg/R/DataFrame.R   | 86 +-
 R/pkg/R/generics.R|  4 ++
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 ++-
 4 files changed, 95 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b3930f74/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 1432ab8..239ad06 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -47,6 +47,7 @@ exportMethods("arrange",
   "covar_pop",
   "crosstab",
   "dapply",
+  "dapplyCollect",
   "describe",
   "dim",
   "distinct",

http://git-wip-us.apache.org/repos/asf/spark/blob/b3930f74/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 43c46b8..0c2a194 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1153,9 +1153,27 @@ setMethod("summarize",
 agg(x, ...)
   })
 
+dapplyInternal <- function(x, func, schema) {
+  packageNamesArr <- serialize(.sparkREnv[[".packages"]],
+   connection = NULL)
+
+  broadcastArr <- lapply(ls(.broadcastNames),
+ function(name) { get(name, .broadcastNames) })
+
+  sdf <- callJStatic(
+   "org.apache.spark.sql.api.r.SQLUtils",
+   "dapply",
+   x@sdf,
+   serialize(cleanClosure(func), connection = NULL),
+   packageNamesArr,
+   broadcastArr,
+   if (is.null(schema)) { schema } else { schema$jobj })
+  dataFrame(sdf)
+}
+
 #' dapply
 #'
-#' Apply a function to each partition of a DataFrame.
+#' Apply a function to each partition of a SparkDataFrame.
 #'
 #' @param x A SparkDataFrame
 #' @param func A function to be applied to each partition of the 
SparkDataFrame.
@@ -1197,21 +1215,57 @@ setMethod("summarize",
 setMethod("dapply",
   signature(x = "SparkDataFrame", func = "function", schema = 
"structType"),
   function(x, func, schema) {
-packageNamesArr <- serialize(.sparkREnv[[".packages"]],
- connection = NULL)
-
-broadcastArr <- lapply(ls(.broadcastNames),
-   function(name) { get(name, .broadcastNames) 
})
-
-sdf <- callJStatic(
- "org.apache.spark.sql.api.r.SQLUtils",
- "dapply",
- x@sdf,
- serialize(cleanClosure(func), connection = NULL),
- packageNamesArr,
- broadcastArr,
- schema$jobj)
-dataFrame(sdf)
+dapplyInternal(x, func, schema)
+  })
+
+#' dapplyCollect
+#'
+#' Apply a function to each partition of a SparkDataFrame and collect the 
result back
+#’ to R as a data.frame.
+#'
+#' @param x A SparkDataFrame
+#' @param func A function to be applied to each partition of the 
SparkDataFrame.
+#' func should have only one parameter, to which a data.frame 
corresponds
+#' to each partition will be passed.
+#' The output of func should be a data.frame.
+#' @family SparkDataFrame functions
+#' @rdname dapply
+#' @name dapplyCollect
+#' @export
+#' @examples
+#' \dontrun{
+#'   df <- createDataFrame (sqlContext, iris)
+#'   ldf <- dapplyCollect(df, function(x) { x })
+#'
+#'   # filter and add a column
+#'   df <- createDataFrame (
+#'   sqlContext, 
+#'   list(list(1L, 1, "1"), list(2L, 2, "2"), list(3L, 3, "3")),
+#'   c("a", "b", "c"))
+#'   ldf <- dapplyCollect(
+#'

spark git commit: [SPARK-10605][SQL] Create native collect_list/collect_set aggregates

2016-05-12 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 ac6e9a8d9 -> 31ea3c7bd


[SPARK-10605][SQL] Create native collect_list/collect_set aggregates

## What changes were proposed in this pull request?
We currently use the Hive implementations for the collect_list/collect_set 
aggregate functions. This has a few major drawbacks: the use of HiveUDAF (which 
has quite a bit of overhead) and the lack of support for struct datatypes. This 
PR adds native implementation of these functions to Spark.

The size of the collected list/set may vary, this means we cannot use the fast, 
Tungsten, aggregation path to perform the aggregation, and that we fallback to 
the slower sort based path. Another big issue with these operators is that when 
the size of the collected list/set grows too large, we can start experiencing 
large GC pauzes and OOMEs.

This `collect*` aggregates implemented in this PR rely on the sort based 
aggregate path for correctness. They maintain their own internal buffer which 
holds the rows for one group at a time. The sortbased aggregation path is 
triggered by disabling `partialAggregation` for these aggregates (which is 
kinda funny); this technique is also employed in 
`org.apache.spark.sql.hiveHiveUDAFFunction`.

I have done some performance testing:
```scala
import org.apache.spark.sql.{Dataset, Row}

sql("create function collect_list2 as 
'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectList'")

val df = range(0, 1000).select($"id", (rand(213123L) * 
10).cast("int").as("grp"))
df.select(countDistinct($"grp")).show

def benchmark(name: String, plan: Dataset[Row], maxItr: Int = 5): Unit = {
   // Do not measure planning.
   plan1.queryExecution.executedPlan

   // Execute the plan a number of times and average the result.
   val start = System.nanoTime
   var i = 0
   while (i < maxItr) {
 plan.rdd.foreach(row => Unit)
 i += 1
   }
   val time = (System.nanoTime - start) / (maxItr * 100L)
   println(s"[$name] $maxItr iterations completed in an average time of $time 
ms.")
}

val plan1 = df.groupBy($"grp").agg(collect_list($"id"))
val plan2 = df.groupBy($"grp").agg(callUDF("collect_list2", $"id"))

benchmark("Spark collect_list", plan1)
...
> [Spark collect_list] 5 iterations completed in an average time of 3371 ms.

benchmark("Hive collect_list", plan2)
...
> [Hive collect_list] 5 iterations completed in an average time of 9109 ms.
```
Performance is improved by a factor 2-3.

## How was this patch tested?
Added tests to `DataFrameAggregateSuite`.

Author: Herman van Hovell 

Closes #12874 from hvanhovell/implode.

(cherry picked from commit bb1362eb3b36b553dca246b95f59ba7fd8adcc8a)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31ea3c7b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31ea3c7b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31ea3c7b

Branch: refs/heads/branch-2.0
Commit: 31ea3c7bde94f5bcca1db601f9c16c36c56cef73
Parents: ac6e9a8
Author: Herman van Hovell 
Authored: Thu May 12 13:56:00 2016 -0700
Committer: Reynold Xin 
Committed: Thu May 12 13:56:15 2016 -0700

--
 .../catalyst/analysis/FunctionRegistry.scala|   2 +
 .../expressions/aggregate/collect.scala | 119 +++
 .../scala/org/apache/spark/sql/functions.scala  |  12 +-
 .../spark/sql/DataFrameAggregateSuite.scala |  26 
 .../spark/sql/hive/HiveSessionCatalog.scala |  16 ---
 .../sql/hive/HiveDataFrameAnalyticsSuite.scala  |  11 --
 6 files changed, 149 insertions(+), 37 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/31ea3c7b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ac05dd3..c459fe5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -252,6 +252,8 @@ object FunctionRegistry {
 expression[VarianceSamp]("variance"),
 expression[VariancePop]("var_pop"),
 expression[VarianceSamp]("var_samp"),
+expression[CollectList]("collect_list"),
+expression[CollectSet]("collect_set"),
 
 // string functions
 expression[Ascii]("ascii"),

http://git-wip-us.apache.org/repos/asf/spark/blob/31ea3c7b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala

spark git commit: [SPARK-13902][SCHEDULER] Make DAGScheduler not to create duplicate stage.

2016-05-12 Thread kayousterhout
Repository: spark
Updated Branches:
  refs/heads/master 81e3bfc16 -> a57aadae8


[SPARK-13902][SCHEDULER] Make DAGScheduler not to create duplicate stage.

## What changes were proposed in this pull request?

`DAGScheduler`sometimes generate incorrect stage graph.

Suppose you have the following DAG:

```
[A] <--(s_A)-- [B] <--(s_B)-- [C] <--(s_C)-- [D]
\/
  <-
```

Note: [] means an RDD, () means a shuffle dependency.

Here, RDD `B` has a shuffle dependency on RDD `A`, and RDD `C` has shuffle 
dependency on both `B` and `A`. The shuffle dependency IDs are numbers in the 
`DAGScheduler`, but to make the example easier to understand, let's call the 
shuffled data from `A` shuffle dependency ID `s_A` and the shuffled data from 
`B` shuffle dependency ID `s_B`.
The `getAncestorShuffleDependencies` method in `DAGScheduler` (incorrectly) 
does not check for duplicates when it's adding ShuffleDependencies to the 
parents data structure, so for this DAG, when `getAncestorShuffleDependencies` 
gets called on `C` (previous of the final RDD), 
`getAncestorShuffleDependencies` will return `s_A`, `s_B`, `s_A` (`s_A` gets 
added twice: once when the method "visit"s RDD `C`, and once when the method 
"visit"s RDD `B`). This is problematic because this line of code: 
https://github.com/apache/spark/blob/8ef3399/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L289
 then generates a new shuffle stage for each dependency returned by 
`getAncestorShuffleDependencies`, resulting in duplicate map stages that 
compute the map output from RDD `A`.

As a result, `DAGScheduler` generates the following stages and their parents 
for each shuffle:

| | stage | parents |
||||
| s_A | ShuffleMapStage 2 | List() |
| s_B | ShuffleMapStage 1 | List(ShuffleMapStage 0) |
| s_C | ShuffleMapStage 3 | List(ShuffleMapStage 1, ShuffleMapStage 2) |
| - | ResultStage 4 | List(ShuffleMapStage 3) |

The stage for s_A should be `ShuffleMapStage 0`, but the stage for `s_A` is 
generated twice as `ShuffleMapStage 2` and `ShuffleMapStage 0` is overwritten 
by `ShuffleMapStage 2`, and the stage `ShuffleMap Stage1` keeps referring the 
old stage `ShuffleMapStage 0`.

This patch is fixing it.

## How was this patch tested?

I added the sample RDD graph to show the illegal stage graph to 
`DAGSchedulerSuite`.

Author: Takuya UESHIN 

Closes #12655 from ueshin/issues/SPARK-13902.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a57aadae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a57aadae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a57aadae

Branch: refs/heads/master
Commit: a57aadae84aca27e5f02ac0bd64fd0ea34a64b61
Parents: 81e3bfc
Author: Takuya UESHIN 
Authored: Thu May 12 12:36:18 2016 -0700
Committer: Kay Ousterhout 
Committed: Thu May 12 12:36:18 2016 -0700

--
 .../apache/spark/scheduler/DAGScheduler.scala   |  4 +-
 .../spark/scheduler/DAGSchedulerSuite.scala | 47 
 2 files changed, 50 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a57aadae/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 4dfd532..5291b66 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -286,7 +286,9 @@ class DAGScheduler(
   case None =>
 // We are going to register ancestor shuffle dependencies
 getAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
-  shuffleToMapStage(dep.shuffleId) = newOrUsedShuffleStage(dep, 
firstJobId)
+  if (!shuffleToMapStage.contains(dep.shuffleId)) {
+shuffleToMapStage(dep.shuffleId) = newOrUsedShuffleStage(dep, 
firstJobId)
+  }
 }
 // Then register current shuffleDep
 val stage = newOrUsedShuffleStage(shuffleDep, firstJobId)

http://git-wip-us.apache.org/repos/asf/spark/blob/a57aadae/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala 
b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index e3ed079..088a476 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -325,6 +325,53 @@ class 

spark git commit: [SPARK-14421] Upgrades protobuf dependency to 2.6.1 for the new version of KCL, and…

2016-05-12 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 81bf87084 -> 81e3bfc16


[SPARK-14421] Upgrades protobuf dependency to 2.6.1 for the new version of KCL, 
and…

## What changes were proposed in this pull request?

When running with Kinesis Consumer Library (KCL), against a stream that 
contains aggregated data, the KCL needs access to protobuf to de-aggregate the 
records.   Without this patch, that results in the following error message:

```
   Caused by: java.lang.ClassNotFoundException: 
com.google.protobuf.ProtocolStringList
```

This PR upgrades the protobuf dependency within the kinesis-asl-assembly, and 
relocates that package (as not to conflict with Spark's use of 2.5.0), which 
fixes the above CNFE.

## How was this patch tested?

Used kinesis word count example against a stream containing aggregated data.

See: SPARK-14421

Author: Brian O'Neill 

Closes #13054 from boneill42/protobuf-relocation-for-kcl.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81e3bfc1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81e3bfc1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81e3bfc1

Branch: refs/heads/master
Commit: 81e3bfc16c6cfbf2f9f2c6c32ed651b8450795ba
Parents: 81bf870
Author: Brian O'Neill 
Authored: Thu May 12 20:10:33 2016 +0100
Committer: Sean Owen 
Committed: Thu May 12 20:10:33 2016 +0100

--
 external/kinesis-asl-assembly/pom.xml | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/81e3bfc1/external/kinesis-asl-assembly/pom.xml
--
diff --git a/external/kinesis-asl-assembly/pom.xml 
b/external/kinesis-asl-assembly/pom.xml
index e057b78..6fb88eb 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -63,7 +63,12 @@
 
   com.google.protobuf
   protobuf-java
-  provided
+  2.6.1
+  
 
 
   org.glassfish.jersey.core
@@ -147,6 +152,15 @@
 *:*
   
 
+
+  
+com.google.protobuf
+kinesis.protobuf
+
+  com.google.protobuf.**
+
+  
+
 
   
 *:*


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-14421] Upgrades protobuf dependency to 2.6.1 for the new version of KCL, and…

2016-05-12 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 7a14d28cb -> ac6e9a8d9


[SPARK-14421] Upgrades protobuf dependency to 2.6.1 for the new version of KCL, 
and…

## What changes were proposed in this pull request?

When running with Kinesis Consumer Library (KCL), against a stream that 
contains aggregated data, the KCL needs access to protobuf to de-aggregate the 
records.   Without this patch, that results in the following error message:

```
   Caused by: java.lang.ClassNotFoundException: 
com.google.protobuf.ProtocolStringList
```

This PR upgrades the protobuf dependency within the kinesis-asl-assembly, and 
relocates that package (as not to conflict with Spark's use of 2.5.0), which 
fixes the above CNFE.

## How was this patch tested?

Used kinesis word count example against a stream containing aggregated data.

See: SPARK-14421

Author: Brian O'Neill 

Closes #13054 from boneill42/protobuf-relocation-for-kcl.

(cherry picked from commit 81e3bfc16c6cfbf2f9f2c6c32ed651b8450795ba)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ac6e9a8d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ac6e9a8d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ac6e9a8d

Branch: refs/heads/branch-2.0
Commit: ac6e9a8d9533d485ee3cbbb57a9835f92722e8fc
Parents: 7a14d28
Author: Brian O'Neill 
Authored: Thu May 12 20:10:33 2016 +0100
Committer: Sean Owen 
Committed: Thu May 12 20:10:40 2016 +0100

--
 external/kinesis-asl-assembly/pom.xml | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ac6e9a8d/external/kinesis-asl-assembly/pom.xml
--
diff --git a/external/kinesis-asl-assembly/pom.xml 
b/external/kinesis-asl-assembly/pom.xml
index e057b78..6fb88eb 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -63,7 +63,12 @@
 
   com.google.protobuf
   protobuf-java
-  provided
+  2.6.1
+  
 
 
   org.glassfish.jersey.core
@@ -147,6 +152,15 @@
 *:*
   
 
+
+  
+com.google.protobuf
+kinesis.protobuf
+
+  com.google.protobuf.**
+
+  
+
 
   
 *:*


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-14897][SQL] upgrade to jetty 9.2.16

2016-05-12 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 9c5c9013d -> 7a14d28cb


[SPARK-14897][SQL] upgrade to jetty 9.2.16

## What changes were proposed in this pull request?

Since Jetty 8 is EOL (end of life) and has critical security issue 
[http://www.securityweek.com/critical-vulnerability-found-jetty-web-server], I 
think upgrading to 9 is necessary. I am using latest 9.2 since 9.3 requires 
Java 8+.

`javax.servlet` and `derby` were also upgraded since Jetty 9.2 needs 
corresponding version.

## How was this patch tested?

Manual test and current test cases should cover it.

Author: bomeng 

Closes #12916 from bomeng/SPARK-14897.

(cherry picked from commit 81bf870848cf9faeec5ab2d40acff27085466698)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a14d28c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a14d28c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a14d28c

Branch: refs/heads/branch-2.0
Commit: 7a14d28cbafa026ed19ce6e16a9feb5c26655f58
Parents: 9c5c901
Author: bomeng 
Authored: Thu May 12 20:07:44 2016 +0100
Committer: Sean Owen 
Committed: Thu May 12 20:07:51 2016 +0100

--
 core/pom.xml| 15 
 .../scala/org/apache/spark/SSLOptions.scala |  2 +-
 .../deploy/rest/RestSubmissionServer.scala  | 14 +---
 .../scala/org/apache/spark/ui/JettyUtils.scala  | 37 ++--
 dev/deps/spark-deps-hadoop-2.2  |  4 +--
 dev/deps/spark-deps-hadoop-2.3  |  4 +--
 dev/deps/spark-deps-hadoop-2.4  |  4 +--
 dev/deps/spark-deps-hadoop-2.6  |  4 +--
 dev/deps/spark-deps-hadoop-2.7  |  4 +--
 pom.xml | 13 +--
 .../cli/thrift/ThriftHttpCLIService.java| 16 -
 streaming/pom.xml   |  4 +++
 yarn/pom.xml|  4 +++
 13 files changed, 72 insertions(+), 53 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7a14d28c/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index 07b5896..8584b62 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -125,12 +125,15 @@
   jetty-servlet
   compile
 
-
 
-  org.eclipse.jetty.orbit
-  javax.servlet
-  ${orbit.version}
+  org.eclipse.jetty
+  jetty-servlets
+  compile
+
+
+  javax.servlet
+  javax.servlet-api
+  ${javaxservlet.version}
 
 
 
@@ -356,7 +359,7 @@
   true
   true
   
-
guava,jetty-io,jetty-servlet,jetty-continuation,jetty-http,jetty-plus,jetty-util,jetty-server,jetty-security
+
guava,jetty-io,jetty-servlet,jetty-servlets,jetty-continuation,jetty-http,jetty-plus,jetty-util,jetty-server,jetty-security
   
   true
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7a14d28c/core/src/main/scala/org/apache/spark/SSLOptions.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala 
b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index 719905a..be19179 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -71,7 +71,7 @@ private[spark] case class SSLOptions(
   keyPassword.foreach(sslContextFactory.setKeyManagerPassword)
   keyStoreType.foreach(sslContextFactory.setKeyStoreType)
   if (needClientAuth) {
-trustStore.foreach(file => 
sslContextFactory.setTrustStore(file.getAbsolutePath))
+trustStore.foreach(file => 
sslContextFactory.setTrustStorePath(file.getAbsolutePath))
 trustStorePassword.foreach(sslContextFactory.setTrustStorePassword)
 trustStoreType.foreach(sslContextFactory.setTrustStoreType)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/7a14d28c/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala 
b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
index 14244ea..7e93bfc 100644
--- 
a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
+++ 
b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.deploy.rest
 
-import java.net.InetSocketAddress
 import javax.servlet.http.{HttpServlet, 

spark git commit: [SPARK-14897][SQL] upgrade to jetty 9.2.16

2016-05-12 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master be617f3d0 -> 81bf87084


[SPARK-14897][SQL] upgrade to jetty 9.2.16

## What changes were proposed in this pull request?

Since Jetty 8 is EOL (end of life) and has critical security issue 
[http://www.securityweek.com/critical-vulnerability-found-jetty-web-server], I 
think upgrading to 9 is necessary. I am using latest 9.2 since 9.3 requires 
Java 8+.

`javax.servlet` and `derby` were also upgraded since Jetty 9.2 needs 
corresponding version.

## How was this patch tested?

Manual test and current test cases should cover it.

Author: bomeng 

Closes #12916 from bomeng/SPARK-14897.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81bf8708
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81bf8708
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81bf8708

Branch: refs/heads/master
Commit: 81bf870848cf9faeec5ab2d40acff27085466698
Parents: be617f3
Author: bomeng 
Authored: Thu May 12 20:07:44 2016 +0100
Committer: Sean Owen 
Committed: Thu May 12 20:07:44 2016 +0100

--
 core/pom.xml| 15 
 .../scala/org/apache/spark/SSLOptions.scala |  2 +-
 .../deploy/rest/RestSubmissionServer.scala  | 14 +---
 .../scala/org/apache/spark/ui/JettyUtils.scala  | 37 ++--
 dev/deps/spark-deps-hadoop-2.2  |  4 +--
 dev/deps/spark-deps-hadoop-2.3  |  4 +--
 dev/deps/spark-deps-hadoop-2.4  |  4 +--
 dev/deps/spark-deps-hadoop-2.6  |  4 +--
 dev/deps/spark-deps-hadoop-2.7  |  4 +--
 pom.xml | 13 +--
 .../cli/thrift/ThriftHttpCLIService.java| 16 -
 streaming/pom.xml   |  4 +++
 yarn/pom.xml|  4 +++
 13 files changed, 72 insertions(+), 53 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/81bf8708/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index 07b5896..8584b62 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -125,12 +125,15 @@
   jetty-servlet
   compile
 
-
 
-  org.eclipse.jetty.orbit
-  javax.servlet
-  ${orbit.version}
+  org.eclipse.jetty
+  jetty-servlets
+  compile
+
+
+  javax.servlet
+  javax.servlet-api
+  ${javaxservlet.version}
 
 
 
@@ -356,7 +359,7 @@
   true
   true
   
-
guava,jetty-io,jetty-servlet,jetty-continuation,jetty-http,jetty-plus,jetty-util,jetty-server,jetty-security
+
guava,jetty-io,jetty-servlet,jetty-servlets,jetty-continuation,jetty-http,jetty-plus,jetty-util,jetty-server,jetty-security
   
   true
 

http://git-wip-us.apache.org/repos/asf/spark/blob/81bf8708/core/src/main/scala/org/apache/spark/SSLOptions.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala 
b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index 719905a..be19179 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -71,7 +71,7 @@ private[spark] case class SSLOptions(
   keyPassword.foreach(sslContextFactory.setKeyManagerPassword)
   keyStoreType.foreach(sslContextFactory.setKeyStoreType)
   if (needClientAuth) {
-trustStore.foreach(file => 
sslContextFactory.setTrustStore(file.getAbsolutePath))
+trustStore.foreach(file => 
sslContextFactory.setTrustStorePath(file.getAbsolutePath))
 trustStorePassword.foreach(sslContextFactory.setTrustStorePassword)
 trustStoreType.foreach(sslContextFactory.setTrustStoreType)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/81bf8708/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala 
b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
index 14244ea..7e93bfc 100644
--- 
a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
+++ 
b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.deploy.rest
 
-import java.net.InetSocketAddress
 import javax.servlet.http.{HttpServlet, HttpServletRequest, 
HttpServletResponse}
 
 import scala.io.Source
 
 import com.fasterxml.jackson.core.JsonProcessingException
-import 

spark git commit: [SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and Checking Partition Spec Existence Before Dropping

2016-05-12 Thread andrewor14
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 68617e1ad -> 9c5c9013d


[SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and 
Checking Partition Spec Existence Before Dropping

 What changes were proposed in this pull request?
~~Currently, multiple partitions are allowed to drop by using a single DDL 
command: Alter Table Drop Partition. However, the internal implementation could 
break atomicity. That means, we could just drop a subset of qualified 
partitions, if hitting an exception when dropping one of qualified partitions~~

~~This PR contains the following behavior changes:~~
~~- disallow dropping multiple partitions by a single command ~~
~~- allow users to input predicates in partition specification and issue a 
nicer error message if the predicate's comparison operator is not `=`.~~
~~- verify the partition spec in SessionCatalog. This can ensure each partition 
spec in `Drop Partition` does not correspond to multiple partitions.~~

This PR has two major parts:
- Verify the partition spec in SessionCatalog for fixing the following issue:
  ```scala
  sql(s"ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-09', 
unknownCol='12')")
  ```
  Above example uses an invalid partition spec. Without this PR, we will drop 
all the partitions. The reason is Hive megastores getPartitions API returns all 
the partitions if we provide an invalid spec.

- Re-implemented the `dropPartitions` in `HiveClientImpl`. Now, we always check 
if all the user-specified partition specs exist before attempting to drop the 
partitions. Previously, we start drop the partition before completing checking 
the existence of all the partition specs. If any failure happened after we 
start to drop the partitions, we will log an error message to indicate which 
partitions have been dropped and which partitions have not been dropped.

 How was this patch tested?
Modified the existing test cases and added new test cases.

Author: gatorsmile 
Author: xiaoli 
Author: Xiao Li 

Closes #12801 from gatorsmile/banDropMultiPart.

(cherry picked from commit be617f3d0695982f982006fdd79afe3e3730b4c4)
Signed-off-by: Andrew Or 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c5c9013
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c5c9013
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c5c9013

Branch: refs/heads/branch-2.0
Commit: 9c5c9013de1311b3175a6156fb90447f00c7a883
Parents: 68617e1
Author: gatorsmile 
Authored: Thu May 12 11:14:40 2016 -0700
Committer: Andrew Or 
Committed: Thu May 12 11:14:52 2016 -0700

--
 .../sql/catalyst/catalog/SessionCatalog.scala   |  47 +++-
 .../catalyst/catalog/ExternalCatalogSuite.scala |   6 +
 .../catalyst/catalog/SessionCatalogSuite.scala  | 116 ++-
 .../spark/sql/execution/command/DDLSuite.scala  |  78 ++---
 .../spark/sql/hive/client/HiveClientImpl.scala  |  50 +---
 .../spark/sql/hive/execution/HiveDDLSuite.scala |   9 +-
 6 files changed, 248 insertions(+), 58 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9c5c9013/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 0fc4ab5..54b30d3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -510,6 +510,7 @@ class SessionCatalog(
   tableName: TableIdentifier,
   parts: Seq[CatalogTablePartition],
   ignoreIfExists: Boolean): Unit = {
+requireExactMatchedPartitionSpec(parts.map(_.spec), 
getTableMetadata(tableName))
 val db = 
formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
 val table = formatTableName(tableName.table)
 requireDbExists(db)
@@ -523,13 +524,14 @@ class SessionCatalog(
*/
   def dropPartitions(
   tableName: TableIdentifier,
-  parts: Seq[TablePartitionSpec],
+  specs: Seq[TablePartitionSpec],
   ignoreIfNotExists: Boolean): Unit = {
+requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName))
 val db = 
formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
 val table = formatTableName(tableName.table)
 requireDbExists(db)
 requireTableExists(TableIdentifier(table, Option(db)))
-externalCatalog.dropPartitions(db, table, 

spark git commit: [SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and Checking Partition Spec Existence Before Dropping

2016-05-12 Thread andrewor14
Repository: spark
Updated Branches:
  refs/heads/master 470de743e -> be617f3d0


[SPARK-14684][SPARK-15277][SQL] Partition Spec Validation in SessionCatalog and 
Checking Partition Spec Existence Before Dropping

 What changes were proposed in this pull request?
~~Currently, multiple partitions are allowed to drop by using a single DDL 
command: Alter Table Drop Partition. However, the internal implementation could 
break atomicity. That means, we could just drop a subset of qualified 
partitions, if hitting an exception when dropping one of qualified partitions~~

~~This PR contains the following behavior changes:~~
~~- disallow dropping multiple partitions by a single command ~~
~~- allow users to input predicates in partition specification and issue a 
nicer error message if the predicate's comparison operator is not `=`.~~
~~- verify the partition spec in SessionCatalog. This can ensure each partition 
spec in `Drop Partition` does not correspond to multiple partitions.~~

This PR has two major parts:
- Verify the partition spec in SessionCatalog for fixing the following issue:
  ```scala
  sql(s"ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-09', 
unknownCol='12')")
  ```
  Above example uses an invalid partition spec. Without this PR, we will drop 
all the partitions. The reason is Hive megastores getPartitions API returns all 
the partitions if we provide an invalid spec.

- Re-implemented the `dropPartitions` in `HiveClientImpl`. Now, we always check 
if all the user-specified partition specs exist before attempting to drop the 
partitions. Previously, we start drop the partition before completing checking 
the existence of all the partition specs. If any failure happened after we 
start to drop the partitions, we will log an error message to indicate which 
partitions have been dropped and which partitions have not been dropped.

 How was this patch tested?
Modified the existing test cases and added new test cases.

Author: gatorsmile 
Author: xiaoli 
Author: Xiao Li 

Closes #12801 from gatorsmile/banDropMultiPart.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be617f3d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be617f3d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be617f3d

Branch: refs/heads/master
Commit: be617f3d0695982f982006fdd79afe3e3730b4c4
Parents: 470de74
Author: gatorsmile 
Authored: Thu May 12 11:14:40 2016 -0700
Committer: Andrew Or 
Committed: Thu May 12 11:14:40 2016 -0700

--
 .../sql/catalyst/catalog/SessionCatalog.scala   |  47 +++-
 .../catalyst/catalog/ExternalCatalogSuite.scala |   6 +
 .../catalyst/catalog/SessionCatalogSuite.scala  | 116 ++-
 .../spark/sql/execution/command/DDLSuite.scala  |  78 ++---
 .../spark/sql/hive/client/HiveClientImpl.scala  |  50 +---
 .../spark/sql/hive/execution/HiveDDLSuite.scala |   9 +-
 6 files changed, 248 insertions(+), 58 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/be617f3d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 0fc4ab5..54b30d3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -510,6 +510,7 @@ class SessionCatalog(
   tableName: TableIdentifier,
   parts: Seq[CatalogTablePartition],
   ignoreIfExists: Boolean): Unit = {
+requireExactMatchedPartitionSpec(parts.map(_.spec), 
getTableMetadata(tableName))
 val db = 
formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
 val table = formatTableName(tableName.table)
 requireDbExists(db)
@@ -523,13 +524,14 @@ class SessionCatalog(
*/
   def dropPartitions(
   tableName: TableIdentifier,
-  parts: Seq[TablePartitionSpec],
+  specs: Seq[TablePartitionSpec],
   ignoreIfNotExists: Boolean): Unit = {
+requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName))
 val db = 
formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
 val table = formatTableName(tableName.table)
 requireDbExists(db)
 requireTableExists(TableIdentifier(table, Option(db)))
-externalCatalog.dropPartitions(db, table, parts, ignoreIfNotExists)
+externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists)
   }
 
   /**
@@ -542,6 +544,9 

spark git commit: [SPARK-15094][SPARK-14803][SQL] Remove extra Project added in EliminateSerialization

2016-05-12 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master 5bb62b893 -> 470de743e


[SPARK-15094][SPARK-14803][SQL] Remove extra Project added in 
EliminateSerialization

## What changes were proposed in this pull request?

We will eliminate the pair of `DeserializeToObject` and `SerializeFromObject` 
in `Optimizer` and add extra `Project`. However, when DeserializeToObject's 
outputObjectType is ObjectType and its cls can't be processed by unsafe 
project, it will be failed.

To fix it, we can simply remove the extra `Project` and replace the output 
attribute of `DeserializeToObject` in another rule.

## How was this patch tested?
`DatasetSuite`.

Author: Liang-Chi Hsieh 

Closes #12926 from viirya/fix-eliminate-serialization-projection.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/470de743
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/470de743
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/470de743

Branch: refs/heads/master
Commit: 470de743ecf3617babd86f50ab203e85aa975d69
Parents: 5bb62b8
Author: Liang-Chi Hsieh 
Authored: Thu May 12 10:11:12 2016 -0700
Committer: Yin Huai 
Committed: Thu May 12 10:11:12 2016 -0700

--
 .../sql/catalyst/optimizer/Optimizer.scala  | 60 
 .../org/apache/spark/sql/DatasetSuite.scala | 12 
 2 files changed, 62 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/470de743/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 350b601..928ba21 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -102,7 +102,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, 
conf: CatalystConf)
   SimplifyCasts,
   SimplifyCaseConversionExpressions,
   RewriteCorrelatedScalarSubquery,
-  EliminateSerialization) ::
+  EliminateSerialization,
+  RemoveAliasOnlyProject) ::
 Batch("Decimal Optimizations", fixedPoint,
   DecimalAggregates) ::
 Batch("Typed Filter Optimization", fixedPoint,
@@ -156,6 +157,49 @@ object SamplePushDown extends Rule[LogicalPlan] {
 }
 
 /**
+ * Removes the Project only conducting Alias of its child node.
+ * It is created mainly for removing extra Project added in 
EliminateSerialization rule,
+ * but can also benefit other operators.
+ */
+object RemoveAliasOnlyProject extends Rule[LogicalPlan] {
+  // Check if projectList in the Project node has the same attribute names and 
ordering
+  // as its child node.
+  private def isAliasOnly(
+  projectList: Seq[NamedExpression],
+  childOutput: Seq[Attribute]): Boolean = {
+if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != 
childOutput.length) {
+  return false
+} else {
+  projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case 
(a, o) =>
+a.child match {
+  case attr: Attribute if a.name == attr.name && 
attr.semanticEquals(o) => true
+  case _ => false
+}
+  }
+}
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+val aliasOnlyProject = plan.find { p =>
+  p match {
+case Project(pList, child) if isAliasOnly(pList, child.output) => true
+case _ => false
+  }
+}
+
+aliasOnlyProject.map { case p: Project =>
+  val aliases = p.projectList.map(_.asInstanceOf[Alias])
+  val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child)))
+  plan.transformAllExpressions {
+case a: Attribute if attrMap.contains(a) => attrMap(a)
+  }.transform {
+case op: Project if op.eq(p) => op.child
+  }
+}.getOrElse(plan)
+  }
+}
+
+/**
  * Removes cases where we are unnecessarily going between the object and 
serialized (InternalRow)
  * representation of data item.  For example back to back map operations.
  */
@@ -163,15 +207,11 @@ object EliminateSerialization extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
 case d @ DeserializeToObject(_, _, s: SerializeFromObject)
 if d.outputObjectType == s.inputObjectType =>
-  // A workaround for SPARK-14803. Remove this after it is fixed.
-  if (d.outputObjectType.isInstanceOf[ObjectType] &&
-  d.outputObjectType.asInstanceOf[ObjectType].cls == 
classOf[org.apache.spark.sql.Row]) {
-s.child
-  } else {
-// 

spark git commit: [SPARK-15094][SPARK-14803][SQL] Remove extra Project added in EliminateSerialization

2016-05-12 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 b3f145442 -> 68617e1ad


[SPARK-15094][SPARK-14803][SQL] Remove extra Project added in 
EliminateSerialization

## What changes were proposed in this pull request?

We will eliminate the pair of `DeserializeToObject` and `SerializeFromObject` 
in `Optimizer` and add extra `Project`. However, when DeserializeToObject's 
outputObjectType is ObjectType and its cls can't be processed by unsafe 
project, it will be failed.

To fix it, we can simply remove the extra `Project` and replace the output 
attribute of `DeserializeToObject` in another rule.

## How was this patch tested?
`DatasetSuite`.

Author: Liang-Chi Hsieh 

Closes #12926 from viirya/fix-eliminate-serialization-projection.

(cherry picked from commit 470de743ecf3617babd86f50ab203e85aa975d69)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/68617e1a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/68617e1a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/68617e1a

Branch: refs/heads/branch-2.0
Commit: 68617e1addc81805d6c27d37a84f5b50644c6a75
Parents: b3f1454
Author: Liang-Chi Hsieh 
Authored: Thu May 12 10:11:12 2016 -0700
Committer: Yin Huai 
Committed: Thu May 12 10:11:26 2016 -0700

--
 .../sql/catalyst/optimizer/Optimizer.scala  | 60 
 .../org/apache/spark/sql/DatasetSuite.scala | 12 
 2 files changed, 62 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/68617e1a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 350b601..928ba21 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -102,7 +102,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, 
conf: CatalystConf)
   SimplifyCasts,
   SimplifyCaseConversionExpressions,
   RewriteCorrelatedScalarSubquery,
-  EliminateSerialization) ::
+  EliminateSerialization,
+  RemoveAliasOnlyProject) ::
 Batch("Decimal Optimizations", fixedPoint,
   DecimalAggregates) ::
 Batch("Typed Filter Optimization", fixedPoint,
@@ -156,6 +157,49 @@ object SamplePushDown extends Rule[LogicalPlan] {
 }
 
 /**
+ * Removes the Project only conducting Alias of its child node.
+ * It is created mainly for removing extra Project added in 
EliminateSerialization rule,
+ * but can also benefit other operators.
+ */
+object RemoveAliasOnlyProject extends Rule[LogicalPlan] {
+  // Check if projectList in the Project node has the same attribute names and 
ordering
+  // as its child node.
+  private def isAliasOnly(
+  projectList: Seq[NamedExpression],
+  childOutput: Seq[Attribute]): Boolean = {
+if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != 
childOutput.length) {
+  return false
+} else {
+  projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case 
(a, o) =>
+a.child match {
+  case attr: Attribute if a.name == attr.name && 
attr.semanticEquals(o) => true
+  case _ => false
+}
+  }
+}
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+val aliasOnlyProject = plan.find { p =>
+  p match {
+case Project(pList, child) if isAliasOnly(pList, child.output) => true
+case _ => false
+  }
+}
+
+aliasOnlyProject.map { case p: Project =>
+  val aliases = p.projectList.map(_.asInstanceOf[Alias])
+  val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child)))
+  plan.transformAllExpressions {
+case a: Attribute if attrMap.contains(a) => attrMap(a)
+  }.transform {
+case op: Project if op.eq(p) => op.child
+  }
+}.getOrElse(plan)
+  }
+}
+
+/**
  * Removes cases where we are unnecessarily going between the object and 
serialized (InternalRow)
  * representation of data item.  For example back to back map operations.
  */
@@ -163,15 +207,11 @@ object EliminateSerialization extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
 case d @ DeserializeToObject(_, _, s: SerializeFromObject)
 if d.outputObjectType == s.inputObjectType =>
-  // A workaround for SPARK-14803. Remove this after it is fixed.
-  if (d.outputObjectType.isInstanceOf[ObjectType] &&
-  

spark git commit: [HOTFIX] SQL test compilation error from merge conflict

2016-05-12 Thread andrewor14
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 9098b1a17 -> b3f145442


[HOTFIX] SQL test compilation error from merge conflict


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3f14544
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3f14544
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3f14544

Branch: refs/heads/branch-2.0
Commit: b3f145442a4419a43a13960bb2a45d28ce41bfc4
Parents: 9098b1a
Author: Andrew Or 
Authored: Tue May 10 11:46:02 2016 -0700
Committer: Andrew Or 
Committed: Thu May 12 09:20:43 2016 -0700

--
 .../scala/org/apache/spark/sql/internal/CatalogSuite.scala   | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b3f14544/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
index 94f77bc..e4d4cec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
@@ -279,10 +279,10 @@ class CatalogSuite
 assert(tableFields == Seq("nama", "databasa", "descripta", "typa", false))
 assert(functionFields == Seq("nama", "descripta", "classa", false))
 assert(columnFields == Seq("nama", "descripta", "typa", false, true, true))
-val dbString = CatalogImpl.makeDataset(Seq(db), 
sparkSession).showString(10)
-val tableString = CatalogImpl.makeDataset(Seq(table), 
sparkSession).showString(10)
-val functionString = CatalogImpl.makeDataset(Seq(function), 
sparkSession).showString(10)
-val columnString = CatalogImpl.makeDataset(Seq(column), 
sparkSession).showString(10)
+val dbString = CatalogImpl.makeDataset(Seq(db), spark).showString(10)
+val tableString = CatalogImpl.makeDataset(Seq(table), spark).showString(10)
+val functionString = CatalogImpl.makeDataset(Seq(function), 
spark).showString(10)
+val columnString = CatalogImpl.makeDataset(Seq(column), 
spark).showString(10)
 dbFields.foreach { f => assert(dbString.contains(f.toString)) }
 tableFields.foreach { f => assert(tableString.contains(f.toString)) }
 functionFields.foreach { f => assert(functionString.contains(f.toString)) }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [BUILD] Test closing stale PRs

2016-05-12 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 33c6eb521 -> 5bb62b893


[BUILD] Test closing stale PRs

## What changes were proposed in this pull request?

Here I'm seeing if we can close stale PRs via a PR message, as I'd expect.
See thread https://www.mail-archive.com/devspark.apache.org/msg14149.html

Closes #9354
Closes #9451
Closes #10507
Closes #10486
Closes #10460
Closes #10967
Closes #10681
Closes #11766
Closes #9907
Closes #10209
Closes #10379
Closes #10403
Closes #10842
Closes #11036
Closes #13003
Closes #10887

## How was this patch tested?

(No changes)

Author: Sean Owen 

Closes #13052 from srowen/TestClosingPRs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bb62b89
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bb62b89
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bb62b89

Branch: refs/heads/master
Commit: 5bb62b893bf13973de63ab28571e05501b84bfef
Parents: 33c6eb5
Author: Sean Owen 
Authored: Thu May 12 10:25:20 2016 +0100
Committer: Sean Owen 
Committed: Thu May 12 10:25:20 2016 +0100

--

--



-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-15171][SQL] Deprecate registerTempTable and add dataset.createTempView

2016-05-12 Thread lian
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6b69b8c0c -> 9098b1a17


[SPARK-15171][SQL] Deprecate registerTempTable and add dataset.createTempView

## What changes were proposed in this pull request?

Deprecates registerTempTable and add dataset.createTempView, 
dataset.createOrReplaceTempView.

## How was this patch tested?

Unit tests.

Author: Sean Zhong 

Closes #12945 from clockfly/spark-15171.

(cherry picked from commit 33c6eb5218ce3c31cc9f632a67fd2c7057569683)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9098b1a1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9098b1a1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9098b1a1

Branch: refs/heads/branch-2.0
Commit: 9098b1a1746d9affd894cb44ed169f3cf3566b14
Parents: 6b69b8c
Author: Sean Zhong 
Authored: Thu May 12 15:51:53 2016 +0800
Committer: Cheng Lian 
Committed: Thu May 12 15:54:33 2016 +0800

--
 .../sbt_app_sql/src/main/scala/SqlApp.scala |  4 +-
 .../apache/spark/examples/sql/JavaSparkSQL.java |  8 +--
 .../streaming/JavaSqlNetworkWordCount.java  |  2 +-
 examples/src/main/python/sql.py |  2 +-
 .../python/streaming/sql_network_wordcount.py   |  2 +-
 .../apache/spark/examples/sql/RDDRelation.scala |  4 +-
 .../streaming/SqlNetworkWordCount.scala |  2 +-
 .../spark/ml/feature/SQLTransformer.scala   |  5 +-
 python/pyspark/sql/catalog.py   | 26 +++---
 python/pyspark/sql/context.py   |  4 +-
 python/pyspark/sql/dataframe.py | 51 ++--
 python/pyspark/sql/session.py   |  6 +--
 .../analysis/DistinctAggregationRewriter.scala  |  2 +-
 .../sql/catalyst/catalog/SessionCatalog.scala   |  2 +-
 .../sql/catalyst/analysis/AnalysisTest.scala|  2 +-
 .../analysis/DecimalPrecisionSuite.scala|  2 +-
 .../catalyst/catalog/SessionCatalogSuite.scala  | 26 +-
 .../org/apache/spark/sql/DataFrameWriter.scala  |  2 +-
 .../scala/org/apache/spark/sql/Dataset.scala| 30 +++-
 .../scala/org/apache/spark/sql/SQLContext.scala |  4 +-
 .../org/apache/spark/sql/SparkSession.scala | 17 +++
 .../org/apache/spark/sql/catalog/Catalog.scala  |  8 +--
 .../spark/sql/execution/SparkSqlParser.scala|  2 +-
 .../spark/sql/execution/command/cache.scala |  3 +-
 .../spark/sql/execution/command/views.scala |  2 +-
 .../spark/sql/execution/datasources/ddl.scala   |  4 +-
 .../apache/spark/sql/internal/CatalogImpl.scala | 12 ++---
 .../org/apache/spark/sql/CachedTableSuite.scala |  6 +--
 .../org/apache/spark/sql/DataFrameSuite.scala   | 19 
 .../spark/sql/DataFrameTimeWindowingSuite.scala |  2 +-
 .../org/apache/spark/sql/DatasetSuite.scala | 18 +++
 .../org/apache/spark/sql/ListTablesSuite.scala  |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala|  6 +--
 .../scala/org/apache/spark/sql/UDFSuite.scala   |  4 +-
 .../columnar/InMemoryColumnarQuerySuite.scala   |  2 +-
 .../parquet/ParquetReadBenchmark.scala  |  2 +-
 .../spark/sql/internal/CatalogSuite.scala   |  2 +-
 .../apache/spark/sql/test/SQLTestUtils.scala|  2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +-
 .../spark/sql/hive/ErrorPositionSuite.scala |  6 +--
 .../spark/sql/hive/HiveDDLCommandSuite.scala|  2 +-
 .../apache/spark/sql/hive/ListTablesSuite.scala |  2 +-
 .../hive/execution/AggregationQuerySuite.scala  |  2 +-
 .../spark/sql/hive/execution/HiveUDFSuite.scala |  2 +-
 .../hive/execution/SQLWindowFunctionSuite.scala |  2 +-
 45 files changed, 197 insertions(+), 120 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9098b1a1/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
--
diff --git a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala 
b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
index 69c1154..1002631 100644
--- a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
+++ b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
@@ -41,7 +41,7 @@ object SparkSqlExample {
 import sqlContext._
 
 val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF()
-people.registerTempTable("people")
+people.createOrReplaceTempView("people")
 val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 
19")
 val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect()
 teenagerNames.foreach(println)
@@ -52,7 +52,7 @@ object SparkSqlExample {
 System.exit(-1)
   }
 }
-
+
 test(teenagerNames.size == 7, "Unexpected number of selected 

spark git commit: [SPARK-15171][SQL] Deprecate registerTempTable and add dataset.createTempView

2016-05-12 Thread lian
Repository: spark
Updated Branches:
  refs/heads/master 5207a005c -> 33c6eb521


[SPARK-15171][SQL] Deprecate registerTempTable and add dataset.createTempView

## What changes were proposed in this pull request?

Deprecates registerTempTable and add dataset.createTempView, 
dataset.createOrReplaceTempView.

## How was this patch tested?

Unit tests.

Author: Sean Zhong 

Closes #12945 from clockfly/spark-15171.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33c6eb52
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33c6eb52
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33c6eb52

Branch: refs/heads/master
Commit: 33c6eb5218ce3c31cc9f632a67fd2c7057569683
Parents: 5207a00
Author: Sean Zhong 
Authored: Thu May 12 15:51:53 2016 +0800
Committer: Cheng Lian 
Committed: Thu May 12 15:51:53 2016 +0800

--
 .../sbt_app_sql/src/main/scala/SqlApp.scala |  4 +-
 .../apache/spark/examples/sql/JavaSparkSQL.java |  8 +--
 .../streaming/JavaSqlNetworkWordCount.java  |  2 +-
 examples/src/main/python/sql.py |  2 +-
 .../python/streaming/sql_network_wordcount.py   |  2 +-
 .../apache/spark/examples/sql/RDDRelation.scala |  4 +-
 .../streaming/SqlNetworkWordCount.scala |  2 +-
 .../spark/ml/feature/SQLTransformer.scala   |  5 +-
 python/pyspark/sql/catalog.py   | 26 +++---
 python/pyspark/sql/context.py   |  4 +-
 python/pyspark/sql/dataframe.py | 51 ++--
 python/pyspark/sql/session.py   |  6 +--
 .../analysis/DistinctAggregationRewriter.scala  |  2 +-
 .../sql/catalyst/catalog/SessionCatalog.scala   |  2 +-
 .../sql/catalyst/analysis/AnalysisTest.scala|  2 +-
 .../analysis/DecimalPrecisionSuite.scala|  2 +-
 .../catalyst/catalog/SessionCatalogSuite.scala  | 26 +-
 .../org/apache/spark/sql/DataFrameWriter.scala  |  2 +-
 .../scala/org/apache/spark/sql/Dataset.scala| 30 +++-
 .../scala/org/apache/spark/sql/SQLContext.scala |  4 +-
 .../org/apache/spark/sql/SparkSession.scala | 17 +++
 .../org/apache/spark/sql/catalog/Catalog.scala  |  8 +--
 .../spark/sql/execution/SparkSqlParser.scala|  2 +-
 .../spark/sql/execution/command/cache.scala |  3 +-
 .../spark/sql/execution/command/views.scala |  2 +-
 .../spark/sql/execution/datasources/ddl.scala   |  4 +-
 .../apache/spark/sql/internal/CatalogImpl.scala | 12 ++---
 .../org/apache/spark/sql/CachedTableSuite.scala |  6 +--
 .../org/apache/spark/sql/DataFrameSuite.scala   | 19 
 .../spark/sql/DataFrameTimeWindowingSuite.scala |  2 +-
 .../org/apache/spark/sql/DatasetSuite.scala | 18 +++
 .../org/apache/spark/sql/ListTablesSuite.scala  |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala|  6 +--
 .../scala/org/apache/spark/sql/UDFSuite.scala   |  4 +-
 .../columnar/InMemoryColumnarQuerySuite.scala   |  2 +-
 .../parquet/ParquetReadBenchmark.scala  |  2 +-
 .../spark/sql/internal/CatalogSuite.scala   |  2 +-
 .../apache/spark/sql/test/SQLTestUtils.scala|  2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +-
 .../spark/sql/hive/ErrorPositionSuite.scala |  6 +--
 .../spark/sql/hive/HiveDDLCommandSuite.scala|  2 +-
 .../apache/spark/sql/hive/ListTablesSuite.scala |  2 +-
 .../hive/execution/AggregationQuerySuite.scala  |  2 +-
 .../spark/sql/hive/execution/HiveUDFSuite.scala |  2 +-
 .../hive/execution/SQLWindowFunctionSuite.scala |  2 +-
 45 files changed, 197 insertions(+), 120 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/33c6eb52/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
--
diff --git a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala 
b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
index 69c1154..1002631 100644
--- a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
+++ b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
@@ -41,7 +41,7 @@ object SparkSqlExample {
 import sqlContext._
 
 val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF()
-people.registerTempTable("people")
+people.createOrReplaceTempView("people")
 val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 
19")
 val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect()
 teenagerNames.foreach(println)
@@ -52,7 +52,7 @@ object SparkSqlExample {
 System.exit(-1)
   }
 }
-
+
 test(teenagerNames.size == 7, "Unexpected number of selected elements: " + 
teenagerNames)
 println("Test succeeded")
 sc.stop()


spark git commit: [SPARK-15281][PYSPARK][ML][TRIVIAL] Add impurity param to GBTRegressor & add experimental inside of regression.py

2016-05-12 Thread mlnick
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 beda3938c -> 6b69b8c0c


[SPARK-15281][PYSPARK][ML][TRIVIAL] Add impurity param to GBTRegressor & add 
experimental inside of regression.py

## What changes were proposed in this pull request?

Add impurity param to  GBTRegressor and mark the of the models & regressors in 
regression.py as experimental to match Scaladoc.

## How was this patch tested?

Added default value to init, tested with unit/doc tests.

Author: Holden Karau 

Closes #13071 from holdenk/SPARK-15281-GBTRegressor-impurity.

(cherry picked from commit 5207a005cc86618907b8f467abc03eacef485ecd)
Signed-off-by: Nick Pentreath 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b69b8c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b69b8c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b69b8c0

Branch: refs/heads/branch-2.0
Commit: 6b69b8c0c778f4cba2b281fe3ad225dc922f82d6
Parents: beda393
Author: Holden Karau 
Authored: Thu May 12 09:19:27 2016 +0200
Committer: Nick Pentreath 
Committed: Thu May 12 09:20:13 2016 +0200

--
 python/pyspark/ml/regression.py | 52 ++--
 1 file changed, 44 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6b69b8c0/python/pyspark/ml/regression.py
--
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index a2300fa..0d0eb8a 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -40,6 +40,8 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPrediction
HasRegParam, HasTol, HasElasticNetParam, 
HasFitIntercept,
HasStandardization, HasSolver, HasWeightCol, 
JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 Linear regression.
 
 The learning objective is to minimize the squared error, with 
regularization.
@@ -123,6 +125,8 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPrediction
 
 class LinearRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 Model fitted by LinearRegression.
 
 .. versionadded:: 1.4.0
@@ -631,6 +635,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 DecisionTreeParams, TreeRegressorParams, 
HasCheckpointInterval,
 HasSeed, JavaMLWritable, JavaMLReadable, 
HasVarianceCol):
 """
+.. note:: Experimental
+
 `Decision tree `_
 learning algorithm for regression.
 It supports both continuous and categorical features.
@@ -713,7 +719,10 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 
 @inherit_doc
 class DecisionTreeModel(JavaModel):
-"""Abstraction for Decision Tree models.
+"""
+.. note:: Experimental
+
+Abstraction for Decision Tree models.
 
 .. versionadded:: 1.5.0
 """
@@ -736,7 +745,10 @@ class DecisionTreeModel(JavaModel):
 
 @inherit_doc
 class TreeEnsembleModels(JavaModel):
-"""Represents a tree ensemble model.
+"""
+.. note:: Experimental
+
+Represents a tree ensemble model.
 
 .. versionadded:: 1.5.0
 """
@@ -754,6 +766,8 @@ class TreeEnsembleModels(JavaModel):
 @inherit_doc
 class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, 
JavaMLReadable):
 """
+.. note:: Experimental
+
 Model fitted by DecisionTreeRegressor.
 
 .. versionadded:: 1.4.0
@@ -786,6 +800,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 RandomForestParams, TreeRegressorParams, 
HasCheckpointInterval,
 JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 `Random Forest `_
 learning algorithm for regression.
 It supports both continuous and categorical features.
@@ -868,6 +884,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 
 class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, 
JavaMLReadable):
 """
+.. note:: Experimental
+
 Model fitted by RandomForestRegressor.
 
 .. versionadded:: 1.4.0
@@ -892,8 +910,10 @@ class RandomForestRegressionModel(TreeEnsembleModels, 
JavaMLWritable, JavaMLRead
 @inherit_doc
 class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,
GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, 

spark git commit: [SPARK-15281][PYSPARK][ML][TRIVIAL] Add impurity param to GBTRegressor & add experimental inside of regression.py

2016-05-12 Thread mlnick
Repository: spark
Updated Branches:
  refs/heads/master 46991448a -> 5207a005c


[SPARK-15281][PYSPARK][ML][TRIVIAL] Add impurity param to GBTRegressor & add 
experimental inside of regression.py

## What changes were proposed in this pull request?

Add impurity param to  GBTRegressor and mark the of the models & regressors in 
regression.py as experimental to match Scaladoc.

## How was this patch tested?

Added default value to init, tested with unit/doc tests.

Author: Holden Karau 

Closes #13071 from holdenk/SPARK-15281-GBTRegressor-impurity.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5207a005
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5207a005
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5207a005

Branch: refs/heads/master
Commit: 5207a005cc86618907b8f467abc03eacef485ecd
Parents: 4699144
Author: Holden Karau 
Authored: Thu May 12 09:19:27 2016 +0200
Committer: Nick Pentreath 
Committed: Thu May 12 09:19:27 2016 +0200

--
 python/pyspark/ml/regression.py | 52 ++--
 1 file changed, 44 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5207a005/python/pyspark/ml/regression.py
--
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index a2300fa..0d0eb8a 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -40,6 +40,8 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPrediction
HasRegParam, HasTol, HasElasticNetParam, 
HasFitIntercept,
HasStandardization, HasSolver, HasWeightCol, 
JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 Linear regression.
 
 The learning objective is to minimize the squared error, with 
regularization.
@@ -123,6 +125,8 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPrediction
 
 class LinearRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 Model fitted by LinearRegression.
 
 .. versionadded:: 1.4.0
@@ -631,6 +635,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 DecisionTreeParams, TreeRegressorParams, 
HasCheckpointInterval,
 HasSeed, JavaMLWritable, JavaMLReadable, 
HasVarianceCol):
 """
+.. note:: Experimental
+
 `Decision tree `_
 learning algorithm for regression.
 It supports both continuous and categorical features.
@@ -713,7 +719,10 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 
 @inherit_doc
 class DecisionTreeModel(JavaModel):
-"""Abstraction for Decision Tree models.
+"""
+.. note:: Experimental
+
+Abstraction for Decision Tree models.
 
 .. versionadded:: 1.5.0
 """
@@ -736,7 +745,10 @@ class DecisionTreeModel(JavaModel):
 
 @inherit_doc
 class TreeEnsembleModels(JavaModel):
-"""Represents a tree ensemble model.
+"""
+.. note:: Experimental
+
+Represents a tree ensemble model.
 
 .. versionadded:: 1.5.0
 """
@@ -754,6 +766,8 @@ class TreeEnsembleModels(JavaModel):
 @inherit_doc
 class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, 
JavaMLReadable):
 """
+.. note:: Experimental
+
 Model fitted by DecisionTreeRegressor.
 
 .. versionadded:: 1.4.0
@@ -786,6 +800,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 RandomForestParams, TreeRegressorParams, 
HasCheckpointInterval,
 JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 `Random Forest `_
 learning algorithm for regression.
 It supports both continuous and categorical features.
@@ -868,6 +884,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
 
 class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, 
JavaMLReadable):
 """
+.. note:: Experimental
+
 Model fitted by RandomForestRegressor.
 
 .. versionadded:: 1.4.0
@@ -892,8 +910,10 @@ class RandomForestRegressionModel(TreeEnsembleModels, 
JavaMLWritable, JavaMLRead
 @inherit_doc
 class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,
GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, 
JavaMLWritable,
-   JavaMLReadable):
+   JavaMLReadable, TreeRegressorParams):
 """
+.. note:: 

spark git commit: [SPARK-15160][SQL] support data source table in InMemoryCatalog

2016-05-12 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 86acb5efd -> beda3938c


[SPARK-15160][SQL] support data source table in InMemoryCatalog

## What changes were proposed in this pull request?

This PR adds a new rule to convert `SimpleCatalogRelation` to data source table 
if its table property contains data source information.

## How was this patch tested?

new test in SQLQuerySuite

Author: Wenchen Fan 

Closes #12935 from cloud-fan/ds-table.

(cherry picked from commit 46991448aa6f78f413a761059d7d7bb586f9d63e)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/beda3938
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/beda3938
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/beda3938

Branch: refs/heads/branch-2.0
Commit: beda3938c2901de81a1df9ed802b136b7abe29f4
Parents: 86acb5e
Author: Wenchen Fan 
Authored: Wed May 11 23:55:42 2016 -0700
Committer: Yin Huai 
Committed: Wed May 11 23:55:58 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  2 +
 .../command/createDataSourceTables.scala|  4 +-
 .../spark/sql/execution/command/ddl.scala   | 76 
 .../spark/sql/execution/command/tables.scala| 27 ---
 .../datasources/DataSourceStrategy.scala| 47 +++-
 .../spark/sql/internal/SessionState.scala   |  7 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala| 16 +
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +
 8 files changed, 114 insertions(+), 67 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/beda3938/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index fc2068c..d215655 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -185,6 +185,8 @@ case class SimpleCatalogRelation(
 
   override def catalogTable: CatalogTable = metadata
 
+  override lazy val resolved: Boolean = false
+
   override val output: Seq[Attribute] = {
 val cols = catalogTable.schema
   .filter { c => !catalogTable.partitionColumnNames.contains(c.name) }

http://git-wip-us.apache.org/repos/asf/spark/blob/beda3938/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index de3c868..7d3c5257 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -26,7 +26,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
-import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, 
CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.HiveSerDe
@@ -200,6 +200,8 @@ case class CreateDataSourceTableAsSelectCommand(
 s"doesn't match the data schema[${query.schema}]'s")
   }
   existingSchema = Some(l.schema)
+case s: SimpleCatalogRelation if 
DDLUtils.isDatasourceTable(s.metadata) =>
+  existingSchema = 
DDLUtils.getSchemaFromTableProperties(s.metadata)
 case o =>
   throw new AnalysisException(s"Saving data in ${o.toString} is 
not supported.")
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/beda3938/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 0b0b618..1c1716f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ 

spark git commit: [SPARK-15160][SQL] support data source table in InMemoryCatalog

2016-05-12 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master 9e266d07a -> 46991448a


[SPARK-15160][SQL] support data source table in InMemoryCatalog

## What changes were proposed in this pull request?

This PR adds a new rule to convert `SimpleCatalogRelation` to data source table 
if its table property contains data source information.

## How was this patch tested?

new test in SQLQuerySuite

Author: Wenchen Fan 

Closes #12935 from cloud-fan/ds-table.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/46991448
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/46991448
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/46991448

Branch: refs/heads/master
Commit: 46991448aa6f78f413a761059d7d7bb586f9d63e
Parents: 9e266d0
Author: Wenchen Fan 
Authored: Wed May 11 23:55:42 2016 -0700
Committer: Yin Huai 
Committed: Wed May 11 23:55:42 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  2 +
 .../command/createDataSourceTables.scala|  4 +-
 .../spark/sql/execution/command/ddl.scala   | 76 
 .../spark/sql/execution/command/tables.scala| 27 ---
 .../datasources/DataSourceStrategy.scala| 47 +++-
 .../spark/sql/internal/SessionState.scala   |  7 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala| 16 +
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +
 8 files changed, 114 insertions(+), 67 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/46991448/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index fc2068c..d215655 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -185,6 +185,8 @@ case class SimpleCatalogRelation(
 
   override def catalogTable: CatalogTable = metadata
 
+  override lazy val resolved: Boolean = false
+
   override val output: Seq[Attribute] = {
 val cols = catalogTable.schema
   .filter { c => !catalogTable.partitionColumnNames.contains(c.name) }

http://git-wip-us.apache.org/repos/asf/spark/blob/46991448/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index de3c868..7d3c5257 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -26,7 +26,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
-import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, 
CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.HiveSerDe
@@ -200,6 +200,8 @@ case class CreateDataSourceTableAsSelectCommand(
 s"doesn't match the data schema[${query.schema}]'s")
   }
   existingSchema = Some(l.schema)
+case s: SimpleCatalogRelation if 
DDLUtils.isDatasourceTable(s.metadata) =>
+  existingSchema = 
DDLUtils.getSchemaFromTableProperties(s.metadata)
 case o =>
   throw new AnalysisException(s"Saving data in ${o.toString} is 
not supported.")
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/46991448/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 0b0b618..1c1716f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -25,6 +25,7 @@ import 
org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable}
 import