[1/3] spark-website git commit: Add 1.6.3 release.
Repository: spark-website Updated Branches: refs/heads/asf-site 24d32b75d -> b9aa4c3ee http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/releases/spark-release-1-2-1.html -- diff --git a/site/releases/spark-release-1-2-1.html b/site/releases/spark-release-1-2-1.html index 5581c54..c9efc6a 100644 --- a/site/releases/spark-release-1-2-1.html +++ b/site/releases/spark-release-1-2-1.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/releases/spark-release-1-2-2.html -- diff --git a/site/releases/spark-release-1-2-2.html b/site/releases/spark-release-1-2-2.html index c8a859a..d76c619 100644 --- a/site/releases/spark-release-1-2-2.html +++ b/site/releases/spark-release-1-2-2.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/releases/spark-release-1-3-0.html -- diff --git a/site/releases/spark-release-1-3-0.html b/site/releases/spark-release-1-3-0.html index 382ef4d..435ed19 100644 --- a/site/releases/spark-release-1-3-0.html +++ b/site/releases/spark-release-1-3-0.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive @@ -191,7 +191,7 @@ To download Spark 1.3 visit the downloads page. Spark Core -Spark 1.3 sees a handful of usability improvements in the core engine. The core API now supports https://issues.apache.org/jira/browse/SPARK-5430";>multi level aggregation trees to help speed up expensive reduce operations. https://issues.apache.org/jira/browse/SPARK-5063";>Improved error reporting has been added for certain gotcha operations. Spark’s Jetty dependency is https://issues.apache.org/jira/browse/SPARK-3996";>now shaded to help avoid conflicts with user programs. Spark now supports https://issues.apache.org/jira/browse/SPARK-3883";>SSL encryption for some communication endpoints. Finaly, realtime https://issues.apache.org/jira/browse/SPARK-3428";>GC metrics and https://issues.apache.org/jira/browse/SPARK-4874";>record counts have been added to the UI. +Spark 1.3 sees a handful of usability improvements in the core engine. The core API now supports https://issues.apache.org/jira/browse/SPARK-5430";>multi level aggregation trees to help speed up expensive reduce operations. https://issues.apache.org/jira/browse/SPARK-5063";>Improved error reporting has been added for certain gotcha operations. Spark’s Jetty dependency is https://issues.apache.org/jira/browse/SPARK-3996";>now shaded to help avoid conflicts with user programs. Spark now supports https://issues.apache.org/jira/browse/SPARK-3883";>SSL encryption for some communication endpoints. Finaly, realtime https://issues.apache.org/jira/browse/SPARK-3428";>GC metrics and https://issues.apache.org/jira/browse/SPARK-4874";>record counts have been added to the UI. DataFrame API Spark 1.3 adds a new DataFrames API that provides powerful and convenient operators when working with structured datasets. The DataFrame is an evolution of the base RDD API that includes named fields along with schema information. Itâs easy to construct a DataFrame from sources such as Hive tables, JSON data, a JDBC database, or any implementation of Sparkâs new data source API. Data frames will become a common interchange format between Spark components and when importing and exporting data to other systems. Data frames are supported in Python, Scala, and Java. @@ -203,7 +203,7 @@ In this release Spark MLlib introduces several new algorithms: latent Dirichlet allocation (LDA) for https://issues.apache.org/jira/browse/SPARK-1405";>topic modeling, https://issues.apache.org/jira/browse/SPARK-2309";>multinomi
[3/3] spark-website git commit: Add 1.6.3 release.
Add 1.6.3 release. Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/b9aa4c3e Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/b9aa4c3e Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/b9aa4c3e Branch: refs/heads/asf-site Commit: b9aa4c3eefe4788fa97086ea87d92d8e3bfbc535 Parents: 24d32b7 Author: Reynold Xin Authored: Mon Nov 7 19:05:12 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 19:05:12 2016 -0800 -- documentation.md| 1 + js/downloads.js | 3 +- news/_posts/2016-11-07-spark-1-6-3-released.md | 16 ++ .../_posts/2016-11-07-spark-release-1-6-3.md| 18 ++ site/community.html | 6 +- site/documentation.html | 12 +- site/downloads.html | 6 +- site/examples.html | 6 +- site/faq.html | 6 +- site/graphx/index.html | 6 +- site/index.html | 6 +- site/js/downloads.js| 3 +- site/mailing-lists.html | 6 +- site/mllib/index.html | 6 +- site/news/amp-camp-2013-registration-ope.html | 6 +- .../news/announcing-the-first-spark-summit.html | 6 +- .../news/fourth-spark-screencast-published.html | 6 +- site/news/index.html| 30 ++- site/news/nsdi-paper.html | 6 +- site/news/one-month-to-spark-summit-2015.html | 6 +- .../proposals-open-for-spark-summit-east.html | 6 +- ...registration-open-for-spark-summit-east.html | 6 +- .../news/run-spark-and-shark-on-amazon-emr.html | 6 +- site/news/spark-0-6-1-and-0-5-2-released.html | 6 +- site/news/spark-0-6-2-released.html | 6 +- site/news/spark-0-7-0-released.html | 6 +- site/news/spark-0-7-2-released.html | 6 +- site/news/spark-0-7-3-released.html | 6 +- site/news/spark-0-8-0-released.html | 6 +- site/news/spark-0-8-1-released.html | 6 +- site/news/spark-0-9-0-released.html | 6 +- site/news/spark-0-9-1-released.html | 8 +- site/news/spark-0-9-2-released.html | 8 +- site/news/spark-1-0-0-released.html | 6 +- site/news/spark-1-0-1-released.html | 6 +- site/news/spark-1-0-2-released.html | 6 +- site/news/spark-1-1-0-released.html | 8 +- site/news/spark-1-1-1-released.html | 6 +- site/news/spark-1-2-0-released.html | 6 +- site/news/spark-1-2-1-released.html | 6 +- site/news/spark-1-2-2-released.html | 8 +- site/news/spark-1-3-0-released.html | 6 +- site/news/spark-1-4-0-released.html | 6 +- site/news/spark-1-4-1-released.html | 6 +- site/news/spark-1-5-0-released.html | 6 +- site/news/spark-1-5-1-released.html | 6 +- site/news/spark-1-5-2-released.html | 6 +- site/news/spark-1-6-0-released.html | 6 +- site/news/spark-1-6-1-released.html | 6 +- site/news/spark-1-6-2-released.html | 6 +- site/news/spark-1-6-3-released.html | 213 ++ site/news/spark-2-0-0-released.html | 6 +- site/news/spark-2-0-1-released.html | 6 +- site/news/spark-2.0.0-preview.html | 6 +- .../spark-accepted-into-apache-incubator.html | 6 +- site/news/spark-and-shark-in-the-news.html | 8 +- site/news/spark-becomes-tlp.html| 6 +- site/news/spark-featured-in-wired.html | 6 +- .../spark-mailing-lists-moving-to-apache.html | 6 +- site/news/spark-meetups.html| 6 +- site/news/spark-screencasts-published.html | 6 +- site/news/spark-summit-2013-is-a-wrap.html | 6 +- site/news/spark-summit-2014-videos-posted.html | 6 +- site/news/spark-summit-2015-videos-posted.html | 6 +- site/news/spark-summit-agenda-posted.html | 6 +- .../spark-summit-east-2015-videos-posted.html | 8 +- .../spark-summit-east-2016-cfp-closing.html | 6 +- site/news/spark-summit-east-agenda-posted.html | 6 +- .../news/spark-summit-europe-agenda-posted.html | 6 +- site/news/spark-summit-europe.html | 6 +- .../spark-summit-june-2016-agenda-posted.html | 6 +- site/news/spark-tips-from-quantifind.html | 6 +- .../spark-user-survey-and-powered-by-page.html | 6 +- site/news/spark-version-0-6-0-released.html | 6 +- ...-wins-daytona-gray-sort-100tb-benchmark.html | 6 +- .../strata-exercises-now-ava
[2/3] spark-website git commit: Add 1.6.3 release.
http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/news/spark-2-0-0-released.html -- diff --git a/site/news/spark-2-0-0-released.html b/site/news/spark-2-0-0-released.html index dd2f3e8..070d5ac 100644 --- a/site/news/spark-2-0-0-released.html +++ b/site/news/spark-2-0-0-released.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/news/spark-2-0-1-released.html -- diff --git a/site/news/spark-2-0-1-released.html b/site/news/spark-2-0-1-released.html index 8ee951f..f772398 100644 --- a/site/news/spark-2-0-1-released.html +++ b/site/news/spark-2-0-1-released.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/news/spark-2.0.0-preview.html -- diff --git a/site/news/spark-2.0.0-preview.html b/site/news/spark-2.0.0-preview.html index 87d446a..7e7f1a8 100644 --- a/site/news/spark-2.0.0-preview.html +++ b/site/news/spark-2.0.0-preview.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/news/spark-accepted-into-apache-incubator.html -- diff --git a/site/news/spark-accepted-into-apache-incubator.html b/site/news/spark-accepted-into-apache-incubator.html index bb6ed2e..e6330cf 100644 --- a/site/news/spark-accepted-into-apache-incubator.html +++ b/site/news/spark-accepted-into-apache-incubator.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive http://git-wip-us.apache.org/repos/asf/spark-website/blob/b9aa4c3e/site/news/spark-and-shark-in-the-news.html -- diff --git a/site/news/spark-and-shark-in-the-news.html b/site/news/spark-and-shark-in-the-news.html index 16375a5..d48af18 100644 --- a/site/news/spark-and-shark-in-the-news.html +++ b/site/news/spark-and-shark-in-the-news.html @@ -150,6 +150,9 @@ Latest News + Spark 1.6.3 released + (Nov 07, 2016) + Spark 2.0.1 released (Oct 03, 2016) @@ -159,9 +162,6 @@ Spark 1.6.2 released (Jun 25, 2016) - Call for Presentations for Spark Summit EU is Open - (Jun 16, 2016) - Archive @@ -196,7 +196,7 @@ http://data-informed.com/spark-an-open-source-engine-for-iterative-data-mining/";>DataInformed interviewed two Spark users and wrote about their applications in anomaly detection, predictive analytics and data mining. -In other news, there will be a full day of tutorials on Spark and Shark at the http://strataconf.com/strata2013";>O’Reilly Strata conference in February. They include a three-hour http://strataconf.com/strata2013/public/schedule/detail/27438";>introduction to Spark, Shark and BDAS Tuesday morning, and a three-hour http://strataconf.com/strata2013/public/schedule/detail/27440";>hands-on exercise session. +In other news, there will be a full day of tutorials on Spark and Shark at the http://strataconf.com/strata2013";>O’Reilly Strata conference in February. They include a three-hour http://strataconf.com/strata2013/public/schedule/detail/27438";>introduction to Spark, Shark and BDAS Tu
spark git commit: [SPARK-16575][CORE] partition calculation mismatch with sc.binaryFiles
Repository: spark Updated Branches: refs/heads/branch-2.1 4cb4e5ff0 -> c8879bf1e [SPARK-16575][CORE] partition calculation mismatch with sc.binaryFiles ## What changes were proposed in this pull request? This Pull request comprises of the critical bug SPARK-16575 changes. This change rectifies the issue with BinaryFileRDD partition calculations as upon creating an RDD with sc.binaryFiles, the resulting RDD always just consisted of two partitions only. ## How was this patch tested? The original issue ie. getNumPartitions on binary Files RDD (always having two partitions) was first replicated and then tested upon the changes. Also the unit tests have been checked and passed. This contribution is my original work and I licence the work to the project under the project's open source license srowen hvanhovell rxin vanzin skyluc kmader zsxwing datafarmer Please have a look . Author: fidato Closes #15327 from fidato13/SPARK-16575. (cherry picked from commit 6f3697136aa68dc39d3ce42f43a7af554d2a3bf9) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c8879bf1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c8879bf1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c8879bf1 Branch: refs/heads/branch-2.1 Commit: c8879bf1ee2af9ccd5d5656571d931d2fc1da024 Parents: 4cb4e5f Author: fidato Authored: Mon Nov 7 18:41:17 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 18:41:29 2016 -0800 -- .../org/apache/spark/input/PortableDataStream.scala | 14 +++--- .../org/apache/spark/internal/config/package.scala | 13 + .../scala/org/apache/spark/rdd/BinaryFileRDD.scala | 4 ++-- docs/configuration.md | 16 4 files changed, 42 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c8879bf1/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala -- diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala index f66510b..59404e0 100644 --- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala +++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala @@ -27,6 +27,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} +import org.apache.spark.internal.config +import org.apache.spark.SparkContext + /** * A general format for reading whole files in as streams, byte arrays, * or other functions to be added @@ -40,9 +43,14 @@ private[spark] abstract class StreamFileInputFormat[T] * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API * which is set through setMaxSplitSize */ - def setMinPartitions(context: JobContext, minPartitions: Int) { -val totalLen = listStatus(context).asScala.filterNot(_.isDirectory).map(_.getLen).sum -val maxSplitSize = math.ceil(totalLen / math.max(minPartitions, 1.0)).toLong + def setMinPartitions(sc: SparkContext, context: JobContext, minPartitions: Int) { +val defaultMaxSplitBytes = sc.getConf.get(config.FILES_MAX_PARTITION_BYTES) +val openCostInBytes = sc.getConf.get(config.FILES_OPEN_COST_IN_BYTES) +val defaultParallelism = sc.defaultParallelism +val files = listStatus(context).asScala +val totalBytes = files.filterNot(_.isDirectory).map(_.getLen + openCostInBytes).sum +val bytesPerCore = totalBytes / defaultParallelism +val maxSplitSize = Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore)) super.setMaxSplitSize(maxSplitSize) } http://git-wip-us.apache.org/repos/asf/spark/blob/c8879bf1/core/src/main/scala/org/apache/spark/internal/config/package.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 497ca92..4a3e3d5 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -206,4 +206,17 @@ package object config { "encountering corrupt files and contents that have been read will still be returned.") .booleanConf .createWithDefault(false) + + private[spark] val FILES_MAX_PARTITION_BYTES = ConfigBuilder("spark.files.maxPartitionBytes") +.doc("The maximum number of bytes to pack into a single partition when reading files.") +
spark git commit: [SPARK-16575][CORE] partition calculation mismatch with sc.binaryFiles
Repository: spark Updated Branches: refs/heads/master 1da64e1fa -> 6f3697136 [SPARK-16575][CORE] partition calculation mismatch with sc.binaryFiles ## What changes were proposed in this pull request? This Pull request comprises of the critical bug SPARK-16575 changes. This change rectifies the issue with BinaryFileRDD partition calculations as upon creating an RDD with sc.binaryFiles, the resulting RDD always just consisted of two partitions only. ## How was this patch tested? The original issue ie. getNumPartitions on binary Files RDD (always having two partitions) was first replicated and then tested upon the changes. Also the unit tests have been checked and passed. This contribution is my original work and I licence the work to the project under the project's open source license srowen hvanhovell rxin vanzin skyluc kmader zsxwing datafarmer Please have a look . Author: fidato Closes #15327 from fidato13/SPARK-16575. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f369713 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f369713 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f369713 Branch: refs/heads/master Commit: 6f3697136aa68dc39d3ce42f43a7af554d2a3bf9 Parents: 1da64e1 Author: fidato Authored: Mon Nov 7 18:41:17 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 18:41:17 2016 -0800 -- .../org/apache/spark/input/PortableDataStream.scala | 14 +++--- .../org/apache/spark/internal/config/package.scala | 13 + .../scala/org/apache/spark/rdd/BinaryFileRDD.scala | 4 ++-- docs/configuration.md | 16 4 files changed, 42 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f369713/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala -- diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala index f66510b..59404e0 100644 --- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala +++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala @@ -27,6 +27,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} +import org.apache.spark.internal.config +import org.apache.spark.SparkContext + /** * A general format for reading whole files in as streams, byte arrays, * or other functions to be added @@ -40,9 +43,14 @@ private[spark] abstract class StreamFileInputFormat[T] * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API * which is set through setMaxSplitSize */ - def setMinPartitions(context: JobContext, minPartitions: Int) { -val totalLen = listStatus(context).asScala.filterNot(_.isDirectory).map(_.getLen).sum -val maxSplitSize = math.ceil(totalLen / math.max(minPartitions, 1.0)).toLong + def setMinPartitions(sc: SparkContext, context: JobContext, minPartitions: Int) { +val defaultMaxSplitBytes = sc.getConf.get(config.FILES_MAX_PARTITION_BYTES) +val openCostInBytes = sc.getConf.get(config.FILES_OPEN_COST_IN_BYTES) +val defaultParallelism = sc.defaultParallelism +val files = listStatus(context).asScala +val totalBytes = files.filterNot(_.isDirectory).map(_.getLen + openCostInBytes).sum +val bytesPerCore = totalBytes / defaultParallelism +val maxSplitSize = Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore)) super.setMaxSplitSize(maxSplitSize) } http://git-wip-us.apache.org/repos/asf/spark/blob/6f369713/core/src/main/scala/org/apache/spark/internal/config/package.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 497ca92..4a3e3d5 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -206,4 +206,17 @@ package object config { "encountering corrupt files and contents that have been read will still be returned.") .booleanConf .createWithDefault(false) + + private[spark] val FILES_MAX_PARTITION_BYTES = ConfigBuilder("spark.files.maxPartitionBytes") +.doc("The maximum number of bytes to pack into a single partition when reading files.") +.longConf +.createWithDefault(128 * 1024 * 1024) + + private[spark] val FILES_OPEN_COST_IN_BYTES =
spark git commit: [SPARK-18217][SQL] Disallow creating permanent views based on temporary views or UDFs
Repository: spark Updated Branches: refs/heads/master c1a0c66bd -> 1da64e1fa [SPARK-18217][SQL] Disallow creating permanent views based on temporary views or UDFs ### What changes were proposed in this pull request? Based on the discussion in [SPARK-18209](https://issues.apache.org/jira/browse/SPARK-18209). It doesn't really make sense to create permanent views based on temporary views or temporary UDFs. To disallow the supports and issue the exceptions, this PR needs to detect whether a temporary view/UDF is being used when defining a permanent view. Basically, this PR can be split to two sub-tasks: **Task 1:** detecting a temporary view from the query plan of view definition. When finding an unresolved temporary view, Analyzer replaces it by a `SubqueryAlias` with the corresponding logical plan, which is stored in an in-memory HashMap. After replacement, it is impossible to detect whether the `SubqueryAlias` is added/generated from a temporary view. Thus, to detect the usage of a temporary view in view definition, this PR traverses the unresolved logical plan and uses the name of an `UnresolvedRelation` to detect whether it is a (global) temporary view. **Task 2:** detecting a temporary UDF from the query plan of view definition. Detecting usage of a temporary UDF in view definition is not straightfoward. First, in the analyzed plan, we are having different forms to represent the functions. More importantly, some classes (e.g., `HiveGenericUDF`) are not accessible from `CreateViewCommand`, which is part of `sql/core`. Thus, we used the unanalyzed plan `child` of `CreateViewCommand` to detect the usage of a temporary UDF. Because the plan has already been successfully analyzed, we can assume the functions have been defined/registered. Second, in Spark, the functions have four forms: Spark built-in functions, built-in hash functions, permanent UDFs and temporary UDFs. We do not have any direct way to determine whether a function is temporary or not. Thus, we introduced a function `isTemporaryFunction` in `SessionCatalog`. This function contains the detailed logics to determine whether a function is temporary or not. ### How was this patch tested? Added test cases. Author: gatorsmile Closes #15764 from gatorsmile/blockTempFromPermViewCreation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1da64e1f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1da64e1f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1da64e1f Branch: refs/heads/master Commit: 1da64e1fa0970277d1fb47dec8adca47b068b1ec Parents: c1a0c66 Author: gatorsmile Authored: Mon Nov 7 18:34:21 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 18:34:21 2016 -0800 -- .../sql/catalyst/catalog/SessionCatalog.scala | 18 .../catalyst/catalog/SessionCatalogSuite.scala | 28 ++ .../spark/sql/execution/command/views.scala | 38 +++- .../spark/sql/hive/HiveSessionCatalog.scala | 1 + .../spark/sql/hive/execution/SQLViewSuite.scala | 99 ++-- 5 files changed, 172 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1da64e1f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 2d2120d..c8b61d8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -923,6 +923,24 @@ class SessionCatalog( } } + /** + * Returns whether it is a temporary function. If not existed, returns false. + */ + def isTemporaryFunction(name: FunctionIdentifier): Boolean = { +// copied from HiveSessionCatalog +val hiveFunctions = Seq( + "hash", + "histogram_numeric", + "percentile") + +// A temporary function is a function that has been registered in functionRegistry +// without a database name, and is neither a built-in function nor a Hive function +name.database.isEmpty && + functionRegistry.functionExists(name.funcName) && + !FunctionRegistry.builtin.functionExists(name.funcName) && + !hiveFunctions.contains(name.funcName.toLowerCase) + } + protected def failFunctionLookup(name: String): Nothing = { throw new NoSuchFunctionException(db = currentDb, func = name) } http://git-wip-us.apache.org/repos/asf/spark/blob/1da64e1f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala -
spark git commit: [SPARK-18217][SQL] Disallow creating permanent views based on temporary views or UDFs
Repository: spark Updated Branches: refs/heads/branch-2.1 4943929d8 -> 4cb4e5ff0 [SPARK-18217][SQL] Disallow creating permanent views based on temporary views or UDFs ### What changes were proposed in this pull request? Based on the discussion in [SPARK-18209](https://issues.apache.org/jira/browse/SPARK-18209). It doesn't really make sense to create permanent views based on temporary views or temporary UDFs. To disallow the supports and issue the exceptions, this PR needs to detect whether a temporary view/UDF is being used when defining a permanent view. Basically, this PR can be split to two sub-tasks: **Task 1:** detecting a temporary view from the query plan of view definition. When finding an unresolved temporary view, Analyzer replaces it by a `SubqueryAlias` with the corresponding logical plan, which is stored in an in-memory HashMap. After replacement, it is impossible to detect whether the `SubqueryAlias` is added/generated from a temporary view. Thus, to detect the usage of a temporary view in view definition, this PR traverses the unresolved logical plan and uses the name of an `UnresolvedRelation` to detect whether it is a (global) temporary view. **Task 2:** detecting a temporary UDF from the query plan of view definition. Detecting usage of a temporary UDF in view definition is not straightfoward. First, in the analyzed plan, we are having different forms to represent the functions. More importantly, some classes (e.g., `HiveGenericUDF`) are not accessible from `CreateViewCommand`, which is part of `sql/core`. Thus, we used the unanalyzed plan `child` of `CreateViewCommand` to detect the usage of a temporary UDF. Because the plan has already been successfully analyzed, we can assume the functions have been defined/registered. Second, in Spark, the functions have four forms: Spark built-in functions, built-in hash functions, permanent UDFs and temporary UDFs. We do not have any direct way to determine whether a function is temporary or not. Thus, we introduced a function `isTemporaryFunction` in `SessionCatalog`. This function contains the detailed logics to determine whether a function is temporary or not. ### How was this patch tested? Added test cases. Author: gatorsmile Closes #15764 from gatorsmile/blockTempFromPermViewCreation. (cherry picked from commit 1da64e1fa0970277d1fb47dec8adca47b068b1ec) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4cb4e5ff Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4cb4e5ff Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4cb4e5ff Branch: refs/heads/branch-2.1 Commit: 4cb4e5ff0ab9537758bf0b418ddd40dfe9537609 Parents: 4943929 Author: gatorsmile Authored: Mon Nov 7 18:34:21 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 18:34:29 2016 -0800 -- .../sql/catalyst/catalog/SessionCatalog.scala | 18 .../catalyst/catalog/SessionCatalogSuite.scala | 28 ++ .../spark/sql/execution/command/views.scala | 38 +++- .../spark/sql/hive/HiveSessionCatalog.scala | 1 + .../spark/sql/hive/execution/SQLViewSuite.scala | 99 ++-- 5 files changed, 172 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4cb4e5ff/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 2d2120d..c8b61d8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -923,6 +923,24 @@ class SessionCatalog( } } + /** + * Returns whether it is a temporary function. If not existed, returns false. + */ + def isTemporaryFunction(name: FunctionIdentifier): Boolean = { +// copied from HiveSessionCatalog +val hiveFunctions = Seq( + "hash", + "histogram_numeric", + "percentile") + +// A temporary function is a function that has been registered in functionRegistry +// without a database name, and is neither a built-in function nor a Hive function +name.database.isEmpty && + functionRegistry.functionExists(name.funcName) && + !FunctionRegistry.builtin.functionExists(name.funcName) && + !hiveFunctions.contains(name.funcName.toLowerCase) + } + protected def failFunctionLookup(name: String): Nothing = { throw new NoSuchFunctionException(db = currentDb, func = name) } http://git-wip-us.apache.org/repos/asf/spark/blob/4cb4e5ff/sql/catalys
spark git commit: [SPARK-18261][STRUCTURED STREAMING] Add statistics to MemorySink for joining
Repository: spark Updated Branches: refs/heads/branch-2.1 29f59c733 -> 4943929d8 [SPARK-18261][STRUCTURED STREAMING] Add statistics to MemorySink for joining ## What changes were proposed in this pull request? Right now, there is no way to join the output of a memory sink with any table: > UnsupportedOperationException: LeafNode MemoryPlan must implement statistics This patch adds statistics to MemorySink, making joining snapshots of memory streams with tables possible. ## How was this patch tested? Added a test case. Author: Liwei Lin Closes #15786 from lw-lin/memory-sink-stat. (cherry picked from commit c1a0c66bd2662bc40f312da474c3b95229fe92d0) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4943929d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4943929d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4943929d Branch: refs/heads/branch-2.1 Commit: 4943929d85a2aaf404c140d2d2589a597f484976 Parents: 29f59c7 Author: Liwei Lin Authored: Mon Nov 7 17:49:24 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 17:49:48 2016 -0800 -- .../spark/sql/execution/streaming/memory.scala | 6 +- .../spark/sql/streaming/MemorySinkSuite.scala | 16 2 files changed, 21 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4943929d/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala index 48d9791..613c7cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala @@ -27,7 +27,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LeafNode +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @@ -212,4 +212,8 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi */ case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode { def this(sink: MemorySink) = this(sink, sink.schema.toAttributes) + + private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum + + override def statistics: Statistics = Statistics(sizePerRow * sink.allData.size) } http://git-wip-us.apache.org/repos/asf/spark/blob/4943929d/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala index 310d756..4e9fba9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala @@ -187,6 +187,22 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter { query.stop() } + test("MemoryPlan statistics") { +implicit val schema = new StructType().add(new StructField("value", IntegerType)) +val sink = new MemorySink(schema, InternalOutputModes.Append) +val plan = new MemoryPlan(sink) + +// Before adding data, check output +checkAnswer(sink.allData, Seq.empty) +assert(plan.statistics.sizeInBytes === 0) + +sink.addBatch(0, 1 to 3) +assert(plan.statistics.sizeInBytes === 12) + +sink.addBatch(1, 4 to 6) +assert(plan.statistics.sizeInBytes === 24) + } + ignore("stress test") { // Ignore the stress test as it takes several minutes to run (0 until 1000).foreach { _ => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18261][STRUCTURED STREAMING] Add statistics to MemorySink for joining
Repository: spark Updated Branches: refs/heads/master 9b0593d5e -> c1a0c66bd [SPARK-18261][STRUCTURED STREAMING] Add statistics to MemorySink for joining ## What changes were proposed in this pull request? Right now, there is no way to join the output of a memory sink with any table: > UnsupportedOperationException: LeafNode MemoryPlan must implement statistics This patch adds statistics to MemorySink, making joining snapshots of memory streams with tables possible. ## How was this patch tested? Added a test case. Author: Liwei Lin Closes #15786 from lw-lin/memory-sink-stat. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1a0c66b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1a0c66b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1a0c66b Branch: refs/heads/master Commit: c1a0c66bd2662bc40f312da474c3b95229fe92d0 Parents: 9b0593d Author: Liwei Lin Authored: Mon Nov 7 17:49:24 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 17:49:24 2016 -0800 -- .../spark/sql/execution/streaming/memory.scala | 6 +- .../spark/sql/streaming/MemorySinkSuite.scala | 16 2 files changed, 21 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c1a0c66b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala index 48d9791..613c7cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala @@ -27,7 +27,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LeafNode +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @@ -212,4 +212,8 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi */ case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode { def this(sink: MemorySink) = this(sink, sink.schema.toAttributes) + + private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum + + override def statistics: Statistics = Statistics(sizePerRow * sink.allData.size) } http://git-wip-us.apache.org/repos/asf/spark/blob/c1a0c66b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala index 310d756..4e9fba9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala @@ -187,6 +187,22 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter { query.stop() } + test("MemoryPlan statistics") { +implicit val schema = new StructType().add(new StructField("value", IntegerType)) +val sink = new MemorySink(schema, InternalOutputModes.Append) +val plan = new MemoryPlan(sink) + +// Before adding data, check output +checkAnswer(sink.allData, Seq.empty) +assert(plan.statistics.sizeInBytes === 0) + +sink.addBatch(0, 1 to 3) +assert(plan.statistics.sizeInBytes === 12) + +sink.addBatch(1, 4 to 6) +assert(plan.statistics.sizeInBytes === 24) + } + ignore("stress test") { // Ignore the stress test as it takes several minutes to run (0 until 1000).foreach { _ => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18086] Add support for Hive session vars.
Repository: spark Updated Branches: refs/heads/branch-2.1 4af82d56f -> 29f59c733 [SPARK-18086] Add support for Hive session vars. ## What changes were proposed in this pull request? This adds support for Hive variables: * Makes values set via `spark-sql --hivevar name=value` accessible * Adds `getHiveVar` and `setHiveVar` to the `HiveClient` interface * Adds a SessionVariables trait for sessions like Hive that support variables (including Hive vars) * Adds SessionVariables support to variable substitution * Adds SessionVariables support to the SET command ## How was this patch tested? * Adds a test to all supported Hive versions for accessing Hive variables * Adds HiveVariableSubstitutionSuite Author: Ryan Blue Closes #15738 from rdblue/SPARK-18086-add-hivevar-support. (cherry picked from commit 9b0593d5e99bb919c4abb8d0836a126ec2eaf1d5) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29f59c73 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29f59c73 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29f59c73 Branch: refs/heads/branch-2.1 Commit: 29f59c73301628fb63086660f64fdb5272a312fe Parents: 4af82d5 Author: Ryan Blue Authored: Mon Nov 7 17:36:15 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 17:36:22 2016 -0800 -- .../sql/execution/command/SetCommand.scala | 11 + .../sql/internal/VariableSubstitution.scala | 5 +- .../hive/thriftserver/SparkSQLCLIDriver.scala | 6 ++- .../hive/HiveVariableSubstitutionSuite.scala| 50 4 files changed, 67 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29f59c73/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala index af6def5..dc8d975 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala @@ -60,6 +60,13 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm } (keyValueOutput, runFunc) +case Some((key @ SetCommand.VariableName(name), Some(value))) => + val runFunc = (sparkSession: SparkSession) => { +sparkSession.conf.set(name, value) +Seq(Row(key, value)) + } + (keyValueOutput, runFunc) + // Configures a single property. case Some((key, Some(value))) => val runFunc = (sparkSession: SparkSession) => { @@ -117,6 +124,10 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm } +object SetCommand { + val VariableName = """hivevar:([^=]+)""".r +} + /** * This command is for resetting SQLConf to the default values. Command that runs * {{{ http://git-wip-us.apache.org/repos/asf/spark/blob/29f59c73/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala index 50725a0..791a9cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala @@ -17,10 +17,7 @@ package org.apache.spark.sql.internal -import java.util.regex.Pattern - import org.apache.spark.internal.config._ -import org.apache.spark.sql.AnalysisException /** * A helper class that enables substitution using syntax like @@ -37,6 +34,7 @@ class VariableSubstitution(conf: SQLConf) { private val reader = new ConfigReader(provider) .bind("spark", provider) .bind("sparkconf", provider) +.bind("hivevar", provider) .bind("hiveconf", provider) /** @@ -49,5 +47,4 @@ class VariableSubstitution(conf: SQLConf) { input } } - } http://git-wip-us.apache.org/repos/asf/spark/blob/29f59c73/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index 5dafec1..0c79b6f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/s
spark git commit: [SPARK-18086] Add support for Hive session vars.
Repository: spark Updated Branches: refs/heads/master 3eda05703 -> 9b0593d5e [SPARK-18086] Add support for Hive session vars. ## What changes were proposed in this pull request? This adds support for Hive variables: * Makes values set via `spark-sql --hivevar name=value` accessible * Adds `getHiveVar` and `setHiveVar` to the `HiveClient` interface * Adds a SessionVariables trait for sessions like Hive that support variables (including Hive vars) * Adds SessionVariables support to variable substitution * Adds SessionVariables support to the SET command ## How was this patch tested? * Adds a test to all supported Hive versions for accessing Hive variables * Adds HiveVariableSubstitutionSuite Author: Ryan Blue Closes #15738 from rdblue/SPARK-18086-add-hivevar-support. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9b0593d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9b0593d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9b0593d5 Branch: refs/heads/master Commit: 9b0593d5e99bb919c4abb8d0836a126ec2eaf1d5 Parents: 3eda057 Author: Ryan Blue Authored: Mon Nov 7 17:36:15 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 17:36:15 2016 -0800 -- .../sql/execution/command/SetCommand.scala | 11 + .../sql/internal/VariableSubstitution.scala | 5 +- .../hive/thriftserver/SparkSQLCLIDriver.scala | 6 ++- .../hive/HiveVariableSubstitutionSuite.scala| 50 4 files changed, 67 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9b0593d5/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala index af6def5..dc8d975 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala @@ -60,6 +60,13 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm } (keyValueOutput, runFunc) +case Some((key @ SetCommand.VariableName(name), Some(value))) => + val runFunc = (sparkSession: SparkSession) => { +sparkSession.conf.set(name, value) +Seq(Row(key, value)) + } + (keyValueOutput, runFunc) + // Configures a single property. case Some((key, Some(value))) => val runFunc = (sparkSession: SparkSession) => { @@ -117,6 +124,10 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm } +object SetCommand { + val VariableName = """hivevar:([^=]+)""".r +} + /** * This command is for resetting SQLConf to the default values. Command that runs * {{{ http://git-wip-us.apache.org/repos/asf/spark/blob/9b0593d5/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala index 50725a0..791a9cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala @@ -17,10 +17,7 @@ package org.apache.spark.sql.internal -import java.util.regex.Pattern - import org.apache.spark.internal.config._ -import org.apache.spark.sql.AnalysisException /** * A helper class that enables substitution using syntax like @@ -37,6 +34,7 @@ class VariableSubstitution(conf: SQLConf) { private val reader = new ConfigReader(provider) .bind("spark", provider) .bind("sparkconf", provider) +.bind("hivevar", provider) .bind("hiveconf", provider) /** @@ -49,5 +47,4 @@ class VariableSubstitution(conf: SQLConf) { input } } - } http://git-wip-us.apache.org/repos/asf/spark/blob/9b0593d5/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index 5dafec1..0c79b6f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -38,
spark git commit: [SPARK-18295][SQL] Make to_json function null safe (matching it to from_json)
Repository: spark Updated Branches: refs/heads/branch-2.1 9873d57f2 -> 4af82d56f [SPARK-18295][SQL] Make to_json function null safe (matching it to from_json) ## What changes were proposed in this pull request? This PR proposes to match up the behaviour of `to_json` to `from_json` function for null-safety. Currently, it throws `NullPointException` but this PR fixes this to produce `null` instead. with the data below: ```scala import spark.implicits._ val df = Seq(Some(Tuple1(Tuple1(1))), None).toDF("a") df.show() ``` ``` ++ | a| ++ | [1]| |null| ++ ``` the codes below ```scala import org.apache.spark.sql.functions._ df.select(to_json($"a")).show() ``` produces.. **Before** throws `NullPointException` as below: ``` java.lang.NullPointerException at org.apache.spark.sql.catalyst.json.JacksonGenerator.org$apache$spark$sql$catalyst$json$JacksonGenerator$$writeFields(JacksonGenerator.scala:138) at org.apache.spark.sql.catalyst.json.JacksonGenerator$$anonfun$write$1.apply$mcV$sp(JacksonGenerator.scala:194) at org.apache.spark.sql.catalyst.json.JacksonGenerator.org$apache$spark$sql$catalyst$json$JacksonGenerator$$writeObject(JacksonGenerator.scala:131) at org.apache.spark.sql.catalyst.json.JacksonGenerator.write(JacksonGenerator.scala:193) at org.apache.spark.sql.catalyst.expressions.StructToJson.eval(jsonExpressions.scala:544) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:142) at org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:48) at org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:30) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) ``` **After** ``` +---+ |structtojson(a)| +---+ | {"_1":1}| | null| +---+ ``` ## How was this patch tested? Unit test in `JsonExpressionsSuite.scala` and `JsonFunctionsSuite.scala`. Author: hyukjinkwon Closes #15792 from HyukjinKwon/SPARK-18295. (cherry picked from commit 3eda05703f02413540f180ade01f0f114e70b9cc) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4af82d56 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4af82d56 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4af82d56 Branch: refs/heads/branch-2.1 Commit: 4af82d56f79ac3cceb08b702413ae2b35dfea48b Parents: 9873d57 Author: hyukjinkwon Authored: Mon Nov 7 16:54:40 2016 -0800 Committer: Michael Armbrust Committed: Mon Nov 7 16:54:57 2016 -0800 -- .../sql/catalyst/expressions/jsonExpressions.scala| 14 +- .../catalyst/expressions/JsonExpressionsSuite.scala | 13 +++-- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 14 ++ 3 files changed, 30 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4af82d56/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 89fe7c4..b61583d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -484,7 +484,7 @@ case class JsonTuple(children: Seq[Expression]) * Converts an json input string to a [[StructType]] with the specified schema. */ case class JsonToStruct(schema: StructType, options: Map[String, String], child: Expression) - extends Expression with CodegenFallback with ExpectsInputTypes { + extends UnaryExpression with CodegenFallback with ExpectsInputTypes { override def nullable: Boolean = true @transient @@ -495,11 +495,8 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child: new JSONOptions(options ++ Map("mode" -> ParseModes.FAIL_FAST_MODE))) override def dataType: DataType = schema - override def children: Seq[Expression] = child :: Nil - override def eval(input: InternalRow): Any = { -val json = child.eval(input) -if (json == null) return null + override def nullSafeEval(json: Any): Any = { try parser.parse(json.toString).head catch { case _: SparkSQLJsonProcessingException => null } @@ -512,7 +509,7 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child: * Converts a [[StructType]] to a json output string. */ case class StructToJson(options: Map[String, String], child: Expression) - extends Expression
spark git commit: [SPARK-18295][SQL] Make to_json function null safe (matching it to from_json)
Repository: spark Updated Branches: refs/heads/master 3a710b94b -> 3eda05703 [SPARK-18295][SQL] Make to_json function null safe (matching it to from_json) ## What changes were proposed in this pull request? This PR proposes to match up the behaviour of `to_json` to `from_json` function for null-safety. Currently, it throws `NullPointException` but this PR fixes this to produce `null` instead. with the data below: ```scala import spark.implicits._ val df = Seq(Some(Tuple1(Tuple1(1))), None).toDF("a") df.show() ``` ``` ++ | a| ++ | [1]| |null| ++ ``` the codes below ```scala import org.apache.spark.sql.functions._ df.select(to_json($"a")).show() ``` produces.. **Before** throws `NullPointException` as below: ``` java.lang.NullPointerException at org.apache.spark.sql.catalyst.json.JacksonGenerator.org$apache$spark$sql$catalyst$json$JacksonGenerator$$writeFields(JacksonGenerator.scala:138) at org.apache.spark.sql.catalyst.json.JacksonGenerator$$anonfun$write$1.apply$mcV$sp(JacksonGenerator.scala:194) at org.apache.spark.sql.catalyst.json.JacksonGenerator.org$apache$spark$sql$catalyst$json$JacksonGenerator$$writeObject(JacksonGenerator.scala:131) at org.apache.spark.sql.catalyst.json.JacksonGenerator.write(JacksonGenerator.scala:193) at org.apache.spark.sql.catalyst.expressions.StructToJson.eval(jsonExpressions.scala:544) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:142) at org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:48) at org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:30) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) ``` **After** ``` +---+ |structtojson(a)| +---+ | {"_1":1}| | null| +---+ ``` ## How was this patch tested? Unit test in `JsonExpressionsSuite.scala` and `JsonFunctionsSuite.scala`. Author: hyukjinkwon Closes #15792 from HyukjinKwon/SPARK-18295. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3eda0570 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3eda0570 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3eda0570 Branch: refs/heads/master Commit: 3eda05703f02413540f180ade01f0f114e70b9cc Parents: 3a710b9 Author: hyukjinkwon Authored: Mon Nov 7 16:54:40 2016 -0800 Committer: Michael Armbrust Committed: Mon Nov 7 16:54:40 2016 -0800 -- .../sql/catalyst/expressions/jsonExpressions.scala| 14 +- .../catalyst/expressions/JsonExpressionsSuite.scala | 13 +++-- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 14 ++ 3 files changed, 30 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3eda0570/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 89fe7c4..b61583d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -484,7 +484,7 @@ case class JsonTuple(children: Seq[Expression]) * Converts an json input string to a [[StructType]] with the specified schema. */ case class JsonToStruct(schema: StructType, options: Map[String, String], child: Expression) - extends Expression with CodegenFallback with ExpectsInputTypes { + extends UnaryExpression with CodegenFallback with ExpectsInputTypes { override def nullable: Boolean = true @transient @@ -495,11 +495,8 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child: new JSONOptions(options ++ Map("mode" -> ParseModes.FAIL_FAST_MODE))) override def dataType: DataType = schema - override def children: Seq[Expression] = child :: Nil - override def eval(input: InternalRow): Any = { -val json = child.eval(input) -if (json == null) return null + override def nullSafeEval(json: Any): Any = { try parser.parse(json.toString).head catch { case _: SparkSQLJsonProcessingException => null } @@ -512,7 +509,7 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child: * Converts a [[StructType]] to a json output string. */ case class StructToJson(options: Map[String, String], child: Expression) - extends Expression with CodegenFallback with ExpectsInputTypes { + extends UnaryExpression with CodegenFallback with ExpectsInput
spark git commit: [SPARK-18236] Reduce duplicate objects in Spark UI and HistoryServer
Repository: spark Updated Branches: refs/heads/master 19cf20806 -> 3a710b94b [SPARK-18236] Reduce duplicate objects in Spark UI and HistoryServer ## What changes were proposed in this pull request? When profiling heap dumps from the HistoryServer and live Spark web UIs, I found a large amount of memory being wasted on duplicated objects and strings. This patch's changes remove most of this duplication, resulting in over 40% memory savings for some benchmarks. - **Task metrics** (6441f0624dfcda9c7193a64bfb416a145b5aabdf): previously, every `TaskUIData` object would have its own instances of `InputMetricsUIData`, `OutputMetricsUIData`, `ShuffleReadMetrics`, and `ShuffleWriteMetrics`, but for many tasks these metrics are irrelevant because they're all zero. This patch changes how we construct these metrics in order to re-use a single immutable "empty" value for the cases where these metrics are empty. - **TaskInfo.accumulables** (ade86db901127bf13c0e0bdc3f09c933a093bb76): Previously, every `TaskInfo` object had its own empty `ListBuffer` for holding updates from named accumulators. Tasks which didn't use named accumulators still paid for the cost of allocating and storing this empty buffer. To avoid this overhead, I changed the `val` with a mutable buffer into a `var` which holds an immutable Scala list, allowing tasks which do not have named accumulator updates to share the same singleton `Nil` object. - **String.intern() in JSONProtocol** (7e05630e9a78c455db8c8c499f0590c864624e05): in the HistoryServer, executor hostnames and ids are deserialized from JSON, leading to massive duplication of these string objects. By calling `String.intern()` on the deserialized values we can remove all of this duplication. Since Spark now requires Java 7+ we don't have to worry about string interning exhausting the permgen (see http://java-performance.info/string-intern-in-java-6-7-8/). ## How was this patch tested? I ran ``` sc.parallelize(1 to 10, 10).count() ``` in `spark-shell` with event logging enabled, then loaded that event log in the HistoryServer, performed a full GC, and took a heap dump. According to YourKit, the changes in this patch reduced memory consumption by roughly 28 megabytes (or 770k Java objects): ![image](https://cloud.githubusercontent.com/assets/50748/19953276/4f3a28aa-a129-11e6-93df-d7fa91396f66.png) Here's a table illustrating the drop in objects due to deduplication (the drop is <100k for some objects because some events were dropped from the listener bus; this is a separate, existing bug that I'll address separately after CPU-profiling): ![image](https://cloud.githubusercontent.com/assets/50748/19953290/6a271290-a129-11e6-93ad-b825f1448886.png) Author: Josh Rosen Closes #15743 from JoshRosen/spark-ui-memory-usage. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3a710b94 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3a710b94 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3a710b94 Branch: refs/heads/master Commit: 3a710b94b0c853a2dd4c40dca446ecde4e7be959 Parents: 19cf208 Author: Josh Rosen Authored: Mon Nov 7 16:14:19 2016 -0800 Committer: Josh Rosen Committed: Mon Nov 7 16:14:19 2016 -0800 -- .../apache/spark/scheduler/DAGScheduler.scala | 3 +- .../org/apache/spark/scheduler/TaskInfo.scala | 10 ++- .../scala/org/apache/spark/ui/jobs/UIData.scala | 83 +++- .../org/apache/spark/util/JsonProtocol.scala| 10 +-- .../ui/jobs/JobProgressListenerSuite.scala | 2 +- .../apache/spark/util/JsonProtocolSuite.scala | 7 +- project/MimaExcludes.scala | 5 +- .../sql/execution/ui/SQLListenerSuite.scala | 2 +- 8 files changed, 84 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3a710b94/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index f251740..7fde34d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1089,7 +1089,8 @@ class DAGScheduler( // To avoid UI cruft, ignore cases where value wasn't updated if (acc.name.isDefined && !updates.isZero) { stage.latestInfo.accumulables(id) = acc.toInfo(None, Some(acc.value)) - event.taskInfo.accumulables += acc.toInfo(Some(updates.value), Some(acc.value)) + event.taskInfo.setAccumulables( +acc.toInfo(Some(updates.value), Some(acc.value)) +: event.taskInfo.accumulables) } } }
spark git commit: [SPARK-17490][SQL] Optimize SerializeFromObject() for a primitive array
Repository: spark Updated Branches: refs/heads/branch-2.1 d1eac3ef4 -> 9873d57f2 [SPARK-17490][SQL] Optimize SerializeFromObject() for a primitive array Waiting for merging #13680 This PR optimizes `SerializeFromObject()` for an primitive array. This is derived from #13758 to address one of problems by using a simple way in #13758. The current implementation always generates `GenericArrayData` from `SerializeFromObject()` for any type of an array in a logical plan. This involves a boxing at a constructor of `GenericArrayData` when `SerializedFromObject()` has an primitive array. This PR enables to generate `UnsafeArrayData` from `SerializeFromObject()` for a primitive array. It can avoid boxing to create an instance of `ArrayData` in the generated code by Catalyst. This PR also generate `UnsafeArrayData` in a case for `RowEncoder.serializeFor` or `CatalystTypeConverters.createToCatalystConverter`. Performance improvement of `SerializeFromObject()` is up to 2.0x ``` OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Without this PR Write an array in Dataset: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative Int556 / 608 15.1 66.3 1.0X Double1668 / 1746 5.0 198.8 0.3X with this PR Write an array in Dataset: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative Int352 / 401 23.8 42.0 1.0X Double 821 / 885 10.2 97.9 0.4X ``` Here is an example program that will happen in mllib as described in [SPARK-16070](https://issues.apache.org/jira/browse/SPARK-16070). ``` sparkContext.parallelize(Seq(Array(1, 2)), 1).toDS.map(e => e).show ``` Generated code before applying this PR ``` java /* 039 */ protected void processNext() throws java.io.IOException { /* 040 */ while (inputadapter_input.hasNext()) { /* 041 */ InternalRow inputadapter_row = (InternalRow) inputadapter_input.next(); /* 042 */ int[] inputadapter_value = (int[])inputadapter_row.get(0, null); /* 043 */ /* 044 */ Object mapelements_obj = ((Expression) references[0]).eval(null); /* 045 */ scala.Function1 mapelements_value1 = (scala.Function1) mapelements_obj; /* 046 */ /* 047 */ boolean mapelements_isNull = false || false; /* 048 */ int[] mapelements_value = null; /* 049 */ if (!mapelements_isNull) { /* 050 */ Object mapelements_funcResult = null; /* 051 */ mapelements_funcResult = mapelements_value1.apply(inputadapter_value); /* 052 */ if (mapelements_funcResult == null) { /* 053 */ mapelements_isNull = true; /* 054 */ } else { /* 055 */ mapelements_value = (int[]) mapelements_funcResult; /* 056 */ } /* 057 */ /* 058 */ } /* 059 */ mapelements_isNull = mapelements_value == null; /* 060 */ /* 061 */ serializefromobject_argIsNulls[0] = mapelements_isNull; /* 062 */ serializefromobject_argValue = mapelements_value; /* 063 */ /* 064 */ boolean serializefromobject_isNull = false; /* 065 */ for (int idx = 0; idx < 1; idx++) { /* 066 */ if (serializefromobject_argIsNulls[idx]) { serializefromobject_isNull = true; break; } /* 067 */ } /* 068 */ /* 069 */ final ArrayData serializefromobject_value = serializefromobject_isNull ? null : new org.apache.spark.sql.catalyst.util.GenericArrayData(serializefromobject_argValue); /* 070 */ serializefromobject_holder.reset(); /* 071 */ /* 072 */ serializefromobject_rowWriter.zeroOutNullBytes(); /* 073 */ /* 074 */ if (serializefromobject_isNull) { /* 075 */ serializefromobject_rowWriter.setNullAt(0); /* 076 */ } else { /* 077 */ // Remember the current cursor so that we can calculate how many bytes are /* 078 */ // written later. /* 079 */ final int serializefromobject_tmpCursor = serializefromobject_holder.cursor; /* 080 */ /* 081 */ if (serializefromobject_value instanceof UnsafeArrayData) { /* 082 */ final int serializefromobject_sizeInBytes = ((UnsafeArrayData) serializefromobject_value).getSizeInBytes(); /* 083 */ // grow the global buffer before writing data. /* 084 */ serializefromobject_holder.grow(serializefromobject_sizeInBytes); /* 085 */ ((UnsafeArrayData) serializefromobject_value).writeToMemory(serializefromobject_holder.buffer, serializefromobject_holder.cursor); /* 086 */ serializefromobject_holder.cursor += serializefromobject_sizeInBytes; /* 087 */ /* 088
spark git commit: [SPARK-17490][SQL] Optimize SerializeFromObject() for a primitive array
Repository: spark Updated Branches: refs/heads/master 8f0ea011a -> 19cf20806 [SPARK-17490][SQL] Optimize SerializeFromObject() for a primitive array ## What changes were proposed in this pull request? Waiting for merging #13680 This PR optimizes `SerializeFromObject()` for an primitive array. This is derived from #13758 to address one of problems by using a simple way in #13758. The current implementation always generates `GenericArrayData` from `SerializeFromObject()` for any type of an array in a logical plan. This involves a boxing at a constructor of `GenericArrayData` when `SerializedFromObject()` has an primitive array. This PR enables to generate `UnsafeArrayData` from `SerializeFromObject()` for a primitive array. It can avoid boxing to create an instance of `ArrayData` in the generated code by Catalyst. This PR also generate `UnsafeArrayData` in a case for `RowEncoder.serializeFor` or `CatalystTypeConverters.createToCatalystConverter`. Performance improvement of `SerializeFromObject()` is up to 2.0x ``` OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Without this PR Write an array in Dataset: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative Int556 / 608 15.1 66.3 1.0X Double1668 / 1746 5.0 198.8 0.3X with this PR Write an array in Dataset: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative Int352 / 401 23.8 42.0 1.0X Double 821 / 885 10.2 97.9 0.4X ``` Here is an example program that will happen in mllib as described in [SPARK-16070](https://issues.apache.org/jira/browse/SPARK-16070). ``` sparkContext.parallelize(Seq(Array(1, 2)), 1).toDS.map(e => e).show ``` Generated code before applying this PR ``` java /* 039 */ protected void processNext() throws java.io.IOException { /* 040 */ while (inputadapter_input.hasNext()) { /* 041 */ InternalRow inputadapter_row = (InternalRow) inputadapter_input.next(); /* 042 */ int[] inputadapter_value = (int[])inputadapter_row.get(0, null); /* 043 */ /* 044 */ Object mapelements_obj = ((Expression) references[0]).eval(null); /* 045 */ scala.Function1 mapelements_value1 = (scala.Function1) mapelements_obj; /* 046 */ /* 047 */ boolean mapelements_isNull = false || false; /* 048 */ int[] mapelements_value = null; /* 049 */ if (!mapelements_isNull) { /* 050 */ Object mapelements_funcResult = null; /* 051 */ mapelements_funcResult = mapelements_value1.apply(inputadapter_value); /* 052 */ if (mapelements_funcResult == null) { /* 053 */ mapelements_isNull = true; /* 054 */ } else { /* 055 */ mapelements_value = (int[]) mapelements_funcResult; /* 056 */ } /* 057 */ /* 058 */ } /* 059 */ mapelements_isNull = mapelements_value == null; /* 060 */ /* 061 */ serializefromobject_argIsNulls[0] = mapelements_isNull; /* 062 */ serializefromobject_argValue = mapelements_value; /* 063 */ /* 064 */ boolean serializefromobject_isNull = false; /* 065 */ for (int idx = 0; idx < 1; idx++) { /* 066 */ if (serializefromobject_argIsNulls[idx]) { serializefromobject_isNull = true; break; } /* 067 */ } /* 068 */ /* 069 */ final ArrayData serializefromobject_value = serializefromobject_isNull ? null : new org.apache.spark.sql.catalyst.util.GenericArrayData(serializefromobject_argValue); /* 070 */ serializefromobject_holder.reset(); /* 071 */ /* 072 */ serializefromobject_rowWriter.zeroOutNullBytes(); /* 073 */ /* 074 */ if (serializefromobject_isNull) { /* 075 */ serializefromobject_rowWriter.setNullAt(0); /* 076 */ } else { /* 077 */ // Remember the current cursor so that we can calculate how many bytes are /* 078 */ // written later. /* 079 */ final int serializefromobject_tmpCursor = serializefromobject_holder.cursor; /* 080 */ /* 081 */ if (serializefromobject_value instanceof UnsafeArrayData) { /* 082 */ final int serializefromobject_sizeInBytes = ((UnsafeArrayData) serializefromobject_value).getSizeInBytes(); /* 083 */ // grow the global buffer before writing data. /* 084 */ serializefromobject_holder.grow(serializefromobject_sizeInBytes); /* 085 */ ((UnsafeArrayData) serializefromobject_value).writeToMemory(serializefromobject_holder.buffer, serializefromobject_holder.cursor); /* 086 */ serializefromobject_holder.cursor += s
spark git commit: [SPARK-14914][CORE] Fix Resource not closed after using, mostly for unit tests
Repository: spark Updated Branches: refs/heads/master 0d95662e7 -> 8f0ea011a [SPARK-14914][CORE] Fix Resource not closed after using, mostly for unit tests ## What changes were proposed in this pull request? Close `FileStreams`, `ZipFiles` etc to release the resources after using. Not closing the resources will cause IO Exception to be raised while deleting temp files. ## How was this patch tested? Existing tests Author: U-FAREAST\tl Author: hyukjinkwon Author: Tao LI Closes #15618 from HyukjinKwon/SPARK-14914-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f0ea011 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f0ea011 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f0ea011 Branch: refs/heads/master Commit: 8f0ea011a7294679ec4275b2fef349ef45b6eb81 Parents: 0d95662 Author: Hyukjin Kwon Authored: Mon Nov 7 12:47:39 2016 -0800 Committer: Mridul Muralidharan Committed: Mon Nov 7 12:47:39 2016 -0800 -- .../spark/rdd/ReliableCheckpointRDD.scala | 13 +--- .../test/scala/org/apache/spark/FileSuite.scala | 13 +--- .../spark/deploy/RPackageUtilsSuite.scala | 35 .../deploy/history/FsHistoryProviderSuite.scala | 8 - .../scheduler/EventLoggingListenerSuite.scala | 26 +-- .../spark/scheduler/TaskResultGetterSuite.scala | 7 ++-- .../apache/spark/mllib/util/MLUtilsSuite.scala | 16 + .../apache/spark/streaming/JavaAPISuite.java| 1 + .../spark/streaming/CheckpointSuite.scala | 16 + .../spark/streaming/MasterFailureTest.scala | 1 + .../streaming/ReceivedBlockTrackerSuite.scala | 5 +++ .../streaming/util/WriteAheadLogSuite.scala | 1 + 12 files changed, 93 insertions(+), 49 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8f0ea011/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala index eac901d..9f800e3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala @@ -239,12 +239,17 @@ private[spark] object ReliableCheckpointRDD extends Logging { val fs = partitionerFilePath.getFileSystem(sc.hadoopConfiguration) val fileInputStream = fs.open(partitionerFilePath, bufferSize) val serializer = SparkEnv.get.serializer.newInstance() - val deserializeStream = serializer.deserializeStream(fileInputStream) - val partitioner = Utils.tryWithSafeFinally[Partitioner] { -deserializeStream.readObject[Partitioner] + val partitioner = Utils.tryWithSafeFinally { +val deserializeStream = serializer.deserializeStream(fileInputStream) +Utils.tryWithSafeFinally { + deserializeStream.readObject[Partitioner] +} { + deserializeStream.close() +} } { -deserializeStream.close() +fileInputStream.close() } + logDebug(s"Read partitioner from $partitionerFilePath") Some(partitioner) } catch { http://git-wip-us.apache.org/repos/asf/spark/blob/8f0ea011/core/src/test/scala/org/apache/spark/FileSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index cc52bb1..89f0b1c 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -58,10 +58,15 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { nums.saveAsTextFile(outputDir) // Read the plain text file and check it's OK val outputFile = new File(outputDir, "part-0") -val content = Source.fromFile(outputFile).mkString -assert(content === "1\n2\n3\n4\n") -// Also try reading it in as a text file RDD -assert(sc.textFile(outputDir).collect().toList === List("1", "2", "3", "4")) +val bufferSrc = Source.fromFile(outputFile) +Utils.tryWithSafeFinally { + val content = bufferSrc.mkString + assert(content === "1\n2\n3\n4\n") + // Also try reading it in as a text file RDD + assert(sc.textFile(outputDir).collect().toList === List("1", "2", "3", "4")) +} { + bufferSrc.close() +} } test("text files (compressed)") { http://git-wip-us.apache.org/repos/asf/spark/blob/8f0ea011/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spar
spark git commit: [SPARK-17108][SQL] Fix BIGINT and INT comparison failure in spark sql
Repository: spark Updated Branches: refs/heads/master b06c23db9 -> 0d95662e7 [SPARK-17108][SQL] Fix BIGINT and INT comparison failure in spark sql ## What changes were proposed in this pull request? Add a function to check if two integers are compatible when invoking `acceptsType()` in `DataType`. ## How was this patch tested? Manually. E.g. ``` spark.sql("create table t3(a map>)") spark.sql("select * from t3 where a[1] is not null") ``` Before: ``` cannot resolve 't.`a`[1]' due to data type mismatch: argument 2 requires bigint type, however, '1' is of int type.; line 1 pos 22 org.apache.spark.sql.AnalysisException: cannot resolve 't.`a`[1]' due to data type mismatch: argument 2 requires bigint type, however, '1' is of int type.; line 1 pos 22 at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:82) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:307) ``` After: Run the sql queries above. No errors. Author: Weiqing Yang Closes #15448 from weiqingy/SPARK_17108. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d95662e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d95662e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d95662e Branch: refs/heads/master Commit: 0d95662e7fff26669d4f70e88fdac7a4128a4f49 Parents: b06c23d Author: Weiqing Yang Authored: Mon Nov 7 21:33:01 2016 +0100 Committer: Herman van Hovell Committed: Mon Nov 7 21:33:01 2016 +0100 -- .../catalyst/expressions/complexTypeExtractors.scala| 2 +- .../apache/spark/sql/hive/execution/SQLQuerySuite.scala | 12 2 files changed, 13 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d95662e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index abb5594..0c256c3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -260,7 +260,7 @@ case class GetArrayItem(child: Expression, ordinal: Expression) * We need to do type checking here as `key` expression maybe unresolved. */ case class GetMapValue(child: Expression, key: Expression) - extends BinaryExpression with ExpectsInputTypes with ExtractValue { + extends BinaryExpression with ImplicitCastInputTypes with ExtractValue { private def keyType = child.dataType.asInstanceOf[MapType].keyType http://git-wip-us.apache.org/repos/asf/spark/blob/0d95662e/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 5e08ef3..c21db35 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1939,6 +1939,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } + + test("SPARK-17108: Fix BIGINT and INT comparison failure in spark sql") { +sql("create table t1(a map>)") +sql("select * from t1 where a[1] is not null") + +sql("create table t2(a map>)") +sql("select * from t2 where a[1] is not null") + +sql("create table t3(a map>)") +sql("select * from t3 where a[1L] is not null") + } + test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") { withTempDir { dir => for (i <- 1 to 3) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17108][SQL] Fix BIGINT and INT comparison failure in spark sql
Repository: spark Updated Branches: refs/heads/branch-2.1 7a84edb24 -> d1eac3ef4 [SPARK-17108][SQL] Fix BIGINT and INT comparison failure in spark sql ## What changes were proposed in this pull request? Add a function to check if two integers are compatible when invoking `acceptsType()` in `DataType`. ## How was this patch tested? Manually. E.g. ``` spark.sql("create table t3(a map>)") spark.sql("select * from t3 where a[1] is not null") ``` Before: ``` cannot resolve 't.`a`[1]' due to data type mismatch: argument 2 requires bigint type, however, '1' is of int type.; line 1 pos 22 org.apache.spark.sql.AnalysisException: cannot resolve 't.`a`[1]' due to data type mismatch: argument 2 requires bigint type, however, '1' is of int type.; line 1 pos 22 at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:82) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:307) ``` After: Run the sql queries above. No errors. Author: Weiqing Yang Closes #15448 from weiqingy/SPARK_17108. (cherry picked from commit 0d95662e7fff26669d4f70e88fdac7a4128a4f49) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d1eac3ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d1eac3ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d1eac3ef Branch: refs/heads/branch-2.1 Commit: d1eac3ef4af2f8c58395ff6f8bb58a1806a8c09b Parents: 7a84edb Author: Weiqing Yang Authored: Mon Nov 7 21:33:01 2016 +0100 Committer: Herman van Hovell Committed: Mon Nov 7 21:33:13 2016 +0100 -- .../catalyst/expressions/complexTypeExtractors.scala| 2 +- .../apache/spark/sql/hive/execution/SQLQuerySuite.scala | 12 2 files changed, 13 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d1eac3ef/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index abb5594..0c256c3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -260,7 +260,7 @@ case class GetArrayItem(child: Expression, ordinal: Expression) * We need to do type checking here as `key` expression maybe unresolved. */ case class GetMapValue(child: Expression, key: Expression) - extends BinaryExpression with ExpectsInputTypes with ExtractValue { + extends BinaryExpression with ImplicitCastInputTypes with ExtractValue { private def keyType = child.dataType.asInstanceOf[MapType].keyType http://git-wip-us.apache.org/repos/asf/spark/blob/d1eac3ef/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 5e08ef3..c21db35 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1939,6 +1939,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } + + test("SPARK-17108: Fix BIGINT and INT comparison failure in spark sql") { +sql("create table t1(a map>)") +sql("select * from t1 where a[1] is not null") + +sql("create table t2(a map>)") +sql("select * from t2 where a[1] is not null") + +sql("create table t3(a map>)") +sql("select * from t3 where a[1L] is not null") + } + test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") { withTempDir { dir => for (i <- 1 to 3) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v2.0.2-rc3
Repository: spark Updated Branches: refs/heads/branch-2.0 10525c294 -> a39f8c101 Preparing Spark release v2.0.2-rc3 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/584354ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/584354ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/584354ea Branch: refs/heads/branch-2.0 Commit: 584354eaac02531c9584188b143367ba694b0c34 Parents: 10525c2 Author: Patrick Wendell Authored: Mon Nov 7 12:26:31 2016 -0800 Committer: Patrick Wendell Committed: Mon Nov 7 12:26:31 2016 -0800 -- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 37 files changed, 38 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/584354ea/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 0b01ca8..dfb7e22 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,7 +1,7 @@ Package: SparkR Type: Package Title: R Frontend for Apache Spark -Version: 2.0.3 +Version: 2.0.2 Date: 2016-08-27 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), email = "shiva...@cs.berkeley.edu"), http://git-wip-us.apache.org/repos/asf/spark/blob/584354ea/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index de09fce..58feedc 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.3-SNAPSHOT +2.0.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/584354ea/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 2ee104f..a75d222 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.3-SNAPSHOT +2.0.2 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/584354ea/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index b20f9e2..828a407 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.3-SNAPSHOT +2.0.2 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/584354ea/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 06895c6..30891f3 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.3-SNAPSHOT +2.0.2 ../../pom.xml http://git-wip-us.apache
[2/2] spark git commit: Preparing development version 2.0.3-SNAPSHOT
Preparing development version 2.0.3-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a39f8c10 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a39f8c10 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a39f8c10 Branch: refs/heads/branch-2.0 Commit: a39f8c101283f29dac2e49d8d219588f2350fe94 Parents: 584354e Author: Patrick Wendell Authored: Mon Nov 7 12:26:38 2016 -0800 Committer: Patrick Wendell Committed: Mon Nov 7 12:26:38 2016 -0800 -- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 37 files changed, 38 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a39f8c10/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index dfb7e22..0b01ca8 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,7 +1,7 @@ Package: SparkR Type: Package Title: R Frontend for Apache Spark -Version: 2.0.2 +Version: 2.0.3 Date: 2016-08-27 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), email = "shiva...@cs.berkeley.edu"), http://git-wip-us.apache.org/repos/asf/spark/blob/a39f8c10/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 58feedc..de09fce 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.2 +2.0.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/a39f8c10/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index a75d222..2ee104f 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.2 +2.0.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/a39f8c10/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 828a407..b20f9e2 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.2 +2.0.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/a39f8c10/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 30891f3..06895c6 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.2 +2.0.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/a39f8c10/common/sketch/pom.xml --
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.0.2-rc3 [created] 584354eaa - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test to check whether default starting offset in latest
Repository: spark Updated Branches: refs/heads/master daa975f4b -> b06c23db9 [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test to check whether default starting offset in latest ## What changes were proposed in this pull request? Added test to check whether default starting offset in latest ## How was this patch tested? new unit test Author: Tathagata Das Closes #15778 from tdas/SPARK-18283. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b06c23db Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b06c23db Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b06c23db Branch: refs/heads/master Commit: b06c23db9aedae48c9eba9d702ae82fa5647cfe5 Parents: daa975f Author: Tathagata Das Authored: Mon Nov 7 10:43:36 2016 -0800 Committer: Shixiong Zhu Committed: Mon Nov 7 10:43:36 2016 -0800 -- .../spark/sql/kafka010/KafkaSourceSuite.scala | 24 1 file changed, 24 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b06c23db/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala index ed4cc75..89e713f 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala @@ -306,6 +306,30 @@ class KafkaSourceSuite extends KafkaSourceTest { ) } + test("starting offset is latest by default") { +val topic = newTopic() +testUtils.createTopic(topic, partitions = 5) +testUtils.sendMessages(topic, Array("0")) +require(testUtils.getLatestOffsets(Set(topic)).size === 5) + +val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("subscribe", topic) + +val kafka = reader.load() + .selectExpr("CAST(value AS STRING)") + .as[String] +val mapped = kafka.map(_.toInt) + +testStream(mapped)( + makeSureGetOffsetCalled, + AddKafkaData(Set(topic), 1, 2, 3), + CheckAnswer(1, 2, 3) // should not have 0 +) + } + test("bad source options") { def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = { val ex = intercept[IllegalArgumentException] { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test to check whether default starting offset in latest
Repository: spark Updated Branches: refs/heads/branch-2.0 b5d7217af -> 10525c294 [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test to check whether default starting offset in latest ## What changes were proposed in this pull request? Added test to check whether default starting offset in latest ## How was this patch tested? new unit test Author: Tathagata Das Closes #15778 from tdas/SPARK-18283. (cherry picked from commit b06c23db9aedae48c9eba9d702ae82fa5647cfe5) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/10525c29 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/10525c29 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/10525c29 Branch: refs/heads/branch-2.0 Commit: 10525c2947d9d1593e77e6af692573b03de6a71f Parents: b5d7217 Author: Tathagata Das Authored: Mon Nov 7 10:43:36 2016 -0800 Committer: Shixiong Zhu Committed: Mon Nov 7 10:44:05 2016 -0800 -- .../spark/sql/kafka010/KafkaSourceSuite.scala | 24 1 file changed, 24 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/10525c29/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala index ed4cc75..89e713f 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala @@ -306,6 +306,30 @@ class KafkaSourceSuite extends KafkaSourceTest { ) } + test("starting offset is latest by default") { +val topic = newTopic() +testUtils.createTopic(topic, partitions = 5) +testUtils.sendMessages(topic, Array("0")) +require(testUtils.getLatestOffsets(Set(topic)).size === 5) + +val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("subscribe", topic) + +val kafka = reader.load() + .selectExpr("CAST(value AS STRING)") + .as[String] +val mapped = kafka.map(_.toInt) + +testStream(mapped)( + makeSureGetOffsetCalled, + AddKafkaData(Set(topic), 1, 2, 3), + CheckAnswer(1, 2, 3) // should not have 0 +) + } + test("bad source options") { def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = { val ex = intercept[IllegalArgumentException] { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test to check whether default starting offset in latest
Repository: spark Updated Branches: refs/heads/branch-2.1 6b332909f -> 7a84edb24 [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test to check whether default starting offset in latest ## What changes were proposed in this pull request? Added test to check whether default starting offset in latest ## How was this patch tested? new unit test Author: Tathagata Das Closes #15778 from tdas/SPARK-18283. (cherry picked from commit b06c23db9aedae48c9eba9d702ae82fa5647cfe5) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a84edb2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a84edb2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a84edb2 Branch: refs/heads/branch-2.1 Commit: 7a84edb2475446ff3a98e8cc8dcf62ee801fbbb9 Parents: 6b33290 Author: Tathagata Das Authored: Mon Nov 7 10:43:36 2016 -0800 Committer: Shixiong Zhu Committed: Mon Nov 7 10:43:53 2016 -0800 -- .../spark/sql/kafka010/KafkaSourceSuite.scala | 24 1 file changed, 24 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a84edb2/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala -- diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala index ed4cc75..89e713f 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala @@ -306,6 +306,30 @@ class KafkaSourceSuite extends KafkaSourceTest { ) } + test("starting offset is latest by default") { +val topic = newTopic() +testUtils.createTopic(topic, partitions = 5) +testUtils.sendMessages(topic, Array("0")) +require(testUtils.getLatestOffsets(Set(topic)).size === 5) + +val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("subscribe", topic) + +val kafka = reader.load() + .selectExpr("CAST(value AS STRING)") + .as[String] +val mapped = kafka.map(_.toInt) + +testStream(mapped)( + makeSureGetOffsetCalled, + AddKafkaData(Set(topic), 1, 2, 3), + CheckAnswer(1, 2, 3) // should not have 0 +) + } + test("bad source options") { def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = { val ex = intercept[IllegalArgumentException] { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18125][SQL][BRANCH-2.0] Fix a compilation error in codegen due to splitExpression
Repository: spark Updated Branches: refs/heads/branch-2.0 dd5cb0a98 -> b5d7217af [SPARK-18125][SQL][BRANCH-2.0] Fix a compilation error in codegen due to splitExpression ## What changes were proposed in this pull request? Backport to branch 2.0. As reported in the jira, sometimes the generated java code in codegen will cause compilation error. Code snippet to test it: case class Route(src: String, dest: String, cost: Int) case class GroupedRoutes(src: String, dest: String, routes: Seq[Route]) val ds = sc.parallelize(Array( Route("a", "b", 1), Route("a", "b", 2), Route("a", "c", 2), Route("a", "d", 10), Route("b", "a", 1), Route("b", "a", 5), Route("b", "c", 6)) ).toDF.as[Route] val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r))) .groupByKey(r => (r.src, r.dest)) .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) => GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes) }.map(_._2) The problem here is, in `ReferenceToExpressions` we evaluate the children vars to local variables. Then the result expression is evaluated to use those children variables. In the above case, the result expression code is too long and will be split by `CodegenContext.splitExpression`. So those local variables cannot be accessed and cause compilation error. ## How was this patch tested? Jenkins tests. Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: Liang-Chi Hsieh Closes #15796 from viirya/fix-codege-compilation-error-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5d7217a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5d7217a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5d7217a Branch: refs/heads/branch-2.0 Commit: b5d7217aff80c4c407672dc1858c824954953b1d Parents: dd5cb0a Author: Liang-Chi Hsieh Authored: Mon Nov 7 19:09:18 2016 +0100 Committer: Herman van Hovell Committed: Mon Nov 7 19:09:18 2016 +0100 -- .../expressions/ReferenceToExpressions.scala| 28 +++ .../org/apache/spark/sql/DatasetSuite.scala | 37 2 files changed, 59 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b5d7217a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala index 502d791..6c75a7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -45,6 +45,7 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal + case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + @@ -62,15 +63,30 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) -val childrenVars = childrenGen.zip(children).map { - case (childGen, child) => LambdaVariable(childGen.value, childGen.isNull, child.dataType) -} +val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map { + case (childGen, child) => +// SPARK-18125: The children vars are local variables. If the result expression uses +// splitExpression, those variables cannot be accessed so compilation fails. +// To fix it, we use class variables to hold those local variables. +val classChildVarName = ctx.freshName("classChildVar") +val classChildVarIsNull = ctx.freshName("classChildVarIsNull") +ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "") +ctx.addMutableState("boolean", classChildVarIsNull, "") + +val classChildVar = + LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType) + +val initCode = s"${classChildVar.value} = ${childGen.value};\n" + + s"${classChildVar.isNull} = ${childGen.isNull};" + +(classChildVar, initCode) +}.unzip val resultGen = result.transform { - case b: BoundReference => childrenVars(b.
spark git commit: [SPARK-18291][SPARKR][ML] SparkR glm predict should output original label when family = binomial.
Repository: spark Updated Branches: refs/heads/branch-2.1 df40ee2b4 -> 6b332909f [SPARK-18291][SPARKR][ML] SparkR glm predict should output original label when family = binomial. ## What changes were proposed in this pull request? SparkR ```spark.glm``` predict should output original label when family = "binomial". ## How was this patch tested? Add unit test. You can also run the following code to test: ```R training <- suppressWarnings(createDataFrame(iris)) training <- training[training$Species %in% c("versicolor", "virginica"), ] model <- spark.glm(training, Species ~ Sepal_Length + Sepal_Width,family = binomial(link = "logit")) showDF(predict(model, training)) ``` Before this change: ``` ++---++---+--+-+---+ |Sepal_Length|Sepal_Width|Petal_Length|Petal_Width| Species|label| prediction| ++---++---+--+-+---+ | 7.0|3.2| 4.7|1.4|versicolor| 0.0| 0.8271421517601544| | 6.4|3.2| 4.5|1.5|versicolor| 0.0| 0.6044595910413112| | 6.9|3.1| 4.9|1.5|versicolor| 0.0| 0.7916340858281998| | 5.5|2.3| 4.0|1.3|versicolor| 0.0|0.16080518180591158| | 6.5|2.8| 4.6|1.5|versicolor| 0.0| 0.6112229217050189| | 5.7|2.8| 4.5|1.3|versicolor| 0.0| 0.2555087295500885| | 6.3|3.3| 4.7|1.6|versicolor| 0.0| 0.5681507664364834| | 4.9|2.4| 3.3|1.0|versicolor| 0.0|0.05990570219972002| | 6.6|2.9| 4.6|1.3|versicolor| 0.0| 0.6644434078306246| | 5.2|2.7| 3.9|1.4|versicolor| 0.0|0.11293577405862379| | 5.0|2.0| 3.5|1.0|versicolor| 0.0|0.06152372321585971| | 5.9|3.0| 4.2|1.5|versicolor| 0.0|0.35250697207602555| | 6.0|2.2| 4.0|1.0|versicolor| 0.0|0.32267018290814303| | 6.1|2.9| 4.7|1.4|versicolor| 0.0| 0.433391153814592| | 5.6|2.9| 3.6|1.3|versicolor| 0.0| 0.2280744262436993| | 6.7|3.1| 4.4|1.4|versicolor| 0.0| 0.7219848389339459| | 5.6|3.0| 4.5|1.5|versicolor| 0.0|0.23527698971404695| | 5.8|2.7| 4.1|1.0|versicolor| 0.0| 0.285024533520016| | 6.2|2.2| 4.5|1.5|versicolor| 0.0| 0.4107047877447493| | 5.6|2.5| 3.9|1.1|versicolor| 0.0|0.20083561961645083| ++---++---+--+-+---+ ``` After this change: ``` ++---++---+--+-+--+ |Sepal_Length|Sepal_Width|Petal_Length|Petal_Width| Species|label|prediction| ++---++---+--+-+--+ | 7.0|3.2| 4.7|1.4|versicolor| 0.0| virginica| | 6.4|3.2| 4.5|1.5|versicolor| 0.0| virginica| | 6.9|3.1| 4.9|1.5|versicolor| 0.0| virginica| | 5.5|2.3| 4.0|1.3|versicolor| 0.0|versicolor| | 6.5|2.8| 4.6|1.5|versicolor| 0.0| virginica| | 5.7|2.8| 4.5|1.3|versicolor| 0.0|versicolor| | 6.3|3.3| 4.7|1.6|versicolor| 0.0| virginica| | 4.9|2.4| 3.3|1.0|versicolor| 0.0|versicolor| | 6.6|2.9| 4.6|1.3|versicolor| 0.0| virginica| | 5.2|2.7| 3.9|1.4|versicolor| 0.0|versicolor| | 5.0|2.0| 3.5|1.0|versicolor| 0.0|versicolor| | 5.9|3.0| 4.2|1.5|versicolor| 0.0|versicolor| | 6.0|2.2| 4.0|1.0|versicolor| 0.0|versicolor| | 6.1|2.9| 4.7|1.4|versicolor| 0.0|versicolor| | 5.6|2.9| 3.6|1.3|versicolor| 0.0|versicolor| | 6.7|3.1| 4.4|1.4|versicolor| 0.0| virginica| | 5.6|3.0| 4.5|1.5|versicolor| 0.0|versicolor| | 5.8|2.7| 4.1|1.0|versicolor| 0.0|versicolor| | 6.2|2.2| 4.5|1.5|versicolor| 0.0|versicolor| | 5.6|2.5| 3.9|1.1|versicolor| 0.0|versicolor| ++---++---+--+-+--+ ``` Author: Yanbo Liang Closes #15788 from yanboliang/spark-18291. (cherry picked from commit daa975f4bfa4f904697bf3365a4be9987032e490) Signed-off-by: Yanbo Liang Project: http:/
spark git commit: [SPARK-18291][SPARKR][ML] SparkR glm predict should output original label when family = binomial.
Repository: spark Updated Branches: refs/heads/master a814eeac6 -> daa975f4b [SPARK-18291][SPARKR][ML] SparkR glm predict should output original label when family = binomial. ## What changes were proposed in this pull request? SparkR ```spark.glm``` predict should output original label when family = "binomial". ## How was this patch tested? Add unit test. You can also run the following code to test: ```R training <- suppressWarnings(createDataFrame(iris)) training <- training[training$Species %in% c("versicolor", "virginica"), ] model <- spark.glm(training, Species ~ Sepal_Length + Sepal_Width,family = binomial(link = "logit")) showDF(predict(model, training)) ``` Before this change: ``` ++---++---+--+-+---+ |Sepal_Length|Sepal_Width|Petal_Length|Petal_Width| Species|label| prediction| ++---++---+--+-+---+ | 7.0|3.2| 4.7|1.4|versicolor| 0.0| 0.8271421517601544| | 6.4|3.2| 4.5|1.5|versicolor| 0.0| 0.6044595910413112| | 6.9|3.1| 4.9|1.5|versicolor| 0.0| 0.7916340858281998| | 5.5|2.3| 4.0|1.3|versicolor| 0.0|0.16080518180591158| | 6.5|2.8| 4.6|1.5|versicolor| 0.0| 0.6112229217050189| | 5.7|2.8| 4.5|1.3|versicolor| 0.0| 0.2555087295500885| | 6.3|3.3| 4.7|1.6|versicolor| 0.0| 0.5681507664364834| | 4.9|2.4| 3.3|1.0|versicolor| 0.0|0.05990570219972002| | 6.6|2.9| 4.6|1.3|versicolor| 0.0| 0.6644434078306246| | 5.2|2.7| 3.9|1.4|versicolor| 0.0|0.11293577405862379| | 5.0|2.0| 3.5|1.0|versicolor| 0.0|0.06152372321585971| | 5.9|3.0| 4.2|1.5|versicolor| 0.0|0.35250697207602555| | 6.0|2.2| 4.0|1.0|versicolor| 0.0|0.32267018290814303| | 6.1|2.9| 4.7|1.4|versicolor| 0.0| 0.433391153814592| | 5.6|2.9| 3.6|1.3|versicolor| 0.0| 0.2280744262436993| | 6.7|3.1| 4.4|1.4|versicolor| 0.0| 0.7219848389339459| | 5.6|3.0| 4.5|1.5|versicolor| 0.0|0.23527698971404695| | 5.8|2.7| 4.1|1.0|versicolor| 0.0| 0.285024533520016| | 6.2|2.2| 4.5|1.5|versicolor| 0.0| 0.4107047877447493| | 5.6|2.5| 3.9|1.1|versicolor| 0.0|0.20083561961645083| ++---++---+--+-+---+ ``` After this change: ``` ++---++---+--+-+--+ |Sepal_Length|Sepal_Width|Petal_Length|Petal_Width| Species|label|prediction| ++---++---+--+-+--+ | 7.0|3.2| 4.7|1.4|versicolor| 0.0| virginica| | 6.4|3.2| 4.5|1.5|versicolor| 0.0| virginica| | 6.9|3.1| 4.9|1.5|versicolor| 0.0| virginica| | 5.5|2.3| 4.0|1.3|versicolor| 0.0|versicolor| | 6.5|2.8| 4.6|1.5|versicolor| 0.0| virginica| | 5.7|2.8| 4.5|1.3|versicolor| 0.0|versicolor| | 6.3|3.3| 4.7|1.6|versicolor| 0.0| virginica| | 4.9|2.4| 3.3|1.0|versicolor| 0.0|versicolor| | 6.6|2.9| 4.6|1.3|versicolor| 0.0| virginica| | 5.2|2.7| 3.9|1.4|versicolor| 0.0|versicolor| | 5.0|2.0| 3.5|1.0|versicolor| 0.0|versicolor| | 5.9|3.0| 4.2|1.5|versicolor| 0.0|versicolor| | 6.0|2.2| 4.0|1.0|versicolor| 0.0|versicolor| | 6.1|2.9| 4.7|1.4|versicolor| 0.0|versicolor| | 5.6|2.9| 3.6|1.3|versicolor| 0.0|versicolor| | 6.7|3.1| 4.4|1.4|versicolor| 0.0| virginica| | 5.6|3.0| 4.5|1.5|versicolor| 0.0|versicolor| | 5.8|2.7| 4.1|1.0|versicolor| 0.0|versicolor| | 6.2|2.2| 4.5|1.5|versicolor| 0.0|versicolor| | 5.6|2.5| 3.9|1.1|versicolor| 0.0|versicolor| ++---++---+--+-+--+ ``` Author: Yanbo Liang Closes #15788 from yanboliang/spark-18291. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commi
spark git commit: [SPARK-18125][SQL] Fix a compilation error in codegen due to splitExpression
Repository: spark Updated Branches: refs/heads/branch-2.1 410102957 -> df40ee2b4 [SPARK-18125][SQL] Fix a compilation error in codegen due to splitExpression ## What changes were proposed in this pull request? As reported in the jira, sometimes the generated java code in codegen will cause compilation error. Code snippet to test it: case class Route(src: String, dest: String, cost: Int) case class GroupedRoutes(src: String, dest: String, routes: Seq[Route]) val ds = sc.parallelize(Array( Route("a", "b", 1), Route("a", "b", 2), Route("a", "c", 2), Route("a", "d", 10), Route("b", "a", 1), Route("b", "a", 5), Route("b", "c", 6)) ).toDF.as[Route] val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r))) .groupByKey(r => (r.src, r.dest)) .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) => GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes) }.map(_._2) The problem here is, in `ReferenceToExpressions` we evaluate the children vars to local variables. Then the result expression is evaluated to use those children variables. In the above case, the result expression code is too long and will be split by `CodegenContext.splitExpression`. So those local variables cannot be accessed and cause compilation error. ## How was this patch tested? Jenkins tests. Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: Liang-Chi Hsieh Closes #15693 from viirya/fix-codege-compilation-error. (cherry picked from commit a814eeac6b3c38d1294b88c60cd083fc4d01bd25) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df40ee2b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df40ee2b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df40ee2b Branch: refs/heads/branch-2.1 Commit: df40ee2b483989a47cb85d248280cc02f527112d Parents: 4101029 Author: Liang-Chi Hsieh Authored: Mon Nov 7 12:18:19 2016 +0100 Committer: Herman van Hovell Committed: Mon Nov 7 12:18:34 2016 +0100 -- .../expressions/ReferenceToExpressions.scala| 27 ++ .../org/apache/spark/sql/DatasetSuite.scala | 37 2 files changed, 58 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df40ee2b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala index 127797c..6c75a7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -63,15 +63,30 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) -val childrenVars = childrenGen.zip(children).map { - case (childGen, child) => LambdaVariable(childGen.value, childGen.isNull, child.dataType) -} +val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map { + case (childGen, child) => +// SPARK-18125: The children vars are local variables. If the result expression uses +// splitExpression, those variables cannot be accessed so compilation fails. +// To fix it, we use class variables to hold those local variables. +val classChildVarName = ctx.freshName("classChildVar") +val classChildVarIsNull = ctx.freshName("classChildVarIsNull") +ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "") +ctx.addMutableState("boolean", classChildVarIsNull, "") + +val classChildVar = + LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType) + +val initCode = s"${classChildVar.value} = ${childGen.value};\n" + + s"${classChildVar.isNull} = ${childGen.isNull};" + +(classChildVar, initCode) +}.unzip val resultGen = result.transform { - case b: BoundReference => childrenVars(b.ordinal) + case b: BoundReference => classChildrenVars(b.ordinal) }.genCode(ctx) -ExprCode(code = childrenGen.map(_.code).mkString("\n") + "\n" + resultGen.code, - isNull = resultGen.isNull, value = resultGen.value) +ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\
spark git commit: [SPARK-18125][SQL] Fix a compilation error in codegen due to splitExpression
Repository: spark Updated Branches: refs/heads/master 57626a557 -> a814eeac6 [SPARK-18125][SQL] Fix a compilation error in codegen due to splitExpression ## What changes were proposed in this pull request? As reported in the jira, sometimes the generated java code in codegen will cause compilation error. Code snippet to test it: case class Route(src: String, dest: String, cost: Int) case class GroupedRoutes(src: String, dest: String, routes: Seq[Route]) val ds = sc.parallelize(Array( Route("a", "b", 1), Route("a", "b", 2), Route("a", "c", 2), Route("a", "d", 10), Route("b", "a", 1), Route("b", "a", 5), Route("b", "c", 6)) ).toDF.as[Route] val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r))) .groupByKey(r => (r.src, r.dest)) .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) => GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes) }.map(_._2) The problem here is, in `ReferenceToExpressions` we evaluate the children vars to local variables. Then the result expression is evaluated to use those children variables. In the above case, the result expression code is too long and will be split by `CodegenContext.splitExpression`. So those local variables cannot be accessed and cause compilation error. ## How was this patch tested? Jenkins tests. Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: Liang-Chi Hsieh Closes #15693 from viirya/fix-codege-compilation-error. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a814eeac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a814eeac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a814eeac Branch: refs/heads/master Commit: a814eeac6b3c38d1294b88c60cd083fc4d01bd25 Parents: 57626a5 Author: Liang-Chi Hsieh Authored: Mon Nov 7 12:18:19 2016 +0100 Committer: Herman van Hovell Committed: Mon Nov 7 12:18:19 2016 +0100 -- .../expressions/ReferenceToExpressions.scala| 27 ++ .../org/apache/spark/sql/DatasetSuite.scala | 37 2 files changed, 58 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a814eeac/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala index 127797c..6c75a7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -63,15 +63,30 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) -val childrenVars = childrenGen.zip(children).map { - case (childGen, child) => LambdaVariable(childGen.value, childGen.isNull, child.dataType) -} +val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map { + case (childGen, child) => +// SPARK-18125: The children vars are local variables. If the result expression uses +// splitExpression, those variables cannot be accessed so compilation fails. +// To fix it, we use class variables to hold those local variables. +val classChildVarName = ctx.freshName("classChildVar") +val classChildVarIsNull = ctx.freshName("classChildVarIsNull") +ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "") +ctx.addMutableState("boolean", classChildVarIsNull, "") + +val classChildVar = + LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType) + +val initCode = s"${classChildVar.value} = ${childGen.value};\n" + + s"${classChildVar.isNull} = ${childGen.isNull};" + +(classChildVar, initCode) +}.unzip val resultGen = result.transform { - case b: BoundReference => childrenVars(b.ordinal) + case b: BoundReference => classChildrenVars(b.ordinal) }.genCode(ctx) -ExprCode(code = childrenGen.map(_.code).mkString("\n") + "\n" + resultGen.code, - isNull = resultGen.isNull, value = resultGen.value) +ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") + + resultGen.code, isNull = resultGen.isNull, value = resultGen.value) } } http://git-wip-us.apac
spark git commit: [SPARK-16904][SQL] Removal of Hive Built-in Hash Functions and TestHiveFunctionRegistry
Repository: spark Updated Branches: refs/heads/branch-2.1 2fa1a632a -> 410102957 [SPARK-16904][SQL] Removal of Hive Built-in Hash Functions and TestHiveFunctionRegistry ### What changes were proposed in this pull request? Currently, the Hive built-in `hash` function is not being used in Spark since Spark 2.0. The public interface does not allow users to unregister the Spark built-in functions. Thus, users will never use Hive's built-in `hash` function. The only exception here is `TestHiveFunctionRegistry`, which allows users to unregister the built-in functions. Thus, we can load Hive's hash function in the test cases. If we disable it, 10+ test cases will fail because the results are different from the Hive golden answer files. This PR is to remove `hash` from the list of `hiveFunctions` in `HiveSessionCatalog`. It will also remove `TestHiveFunctionRegistry`. This removal makes us easier to remove `TestHiveSessionState` in the future. ### How was this patch tested? N/A Author: gatorsmile Closes #14498 from gatorsmile/removeHash. (cherry picked from commit 57626a55703a189e03148398f67c36cd0e557044) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/41010295 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/41010295 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/41010295 Branch: refs/heads/branch-2.1 Commit: 4101029579de920215b426ca6537c1f0e4e4e5ae Parents: 2fa1a63 Author: gatorsmile Authored: Mon Nov 7 01:16:37 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 01:16:43 2016 -0800 -- .../hive/execution/HiveCompatibilitySuite.scala | 41 ++-- .../spark/sql/hive/HiveSessionCatalog.scala | 1 - .../apache/spark/sql/hive/test/TestHive.scala | 28 - 3 files changed, 20 insertions(+), 50 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/41010295/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala -- diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index f5d10de..5cd4935 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -57,8 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5) // Enable in-memory partition pruning for testing purposes TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) -// Use Hive hash expression instead of the native one -TestHive.sessionState.functionRegistry.unregisterFunction("hash") // Ensures that the plans generation use metastore relation and not OrcRelation // Was done because SqlBuilder does not work with plans having logical relation TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false) @@ -76,7 +74,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) - TestHive.sessionState.functionRegistry.restore() // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) @@ -581,7 +578,26 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "auto_join6", "auto_join7", "auto_join8", -"auto_join9" +"auto_join9", + +// These tests are based on the Hive's hash function, which is different from Spark +"auto_join19", +"auto_join22", +"auto_join25", +"auto_join26", +"auto_join27", +"auto_join28", +"auto_join30", +"auto_join31", +"auto_join_nulls", +"auto_join_reordering_values", +"correlationoptimizer1", +"correlationoptimizer2", +"correlationoptimizer3", +"correlationoptimizer4", +"multiMapJoin1", +"orc_dictionary_threshold", +"udf_hash" ) /** @@ -601,16 +617,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "annotate_stats_part", "annotate_stats_table", "annotate_stats_union", -"auto_join19", -"auto_join22", -"auto_join25", -"auto_join26", -"auto_join27", -"auto_join28", -"auto_join30", -"au
spark git commit: [SPARK-16904][SQL] Removal of Hive Built-in Hash Functions and TestHiveFunctionRegistry
Repository: spark Updated Branches: refs/heads/master 9db06c442 -> 57626a557 [SPARK-16904][SQL] Removal of Hive Built-in Hash Functions and TestHiveFunctionRegistry ### What changes were proposed in this pull request? Currently, the Hive built-in `hash` function is not being used in Spark since Spark 2.0. The public interface does not allow users to unregister the Spark built-in functions. Thus, users will never use Hive's built-in `hash` function. The only exception here is `TestHiveFunctionRegistry`, which allows users to unregister the built-in functions. Thus, we can load Hive's hash function in the test cases. If we disable it, 10+ test cases will fail because the results are different from the Hive golden answer files. This PR is to remove `hash` from the list of `hiveFunctions` in `HiveSessionCatalog`. It will also remove `TestHiveFunctionRegistry`. This removal makes us easier to remove `TestHiveSessionState` in the future. ### How was this patch tested? N/A Author: gatorsmile Closes #14498 from gatorsmile/removeHash. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57626a55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57626a55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57626a55 Branch: refs/heads/master Commit: 57626a55703a189e03148398f67c36cd0e557044 Parents: 9db06c4 Author: gatorsmile Authored: Mon Nov 7 01:16:37 2016 -0800 Committer: Reynold Xin Committed: Mon Nov 7 01:16:37 2016 -0800 -- .../hive/execution/HiveCompatibilitySuite.scala | 41 ++-- .../spark/sql/hive/HiveSessionCatalog.scala | 1 - .../apache/spark/sql/hive/test/TestHive.scala | 28 - 3 files changed, 20 insertions(+), 50 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57626a55/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala -- diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index f5d10de..5cd4935 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -57,8 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5) // Enable in-memory partition pruning for testing purposes TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) -// Use Hive hash expression instead of the native one -TestHive.sessionState.functionRegistry.unregisterFunction("hash") // Ensures that the plans generation use metastore relation and not OrcRelation // Was done because SqlBuilder does not work with plans having logical relation TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false) @@ -76,7 +74,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) - TestHive.sessionState.functionRegistry.restore() // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) @@ -581,7 +578,26 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "auto_join6", "auto_join7", "auto_join8", -"auto_join9" +"auto_join9", + +// These tests are based on the Hive's hash function, which is different from Spark +"auto_join19", +"auto_join22", +"auto_join25", +"auto_join26", +"auto_join27", +"auto_join28", +"auto_join30", +"auto_join31", +"auto_join_nulls", +"auto_join_reordering_values", +"correlationoptimizer1", +"correlationoptimizer2", +"correlationoptimizer3", +"correlationoptimizer4", +"multiMapJoin1", +"orc_dictionary_threshold", +"udf_hash" ) /** @@ -601,16 +617,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "annotate_stats_part", "annotate_stats_table", "annotate_stats_union", -"auto_join19", -"auto_join22", -"auto_join25", -"auto_join26", -"auto_join27", -"auto_join28", -"auto_join30", -"auto_join31", -"auto_join_nulls", -"auto_join_reordering_values", "binary_constant", "bina