spark git commit: [SPARK-21042][SQL] Document Dataset.union is resolution by position
Repository: spark Updated Branches: refs/heads/branch-2.2 869af5bcb -> 815a0820b [SPARK-21042][SQL] Document Dataset.union is resolution by position ## What changes were proposed in this pull request? Document Dataset.union is resolution by position, not by name, since this has been a confusing point for a lot of users. ## How was this patch tested? N/A - doc only change. Author: Reynold Xin Closes #18256 from rxin/SPARK-21042. (cherry picked from commit b78e3849b20d0d09b7146efd7ce8f203ef67b890) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/815a0820 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/815a0820 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/815a0820 Branch: refs/heads/branch-2.2 Commit: 815a0820b1808118ae198a44f4aa0f0f2b6511e6 Parents: 869af5b Author: Reynold Xin Authored: Fri Jun 9 18:29:33 2017 -0700 Committer: Reynold Xin Committed: Fri Jun 9 18:29:39 2017 -0700 -- R/pkg/R/DataFrame.R | 1 + python/pyspark/sql/dataframe.py | 13 + .../src/main/scala/org/apache/spark/sql/Dataset.scala | 14 -- 3 files changed, 18 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/815a0820/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a7b1e3b..b606f1f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' Input SparkDataFrames can have different schemas (names and data types). #' #' Note: This does not remove duplicate rows across the two SparkDataFrames. +#' Also as standard in SQL, this function resolves columns by position (not by name). #' #' @param x A SparkDataFrame #' @param y A SparkDataFrame http://git-wip-us.apache.org/repos/asf/spark/blob/815a0820/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index b1eb80e..d1b336d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1166,18 +1166,23 @@ class DataFrame(object): @since(2.0) def union(self, other): -""" Return a new :class:`DataFrame` containing union of rows in this -frame and another frame. +""" Return a new :class:`DataFrame` containing union of rows in this and another frame. This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by a distinct. + +Also as standard in SQL, this function resolves columns by position (not by name). """ return DataFrame(self._jdf.union(other._jdf), self.sql_ctx) @since(1.3) def unionAll(self, other): -""" Return a new :class:`DataFrame` containing union of rows in this -frame and another frame. +""" Return a new :class:`DataFrame` containing union of rows in this and another frame. + +This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union +(that does deduplication of elements), use this function followed by a distinct. + +Also as standard in SQL, this function resolves columns by position (not by name). .. note:: Deprecated in 2.0, use union instead. """ http://git-wip-us.apache.org/repos/asf/spark/blob/815a0820/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index f37d433..3658890 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1630,10 +1630,11 @@ class Dataset[T] private[sql]( /** * Returns a new Dataset containing union of rows in this Dataset and another Dataset. - * This is equivalent to `UNION ALL` in SQL. * - * To do a SQL-style set union (that does deduplication of elements), use this function followed - * by a [[distinct]]. + * This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does + * deduplication of elements), use this function followed by a [[distinct]]. + * + * Also as standard in SQL, this function resolves columns by position (not by name). * * @group typedrel * @since 2.0.0 @@ -1643,10 +1644,11 @@ class Dataset[T] private[sql]( /** * Returns a new Dataset containing union of rows in this Dataset
spark git commit: [SPARK-21042][SQL] Document Dataset.union is resolution by position
Repository: spark Updated Branches: refs/heads/master 571635488 -> b78e3849b [SPARK-21042][SQL] Document Dataset.union is resolution by position ## What changes were proposed in this pull request? Document Dataset.union is resolution by position, not by name, since this has been a confusing point for a lot of users. ## How was this patch tested? N/A - doc only change. Author: Reynold Xin Closes #18256 from rxin/SPARK-21042. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b78e3849 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b78e3849 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b78e3849 Branch: refs/heads/master Commit: b78e3849b20d0d09b7146efd7ce8f203ef67b890 Parents: 5716354 Author: Reynold Xin Authored: Fri Jun 9 18:29:33 2017 -0700 Committer: Reynold Xin Committed: Fri Jun 9 18:29:33 2017 -0700 -- R/pkg/R/DataFrame.R | 1 + python/pyspark/sql/dataframe.py | 13 + .../src/main/scala/org/apache/spark/sql/Dataset.scala | 14 -- 3 files changed, 18 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b78e3849/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 166b398..3b9d42d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2646,6 +2646,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' Input SparkDataFrames can have different schemas (names and data types). #' #' Note: This does not remove duplicate rows across the two SparkDataFrames. +#' Also as standard in SQL, this function resolves columns by position (not by name). #' #' @param x A SparkDataFrame #' @param y A SparkDataFrame http://git-wip-us.apache.org/repos/asf/spark/blob/b78e3849/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 99abfcc..8541403 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1175,18 +1175,23 @@ class DataFrame(object): @since(2.0) def union(self, other): -""" Return a new :class:`DataFrame` containing union of rows in this -frame and another frame. +""" Return a new :class:`DataFrame` containing union of rows in this and another frame. This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by a distinct. + +Also as standard in SQL, this function resolves columns by position (not by name). """ return DataFrame(self._jdf.union(other._jdf), self.sql_ctx) @since(1.3) def unionAll(self, other): -""" Return a new :class:`DataFrame` containing union of rows in this -frame and another frame. +""" Return a new :class:`DataFrame` containing union of rows in this and another frame. + +This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union +(that does deduplication of elements), use this function followed by a distinct. + +Also as standard in SQL, this function resolves columns by position (not by name). .. note:: Deprecated in 2.0, use union instead. """ http://git-wip-us.apache.org/repos/asf/spark/blob/b78e3849/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index f7637e0..d28ff78 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1734,10 +1734,11 @@ class Dataset[T] private[sql]( /** * Returns a new Dataset containing union of rows in this Dataset and another Dataset. - * This is equivalent to `UNION ALL` in SQL. * - * To do a SQL-style set union (that does deduplication of elements), use this function followed - * by a [[distinct]]. + * This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does + * deduplication of elements), use this function followed by a [[distinct]]. + * + * Also as standard in SQL, this function resolves columns by position (not by name). * * @group typedrel * @since 2.0.0 @@ -1747,10 +1748,11 @@ class Dataset[T] private[sql]( /** * Returns a new Dataset containing union of rows in this Dataset and another Dataset. - * This is equivalent to `UNION ALL` in SQL. * - * To do a SQL-style set u
spark git commit: [SPARK-20918][SQL] Use FunctionIdentifier as function identifiers in FunctionRegistry
Repository: spark Updated Branches: refs/heads/master 82faacd79 -> 571635488 [SPARK-20918][SQL] Use FunctionIdentifier as function identifiers in FunctionRegistry ### What changes were proposed in this pull request? Currently, the unquoted string of a function identifier is being used as the function identifier in the function registry. This could cause the incorrect the behavior when users use `.` in the function names. This PR is to take the `FunctionIdentifier` as the identifier in the function registry. - Add one new function `createOrReplaceTempFunction` to `FunctionRegistry` ```Scala final def createOrReplaceTempFunction(name: String, builder: FunctionBuilder): Unit ``` ### How was this patch tested? Add extra test cases to verify the inclusive bug fixes. Author: Xiao Li Author: gatorsmile Closes #18142 from gatorsmile/fuctionRegistry. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57163548 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57163548 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57163548 Branch: refs/heads/master Commit: 571635488d6e16eee82f09ae0247c2f6ad5b7541 Parents: 82faacd Author: Xiao Li Authored: Fri Jun 9 10:16:30 2017 -0700 Committer: Wenchen Fan Committed: Fri Jun 9 10:16:30 2017 -0700 -- .../catalyst/analysis/FunctionRegistry.scala| 97 +++--- .../sql/catalyst/catalog/SessionCatalog.scala | 37 --- .../catalyst/catalog/SessionCatalogSuite.scala | 2 +- .../org/apache/spark/sql/UDFRegistration.scala | 100 +-- .../spark/sql/execution/command/functions.scala | 2 +- .../spark/sql/GeneratorFunctionSuite.scala | 3 +- .../org/apache/spark/sql/SQLQuerySuite.scala| 2 +- .../apache/spark/sql/SessionStateSuite.scala| 9 +- .../python/BatchEvalPythonExecSuite.scala | 5 +- .../spark/sql/internal/CatalogSuite.scala | 4 + .../spark/sql/hive/HiveSessionCatalog.scala | 4 +- .../spark/sql/hive/execution/HiveUDFSuite.scala | 13 ++- .../sql/hive/execution/SQLQuerySuite.scala | 2 +- 13 files changed, 162 insertions(+), 118 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57163548/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 116b26f..4245b70 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -17,51 +17,68 @@ package org.apache.spark.sql.catalyst.analysis -import java.lang.reflect.Modifier +import java.util.Locale +import javax.annotation.concurrent.GuardedBy +import scala.collection.mutable import scala.language.existentials import scala.reflect.ClassTag import scala.util.{Failure, Success, Try} import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.expressions.xml._ -import org.apache.spark.sql.catalyst.util.StringKeyHashMap import org.apache.spark.sql.types._ /** * A catalog for looking up user defined functions, used by an [[Analyzer]]. * - * Note: The implementation should be thread-safe to allow concurrent access. + * Note: + * 1) The implementation should be thread-safe to allow concurrent access. + * 2) the database name is always case-sensitive here, callers are responsible to + * format the database name w.r.t. case-sensitive config. */ trait FunctionRegistry { - final def registerFunction(name: String, builder: FunctionBuilder): Unit = { -registerFunction(name, new ExpressionInfo(builder.getClass.getCanonicalName, name), builder) + final def registerFunction(name: FunctionIdentifier, builder: FunctionBuilder): Unit = { +val info = new ExpressionInfo( + builder.getClass.getCanonicalName, name.database.orNull, name.funcName) +registerFunction(name, info, builder) } - def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder): Unit + def registerFunction( +name: FunctionIdentifier, +info: ExpressionInfo, +builder: FunctionBuilder): Unit + + /* Create or replace a temporary function. */ + final def createOrReplaceTempFunction(name: String, builder: FunctionBuilder): Unit = { +regis
spark git commit: [SPARK-20997][CORE] driver-cores' standalone or Mesos or YARN in Cluster deploy mode only.
Repository: spark Updated Branches: refs/heads/master 6491cbf06 -> 82faacd79 [SPARK-20997][CORE] driver-cores' standalone or Mesos or YARN in Cluster deploy mode only. ## What changes were proposed in this pull request? '--driver-cores' standalone or Mesos or YARN in Cluster deploy mode only.So The description of spark-submit about it is not very accurate. ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: guoxiaolong Author: éå°é¾ 10207633 Author: guoxiaolongzte Closes #18241 from guoxiaolongzte/SPARK-20997. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/82faacd7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/82faacd7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/82faacd7 Branch: refs/heads/master Commit: 82faacd791d1d62bb8ac186a2a3290e160a20bd5 Parents: 6491cbf Author: guoxiaolong Authored: Fri Jun 9 14:26:54 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 14:26:54 2017 +0100 -- .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/82faacd7/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index b76a3d2..3d9a14c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -558,8 +558,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | --verbose, -v Print additional debug output. | --version, Print the version of current Spark. | -| Spark standalone with cluster deploy mode only: -| --driver-cores NUM Cores for driver (Default: 1). +| Cluster deploy mode only: +| --driver-cores NUM Number of cores used by the driver, only in cluster mode +| (Default: 1). | | Spark standalone or Mesos with cluster deploy mode only: | --supervise If given, restarts the driver on failure. @@ -574,8 +575,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | or all available cores on the worker in standalone mode) | | YARN-only: -| --driver-cores NUM Number of cores used by the driver, only in cluster mode -| (Default: 1). | --queue QUEUE_NAME The YARN queue to submit to (Default: "default"). | --num-executors NUM Number of executors to launch (Default: 2). | If dynamic allocation is enabled, the initial number of - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark-website git commit: Break out security info into dedicated page, to match other ASF projects who list security information
Repository: spark-website Updated Branches: refs/heads/asf-site 004856aaa -> fda1364c8 http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-0-1.html -- diff --git a/site/releases/spark-release-1-0-1.html b/site/releases/spark-release-1-0-1.html index ee4b01e..7d006b9 100644 --- a/site/releases/spark-release-1-0-1.html +++ b/site/releases/spark-release-1-0-1.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-0-2.html -- diff --git a/site/releases/spark-release-1-0-2.html b/site/releases/spark-release-1-0-2.html index 2b340e4..7478fc1 100644 --- a/site/releases/spark-release-1-0-2.html +++ b/site/releases/spark-release-1-0-2.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-1-0.html -- diff --git a/site/releases/spark-release-1-1-0.html b/site/releases/spark-release-1-1-0.html index f2ac6a2..86c6d8b 100644 --- a/site/releases/spark-release-1-1-0.html +++ b/site/releases/spark-release-1-1-0.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-1-1.html -- diff --git a/site/releases/spark-release-1-1-1.html b/site/releases/spark-release-1-1-1.html index 4ed8548..545a3f8 100644 --- a/site/releases/spark-release-1-1-1.html +++ b/site/releases/spark-release-1-1-1.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-2-0.html -- diff --git a/site/releases/spark-release-1-2-0.html b/site/releases/spark-release-1-2-0.html index 63f3988..54b95bc 100644 --- a/site/releases/spark-release-1-2-0.html +++ b/site/releases/spark-release-1-2-0.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-2-1.html -- diff --git a/site/releases/spark-release-1-2-1.html b/site/releases/spark-release-1-2-1.html index dcc0d48..04afe25 100644 --- a/site/releases/spark-release-1-2-1.html +++ b/site/releases/spark-release-1-2-1.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-2-2.html -- diff --git a/site/releases/spark-release-1-2-2.html b/site/releases/spark-release-1-2-2.html index 2f17201..e126395 100644 --- a/site/releases/spark-release-1-2-2.html +++ b/site/releases/spark-release-1-2-2.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-3-0.html -- diff --git a/site/releases/spark-release-1-3-0.html b/site/releases/spark-release-1-3-0.html index 5bb6ae9..14a4c6a 100644 --- a/site/releases/spark-release-1-3-0.html +++ b/site/releases/spark-release-1-3-0.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security http://git-wip-us.apache.org/repos/asf/spark-website/blob/fda1364c/site/releases/spark-release-1-3-1.html -- diff --git a/site/releases/spark-release-1-3-1.html b/site/releases/spark-release-1-3-1.html index a44c8a4..7f7c652 100644 --- a/site/releases/spark-release-1-3-1.html +++ b/site/releases/spark-release-1-3-1.html @@ -133,6 +133,7 @@ Useful Developer Tools Versioning Policy Release Process + Security
[2/2] spark-website git commit: Break out security info into dedicated page, to match other ASF projects who list security information
Break out security info into dedicated page, to match other ASF projects who list security information Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/fda1364c Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/fda1364c Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/fda1364c Branch: refs/heads/asf-site Commit: fda1364c849779048c1e095011c1cbf1209aaf90 Parents: 004856a Author: Sean Owen Authored: Fri Jun 9 11:18:29 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 11:18:29 2017 +0100 -- _layouts/global.html| 1 + community.md| 7 +- security.md | 20 ++ site/committers.html| 1 + site/community.html | 8 +- site/contributing.html | 1 + site/developer-tools.html | 1 + site/documentation.html | 1 + site/downloads.html | 1 + site/examples.html | 1 + site/faq.html | 1 + site/graphx/index.html | 1 + site/improvement-proposals.html | 1 + site/index.html | 1 + site/mailing-lists.html | 3 +- site/mllib/index.html | 1 + site/news/amp-camp-2013-registration-ope.html | 1 + .../news/announcing-the-first-spark-summit.html | 1 + .../news/fourth-spark-screencast-published.html | 1 + site/news/index.html| 1 + site/news/nsdi-paper.html | 1 + site/news/one-month-to-spark-summit-2015.html | 1 + .../proposals-open-for-spark-summit-east.html | 1 + ...registration-open-for-spark-summit-east.html | 1 + .../news/run-spark-and-shark-on-amazon-emr.html | 1 + site/news/spark-0-6-1-and-0-5-2-released.html | 1 + site/news/spark-0-6-2-released.html | 1 + site/news/spark-0-7-0-released.html | 1 + site/news/spark-0-7-2-released.html | 1 + site/news/spark-0-7-3-released.html | 1 + site/news/spark-0-8-0-released.html | 1 + site/news/spark-0-8-1-released.html | 1 + site/news/spark-0-9-0-released.html | 1 + site/news/spark-0-9-1-released.html | 1 + site/news/spark-0-9-2-released.html | 1 + site/news/spark-1-0-0-released.html | 1 + site/news/spark-1-0-1-released.html | 1 + site/news/spark-1-0-2-released.html | 1 + site/news/spark-1-1-0-released.html | 1 + site/news/spark-1-1-1-released.html | 1 + site/news/spark-1-2-0-released.html | 1 + site/news/spark-1-2-1-released.html | 1 + site/news/spark-1-2-2-released.html | 1 + site/news/spark-1-3-0-released.html | 1 + site/news/spark-1-4-0-released.html | 1 + site/news/spark-1-4-1-released.html | 1 + site/news/spark-1-5-0-released.html | 1 + site/news/spark-1-5-1-released.html | 1 + site/news/spark-1-5-2-released.html | 1 + site/news/spark-1-6-0-released.html | 1 + site/news/spark-1-6-1-released.html | 1 + site/news/spark-1-6-2-released.html | 1 + site/news/spark-1-6-3-released.html | 1 + site/news/spark-2-0-0-released.html | 1 + site/news/spark-2-0-1-released.html | 1 + site/news/spark-2-0-2-released.html | 1 + site/news/spark-2-1-0-released.html | 1 + site/news/spark-2-1-1-released.html | 1 + site/news/spark-2.0.0-preview.html | 1 + .../spark-accepted-into-apache-incubator.html | 1 + site/news/spark-and-shark-in-the-news.html | 1 + site/news/spark-becomes-tlp.html| 1 + site/news/spark-featured-in-wired.html | 1 + .../spark-mailing-lists-moving-to-apache.html | 1 + site/news/spark-meetups.html| 1 + site/news/spark-screencasts-published.html | 1 + site/news/spark-summit-2013-is-a-wrap.html | 1 + site/news/spark-summit-2014-videos-posted.html | 1 + site/news/spark-summit-2015-videos-posted.html | 1 + site/news/spark-summit-agenda-posted.html | 1 + .../spark-summit-east-2015-videos-posted.html | 1 + .../spark-summit-east-2016-cfp-closing.html | 1 + .../spark-summit-east-2017-agenda-posted.html | 1 + site/news/spark-summit-east-agenda-posted.html | 1 + .../news/spark-summit-europe-agenda-posted.html | 1 + site/news/spark-summit-europe.html
spark git commit: Fix bug in JavaRegressionMetricsExample.
Repository: spark Updated Branches: refs/heads/branch-2.2 714153c79 -> 869af5bcb Fix bug in JavaRegressionMetricsExample. the original code cant visit the last element of the"parts" array. so the v[v.lengthâ1] always equals 0 ## What changes were proposed in this pull request? change the recycle range from (1 to parts.length-1) to (1 to parts.length) ## How was this patch tested? debug it in eclipse (´ãï½*) zzz. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: junzhi lu <452756...@qq.com> Closes #18237 from masterwugui/patch-1. (cherry picked from commit 6491cbf065254e28bca61c9ef55b84f4009ac36c) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/869af5bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/869af5bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/869af5bc Branch: refs/heads/branch-2.2 Commit: 869af5bcb2670857ee7924a3e9798fa3638a5e3a Parents: 714153c Author: junzhi lu <452756...@qq.com> Authored: Fri Jun 9 10:49:04 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 10:49:13 2017 +0100 -- .../apache/spark/examples/mllib/JavaRegressionMetricsExample.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/869af5bc/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java index 7bb9993..00033b5 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java @@ -40,7 +40,7 @@ public class JavaRegressionMetricsExample { JavaRDD parsedData = data.map(line -> { String[] parts = line.split(" "); double[] v = new double[parts.length - 1]; - for (int i = 1; i < parts.length - 1; i++) { + for (int i = 1; i < parts.length; i++) { v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); } return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Fix bug in JavaRegressionMetricsExample.
Repository: spark Updated Branches: refs/heads/master 033839559 -> 6491cbf06 Fix bug in JavaRegressionMetricsExample. the original code cant visit the last element of the"parts" array. so the v[v.lengthâ1] always equals 0 ## What changes were proposed in this pull request? change the recycle range from (1 to parts.length-1) to (1 to parts.length) ## How was this patch tested? debug it in eclipse (´ãï½*) zzz. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: junzhi lu <452756...@qq.com> Closes #18237 from masterwugui/patch-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6491cbf0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6491cbf0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6491cbf0 Branch: refs/heads/master Commit: 6491cbf065254e28bca61c9ef55b84f4009ac36c Parents: 0338395 Author: junzhi lu <452756...@qq.com> Authored: Fri Jun 9 10:49:04 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 10:49:04 2017 +0100 -- .../apache/spark/examples/mllib/JavaRegressionMetricsExample.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6491cbf0/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java index 7bb9993..00033b5 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java @@ -40,7 +40,7 @@ public class JavaRegressionMetricsExample { JavaRDD parsedData = data.map(line -> { String[] parts = line.split(" "); double[] v = new double[parts.length - 1]; - for (int i = 1; i < parts.length - 1; i++) { + for (int i = 1; i < parts.length; i++) { v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); } return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Fixed broken link
Repository: spark Updated Branches: refs/heads/master bdcd6e4c6 -> 033839559 Fixed broken link ## What changes were proposed in this pull request? I fixed some incorrect formatting on a link in the docs ## How was this patch tested? I looked at the markdown preview before and after, and the link was fixed Before: https://user-images.githubusercontent.com/17733030/26956272-a62cd558-4c79-11e7-862f-9d0e0184b18a.png";> After: https://user-images.githubusercontent.com/17733030/26956276-b1135ef6-4c79-11e7-8028-84d19c392fda.png";> Author: Corey Woodfield Closes #18246 from coreywoodfield/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/03383955 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/03383955 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/03383955 Branch: refs/heads/master Commit: 033839559eab280760e3c5687940d8c62cc9c048 Parents: bdcd6e4 Author: Corey Woodfield Authored: Fri Jun 9 10:24:49 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 10:24:49 2017 +0100 -- docs/running-on-mesos.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/03383955/docs/running-on-mesos.md -- diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index c1344ad..8745e76 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -156,7 +156,7 @@ passing in the Mesos master URL (e.g: mesos://host:5050). This starts the `Mesos If you like to run the `MesosClusterDispatcher` with Marathon, you need to run the `MesosClusterDispatcher` in the foreground (i.e: `bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher`). Note that the `MesosClusterDispatcher` not yet supports multiple instances for HA. The `MesosClusterDispatcher` also supports writing recovery state into Zookeeper. This will allow the `MesosClusterDispatcher` to be able to recover all submitted and running containers on relaunch. In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations. -For more information about these configurations please refer to the configurations (doc)[configurations.html#deploy]. +For more information about these configurations please refer to the configurations [doc](configurations.html#deploy). From the client, you can submit a job to Mesos cluster by running `spark-submit` and specifying the master URL to the URL of the `MesosClusterDispatcher` (e.g: mesos://dispatcher:7077). You can view driver statuses on the - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Fixed broken link
Repository: spark Updated Branches: refs/heads/branch-2.2 3f6812cf8 -> 714153c79 Fixed broken link ## What changes were proposed in this pull request? I fixed some incorrect formatting on a link in the docs ## How was this patch tested? I looked at the markdown preview before and after, and the link was fixed Before: https://user-images.githubusercontent.com/17733030/26956272-a62cd558-4c79-11e7-862f-9d0e0184b18a.png";> After: https://user-images.githubusercontent.com/17733030/26956276-b1135ef6-4c79-11e7-8028-84d19c392fda.png";> Author: Corey Woodfield Closes #18246 from coreywoodfield/master. (cherry picked from commit 033839559eab280760e3c5687940d8c62cc9c048) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/714153c7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/714153c7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/714153c7 Branch: refs/heads/branch-2.2 Commit: 714153c794da62164302e94af620e8107dbc6b5e Parents: 3f6812c Author: Corey Woodfield Authored: Fri Jun 9 10:24:49 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 10:24:58 2017 +0100 -- docs/running-on-mesos.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/714153c7/docs/running-on-mesos.md -- diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 314a806..847a659 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -156,7 +156,7 @@ passing in the Mesos master URL (e.g: mesos://host:5050). This starts the `Mesos If you like to run the `MesosClusterDispatcher` with Marathon, you need to run the `MesosClusterDispatcher` in the foreground (i.e: `bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher`). Note that the `MesosClusterDispatcher` not yet supports multiple instances for HA. The `MesosClusterDispatcher` also supports writing recovery state into Zookeeper. This will allow the `MesosClusterDispatcher` to be able to recover all submitted and running containers on relaunch. In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations. -For more information about these configurations please refer to the configurations (doc)[configurations.html#deploy]. +For more information about these configurations please refer to the configurations [doc](configurations.html#deploy). From the client, you can submit a job to Mesos cluster by running `spark-submit` and specifying the master URL to the URL of the `MesosClusterDispatcher` (e.g: mesos://dispatcher:7077). You can view driver statuses on the - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20995][CORE] Spark-env.sh.template' should add 'YARN_CONF_DIR' configuration instructions.
Repository: spark Updated Branches: refs/heads/master 5a3371883 -> bdcd6e4c6 [SPARK-20995][CORE] Spark-env.sh.template' should add 'YARN_CONF_DIR' configuration instructions. ## What changes were proposed in this pull request? Ensure that `HADOOP_CONF_DIR` or `YARN_CONF_DIR` points to the directory which contains the (client side) configuration files for the Hadoop cluster. These configs are used to write to HDFS and connect to the YARN ResourceManager. The configuration contained in this directory will be distributed to the YARN cluster so that all containers used by the application use the same configuration. Sometimes, `HADOOP_CONF_DIR` is set to the hdfs configuration file path. So, YARN_CONF_DIR should be set to the yarn configuration file path. My project configuration item of 'spark-env.sh ' is as follows: ![1](https://cloud.githubusercontent.com/assets/26266482/26819987/d4acb814-4ad3-11e7-8458-a21aea57a53d.png) 'HADOOP_CONF_DIR' configuration file path. List the relevant documents below: ![3](https://cloud.githubusercontent.com/assets/26266482/26820116/47b6b9fe-4ad4-11e7-8131-fe07c8d8bc21.png) 'YARN_CONF_DIR' configuration file path. List the relevant documents below: ![2](https://cloud.githubusercontent.com/assets/26266482/26820078/274ad79a-4ad4-11e7-83d4-ff359dbb397c.png) So, 'Spark-env.sh.template' should add 'YARN_CONF_DIR' configuration instructions. ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: guoxiaolong Author: éå°é¾ 10207633 Author: guoxiaolongzte Closes #18212 from guoxiaolongzte/SPARK-20995. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdcd6e4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdcd6e4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdcd6e4c Branch: refs/heads/master Commit: bdcd6e4c680ebd3ddf5c1baaeba31134b143dfb4 Parents: 5a33718 Author: guoxiaolong Authored: Fri Jun 9 09:26:30 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 09:26:30 2017 +0100 -- conf/spark-env.sh.template | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bdcd6e4c/conf/spark-env.sh.template -- diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index b7c985a..b9aab5a 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -34,6 +34,7 @@ # Options read in YARN client mode # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14408][CORE] Changed RDD.treeAggregate to use fold instead of reduce
Repository: spark Updated Branches: refs/heads/master 2a23cdd07 -> 5a3371883 [SPARK-14408][CORE] Changed RDD.treeAggregate to use fold instead of reduce ## What changes were proposed in this pull request? Previously, `RDD.treeAggregate` used `reduceByKey` and `reduce` in its implementation, neither of which technically allows the `seq`/`combOps` to modify and return their first arguments. This PR uses `foldByKey` and `fold` instead and notes that `aggregate` and `treeAggregate` are semantically identical in the Scala doc. Note that this had some test failures by unknown reasons. This was actually fixed in https://github.com/apache/spark/commit/e3554605b36bdce63ac180cc66dbdee5c1528ec7. The root cause was, the `zeroValue` now becomes `AFTAggregator` and it compares `totalCnt` (where the value is actually 0). It starts merging one by one and it keeps returning `this` where `totalCnt` is 0. So, this looks not the bug in the current change. This is now fixed in the commit. So, this should pass the tests. ## How was this patch tested? Test case added in `RDDSuite`. Closes #12217 Author: Joseph K. Bradley Author: hyukjinkwon Closes #18198 from HyukjinKwon/SPARK-14408. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a337188 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a337188 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a337188 Branch: refs/heads/master Commit: 5a3371883acf8ac8f94a71cbffa75166605c91bc Parents: 2a23cdd Author: Joseph K. Bradley Authored: Fri Jun 9 08:53:18 2017 +0100 Committer: Sean Owen Committed: Fri Jun 9 08:53:18 2017 +0100 -- .../main/scala/org/apache/spark/rdd/RDD.scala | 9 +++--- .../scala/org/apache/spark/rdd/RDDSuite.scala | 31 +++- 2 files changed, 35 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a337188/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 63a87e7..2985c90 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1118,9 +1118,9 @@ abstract class RDD[T: ClassTag]( /** * Aggregates the elements of this RDD in a multi-level tree pattern. + * This method is semantically identical to [[org.apache.spark.rdd.RDD#aggregate]]. * * @param depth suggested depth of the tree (default: 2) - * @see [[org.apache.spark.rdd.RDD#aggregate]] */ def treeAggregate[U: ClassTag](zeroValue: U)( seqOp: (U, T) => U, @@ -1134,7 +1134,7 @@ abstract class RDD[T: ClassTag]( val cleanCombOp = context.clean(combOp) val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp) - var partiallyAggregated = mapPartitions(it => Iterator(aggregatePartition(it))) + var partiallyAggregated: RDD[U] = mapPartitions(it => Iterator(aggregatePartition(it))) var numPartitions = partiallyAggregated.partitions.length val scale = math.max(math.ceil(math.pow(numPartitions, 1.0 / depth)).toInt, 2) // If creating an extra level doesn't help reduce @@ -1146,9 +1146,10 @@ abstract class RDD[T: ClassTag]( val curNumPartitions = numPartitions partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex { (i, iter) => iter.map((i % curNumPartitions, _)) -}.reduceByKey(new HashPartitioner(curNumPartitions), cleanCombOp).values +}.foldByKey(zeroValue, new HashPartitioner(curNumPartitions))(cleanCombOp).values } - partiallyAggregated.reduce(cleanCombOp) + val copiedZeroValue = Utils.clone(zeroValue, sc.env.closureSerializer.newInstance()) + partiallyAggregated.fold(copiedZeroValue)(cleanCombOp) } } http://git-wip-us.apache.org/repos/asf/spark/blob/5a337188/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 8d06f54..386c006 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -192,6 +192,23 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext { assert(ser.serialize(union.partitions.head).limit() < 2000) } + test("fold") { +val rdd = sc.makeRDD(-1000 until 1000, 10) +def op: (Int, Int) => Int = (c: Int, x: Int) => c + x +val sum = rdd.fold(0)(op) +assert(sum === -1000) + } + + test("fold with op modifying first arg") { +v