spark git commit: [SPARK-23907] Removes regr_* functions in functions.scala
Repository: spark Updated Branches: refs/heads/master f27a035da -> e3dabdf6e [SPARK-23907] Removes regr_* functions in functions.scala ## What changes were proposed in this pull request? This patch removes the various regr_* functions in functions.scala. They are so uncommon that I don't think they deserve real estate in functions.scala. We can consider adding them later if more users need them. ## How was this patch tested? Removed the associated test case as well. Author: Reynold Xin Closes #21309 from rxin/SPARK-23907. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e3dabdf6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e3dabdf6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e3dabdf6 Branch: refs/heads/master Commit: e3dabdf6ef210fb9f4337e305feb9c4983a57350 Parents: f27a035 Author: Reynold Xin Authored: Sat May 12 12:15:36 2018 +0800 Committer: hyukjinkwon Committed: Sat May 12 12:15:36 2018 +0800 -- .../scala/org/apache/spark/sql/functions.scala | 171 --- .../spark/sql/DataFrameAggregateSuite.scala | 68 2 files changed, 239 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e3dabdf6/sql/core/src/main/scala/org/apache/spark/sql/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index e7f866d..3c9ace4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -811,177 +811,6 @@ object functions { */ def var_pop(columnName: String): Column = var_pop(Column(columnName)) - /** - * Aggregate function: returns the number of non-null pairs. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_count(y: Column, x: Column): Column = withAggregateFunction { -RegrCount(y.expr, x.expr) - } - - /** - * Aggregate function: returns the number of non-null pairs. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_count(y: String, x: String): Column = regr_count(Column(y), Column(x)) - - /** - * Aggregate function: returns SUM(x*x)-SUM(x)*SUM(x)/N. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_sxx(y: Column, x: Column): Column = withAggregateFunction { -RegrSXX(y.expr, x.expr) - } - - /** - * Aggregate function: returns SUM(x*x)-SUM(x)*SUM(x)/N. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_sxx(y: String, x: String): Column = regr_sxx(Column(y), Column(x)) - - /** - * Aggregate function: returns SUM(y*y)-SUM(y)*SUM(y)/N. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_syy(y: Column, x: Column): Column = withAggregateFunction { -RegrSYY(y.expr, x.expr) - } - - /** - * Aggregate function: returns SUM(y*y)-SUM(y)*SUM(y)/N. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_syy(y: String, x: String): Column = regr_syy(Column(y), Column(x)) - - /** - * Aggregate function: returns the average of y. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_avgy(y: Column, x: Column): Column = withAggregateFunction { -RegrAvgY(y.expr, x.expr) - } - - /** - * Aggregate function: returns the average of y. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_avgy(y: String, x: String): Column = regr_avgy(Column(y), Column(x)) - - /** - * Aggregate function: returns the average of x. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_avgx(y: Column, x: Column): Column = withAggregateFunction { -RegrAvgX(y.expr, x.expr) - } - - /** - * Aggregate function: returns the average of x. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_avgx(y: String, x: String): Column = regr_avgx(Column(y), Column(x)) - - /** - * Aggregate function: returns the covariance of y and x multiplied for the number of items in - * the dataset. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_sxy(y: Column, x: Column): Column = withAggregateFunction { -RegrSXY(y.expr, x.expr) - } - - /** - * Aggregate function: returns the covariance of y and x multiplied for the number of items in - * the dataset. Any pair with a NULL is ignored. - * - * @group agg_funcs - * @since 2.4.0 - */ - def regr_sxy(y: String, x: String): Column = regr_sxy(Column(y), Column(x)) -
svn commit: r26865 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_20_01-f27a035-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat May 12 03:16:06 2018 New Revision: 26865 Log: Apache Spark 2.4.0-SNAPSHOT-2018_05_11_20_01-f27a035 docs [This commit notification would consist of 1462 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r26862 - in /dev/spark/2.3.1-SNAPSHOT-2018_05_11_18_01-7de4bef-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Sat May 12 01:15:20 2018 New Revision: 26862 Log: Apache Spark 2.3.1-SNAPSHOT-2018_05_11_18_01-7de4bef docs [This commit notification would consist of 1443 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR] Require Java 8 for SparkR
Repository: spark Updated Branches: refs/heads/branch-2.3 1d598b771 -> 7de4bef9e [SPARKR] Require Java 8 for SparkR This change updates the SystemRequirements and also includes a runtime check if the JVM is being launched by R. The runtime check is done by querying `java -version` ## How was this patch tested? Tested on a Mac and Windows machine Author: Shivaram Venkataraman Closes #21278 from shivaram/sparkr-skip-solaris. (cherry picked from commit f27a035daf705766d3445e5c6a99867c11c552b0) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7de4bef9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7de4bef9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7de4bef9 Branch: refs/heads/branch-2.3 Commit: 7de4bef9ec37440aa36e6b0e9d8656de07d03b68 Parents: 1d598b7 Author: Shivaram Venkataraman Authored: Fri May 11 17:00:51 2018 -0700 Committer: Shivaram Venkataraman Committed: Fri May 11 17:01:02 2018 -0700 -- R/pkg/DESCRIPTION | 1 + R/pkg/R/client.R | 35 +++ R/pkg/R/sparkR.R | 1 + R/pkg/R/utils.R | 4 ++-- 4 files changed, 39 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7de4bef9/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 29a8a00..632bcb3 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -13,6 +13,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), License: Apache License (== 2.0) URL: http://www.apache.org/ http://spark.apache.org/ BugReports: http://spark.apache.org/contributing.html +SystemRequirements: Java (== 8) Depends: R (>= 3.0), methods http://git-wip-us.apache.org/repos/asf/spark/blob/7de4bef9/R/pkg/R/client.R -- diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 7244cc9..e9295e0 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -60,6 +60,40 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack combinedArgs } +checkJavaVersion <- function() { + javaBin <- "java" + javaHome <- Sys.getenv("JAVA_HOME") + javaReqs <- utils::packageDescription(utils::packageName(), fields=c("SystemRequirements")) + sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 1L)) + if (javaHome != "") { +javaBin <- file.path(javaHome, "bin", javaBin) + } + + # If java is missing from PATH, we get an error in Unix and a warning in Windows + javaVersionOut <- tryCatch( + launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE), + error = function(e) { + stop("Java version check failed. Please make sure Java is installed", + " and set JAVA_HOME to point to the installation directory.", e) + }, + warning = function(w) { + stop("Java version check failed. Please make sure Java is installed", + " and set JAVA_HOME to point to the installation directory.", w) + }) + javaVersionFilter <- Filter( + function(x) { +grepl("java version", x) + }, javaVersionOut) + + javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2] + # javaVersionStr is of the form 1.8.0_92. + # Extract 8 from it to compare to sparkJavaVersion + javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2]) + if (javaVersionNum != sparkJavaVersion) { +stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:", javaVersionStr)) + } +} + launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) { sparkSubmitBinName <- determineSparkSubmitBin() if (sparkHome != "") { @@ -67,6 +101,7 @@ launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) { } else { sparkSubmitBin <- sparkSubmitBinName } + combinedArgs <- generateSparkSubmitArgs(args, sparkHome, jars, sparkSubmitOpts, packages) cat("Launching java with spark-submit command", sparkSubmitBin, combinedArgs, "\n") invisible(launchScript(sparkSubmitBin, combinedArgs)) http://git-wip-us.apache.org/repos/asf/spark/blob/7de4bef9/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 7430d84..2cd8b0c 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -170,6 +170,7 @@ sparkR.sparkContext <- function( submitOps <- getClientModeSparkSubmitOpts( Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"), sparkEnvirMap) +checkJavaVersion()
spark git commit: [SPARKR] Require Java 8 for SparkR
Repository: spark Updated Branches: refs/heads/master 92f6f52ff -> f27a035da [SPARKR] Require Java 8 for SparkR This change updates the SystemRequirements and also includes a runtime check if the JVM is being launched by R. The runtime check is done by querying `java -version` ## How was this patch tested? Tested on a Mac and Windows machine Author: Shivaram Venkataraman Closes #21278 from shivaram/sparkr-skip-solaris. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f27a035d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f27a035d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f27a035d Branch: refs/heads/master Commit: f27a035daf705766d3445e5c6a99867c11c552b0 Parents: 92f6f52 Author: Shivaram Venkataraman Authored: Fri May 11 17:00:51 2018 -0700 Committer: Shivaram Venkataraman Committed: Fri May 11 17:00:51 2018 -0700 -- R/pkg/DESCRIPTION | 1 + R/pkg/R/client.R | 35 +++ R/pkg/R/sparkR.R | 1 + R/pkg/R/utils.R | 4 ++-- 4 files changed, 39 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f27a035d/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 855eb5b..f52d785 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -13,6 +13,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), License: Apache License (== 2.0) URL: http://www.apache.org/ http://spark.apache.org/ BugReports: http://spark.apache.org/contributing.html +SystemRequirements: Java (== 8) Depends: R (>= 3.0), methods http://git-wip-us.apache.org/repos/asf/spark/blob/f27a035d/R/pkg/R/client.R -- diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 7244cc9..e9295e0 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -60,6 +60,40 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack combinedArgs } +checkJavaVersion <- function() { + javaBin <- "java" + javaHome <- Sys.getenv("JAVA_HOME") + javaReqs <- utils::packageDescription(utils::packageName(), fields=c("SystemRequirements")) + sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 1L)) + if (javaHome != "") { +javaBin <- file.path(javaHome, "bin", javaBin) + } + + # If java is missing from PATH, we get an error in Unix and a warning in Windows + javaVersionOut <- tryCatch( + launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE), + error = function(e) { + stop("Java version check failed. Please make sure Java is installed", + " and set JAVA_HOME to point to the installation directory.", e) + }, + warning = function(w) { + stop("Java version check failed. Please make sure Java is installed", + " and set JAVA_HOME to point to the installation directory.", w) + }) + javaVersionFilter <- Filter( + function(x) { +grepl("java version", x) + }, javaVersionOut) + + javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2] + # javaVersionStr is of the form 1.8.0_92. + # Extract 8 from it to compare to sparkJavaVersion + javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2]) + if (javaVersionNum != sparkJavaVersion) { +stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:", javaVersionStr)) + } +} + launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) { sparkSubmitBinName <- determineSparkSubmitBin() if (sparkHome != "") { @@ -67,6 +101,7 @@ launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) { } else { sparkSubmitBin <- sparkSubmitBinName } + combinedArgs <- generateSparkSubmitArgs(args, sparkHome, jars, sparkSubmitOpts, packages) cat("Launching java with spark-submit command", sparkSubmitBin, combinedArgs, "\n") invisible(launchScript(sparkSubmitBin, combinedArgs)) http://git-wip-us.apache.org/repos/asf/spark/blob/f27a035d/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 38ee794..d6a2d08 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -167,6 +167,7 @@ sparkR.sparkContext <- function( submitOps <- getClientModeSparkSubmitOpts( Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"), sparkEnvirMap) +checkJavaVersion() launchBackend( args = path, sparkHome = sparkHome, http://git-wip-us.apache.org/repos/asf/spark
[1/2] spark git commit: [PYSPARK] Update py4j to version 0.10.7.
Repository: spark Updated Branches: refs/heads/branch-2.1 8177b2148 -> 1d569f684 [PYSPARK] Update py4j to version 0.10.7. (cherry picked from commit cc613b552e753d03cb62661591de59e1c8d82c74) Signed-off-by: Marcelo Vanzin (cherry picked from commit 323dc3ad02e63a7c99b5bd6da618d6020657ecba) Signed-off-by: Marcelo Vanzin (cherry picked from commit 850b7d868bbe23e58e1164fc551fac2facbebead) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/61a8c4c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/61a8c4c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/61a8c4c0 Branch: refs/heads/branch-2.1 Commit: 61a8c4c0929cdd854b0a93eb76ea9686e7f0 Parents: 8177b21 Author: Marcelo Vanzin Authored: Fri Apr 13 14:28:24 2018 -0700 Committer: Marcelo Vanzin Committed: Fri May 11 11:13:32 2018 -0700 -- LICENSE | 2 +- bin/pyspark | 6 +- bin/pyspark2.cmd| 2 +- core/pom.xml| 2 +- .../org/apache/spark/SecurityManager.scala | 11 +- .../spark/api/python/PythonGatewayServer.scala | 50 ++--- .../org/apache/spark/api/python/PythonRDD.scala | 29 -- .../apache/spark/api/python/PythonUtils.scala | 2 +- .../spark/api/python/PythonWorkerFactory.scala | 21 ++-- .../org/apache/spark/deploy/PythonRunner.scala | 12 ++- .../apache/spark/internal/config/package.scala | 6 ++ .../spark/security/SocketAuthHelper.scala | 101 +++ .../scala/org/apache/spark/util/Utils.scala | 11 ++ .../spark/security/SocketAuthHelperSuite.scala | 97 ++ dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- python/README.md| 2 +- python/docs/Makefile| 2 +- python/lib/py4j-0.10.6-src.zip | Bin 80352 -> 0 bytes python/lib/py4j-0.10.7-src.zip | Bin 0 -> 42437 bytes python/pyspark/context.py | 4 +- python/pyspark/daemon.py| 21 +++- python/pyspark/java_gateway.py | 93 ++--- python/pyspark/rdd.py | 21 ++-- python/pyspark/sql/dataframe.py | 8 +- python/pyspark/worker.py| 7 +- python/setup.py | 2 +- sbin/spark-config.sh| 2 +- .../scala/org/apache/spark/sql/Dataset.scala| 4 +- .../org/apache/spark/deploy/yarn/Client.scala | 2 +- .../spark/deploy/yarn/YarnClusterSuite.scala| 2 +- 34 files changed, 417 insertions(+), 115 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/61a8c4c0/LICENSE -- diff --git a/LICENSE b/LICENSE index 02aaef2..83a5e2b 100644 --- a/LICENSE +++ b/LICENSE @@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt. (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf) (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net) (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net) - (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - http://py4j.sourceforge.net/) + (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.7 - http://py4j.sourceforge.net/) (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/) (BSD licence) sbt and sbt-launch-lib.bash (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE) http://git-wip-us.apache.org/repos/asf/spark/blob/61a8c4c0/bin/pyspark -- diff --git a/bin/pyspark b/bin/pyspark index d3b512e..95ab628 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -25,14 +25,14 @@ source "${SPARK_HOME}"/bin/load-spark-env.sh export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]" # In Spark 2.0, IPYTHON and IPYTHON_OPTS are removed and pyspark fails to launch if either option -# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython +# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython # to use IPython a
[2/2] spark git commit: [SPARKR] Match pyspark features in SparkR communication protocol.
[SPARKR] Match pyspark features in SparkR communication protocol. (cherry picked from commit 628c7b517969c4a7ccb26ea67ab3dd61266073ca) Signed-off-by: Marcelo Vanzin (cherry picked from commit 16cd9ac5264831e061c033b26fe1173ebc88e5d1) Signed-off-by: Marcelo Vanzin (cherry picked from commit f96d13dd67b2d39e5fff80a6bc4a1b1fc36745c6) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d569f68 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d569f68 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d569f68 Branch: refs/heads/branch-2.1 Commit: 1d569f684fe41a33dc1708da6cde5d5b2a7857af Parents: 61a8c4c Author: Marcelo Vanzin Authored: Tue Apr 17 13:29:43 2018 -0700 Committer: Marcelo Vanzin Committed: Fri May 11 11:13:44 2018 -0700 -- R/pkg/R/client.R| 4 +- R/pkg/R/deserialize.R | 10 ++-- R/pkg/R/sparkR.R| 39 -- R/pkg/inst/worker/daemon.R | 4 +- R/pkg/inst/worker/worker.R | 5 +- .../org/apache/spark/api/r/RAuthHelper.scala| 38 ++ .../scala/org/apache/spark/api/r/RBackend.scala | 43 --- .../spark/api/r/RBackendAuthHandler.scala | 55 .../scala/org/apache/spark/api/r/RRunner.scala | 35 + .../scala/org/apache/spark/deploy/RRunner.scala | 6 ++- 10 files changed, 210 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d569f68/R/pkg/R/client.R -- diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 9d82814..7244cc9 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -19,7 +19,7 @@ # Creates a SparkR client connection object # if one doesn't already exist -connectBackend <- function(hostname, port, timeout) { +connectBackend <- function(hostname, port, timeout, authSecret) { if (exists(".sparkRcon", envir = .sparkREnv)) { if (isOpen(.sparkREnv[[".sparkRCon"]])) { cat("SparkRBackend client connection already exists\n") @@ -29,7 +29,7 @@ connectBackend <- function(hostname, port, timeout) { con <- socketConnection(host = hostname, port = port, server = FALSE, blocking = TRUE, open = "wb", timeout = timeout) - + doServerAuth(con, authSecret) assign(".sparkRCon", con, envir = .sparkREnv) con } http://git-wip-us.apache.org/repos/asf/spark/blob/1d569f68/R/pkg/R/deserialize.R -- diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 0e99b17..dc7d37e 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -60,14 +60,18 @@ readTypedObject <- function(con, type) { stop(paste("Unsupported type for deserialization", type))) } -readString <- function(con) { - stringLen <- readInt(con) - raw <- readBin(con, raw(), stringLen, endian = "big") +readStringData <- function(con, len) { + raw <- readBin(con, raw(), len, endian = "big") string <- rawToChar(raw) Encoding(string) <- "UTF-8" string } +readString <- function(con) { + stringLen <- readInt(con) + readStringData(con, stringLen) +} + readInt <- function(con) { readBin(con, integer(), n = 1, endian = "big") } http://git-wip-us.apache.org/repos/asf/spark/blob/1d569f68/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d0a12b7..6c0392a 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -161,6 +161,10 @@ sparkR.sparkContext <- function( " please use the --packages commandline instead", sep = ",")) } backendPort <- existingPort +authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET") +if (nchar(authSecret) == 0) { + stop("Auth secret not provided in environment.") +} } else { path <- tempfile(pattern = "backend_port") submitOps <- getClientModeSparkSubmitOpts( @@ -189,16 +193,27 @@ sparkR.sparkContext <- function( monitorPort <- readInt(f) rLibPath <- readString(f) connectionTimeout <- readInt(f) + +# Don't use readString() so that we can provide a useful +# error message if the R and Java versions are mismatched. +authSecretLen = readInt(f) +if (length(authSecretLen) == 0 || authSecretLen == 0) { + stop("Unexpected EOF in JVM connection data. Mismatched versions?") +} +authSecret <- readStringData(f, authSecretLen) close(f) file.remove(path) if (length(backendPort) == 0 || backendPort == 0 || length(monitorPort) == 0 || monitorPort == 0 || -length(rLibPath) != 1) { +length(rLibPath) != 1 ||
svn commit: r26859 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_16_01-92f6f52-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri May 11 23:15:44 2018 New Revision: 26859 Log: Apache Spark 2.4.0-SNAPSHOT-2018_05_11_16_01-92f6f52 docs [This commit notification would consist of 1462 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r26856 - in /dev/spark/2.3.1-SNAPSHOT-2018_05_11_14_01-1d598b7-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri May 11 21:15:49 2018 New Revision: 26856 Log: Apache Spark 2.3.1-SNAPSHOT-2018_05_11_14_01-1d598b7 docs [This commit notification would consist of 1443 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] Documenting months_between direction
Repository: spark Updated Branches: refs/heads/master 928845a42 -> 92f6f52ff [MINOR][DOCS] Documenting months_between direction ## What changes were proposed in this pull request? It's useful to know what relationship between date1 and date2 results in a positive number. Author: aditkumar Author: Adit Kumar Closes #20787 from aditkumar/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92f6f52f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92f6f52f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92f6f52f Branch: refs/heads/master Commit: 92f6f52ff0ce47e046656ca8bed7d7bfbbb42dcb Parents: 928845a Author: aditkumar Authored: Fri May 11 14:42:23 2018 -0500 Committer: Sean Owen Committed: Fri May 11 14:42:23 2018 -0500 -- R/pkg/R/functions.R | 6 +- python/pyspark/sql/functions.py | 7 +-- .../catalyst/expressions/datetimeExpressions.scala| 14 +++--- .../spark/sql/catalyst/util/DateTimeUtils.scala | 8 .../main/scala/org/apache/spark/sql/functions.scala | 7 ++- 5 files changed, 31 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92f6f52f/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 1f97054..4964594 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1912,6 +1912,7 @@ setMethod("atan2", signature(y = "Column"), #' @details #' \code{datediff}: Returns the number of days from \code{y} to \code{x}. +#' If \code{y} is later than \code{x} then the result is positive. #' #' @rdname column_datetime_diff_functions #' @aliases datediff datediff,Column-method @@ -1971,7 +1972,10 @@ setMethod("levenshtein", signature(y = "Column"), }) #' @details -#' \code{months_between}: Returns number of months between dates \code{y} and \code{x}. +#' \code{months_between}: Returns number of months between dates \code{y} and \code{x}. +#' If \code{y} is later than \code{x}, then the result is positive. If \code{y} and \code{x} +#' are on the same day of month, or both are the last day of month, time of day will be ignored. +#' Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits. #' #' @rdname column_datetime_diff_functions #' @aliases months_between months_between,Column-method http://git-wip-us.apache.org/repos/asf/spark/blob/92f6f52f/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index f5a5841..b62748e 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1108,8 +1108,11 @@ def add_months(start, months): @since(1.5) def months_between(date1, date2, roundOff=True): """ -Returns the number of months between date1 and date2. -Unless `roundOff` is set to `False`, the result is rounded off to 8 digits. +Returns number of months between dates date1 and date2. +If date1 is later than date2, then the result is positive. +If date1 and date2 are on the same day of month, or both are the last day of month, +returns an integer (time of day will be ignored). +The result is rounded off to 8 digits unless `roundOff` is set to `False`. >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) >>> df.select(months_between(df.date1, df.date2).alias('months')).collect() http://git-wip-us.apache.org/repos/asf/spark/blob/92f6f52f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 76aa614..03422fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1194,13 +1194,21 @@ case class AddMonths(startDate: Expression, numMonths: Expression) } /** - * Returns number of months between dates date1 and date2. + * Returns number of months between times `timestamp1` and `timestamp2`. + * If `timestamp1` is later than `timestamp2`, then the result is positive. + * If `timestamp1` and `timestamp2` are on the same day of month, or both + * are the last day of month, time of day will be ignored. Otherwise, the + * difference is calculated based on 31 days per month, a
svn commit: r26854 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_12_01-928845a-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri May 11 19:16:20 2018 New Revision: 26854 Log: Apache Spark 2.4.0-SNAPSHOT-2018_05_11_12_01-928845a docs [This commit notification would consist of 1462 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24067][BACKPORT-2.3][STREAMING][KAFKA] Allow non-consecutive offsets
Repository: spark Updated Branches: refs/heads/branch-2.3 414e4e3d7 -> 1d598b771 [SPARK-24067][BACKPORT-2.3][STREAMING][KAFKA] Allow non-consecutive offsets ## What changes were proposed in this pull request? Backport of the bugfix in SPARK-17147 Add a configuration spark.streaming.kafka.allowNonConsecutiveOffsets to allow streaming jobs to proceed on compacted topics (or other situations involving gaps between offsets in the log). ## How was this patch tested? Added new unit test justinrmiller has been testing this branch in production for a few weeks Author: cody koeninger Closes #21300 from koeninger/branch-2.3_kafkafix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d598b77 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d598b77 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d598b77 Branch: refs/heads/branch-2.3 Commit: 1d598b771de3b588a2f377ae7ccf8193156641f2 Parents: 414e4e3 Author: cody koeninger Authored: Fri May 11 13:40:36 2018 -0500 Committer: cody koeninger Committed: Fri May 11 13:40:36 2018 -0500 -- .../kafka010/CachedKafkaConsumer.scala | 55 - .../spark/streaming/kafka010/KafkaRDD.scala | 236 +-- .../streaming/kafka010/KafkaRDDSuite.scala | 106 + .../streaming/kafka010/KafkaTestUtils.scala | 25 +- .../kafka010/mocks/MockScheduler.scala | 96 .../streaming/kafka010/mocks/MockTime.scala | 51 6 files changed, 487 insertions(+), 82 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d598b77/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala -- diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala index fa3ea61..aeb8c1d 100644 --- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala +++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala @@ -22,10 +22,8 @@ import java.{ util => ju } import org.apache.kafka.clients.consumer.{ ConsumerConfig, ConsumerRecord, KafkaConsumer } import org.apache.kafka.common.{ KafkaException, TopicPartition } -import org.apache.spark.SparkConf import org.apache.spark.internal.Logging - /** * Consumer of single topicpartition, intended for cached reuse. * Underlying consumer is not threadsafe, so neither is this, @@ -38,7 +36,7 @@ class CachedKafkaConsumer[K, V] private( val partition: Int, val kafkaParams: ju.Map[String, Object]) extends Logging { - assert(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG), + require(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG), "groupId used for cache key must match the groupId in kafkaParams") val topicPartition = new TopicPartition(topic, partition) @@ -53,7 +51,7 @@ class CachedKafkaConsumer[K, V] private( // TODO if the buffer was kept around as a random-access structure, // could possibly optimize re-calculating of an RDD in the same batch - protected var buffer = ju.Collections.emptyList[ConsumerRecord[K, V]]().iterator + protected var buffer = ju.Collections.emptyListIterator[ConsumerRecord[K, V]]() protected var nextOffset = -2L def close(): Unit = consumer.close() @@ -71,7 +69,7 @@ class CachedKafkaConsumer[K, V] private( } if (!buffer.hasNext()) { poll(timeout) } -assert(buffer.hasNext(), +require(buffer.hasNext(), s"Failed to get records for $groupId $topic $partition $offset after polling for $timeout") var record = buffer.next() @@ -79,17 +77,56 @@ class CachedKafkaConsumer[K, V] private( logInfo(s"Buffer miss for $groupId $topic $partition $offset") seek(offset) poll(timeout) - assert(buffer.hasNext(), + require(buffer.hasNext(), s"Failed to get records for $groupId $topic $partition $offset after polling for $timeout") record = buffer.next() - assert(record.offset == offset, -s"Got wrong record for $groupId $topic $partition even after seeking to offset $offset") + require(record.offset == offset, +s"Got wrong record for $groupId $topic $partition even after seeking to offset $offset " + + s"got offset ${record.offset} instead. If this is a compacted topic, consider enabling " + + "spark.streaming.kafka.allowNonConsecutiveOffsets" + ) } nextOffset = offset + 1 record } + /** + * Start a batch on a compacted topic + */ + def compactedStart(offset: Long, timeout: Long): Unit = { +
spark git commit: [SPARK-24172][SQL] we should not apply operator pushdown to data source v2 many times
Repository: spark Updated Branches: refs/heads/master 54032682b -> 928845a42 [SPARK-24172][SQL] we should not apply operator pushdown to data source v2 many times ## What changes were proposed in this pull request? In `PushDownOperatorsToDataSource`, we use `transformUp` to match `PhysicalOperation` and apply pushdown. This is problematic if we have multiple `Filter` and `Project` above the data source v2 relation. e.g. for a query ``` Project Filter DataSourceV2Relation ``` The pattern match will be triggered twice and we will do operator pushdown twice. This is unnecessary, we can use `mapChildren` to only apply pushdown once. ## How was this patch tested? existing test Author: Wenchen Fan Closes #21230 from cloud-fan/step2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/928845a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/928845a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/928845a4 Branch: refs/heads/master Commit: 928845a42230a2c0a318011002a54ad871468b2e Parents: 5403268 Author: Wenchen Fan Authored: Fri May 11 10:00:28 2018 -0700 Committer: gatorsmile Committed: Fri May 11 10:00:28 2018 -0700 -- .../v2/PushDownOperatorsToDataSource.scala | 15 +-- 1 file changed, 5 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/928845a4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala index 9293d4f..e894f8a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala @@ -23,17 +23,10 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project import org.apache.spark.sql.catalyst.rules.Rule object PushDownOperatorsToDataSource extends Rule[LogicalPlan] { - override def apply( - plan: LogicalPlan): LogicalPlan = plan transformUp { + override def apply(plan: LogicalPlan): LogicalPlan = plan match { // PhysicalOperation guarantees that filters are deterministic; no need to check -case PhysicalOperation(project, newFilters, relation : DataSourceV2Relation) => - // merge the filters - val filters = relation.filters match { -case Some(existing) => - existing ++ newFilters -case _ => - newFilters - } +case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => + assert(relation.filters.isEmpty, "data source v2 should do push down only once.") val projectAttrs = project.map(_.toAttribute) val projectSet = AttributeSet(project.flatMap(_.references)) @@ -67,5 +60,7 @@ object PushDownOperatorsToDataSource extends Rule[LogicalPlan] { } else { filtered } + +case other => other.mapChildren(apply) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/2] spark git commit: [SPARKR] Match pyspark features in SparkR communication protocol.
[SPARKR] Match pyspark features in SparkR communication protocol. (cherry picked from commit 628c7b517969c4a7ccb26ea67ab3dd61266073ca) Signed-off-by: Marcelo Vanzin (cherry picked from commit 16cd9ac5264831e061c033b26fe1173ebc88e5d1) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f96d13dd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f96d13dd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f96d13dd Branch: refs/heads/branch-2.2 Commit: f96d13dd67b2d39e5fff80a6bc4a1b1fc36745c6 Parents: 850b7d8 Author: Marcelo Vanzin Authored: Tue Apr 17 13:29:43 2018 -0700 Committer: Marcelo Vanzin Committed: Thu May 10 12:42:58 2018 -0700 -- R/pkg/R/client.R| 4 +- R/pkg/R/deserialize.R | 10 ++-- R/pkg/R/sparkR.R| 39 -- R/pkg/inst/worker/daemon.R | 4 +- R/pkg/inst/worker/worker.R | 5 +- .../org/apache/spark/api/r/RAuthHelper.scala| 38 ++ .../scala/org/apache/spark/api/r/RBackend.scala | 43 --- .../spark/api/r/RBackendAuthHandler.scala | 55 .../scala/org/apache/spark/api/r/RRunner.scala | 35 + .../scala/org/apache/spark/deploy/RRunner.scala | 6 ++- 10 files changed, 210 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f96d13dd/R/pkg/R/client.R -- diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 9d82814..7244cc9 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -19,7 +19,7 @@ # Creates a SparkR client connection object # if one doesn't already exist -connectBackend <- function(hostname, port, timeout) { +connectBackend <- function(hostname, port, timeout, authSecret) { if (exists(".sparkRcon", envir = .sparkREnv)) { if (isOpen(.sparkREnv[[".sparkRCon"]])) { cat("SparkRBackend client connection already exists\n") @@ -29,7 +29,7 @@ connectBackend <- function(hostname, port, timeout) { con <- socketConnection(host = hostname, port = port, server = FALSE, blocking = TRUE, open = "wb", timeout = timeout) - + doServerAuth(con, authSecret) assign(".sparkRCon", con, envir = .sparkREnv) con } http://git-wip-us.apache.org/repos/asf/spark/blob/f96d13dd/R/pkg/R/deserialize.R -- diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 0e99b17..dc7d37e 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -60,14 +60,18 @@ readTypedObject <- function(con, type) { stop(paste("Unsupported type for deserialization", type))) } -readString <- function(con) { - stringLen <- readInt(con) - raw <- readBin(con, raw(), stringLen, endian = "big") +readStringData <- function(con, len) { + raw <- readBin(con, raw(), len, endian = "big") string <- rawToChar(raw) Encoding(string) <- "UTF-8" string } +readString <- function(con) { + stringLen <- readInt(con) + readStringData(con, stringLen) +} + readInt <- function(con) { readBin(con, integer(), n = 1, endian = "big") } http://git-wip-us.apache.org/repos/asf/spark/blob/f96d13dd/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 9ebd344..daa855b 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -161,6 +161,10 @@ sparkR.sparkContext <- function( " please use the --packages commandline instead", sep = ",")) } backendPort <- existingPort +authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET") +if (nchar(authSecret) == 0) { + stop("Auth secret not provided in environment.") +} } else { path <- tempfile(pattern = "backend_port") submitOps <- getClientModeSparkSubmitOpts( @@ -189,16 +193,27 @@ sparkR.sparkContext <- function( monitorPort <- readInt(f) rLibPath <- readString(f) connectionTimeout <- readInt(f) + +# Don't use readString() so that we can provide a useful +# error message if the R and Java versions are mismatched. +authSecretLen = readInt(f) +if (length(authSecretLen) == 0 || authSecretLen == 0) { + stop("Unexpected EOF in JVM connection data. Mismatched versions?") +} +authSecret <- readStringData(f, authSecretLen) close(f) file.remove(path) if (length(backendPort) == 0 || backendPort == 0 || length(monitorPort) == 0 || monitorPort == 0 || -length(rLibPath) != 1) { +length(rLibPath) != 1 || length(authSecret) == 0) { stop("JVM failed to launch") } -assign(".monitorConn", -
[1/2] spark git commit: [PYSPARK] Update py4j to version 0.10.7.
Repository: spark Updated Branches: refs/heads/branch-2.2 f9d6a16ce -> f96d13dd6 [PYSPARK] Update py4j to version 0.10.7. (cherry picked from commit cc613b552e753d03cb62661591de59e1c8d82c74) Signed-off-by: Marcelo Vanzin (cherry picked from commit 323dc3ad02e63a7c99b5bd6da618d6020657ecba) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/850b7d86 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/850b7d86 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/850b7d86 Branch: refs/heads/branch-2.2 Commit: 850b7d868bbe23e58e1164fc551fac2facbebead Parents: f9d6a16 Author: Marcelo Vanzin Authored: Fri Apr 13 14:28:24 2018 -0700 Committer: Marcelo Vanzin Committed: Thu May 10 12:20:58 2018 -0700 -- LICENSE | 2 +- bin/pyspark | 6 +- bin/pyspark2.cmd| 2 +- core/pom.xml| 2 +- .../org/apache/spark/SecurityManager.scala | 11 +- .../spark/api/python/PythonGatewayServer.scala | 50 ++--- .../org/apache/spark/api/python/PythonRDD.scala | 29 -- .../apache/spark/api/python/PythonUtils.scala | 2 +- .../spark/api/python/PythonWorkerFactory.scala | 21 ++-- .../org/apache/spark/deploy/PythonRunner.scala | 12 ++- .../apache/spark/internal/config/package.scala | 5 + .../spark/security/SocketAuthHelper.scala | 101 +++ .../scala/org/apache/spark/util/Utils.scala | 11 ++ .../spark/security/SocketAuthHelperSuite.scala | 97 ++ dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- dev/run-pip-tests | 2 +- python/README.md| 2 +- python/docs/Makefile| 2 +- python/lib/py4j-0.10.6-src.zip | Bin 80352 -> 0 bytes python/lib/py4j-0.10.7-src.zip | Bin 0 -> 42437 bytes python/pyspark/context.py | 4 +- python/pyspark/daemon.py| 21 +++- python/pyspark/java_gateway.py | 93 ++--- python/pyspark/rdd.py | 21 ++-- python/pyspark/sql/dataframe.py | 8 +- python/pyspark/worker.py| 7 +- python/setup.py | 2 +- .../org/apache/spark/deploy/yarn/Client.scala | 2 +- .../spark/deploy/yarn/YarnClusterSuite.scala| 2 +- sbin/spark-config.sh| 2 +- .../scala/org/apache/spark/sql/Dataset.scala| 4 +- 32 files changed, 414 insertions(+), 113 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/850b7d86/LICENSE -- diff --git a/LICENSE b/LICENSE index 39fe0dc..b948cca 100644 --- a/LICENSE +++ b/LICENSE @@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt. (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf) (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net) (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net) - (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - http://py4j.sourceforge.net/) + (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.7 - http://py4j.sourceforge.net/) (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/) (BSD licence) sbt and sbt-launch-lib.bash (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE) http://git-wip-us.apache.org/repos/asf/spark/blob/850b7d86/bin/pyspark -- diff --git a/bin/pyspark b/bin/pyspark index d3b512e..95ab628 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -25,14 +25,14 @@ source "${SPARK_HOME}"/bin/load-spark-env.sh export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]" # In Spark 2.0, IPYTHON and IPYTHON_OPTS are removed and pyspark fails to launch if either option -# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython +# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython # to use IPython and set PYSPARK_DRIVER_PYTHON_OPTS to pass options when starting the Python driver # (e.g. PYSPARK_DRIVER_PYTHON_OPTS='notebook'). This supports full customization of the IPython # and executor Python executables.
svn commit: r26844 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_04_01-5403268-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Fri May 11 11:18:59 2018 New Revision: 26844 Log: Apache Spark 2.4.0-SNAPSHOT-2018_05_11_04_01-5403268 docs [This commit notification would consist of 1462 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24182][YARN] Improve error message when client AM fails.
Repository: spark Updated Branches: refs/heads/master 75cf369c7 -> 54032682b [SPARK-24182][YARN] Improve error message when client AM fails. Instead of always throwing a generic exception when the AM fails, print a generic error and throw the exception with the YARN diagnostics containing the reason for the failure. There was an issue with YARN sometimes providing a generic diagnostic message, even though the AM provides a failure reason when unregistering. That was happening because the AM was registering too late, and if errors happened before the registration, YARN would just create a generic "ExitCodeException" which wasn't very helpful. Since most errors in this path are a result of not being able to connect to the driver, this change modifies the AM registration a bit so that the AM is registered before the connection to the driver is established. That way, errors are properly propagated through YARN back to the driver. As part of that, I also removed the code that retried connections to the driver from the client AM. At that point, the driver should already be up and waiting for connections, so it's unlikely that retrying would help - and in case it does, that means a flaky network, which would mean problems would probably show up again. The effect of that is that connection-related errors are reported back to the driver much faster now (through the YARN report). One thing to note is that there seems to be a race on the YARN side that causes a report to be sent to the client without the corresponding diagnostics string from the AM; the diagnostics are available later from the RM web page. For that reason, the generic error messages are kept in the Spark scheduler code, to help guide users to a way of debugging their failure. Also of note is that if YARN's max attempts configuration is lower than Spark's, Spark will not unregister the AM with a proper diagnostics message. Unfortunately there seems to be no way to unregister the AM and still allow further re-attempts to happen. Testing: - existing unit tests - some of our integration tests - hardcoded an invalid driver address in the code and verified the error in the shell. e.g. ``` scala> 18/05/04 15:09:34 ERROR cluster.YarnClientSchedulerBackend: YARN application has exited unexpectedly with state FAILED! Check the YARN application logs for more details. 18/05/04 15:09:34 ERROR cluster.YarnClientSchedulerBackend: Diagnostics message: Uncaught exception: org.apache.spark.SparkException: Exception thrown in awaitResult: Caused by: java.io.IOException: Failed to connect to localhost/127.0.0.1:1234 ``` Author: Marcelo Vanzin Closes #21243 from vanzin/SPARK-24182. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54032682 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54032682 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54032682 Branch: refs/heads/master Commit: 54032682b910dc5089af27d2c7b6efe55700f034 Parents: 75cf369 Author: Marcelo Vanzin Authored: Fri May 11 17:40:35 2018 +0800 Committer: jerryshao Committed: Fri May 11 17:40:35 2018 +0800 -- docs/running-on-yarn.md | 5 +- .../spark/deploy/yarn/ApplicationMaster.scala | 103 +++ .../org/apache/spark/deploy/yarn/Client.scala | 43 +--- .../apache/spark/deploy/yarn/YarnRMClient.scala | 29 -- .../cluster/YarnClientSchedulerBackend.scala| 35 +-- 5 files changed, 112 insertions(+), 103 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54032682/docs/running-on-yarn.md -- diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index ceda8a3..c9e68c3 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -133,9 +133,8 @@ To use a custom metrics.properties for the application master and executors, upd spark.yarn.am.waitTime 100s -In cluster mode, time for the YARN Application Master to wait for the -SparkContext to be initialized. In client mode, time for the YARN Application Master to wait -for the driver to connect to it. +Only used in cluster mode. Time for the YARN Application Master to wait for the +SparkContext to be initialized. http://git-wip-us.apache.org/repos/asf/spark/blob/54032682/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala -- diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 595077e..3d6ee50 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark