spark git commit: [SPARK-23907] Removes regr_* functions in functions.scala

2018-05-11 Thread gurwls223
Repository: spark
Updated Branches:
  refs/heads/master f27a035da -> e3dabdf6e


[SPARK-23907] Removes regr_* functions in functions.scala

## What changes were proposed in this pull request?
This patch removes the various regr_* functions in functions.scala. They are so 
uncommon that I don't think they deserve real estate in functions.scala. We can 
consider adding them later if more users need them.

## How was this patch tested?
Removed the associated test case as well.

Author: Reynold Xin 

Closes #21309 from rxin/SPARK-23907.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e3dabdf6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e3dabdf6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e3dabdf6

Branch: refs/heads/master
Commit: e3dabdf6ef210fb9f4337e305feb9c4983a57350
Parents: f27a035
Author: Reynold Xin 
Authored: Sat May 12 12:15:36 2018 +0800
Committer: hyukjinkwon 
Committed: Sat May 12 12:15:36 2018 +0800

--
 .../scala/org/apache/spark/sql/functions.scala  | 171 ---
 .../spark/sql/DataFrameAggregateSuite.scala |  68 
 2 files changed, 239 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e3dabdf6/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e7f866d..3c9ace4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -811,177 +811,6 @@ object functions {
*/
   def var_pop(columnName: String): Column = var_pop(Column(columnName))
 
-  /**
-   * Aggregate function: returns the number of non-null pairs.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_count(y: Column, x: Column): Column = withAggregateFunction {
-RegrCount(y.expr, x.expr)
-  }
-
-  /**
-   * Aggregate function: returns the number of non-null pairs.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_count(y: String, x: String): Column = regr_count(Column(y), 
Column(x))
-
-  /**
-   * Aggregate function: returns SUM(x*x)-SUM(x)*SUM(x)/N. Any pair with a 
NULL is ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_sxx(y: Column, x: Column): Column = withAggregateFunction {
-RegrSXX(y.expr, x.expr)
-  }
-
-  /**
-   * Aggregate function: returns SUM(x*x)-SUM(x)*SUM(x)/N. Any pair with a 
NULL is ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_sxx(y: String, x: String): Column = regr_sxx(Column(y), Column(x))
-
-  /**
-   * Aggregate function: returns SUM(y*y)-SUM(y)*SUM(y)/N. Any pair with a 
NULL is ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_syy(y: Column, x: Column): Column = withAggregateFunction {
-RegrSYY(y.expr, x.expr)
-  }
-
-  /**
-   * Aggregate function: returns SUM(y*y)-SUM(y)*SUM(y)/N. Any pair with a 
NULL is ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_syy(y: String, x: String): Column = regr_syy(Column(y), Column(x))
-
-  /**
-   * Aggregate function: returns the average of y. Any pair with a NULL is 
ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_avgy(y: Column, x: Column): Column = withAggregateFunction {
-RegrAvgY(y.expr, x.expr)
-  }
-
-  /**
-   * Aggregate function: returns the average of y. Any pair with a NULL is 
ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_avgy(y: String, x: String): Column = regr_avgy(Column(y), Column(x))
-
-  /**
-   * Aggregate function: returns the average of x. Any pair with a NULL is 
ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_avgx(y: Column, x: Column): Column = withAggregateFunction {
-RegrAvgX(y.expr, x.expr)
-  }
-
-  /**
-   * Aggregate function: returns the average of x. Any pair with a NULL is 
ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_avgx(y: String, x: String): Column = regr_avgx(Column(y), Column(x))
-
-  /**
-   * Aggregate function: returns the covariance of y and x multiplied for the 
number of items in
-   * the dataset. Any pair with a NULL is ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_sxy(y: Column, x: Column): Column = withAggregateFunction {
-RegrSXY(y.expr, x.expr)
-  }
-
-  /**
-   * Aggregate function: returns the covariance of y and x multiplied for the 
number of items in
-   * the dataset. Any pair with a NULL is ignored.
-   *
-   * @group agg_funcs
-   * @since 2.4.0
-   */
-  def regr_sxy(y: String, x: String): Column = regr_sxy(Column(y), Column(x))
-

svn commit: r26865 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_20_01-f27a035-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-05-11 Thread pwendell
Author: pwendell
Date: Sat May 12 03:16:06 2018
New Revision: 26865

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_05_11_20_01-f27a035 docs


[This commit notification would consist of 1462 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r26862 - in /dev/spark/2.3.1-SNAPSHOT-2018_05_11_18_01-7de4bef-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-05-11 Thread pwendell
Author: pwendell
Date: Sat May 12 01:15:20 2018
New Revision: 26862

Log:
Apache Spark 2.3.1-SNAPSHOT-2018_05_11_18_01-7de4bef docs


[This commit notification would consist of 1443 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR] Require Java 8 for SparkR

2018-05-11 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 1d598b771 -> 7de4bef9e


[SPARKR] Require Java 8 for SparkR

This change updates the SystemRequirements and also includes a runtime check if 
the JVM is being launched by R. The runtime check is done by querying `java 
-version`

## How was this patch tested?

Tested on a Mac and Windows machine

Author: Shivaram Venkataraman 

Closes #21278 from shivaram/sparkr-skip-solaris.

(cherry picked from commit f27a035daf705766d3445e5c6a99867c11c552b0)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7de4bef9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7de4bef9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7de4bef9

Branch: refs/heads/branch-2.3
Commit: 7de4bef9ec37440aa36e6b0e9d8656de07d03b68
Parents: 1d598b7
Author: Shivaram Venkataraman 
Authored: Fri May 11 17:00:51 2018 -0700
Committer: Shivaram Venkataraman 
Committed: Fri May 11 17:01:02 2018 -0700

--
 R/pkg/DESCRIPTION |  1 +
 R/pkg/R/client.R  | 35 +++
 R/pkg/R/sparkR.R  |  1 +
 R/pkg/R/utils.R   |  4 ++--
 4 files changed, 39 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7de4bef9/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 29a8a00..632bcb3 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -13,6 +13,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = 
c("aut", "cre"),
 License: Apache License (== 2.0)
 URL: http://www.apache.org/ http://spark.apache.org/
 BugReports: http://spark.apache.org/contributing.html
+SystemRequirements: Java (== 8)
 Depends:
 R (>= 3.0),
 methods

http://git-wip-us.apache.org/repos/asf/spark/blob/7de4bef9/R/pkg/R/client.R
--
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 7244cc9..e9295e0 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -60,6 +60,40 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, 
sparkSubmitOpts, pack
   combinedArgs
 }
 
+checkJavaVersion <- function() {
+  javaBin <- "java"
+  javaHome <- Sys.getenv("JAVA_HOME")
+  javaReqs <- utils::packageDescription(utils::packageName(), 
fields=c("SystemRequirements"))
+  sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 
1L))
+  if (javaHome != "") {
+javaBin <- file.path(javaHome, "bin", javaBin)
+  }
+
+  # If java is missing from PATH, we get an error in Unix and a warning in 
Windows
+  javaVersionOut <- tryCatch(
+  launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = 
TRUE),
+   error = function(e) {
+ stop("Java version check failed. Please make sure Java is 
installed",
+  " and set JAVA_HOME to point to the installation 
directory.", e)
+   },
+   warning = function(w) {
+ stop("Java version check failed. Please make sure Java is 
installed",
+  " and set JAVA_HOME to point to the installation 
directory.", w)
+   })
+  javaVersionFilter <- Filter(
+  function(x) {
+grepl("java version", x)
+  }, javaVersionOut)
+
+  javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]
+  # javaVersionStr is of the form 1.8.0_92.
+  # Extract 8 from it to compare to sparkJavaVersion
+  javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2])
+  if (javaVersionNum != sparkJavaVersion) {
+stop(paste("Java version", sparkJavaVersion, "is required for this 
package; found version:", javaVersionStr))
+  }
+}
+
 launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {
   sparkSubmitBinName <- determineSparkSubmitBin()
   if (sparkHome != "") {
@@ -67,6 +101,7 @@ launchBackend <- function(args, sparkHome, jars, 
sparkSubmitOpts, packages) {
   } else {
 sparkSubmitBin <- sparkSubmitBinName
   }
+
   combinedArgs <- generateSparkSubmitArgs(args, sparkHome, jars, 
sparkSubmitOpts, packages)
   cat("Launching java with spark-submit command", sparkSubmitBin, 
combinedArgs, "\n")
   invisible(launchScript(sparkSubmitBin, combinedArgs))

http://git-wip-us.apache.org/repos/asf/spark/blob/7de4bef9/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 7430d84..2cd8b0c 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -170,6 +170,7 @@ sparkR.sparkContext <- function(
 submitOps <- getClientModeSparkSubmitOpts(
 Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
 sparkEnvirMap)
+checkJavaVersion()
   

spark git commit: [SPARKR] Require Java 8 for SparkR

2018-05-11 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 92f6f52ff -> f27a035da


[SPARKR] Require Java 8 for SparkR

This change updates the SystemRequirements and also includes a runtime check if 
the JVM is being launched by R. The runtime check is done by querying `java 
-version`

## How was this patch tested?

Tested on a Mac and Windows machine

Author: Shivaram Venkataraman 

Closes #21278 from shivaram/sparkr-skip-solaris.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f27a035d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f27a035d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f27a035d

Branch: refs/heads/master
Commit: f27a035daf705766d3445e5c6a99867c11c552b0
Parents: 92f6f52
Author: Shivaram Venkataraman 
Authored: Fri May 11 17:00:51 2018 -0700
Committer: Shivaram Venkataraman 
Committed: Fri May 11 17:00:51 2018 -0700

--
 R/pkg/DESCRIPTION |  1 +
 R/pkg/R/client.R  | 35 +++
 R/pkg/R/sparkR.R  |  1 +
 R/pkg/R/utils.R   |  4 ++--
 4 files changed, 39 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f27a035d/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 855eb5b..f52d785 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -13,6 +13,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = 
c("aut", "cre"),
 License: Apache License (== 2.0)
 URL: http://www.apache.org/ http://spark.apache.org/
 BugReports: http://spark.apache.org/contributing.html
+SystemRequirements: Java (== 8)
 Depends:
 R (>= 3.0),
 methods

http://git-wip-us.apache.org/repos/asf/spark/blob/f27a035d/R/pkg/R/client.R
--
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 7244cc9..e9295e0 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -60,6 +60,40 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, 
sparkSubmitOpts, pack
   combinedArgs
 }
 
+checkJavaVersion <- function() {
+  javaBin <- "java"
+  javaHome <- Sys.getenv("JAVA_HOME")
+  javaReqs <- utils::packageDescription(utils::packageName(), 
fields=c("SystemRequirements"))
+  sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 
1L))
+  if (javaHome != "") {
+javaBin <- file.path(javaHome, "bin", javaBin)
+  }
+
+  # If java is missing from PATH, we get an error in Unix and a warning in 
Windows
+  javaVersionOut <- tryCatch(
+  launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = 
TRUE),
+   error = function(e) {
+ stop("Java version check failed. Please make sure Java is 
installed",
+  " and set JAVA_HOME to point to the installation 
directory.", e)
+   },
+   warning = function(w) {
+ stop("Java version check failed. Please make sure Java is 
installed",
+  " and set JAVA_HOME to point to the installation 
directory.", w)
+   })
+  javaVersionFilter <- Filter(
+  function(x) {
+grepl("java version", x)
+  }, javaVersionOut)
+
+  javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]
+  # javaVersionStr is of the form 1.8.0_92.
+  # Extract 8 from it to compare to sparkJavaVersion
+  javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2])
+  if (javaVersionNum != sparkJavaVersion) {
+stop(paste("Java version", sparkJavaVersion, "is required for this 
package; found version:", javaVersionStr))
+  }
+}
+
 launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {
   sparkSubmitBinName <- determineSparkSubmitBin()
   if (sparkHome != "") {
@@ -67,6 +101,7 @@ launchBackend <- function(args, sparkHome, jars, 
sparkSubmitOpts, packages) {
   } else {
 sparkSubmitBin <- sparkSubmitBinName
   }
+
   combinedArgs <- generateSparkSubmitArgs(args, sparkHome, jars, 
sparkSubmitOpts, packages)
   cat("Launching java with spark-submit command", sparkSubmitBin, 
combinedArgs, "\n")
   invisible(launchScript(sparkSubmitBin, combinedArgs))

http://git-wip-us.apache.org/repos/asf/spark/blob/f27a035d/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 38ee794..d6a2d08 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -167,6 +167,7 @@ sparkR.sparkContext <- function(
 submitOps <- getClientModeSparkSubmitOpts(
 Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
 sparkEnvirMap)
+checkJavaVersion()
 launchBackend(
 args = path,
 sparkHome = sparkHome,

http://git-wip-us.apache.org/repos/asf/spark

[1/2] spark git commit: [PYSPARK] Update py4j to version 0.10.7.

2018-05-11 Thread vanzin
Repository: spark
Updated Branches:
  refs/heads/branch-2.1 8177b2148 -> 1d569f684


[PYSPARK] Update py4j to version 0.10.7.

(cherry picked from commit cc613b552e753d03cb62661591de59e1c8d82c74)
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit 323dc3ad02e63a7c99b5bd6da618d6020657ecba)
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit 850b7d868bbe23e58e1164fc551fac2facbebead)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/61a8c4c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/61a8c4c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/61a8c4c0

Branch: refs/heads/branch-2.1
Commit: 61a8c4c0929cdd854b0a93eb76ea9686e7f0
Parents: 8177b21
Author: Marcelo Vanzin 
Authored: Fri Apr 13 14:28:24 2018 -0700
Committer: Marcelo Vanzin 
Committed: Fri May 11 11:13:32 2018 -0700

--
 LICENSE |   2 +-
 bin/pyspark |   6 +-
 bin/pyspark2.cmd|   2 +-
 core/pom.xml|   2 +-
 .../org/apache/spark/SecurityManager.scala  |  11 +-
 .../spark/api/python/PythonGatewayServer.scala  |  50 ++---
 .../org/apache/spark/api/python/PythonRDD.scala |  29 --
 .../apache/spark/api/python/PythonUtils.scala   |   2 +-
 .../spark/api/python/PythonWorkerFactory.scala  |  21 ++--
 .../org/apache/spark/deploy/PythonRunner.scala  |  12 ++-
 .../apache/spark/internal/config/package.scala  |   6 ++
 .../spark/security/SocketAuthHelper.scala   | 101 +++
 .../scala/org/apache/spark/util/Utils.scala |  11 ++
 .../spark/security/SocketAuthHelperSuite.scala  |  97 ++
 dev/deps/spark-deps-hadoop-2.2  |   2 +-
 dev/deps/spark-deps-hadoop-2.3  |   2 +-
 dev/deps/spark-deps-hadoop-2.4  |   2 +-
 dev/deps/spark-deps-hadoop-2.6  |   2 +-
 dev/deps/spark-deps-hadoop-2.7  |   2 +-
 python/README.md|   2 +-
 python/docs/Makefile|   2 +-
 python/lib/py4j-0.10.6-src.zip  | Bin 80352 -> 0 bytes
 python/lib/py4j-0.10.7-src.zip  | Bin 0 -> 42437 bytes
 python/pyspark/context.py   |   4 +-
 python/pyspark/daemon.py|  21 +++-
 python/pyspark/java_gateway.py  |  93 ++---
 python/pyspark/rdd.py   |  21 ++--
 python/pyspark/sql/dataframe.py |   8 +-
 python/pyspark/worker.py|   7 +-
 python/setup.py |   2 +-
 sbin/spark-config.sh|   2 +-
 .../scala/org/apache/spark/sql/Dataset.scala|   4 +-
 .../org/apache/spark/deploy/yarn/Client.scala   |   2 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala|   2 +-
 34 files changed, 417 insertions(+), 115 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/61a8c4c0/LICENSE
--
diff --git a/LICENSE b/LICENSE
index 02aaef2..83a5e2b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at 
licenses/LICENSE-[project].txt.
  (New BSD license) Protocol Buffer Java API 
(org.spark-project.protobuf:protobuf-java:2.4.1-shaded - 
http://code.google.com/p/protobuf)
  (The BSD License) Fortran to Java ARPACK 
(net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
  (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - 
http://xmlenc.sourceforge.net)
- (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - 
http://py4j.sourceforge.net/)
+ (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.7 - 
http://py4j.sourceforge.net/)
  (Two-clause BSD-style license) JUnit-Interface 
(com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
  (BSD licence) sbt and sbt-launch-lib.bash
  (BSD 3 Clause) d3.min.js 
(https://github.com/mbostock/d3/blob/master/LICENSE)

http://git-wip-us.apache.org/repos/asf/spark/blob/61a8c4c0/bin/pyspark
--
diff --git a/bin/pyspark b/bin/pyspark
index d3b512e..95ab628 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -25,14 +25,14 @@ source "${SPARK_HOME}"/bin/load-spark-env.sh
 export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
 
 # In Spark 2.0, IPYTHON and IPYTHON_OPTS are removed and pyspark fails to 
launch if either option
-# is set in the user's environment. Instead, users should set 
PYSPARK_DRIVER_PYTHON=ipython 
+# is set in the user's environment. Instead, users should set 
PYSPARK_DRIVER_PYTHON=ipython
 # to use IPython a

[2/2] spark git commit: [SPARKR] Match pyspark features in SparkR communication protocol.

2018-05-11 Thread vanzin
[SPARKR] Match pyspark features in SparkR communication protocol.

(cherry picked from commit 628c7b517969c4a7ccb26ea67ab3dd61266073ca)
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit 16cd9ac5264831e061c033b26fe1173ebc88e5d1)
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit f96d13dd67b2d39e5fff80a6bc4a1b1fc36745c6)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d569f68
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d569f68
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d569f68

Branch: refs/heads/branch-2.1
Commit: 1d569f684fe41a33dc1708da6cde5d5b2a7857af
Parents: 61a8c4c
Author: Marcelo Vanzin 
Authored: Tue Apr 17 13:29:43 2018 -0700
Committer: Marcelo Vanzin 
Committed: Fri May 11 11:13:44 2018 -0700

--
 R/pkg/R/client.R|  4 +-
 R/pkg/R/deserialize.R   | 10 ++--
 R/pkg/R/sparkR.R| 39 --
 R/pkg/inst/worker/daemon.R  |  4 +-
 R/pkg/inst/worker/worker.R  |  5 +-
 .../org/apache/spark/api/r/RAuthHelper.scala| 38 ++
 .../scala/org/apache/spark/api/r/RBackend.scala | 43 ---
 .../spark/api/r/RBackendAuthHandler.scala   | 55 
 .../scala/org/apache/spark/api/r/RRunner.scala  | 35 +
 .../scala/org/apache/spark/deploy/RRunner.scala |  6 ++-
 10 files changed, 210 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1d569f68/R/pkg/R/client.R
--
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 9d82814..7244cc9 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -19,7 +19,7 @@
 
 # Creates a SparkR client connection object
 # if one doesn't already exist
-connectBackend <- function(hostname, port, timeout) {
+connectBackend <- function(hostname, port, timeout, authSecret) {
   if (exists(".sparkRcon", envir = .sparkREnv)) {
 if (isOpen(.sparkREnv[[".sparkRCon"]])) {
   cat("SparkRBackend client connection already exists\n")
@@ -29,7 +29,7 @@ connectBackend <- function(hostname, port, timeout) {
 
   con <- socketConnection(host = hostname, port = port, server = FALSE,
   blocking = TRUE, open = "wb", timeout = timeout)
-
+  doServerAuth(con, authSecret)
   assign(".sparkRCon", con, envir = .sparkREnv)
   con
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/1d569f68/R/pkg/R/deserialize.R
--
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 0e99b17..dc7d37e 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -60,14 +60,18 @@ readTypedObject <- function(con, type) {
 stop(paste("Unsupported type for deserialization", type)))
 }
 
-readString <- function(con) {
-  stringLen <- readInt(con)
-  raw <- readBin(con, raw(), stringLen, endian = "big")
+readStringData <- function(con, len) {
+  raw <- readBin(con, raw(), len, endian = "big")
   string <- rawToChar(raw)
   Encoding(string) <- "UTF-8"
   string
 }
 
+readString <- function(con) {
+  stringLen <- readInt(con)
+  readStringData(con, stringLen)
+}
+
 readInt <- function(con) {
   readBin(con, integer(), n = 1, endian = "big")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/1d569f68/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index d0a12b7..6c0392a 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -161,6 +161,10 @@ sparkR.sparkContext <- function(
 " please use the --packages commandline instead", sep = 
","))
 }
 backendPort <- existingPort
+authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET")
+if (nchar(authSecret) == 0) {
+  stop("Auth secret not provided in environment.")
+}
   } else {
 path <- tempfile(pattern = "backend_port")
 submitOps <- getClientModeSparkSubmitOpts(
@@ -189,16 +193,27 @@ sparkR.sparkContext <- function(
 monitorPort <- readInt(f)
 rLibPath <- readString(f)
 connectionTimeout <- readInt(f)
+
+# Don't use readString() so that we can provide a useful
+# error message if the R and Java versions are mismatched.
+authSecretLen = readInt(f)
+if (length(authSecretLen) == 0 || authSecretLen == 0) {
+  stop("Unexpected EOF in JVM connection data. Mismatched versions?")
+}
+authSecret <- readStringData(f, authSecretLen)
 close(f)
 file.remove(path)
 if (length(backendPort) == 0 || backendPort == 0 ||
 length(monitorPort) == 0 || monitorPort == 0 ||
-length(rLibPath) != 1) {
+length(rLibPath) != 1 ||

svn commit: r26859 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_16_01-92f6f52-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-05-11 Thread pwendell
Author: pwendell
Date: Fri May 11 23:15:44 2018
New Revision: 26859

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_05_11_16_01-92f6f52 docs


[This commit notification would consist of 1462 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r26856 - in /dev/spark/2.3.1-SNAPSHOT-2018_05_11_14_01-1d598b7-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-05-11 Thread pwendell
Author: pwendell
Date: Fri May 11 21:15:49 2018
New Revision: 26856

Log:
Apache Spark 2.3.1-SNAPSHOT-2018_05_11_14_01-1d598b7 docs


[This commit notification would consist of 1443 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [MINOR][DOCS] Documenting months_between direction

2018-05-11 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 928845a42 -> 92f6f52ff


[MINOR][DOCS] Documenting months_between direction

## What changes were proposed in this pull request?

It's useful to know what relationship between date1 and date2 results in a 
positive number.

Author: aditkumar 
Author: Adit Kumar 

Closes #20787 from aditkumar/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92f6f52f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92f6f52f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92f6f52f

Branch: refs/heads/master
Commit: 92f6f52ff0ce47e046656ca8bed7d7bfbbb42dcb
Parents: 928845a
Author: aditkumar 
Authored: Fri May 11 14:42:23 2018 -0500
Committer: Sean Owen 
Committed: Fri May 11 14:42:23 2018 -0500

--
 R/pkg/R/functions.R   |  6 +-
 python/pyspark/sql/functions.py   |  7 +--
 .../catalyst/expressions/datetimeExpressions.scala| 14 +++---
 .../spark/sql/catalyst/util/DateTimeUtils.scala   |  8 
 .../main/scala/org/apache/spark/sql/functions.scala   |  7 ++-
 5 files changed, 31 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/92f6f52f/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 1f97054..4964594 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1912,6 +1912,7 @@ setMethod("atan2", signature(y = "Column"),
 
 #' @details
 #' \code{datediff}: Returns the number of days from \code{y} to \code{x}.
+#' If \code{y} is later than \code{x} then the result is positive.
 #'
 #' @rdname column_datetime_diff_functions
 #' @aliases datediff datediff,Column-method
@@ -1971,7 +1972,10 @@ setMethod("levenshtein", signature(y = "Column"),
   })
 
 #' @details
-#' \code{months_between}: Returns number of months between dates \code{y} and 
\code{x}.
+#' \code{months_between}: Returns number of months between dates \code{y} and 
\code{x}. 
+#' If \code{y} is later than \code{x}, then the result is positive. If 
\code{y} and \code{x}
+#' are on the same day of month, or both are the last day of month, time of 
day will be ignored.
+#' Otherwise, the difference is calculated based on 31 days per month, and 
rounded to 8 digits.
 #'
 #' @rdname column_datetime_diff_functions
 #' @aliases months_between months_between,Column-method

http://git-wip-us.apache.org/repos/asf/spark/blob/92f6f52f/python/pyspark/sql/functions.py
--
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index f5a5841..b62748e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1108,8 +1108,11 @@ def add_months(start, months):
 @since(1.5)
 def months_between(date1, date2, roundOff=True):
 """
-Returns the number of months between date1 and date2.
-Unless `roundOff` is set to `False`, the result is rounded off to 8 digits.
+Returns number of months between dates date1 and date2.
+If date1 is later than date2, then the result is positive.
+If date1 and date2 are on the same day of month, or both are the last day 
of month,
+returns an integer (time of day will be ignored).
+The result is rounded off to 8 digits unless `roundOff` is set to `False`.
 
 >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], 
['date1', 'date2'])
 >>> df.select(months_between(df.date1, df.date2).alias('months')).collect()

http://git-wip-us.apache.org/repos/asf/spark/blob/92f6f52f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 76aa614..03422fe 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -1194,13 +1194,21 @@ case class AddMonths(startDate: Expression, numMonths: 
Expression)
 }
 
 /**
- * Returns number of months between dates date1 and date2.
+ * Returns number of months between times `timestamp1` and `timestamp2`.
+ * If `timestamp1` is later than `timestamp2`, then the result is positive.
+ * If `timestamp1` and `timestamp2` are on the same day of month, or both
+ * are the last day of month, time of day will be ignored. Otherwise, the
+ * difference is calculated based on 31 days per month, a

svn commit: r26854 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_12_01-928845a-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-05-11 Thread pwendell
Author: pwendell
Date: Fri May 11 19:16:20 2018
New Revision: 26854

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_05_11_12_01-928845a docs


[This commit notification would consist of 1462 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-24067][BACKPORT-2.3][STREAMING][KAFKA] Allow non-consecutive offsets

2018-05-11 Thread koeninger
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 414e4e3d7 -> 1d598b771


[SPARK-24067][BACKPORT-2.3][STREAMING][KAFKA] Allow non-consecutive offsets

## What changes were proposed in this pull request?

Backport of the bugfix in SPARK-17147

Add a configuration spark.streaming.kafka.allowNonConsecutiveOffsets to allow 
streaming jobs to proceed on compacted topics (or other situations involving 
gaps between offsets in the log).

## How was this patch tested?

Added new unit test

justinrmiller has been testing this branch in production for a few weeks

Author: cody koeninger 

Closes #21300 from koeninger/branch-2.3_kafkafix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d598b77
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d598b77
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d598b77

Branch: refs/heads/branch-2.3
Commit: 1d598b771de3b588a2f377ae7ccf8193156641f2
Parents: 414e4e3
Author: cody koeninger 
Authored: Fri May 11 13:40:36 2018 -0500
Committer: cody koeninger 
Committed: Fri May 11 13:40:36 2018 -0500

--
 .../kafka010/CachedKafkaConsumer.scala  |  55 -
 .../spark/streaming/kafka010/KafkaRDD.scala | 236 +--
 .../streaming/kafka010/KafkaRDDSuite.scala  | 106 +
 .../streaming/kafka010/KafkaTestUtils.scala |  25 +-
 .../kafka010/mocks/MockScheduler.scala  |  96 
 .../streaming/kafka010/mocks/MockTime.scala |  51 
 6 files changed, 487 insertions(+), 82 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1d598b77/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
--
diff --git 
a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
 
b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
index fa3ea61..aeb8c1d 100644
--- 
a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
+++ 
b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/CachedKafkaConsumer.scala
@@ -22,10 +22,8 @@ import java.{ util => ju }
 import org.apache.kafka.clients.consumer.{ ConsumerConfig, ConsumerRecord, 
KafkaConsumer }
 import org.apache.kafka.common.{ KafkaException, TopicPartition }
 
-import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 
-
 /**
  * Consumer of single topicpartition, intended for cached reuse.
  * Underlying consumer is not threadsafe, so neither is this,
@@ -38,7 +36,7 @@ class CachedKafkaConsumer[K, V] private(
   val partition: Int,
   val kafkaParams: ju.Map[String, Object]) extends Logging {
 
-  assert(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG),
+  require(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG),
 "groupId used for cache key must match the groupId in kafkaParams")
 
   val topicPartition = new TopicPartition(topic, partition)
@@ -53,7 +51,7 @@ class CachedKafkaConsumer[K, V] private(
 
   // TODO if the buffer was kept around as a random-access structure,
   // could possibly optimize re-calculating of an RDD in the same batch
-  protected var buffer = ju.Collections.emptyList[ConsumerRecord[K, 
V]]().iterator
+  protected var buffer = ju.Collections.emptyListIterator[ConsumerRecord[K, 
V]]()
   protected var nextOffset = -2L
 
   def close(): Unit = consumer.close()
@@ -71,7 +69,7 @@ class CachedKafkaConsumer[K, V] private(
 }
 
 if (!buffer.hasNext()) { poll(timeout) }
-assert(buffer.hasNext(),
+require(buffer.hasNext(),
   s"Failed to get records for $groupId $topic $partition $offset after 
polling for $timeout")
 var record = buffer.next()
 
@@ -79,17 +77,56 @@ class CachedKafkaConsumer[K, V] private(
   logInfo(s"Buffer miss for $groupId $topic $partition $offset")
   seek(offset)
   poll(timeout)
-  assert(buffer.hasNext(),
+  require(buffer.hasNext(),
 s"Failed to get records for $groupId $topic $partition $offset after 
polling for $timeout")
   record = buffer.next()
-  assert(record.offset == offset,
-s"Got wrong record for $groupId $topic $partition even after seeking 
to offset $offset")
+  require(record.offset == offset,
+s"Got wrong record for $groupId $topic $partition even after seeking 
to offset $offset " +
+  s"got offset ${record.offset} instead. If this is a compacted topic, 
consider enabling " +
+  "spark.streaming.kafka.allowNonConsecutiveOffsets"
+  )
 }
 
 nextOffset = offset + 1
 record
   }
 
+  /**
+   * Start a batch on a compacted topic
+   */
+  def compactedStart(offset: Long, timeout: Long): Unit = {
+  

spark git commit: [SPARK-24172][SQL] we should not apply operator pushdown to data source v2 many times

2018-05-11 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 54032682b -> 928845a42


[SPARK-24172][SQL] we should not apply operator pushdown to data source v2 many 
times

## What changes were proposed in this pull request?

In `PushDownOperatorsToDataSource`, we use `transformUp` to match 
`PhysicalOperation` and apply pushdown. This is problematic if we have multiple 
`Filter` and `Project` above the data source v2 relation.

e.g. for a query
```
Project
  Filter
DataSourceV2Relation
```

The pattern match will be triggered twice and we will do operator pushdown 
twice. This is unnecessary, we can use `mapChildren` to only apply pushdown 
once.

## How was this patch tested?

existing test

Author: Wenchen Fan 

Closes #21230 from cloud-fan/step2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/928845a4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/928845a4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/928845a4

Branch: refs/heads/master
Commit: 928845a42230a2c0a318011002a54ad871468b2e
Parents: 5403268
Author: Wenchen Fan 
Authored: Fri May 11 10:00:28 2018 -0700
Committer: gatorsmile 
Committed: Fri May 11 10:00:28 2018 -0700

--
 .../v2/PushDownOperatorsToDataSource.scala   | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/928845a4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
index 9293d4f..e894f8a 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
@@ -23,17 +23,10 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, 
LogicalPlan, Project
 import org.apache.spark.sql.catalyst.rules.Rule
 
 object PushDownOperatorsToDataSource extends Rule[LogicalPlan] {
-  override def apply(
-  plan: LogicalPlan): LogicalPlan = plan transformUp {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan match {
 // PhysicalOperation guarantees that filters are deterministic; no need to 
check
-case PhysicalOperation(project, newFilters, relation : 
DataSourceV2Relation) =>
-  // merge the filters
-  val filters = relation.filters match {
-case Some(existing) =>
-  existing ++ newFilters
-case _ =>
-  newFilters
-  }
+case PhysicalOperation(project, filters, relation: DataSourceV2Relation) =>
+  assert(relation.filters.isEmpty, "data source v2 should do push down 
only once.")
 
   val projectAttrs = project.map(_.toAttribute)
   val projectSet = AttributeSet(project.flatMap(_.references))
@@ -67,5 +60,7 @@ object PushDownOperatorsToDataSource extends 
Rule[LogicalPlan] {
   } else {
 filtered
   }
+
+case other => other.mapChildren(apply)
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[2/2] spark git commit: [SPARKR] Match pyspark features in SparkR communication protocol.

2018-05-11 Thread vanzin
[SPARKR] Match pyspark features in SparkR communication protocol.

(cherry picked from commit 628c7b517969c4a7ccb26ea67ab3dd61266073ca)
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit 16cd9ac5264831e061c033b26fe1173ebc88e5d1)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f96d13dd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f96d13dd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f96d13dd

Branch: refs/heads/branch-2.2
Commit: f96d13dd67b2d39e5fff80a6bc4a1b1fc36745c6
Parents: 850b7d8
Author: Marcelo Vanzin 
Authored: Tue Apr 17 13:29:43 2018 -0700
Committer: Marcelo Vanzin 
Committed: Thu May 10 12:42:58 2018 -0700

--
 R/pkg/R/client.R|  4 +-
 R/pkg/R/deserialize.R   | 10 ++--
 R/pkg/R/sparkR.R| 39 --
 R/pkg/inst/worker/daemon.R  |  4 +-
 R/pkg/inst/worker/worker.R  |  5 +-
 .../org/apache/spark/api/r/RAuthHelper.scala| 38 ++
 .../scala/org/apache/spark/api/r/RBackend.scala | 43 ---
 .../spark/api/r/RBackendAuthHandler.scala   | 55 
 .../scala/org/apache/spark/api/r/RRunner.scala  | 35 +
 .../scala/org/apache/spark/deploy/RRunner.scala |  6 ++-
 10 files changed, 210 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f96d13dd/R/pkg/R/client.R
--
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 9d82814..7244cc9 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -19,7 +19,7 @@
 
 # Creates a SparkR client connection object
 # if one doesn't already exist
-connectBackend <- function(hostname, port, timeout) {
+connectBackend <- function(hostname, port, timeout, authSecret) {
   if (exists(".sparkRcon", envir = .sparkREnv)) {
 if (isOpen(.sparkREnv[[".sparkRCon"]])) {
   cat("SparkRBackend client connection already exists\n")
@@ -29,7 +29,7 @@ connectBackend <- function(hostname, port, timeout) {
 
   con <- socketConnection(host = hostname, port = port, server = FALSE,
   blocking = TRUE, open = "wb", timeout = timeout)
-
+  doServerAuth(con, authSecret)
   assign(".sparkRCon", con, envir = .sparkREnv)
   con
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f96d13dd/R/pkg/R/deserialize.R
--
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 0e99b17..dc7d37e 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -60,14 +60,18 @@ readTypedObject <- function(con, type) {
 stop(paste("Unsupported type for deserialization", type)))
 }
 
-readString <- function(con) {
-  stringLen <- readInt(con)
-  raw <- readBin(con, raw(), stringLen, endian = "big")
+readStringData <- function(con, len) {
+  raw <- readBin(con, raw(), len, endian = "big")
   string <- rawToChar(raw)
   Encoding(string) <- "UTF-8"
   string
 }
 
+readString <- function(con) {
+  stringLen <- readInt(con)
+  readStringData(con, stringLen)
+}
+
 readInt <- function(con) {
   readBin(con, integer(), n = 1, endian = "big")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f96d13dd/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 9ebd344..daa855b 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -161,6 +161,10 @@ sparkR.sparkContext <- function(
 " please use the --packages commandline instead", sep = 
","))
 }
 backendPort <- existingPort
+authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET")
+if (nchar(authSecret) == 0) {
+  stop("Auth secret not provided in environment.")
+}
   } else {
 path <- tempfile(pattern = "backend_port")
 submitOps <- getClientModeSparkSubmitOpts(
@@ -189,16 +193,27 @@ sparkR.sparkContext <- function(
 monitorPort <- readInt(f)
 rLibPath <- readString(f)
 connectionTimeout <- readInt(f)
+
+# Don't use readString() so that we can provide a useful
+# error message if the R and Java versions are mismatched.
+authSecretLen = readInt(f)
+if (length(authSecretLen) == 0 || authSecretLen == 0) {
+  stop("Unexpected EOF in JVM connection data. Mismatched versions?")
+}
+authSecret <- readStringData(f, authSecretLen)
 close(f)
 file.remove(path)
 if (length(backendPort) == 0 || backendPort == 0 ||
 length(monitorPort) == 0 || monitorPort == 0 ||
-length(rLibPath) != 1) {
+length(rLibPath) != 1 || length(authSecret) == 0) {
   stop("JVM failed to launch")
 }
-assign(".monitorConn",
-

[1/2] spark git commit: [PYSPARK] Update py4j to version 0.10.7.

2018-05-11 Thread vanzin
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 f9d6a16ce -> f96d13dd6


[PYSPARK] Update py4j to version 0.10.7.

(cherry picked from commit cc613b552e753d03cb62661591de59e1c8d82c74)
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit 323dc3ad02e63a7c99b5bd6da618d6020657ecba)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/850b7d86
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/850b7d86
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/850b7d86

Branch: refs/heads/branch-2.2
Commit: 850b7d868bbe23e58e1164fc551fac2facbebead
Parents: f9d6a16
Author: Marcelo Vanzin 
Authored: Fri Apr 13 14:28:24 2018 -0700
Committer: Marcelo Vanzin 
Committed: Thu May 10 12:20:58 2018 -0700

--
 LICENSE |   2 +-
 bin/pyspark |   6 +-
 bin/pyspark2.cmd|   2 +-
 core/pom.xml|   2 +-
 .../org/apache/spark/SecurityManager.scala  |  11 +-
 .../spark/api/python/PythonGatewayServer.scala  |  50 ++---
 .../org/apache/spark/api/python/PythonRDD.scala |  29 --
 .../apache/spark/api/python/PythonUtils.scala   |   2 +-
 .../spark/api/python/PythonWorkerFactory.scala  |  21 ++--
 .../org/apache/spark/deploy/PythonRunner.scala  |  12 ++-
 .../apache/spark/internal/config/package.scala  |   5 +
 .../spark/security/SocketAuthHelper.scala   | 101 +++
 .../scala/org/apache/spark/util/Utils.scala |  11 ++
 .../spark/security/SocketAuthHelperSuite.scala  |  97 ++
 dev/deps/spark-deps-hadoop-2.6  |   2 +-
 dev/deps/spark-deps-hadoop-2.7  |   2 +-
 dev/run-pip-tests   |   2 +-
 python/README.md|   2 +-
 python/docs/Makefile|   2 +-
 python/lib/py4j-0.10.6-src.zip  | Bin 80352 -> 0 bytes
 python/lib/py4j-0.10.7-src.zip  | Bin 0 -> 42437 bytes
 python/pyspark/context.py   |   4 +-
 python/pyspark/daemon.py|  21 +++-
 python/pyspark/java_gateway.py  |  93 ++---
 python/pyspark/rdd.py   |  21 ++--
 python/pyspark/sql/dataframe.py |   8 +-
 python/pyspark/worker.py|   7 +-
 python/setup.py |   2 +-
 .../org/apache/spark/deploy/yarn/Client.scala   |   2 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala|   2 +-
 sbin/spark-config.sh|   2 +-
 .../scala/org/apache/spark/sql/Dataset.scala|   4 +-
 32 files changed, 414 insertions(+), 113 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/850b7d86/LICENSE
--
diff --git a/LICENSE b/LICENSE
index 39fe0dc..b948cca 100644
--- a/LICENSE
+++ b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at 
licenses/LICENSE-[project].txt.
  (New BSD license) Protocol Buffer Java API 
(org.spark-project.protobuf:protobuf-java:2.4.1-shaded - 
http://code.google.com/p/protobuf)
  (The BSD License) Fortran to Java ARPACK 
(net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
  (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - 
http://xmlenc.sourceforge.net)
- (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - 
http://py4j.sourceforge.net/)
+ (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.7 - 
http://py4j.sourceforge.net/)
  (Two-clause BSD-style license) JUnit-Interface 
(com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
  (BSD licence) sbt and sbt-launch-lib.bash
  (BSD 3 Clause) d3.min.js 
(https://github.com/mbostock/d3/blob/master/LICENSE)

http://git-wip-us.apache.org/repos/asf/spark/blob/850b7d86/bin/pyspark
--
diff --git a/bin/pyspark b/bin/pyspark
index d3b512e..95ab628 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -25,14 +25,14 @@ source "${SPARK_HOME}"/bin/load-spark-env.sh
 export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
 
 # In Spark 2.0, IPYTHON and IPYTHON_OPTS are removed and pyspark fails to 
launch if either option
-# is set in the user's environment. Instead, users should set 
PYSPARK_DRIVER_PYTHON=ipython 
+# is set in the user's environment. Instead, users should set 
PYSPARK_DRIVER_PYTHON=ipython
 # to use IPython and set PYSPARK_DRIVER_PYTHON_OPTS to pass options when 
starting the Python driver
 # (e.g. PYSPARK_DRIVER_PYTHON_OPTS='notebook').  This supports full 
customization of the IPython
 # and executor Python executables.

svn commit: r26844 - in /dev/spark/2.4.0-SNAPSHOT-2018_05_11_04_01-5403268-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-05-11 Thread pwendell
Author: pwendell
Date: Fri May 11 11:18:59 2018
New Revision: 26844

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_05_11_04_01-5403268 docs


[This commit notification would consist of 1462 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-24182][YARN] Improve error message when client AM fails.

2018-05-11 Thread jshao
Repository: spark
Updated Branches:
  refs/heads/master 75cf369c7 -> 54032682b


[SPARK-24182][YARN] Improve error message when client AM fails.

Instead of always throwing a generic exception when the AM fails,
print a generic error and throw the exception with the YARN
diagnostics containing the reason for the failure.

There was an issue with YARN sometimes providing a generic diagnostic
message, even though the AM provides a failure reason when
unregistering. That was happening because the AM was registering
too late, and if errors happened before the registration, YARN would
just create a generic "ExitCodeException" which wasn't very helpful.

Since most errors in this path are a result of not being able to
connect to the driver, this change modifies the AM registration
a bit so that the AM is registered before the connection to the
driver is established. That way, errors are properly propagated
through YARN back to the driver.

As part of that, I also removed the code that retried connections
to the driver from the client AM. At that point, the driver should
already be up and waiting for connections, so it's unlikely that
retrying would help - and in case it does, that means a flaky
network, which would mean problems would probably show up again.
The effect of that is that connection-related errors are reported
back to the driver much faster now (through the YARN report).

One thing to note is that there seems to be a race on the YARN
side that causes a report to be sent to the client without the
corresponding diagnostics string from the AM; the diagnostics are
available later from the RM web page. For that reason, the generic
error messages are kept in the Spark scheduler code, to help
guide users to a way of debugging their failure.

Also of note is that if YARN's max attempts configuration is lower
than Spark's, Spark will not unregister the AM with a proper
diagnostics message. Unfortunately there seems to be no way to
unregister the AM and still allow further re-attempts to happen.

Testing:
- existing unit tests
- some of our integration tests
- hardcoded an invalid driver address in the code and verified
  the error in the shell. e.g.

```
scala> 18/05/04 15:09:34 ERROR cluster.YarnClientSchedulerBackend: YARN 
application has exited unexpectedly with state FAILED! Check the YARN 
application logs for more details.
18/05/04 15:09:34 ERROR cluster.YarnClientSchedulerBackend: Diagnostics 
message: Uncaught exception: org.apache.spark.SparkException: Exception thrown 
in awaitResult:
  
Caused by: java.io.IOException: Failed to connect to localhost/127.0.0.1:1234
  
```

Author: Marcelo Vanzin 

Closes #21243 from vanzin/SPARK-24182.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54032682
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54032682
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54032682

Branch: refs/heads/master
Commit: 54032682b910dc5089af27d2c7b6efe55700f034
Parents: 75cf369
Author: Marcelo Vanzin 
Authored: Fri May 11 17:40:35 2018 +0800
Committer: jerryshao 
Committed: Fri May 11 17:40:35 2018 +0800

--
 docs/running-on-yarn.md |   5 +-
 .../spark/deploy/yarn/ApplicationMaster.scala   | 103 +++
 .../org/apache/spark/deploy/yarn/Client.scala   |  43 +---
 .../apache/spark/deploy/yarn/YarnRMClient.scala |  29 --
 .../cluster/YarnClientSchedulerBackend.scala|  35 +--
 5 files changed, 112 insertions(+), 103 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/54032682/docs/running-on-yarn.md
--
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index ceda8a3..c9e68c3 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -133,9 +133,8 @@ To use a custom metrics.properties for the application 
master and executors, upd
   spark.yarn.am.waitTime
   100s
   
-In cluster mode, time for the YARN Application Master to wait 
for the
-SparkContext to be initialized. In client mode, time for the 
YARN Application Master to wait
-for the driver to connect to it.
+Only used in cluster mode. Time for the YARN Application 
Master to wait for the
+SparkContext to be initialized.
   
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/54032682/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
--
diff --git 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 595077e..3d6ee50 100644
--- 
a/resource-managers/yarn/src/main/scala/org/apache/spark