[spark] branch master updated (a89006a -> e7fa778)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from a89006a [SPARK-31393][SQL] Show the correct alias in schema for expression add e7fa778 [SPARK-30699][ML][PYSPARK] GMM blockify input vectors No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/ml/linalg/BLAS.scala| 2 +- .../stat/distribution/MultivariateGaussian.scala | 32 +- .../distribution/MultivariateGaussianSuite.scala | 10 + .../spark/ml/clustering/GaussianMixture.scala | 324 - .../spark/ml/clustering/GaussianMixtureSuite.scala | 11 + python/pyspark/ml/clustering.py| 22 +- 6 files changed, 325 insertions(+), 76 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-30699][ML][PYSPARK] GMM blockify input vectors
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e7fa778 [SPARK-30699][ML][PYSPARK] GMM blockify input vectors e7fa778 is described below commit e7fa778dc7a695d3b1426de6f98a401f2fb98f39 Author: zhengruifeng AuthorDate: Tue May 12 12:54:03 2020 +0800 [SPARK-30699][ML][PYSPARK] GMM blockify input vectors ### What changes were proposed in this pull request? 1, add new param blockSize; 2, if blockSize==1, keep original behavior, code path trainOnRows; 3, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path trainOnBlocks ### Why are the changes needed? performance gain on dense dataset HIGGS: 1, save about 45% RAM; 2, 3X faster with openBLAS ### Does this PR introduce any user-facing change? add a new expert param `blockSize` ### How was this patch tested? added testsuites Closes #27473 from zhengruifeng/blockify_gmm. Authored-by: zhengruifeng Signed-off-by: zhengruifeng --- .../scala/org/apache/spark/ml/linalg/BLAS.scala| 2 +- .../stat/distribution/MultivariateGaussian.scala | 32 +- .../distribution/MultivariateGaussianSuite.scala | 10 + .../spark/ml/clustering/GaussianMixture.scala | 324 - .../spark/ml/clustering/GaussianMixtureSuite.scala | 11 + python/pyspark/ml/clustering.py| 22 +- 6 files changed, 325 insertions(+), 76 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala index 3d3e7a2..368f177 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala @@ -271,7 +271,7 @@ private[spark] object BLAS extends Serializable { } /** - * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR. + * Adds alpha * v * v.t to a matrix in-place. This is the same as BLAS's ?SPR. * * @param U the upper triangular part of the matrix packed in an array (column major) */ diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala index 42746b5..a08b8af 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala @@ -55,7 +55,7 @@ class MultivariateGaussian @Since("2.0.0") ( */ @transient private lazy val tuple = { val (rootSigmaInv, u) = calculateCovarianceConstants -val rootSigmaInvMat = Matrices.fromBreeze(rootSigmaInv) +val rootSigmaInvMat = Matrices.fromBreeze(rootSigmaInv).toDense val rootSigmaInvMulMu = rootSigmaInvMat.multiply(mean) (rootSigmaInvMat, u, rootSigmaInvMulMu) } @@ -81,6 +81,36 @@ class MultivariateGaussian @Since("2.0.0") ( u - 0.5 * BLAS.dot(v, v) } + private[ml] def pdf(X: Matrix): DenseVector = { +val mat = DenseMatrix.zeros(X.numRows, X.numCols) +pdf(X, mat) + } + + private[ml] def pdf(X: Matrix, mat: DenseMatrix): DenseVector = { +require(!mat.isTransposed) + +BLAS.gemm(1.0, X, rootSigmaInvMat.transpose, 0.0, mat) +val m = mat.numRows +val n = mat.numCols + +val pdfVec = mat.multiply(rootSigmaInvMulMu) + +val blas = BLAS.getBLAS(n) +val squared1 = blas.ddot(n, rootSigmaInvMulMu.values, 1, rootSigmaInvMulMu.values, 1) + +val localU = u +var i = 0 +while (i < m) { + val squared2 = blas.ddot(n, mat.values, i, m, mat.values, i, m) + val dot = pdfVec(i) + val squaredSum = squared1 + squared2 - dot - dot + pdfVec.values(i) = math.exp(localU - 0.5 * squaredSum) + i += 1 +} + +pdfVec + } + /** * Calculate distribution dependent components used for the density function: *pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu)) diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussianSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussianSuite.scala index f2ecff1..8652d31 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussianSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussianSuite.scala @@ -27,6 +27,7 @@ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) +val mat = Matrices.fromVectors(Seq(x1, x2)) val mu = Vectors.dense(0.0)
[spark] branch branch-3.0 updated: [SPARK-31393][SQL] Show the correct alias in schema for expression
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new cb253b1 [SPARK-31393][SQL] Show the correct alias in schema for expression cb253b1 is described below commit cb253b1c5f9b3e6c71adb5d43d8e82514aba00ef Author: beliefer AuthorDate: Tue May 12 10:25:04 2020 +0900 [SPARK-31393][SQL] Show the correct alias in schema for expression ### What changes were proposed in this pull request? Some alias of expression can not display correctly in schema. This PR will fix them. - `TimeWindow` - `MaxBy` - `MinBy` - `UnaryMinus` - `BitwiseCount` This PR also fix a typo issue, please look at https://github.com/apache/spark/blob/b7cde42b04b21c9bfee6535199cf385855c15853/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala#L142 Note: 1. `MaxBy` and `MinBy` extends `MaxMinBy` and the latter add a method `funcName` not needed. We can reuse `prettyName` to replace `funcName`. 2. Spark SQL exists some function no elegant implementation.For example: `BitwiseCount` override the sql method show below: `override def sql: String = s"bit_count(${child.sql})"` I don't think it's elegant enough. Because `Expression` gives the following definitions. ``` def sql: String = { val childrenSQL = children.map(_.sql).mkString(", ") s"$prettyName($childrenSQL)" } ``` By this definition, `BitwiseCount` should override `prettyName` method. ### Why are the changes needed? Improve the implement of some expression. ### Does this PR introduce any user-facing change? 'Yes'. This PR will let user see the correct alias in schema. ### How was this patch tested? Jenkins test. Closes #28164 from beliefer/elegant-pretty-name-for-function. Lead-authored-by: beliefer Co-authored-by: gengjiaan Signed-off-by: HyukjinKwon (cherry picked from commit a89006aba03a623960e5c4c6864ca8c899c81db9) Signed-off-by: HyukjinKwon --- .../apache/spark/sql/catalyst/analysis/FunctionRegistry.scala| 2 +- .../org/apache/spark/sql/catalyst/expressions/TimeWindow.scala | 1 + .../spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala | 9 + .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala | 7 ++- .../spark/sql/catalyst/expressions/bitwiseExpressions.scala | 4 ++-- .../src/test/resources/sql-functions/sql-expression-schema.md| 8 sql/core/src/test/resources/sql-tests/results/operators.sql.out | 2 +- .../test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala | 2 +- 8 files changed, 21 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d3b0731..c2c0df5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -253,7 +253,7 @@ object FunctionRegistry { expression[Log2]("log2"), expression[Log]("ln"), expression[Remainder]("mod", true), -expression[UnaryMinus]("negative"), +expression[UnaryMinus]("negative", true), expression[Pi]("pi"), expression[Pmod]("pmod"), expression[UnaryPositive]("positive"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala index caacb71..82d6894 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala @@ -63,6 +63,7 @@ case class TimeWindow( override def dataType: DataType = new StructType() .add(StructField("start", TimestampType)) .add(StructField("end", TimestampType)) + override def prettyName: String = "window" // This expression is replaced in the analyzer. override lazy val resolved = false diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala index 2e20224..6d3d3da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala @@ -31,7 +31,6 @@ abstract class MaxMinBy extends DeclarativeAggregate { def valueExpr: Expression def orderingExpr: Expression - protected def funcName:
[spark] branch branch-3.0 updated: [SPARK-31393][SQL] Show the correct alias in schema for expression
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new cb253b1 [SPARK-31393][SQL] Show the correct alias in schema for expression cb253b1 is described below commit cb253b1c5f9b3e6c71adb5d43d8e82514aba00ef Author: beliefer AuthorDate: Tue May 12 10:25:04 2020 +0900 [SPARK-31393][SQL] Show the correct alias in schema for expression ### What changes were proposed in this pull request? Some alias of expression can not display correctly in schema. This PR will fix them. - `TimeWindow` - `MaxBy` - `MinBy` - `UnaryMinus` - `BitwiseCount` This PR also fix a typo issue, please look at https://github.com/apache/spark/blob/b7cde42b04b21c9bfee6535199cf385855c15853/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala#L142 Note: 1. `MaxBy` and `MinBy` extends `MaxMinBy` and the latter add a method `funcName` not needed. We can reuse `prettyName` to replace `funcName`. 2. Spark SQL exists some function no elegant implementation.For example: `BitwiseCount` override the sql method show below: `override def sql: String = s"bit_count(${child.sql})"` I don't think it's elegant enough. Because `Expression` gives the following definitions. ``` def sql: String = { val childrenSQL = children.map(_.sql).mkString(", ") s"$prettyName($childrenSQL)" } ``` By this definition, `BitwiseCount` should override `prettyName` method. ### Why are the changes needed? Improve the implement of some expression. ### Does this PR introduce any user-facing change? 'Yes'. This PR will let user see the correct alias in schema. ### How was this patch tested? Jenkins test. Closes #28164 from beliefer/elegant-pretty-name-for-function. Lead-authored-by: beliefer Co-authored-by: gengjiaan Signed-off-by: HyukjinKwon (cherry picked from commit a89006aba03a623960e5c4c6864ca8c899c81db9) Signed-off-by: HyukjinKwon --- .../apache/spark/sql/catalyst/analysis/FunctionRegistry.scala| 2 +- .../org/apache/spark/sql/catalyst/expressions/TimeWindow.scala | 1 + .../spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala | 9 + .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala | 7 ++- .../spark/sql/catalyst/expressions/bitwiseExpressions.scala | 4 ++-- .../src/test/resources/sql-functions/sql-expression-schema.md| 8 sql/core/src/test/resources/sql-tests/results/operators.sql.out | 2 +- .../test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala | 2 +- 8 files changed, 21 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d3b0731..c2c0df5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -253,7 +253,7 @@ object FunctionRegistry { expression[Log2]("log2"), expression[Log]("ln"), expression[Remainder]("mod", true), -expression[UnaryMinus]("negative"), +expression[UnaryMinus]("negative", true), expression[Pi]("pi"), expression[Pmod]("pmod"), expression[UnaryPositive]("positive"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala index caacb71..82d6894 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala @@ -63,6 +63,7 @@ case class TimeWindow( override def dataType: DataType = new StructType() .add(StructField("start", TimestampType)) .add(StructField("end", TimestampType)) + override def prettyName: String = "window" // This expression is replaced in the analyzer. override lazy val resolved = false diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala index 2e20224..6d3d3da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala @@ -31,7 +31,6 @@ abstract class MaxMinBy extends DeclarativeAggregate { def valueExpr: Expression def orderingExpr: Expression - protected def funcName:
[spark] branch master updated (842b1dc -> a89006a)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 842b1dc [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available add a89006a [SPARK-31393][SQL] Show the correct alias in schema for expression No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/analysis/FunctionRegistry.scala| 2 +- .../org/apache/spark/sql/catalyst/expressions/TimeWindow.scala | 1 + .../spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala | 9 + .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala | 7 ++- .../spark/sql/catalyst/expressions/bitwiseExpressions.scala | 4 ++-- .../src/test/resources/sql-functions/sql-expression-schema.md| 8 sql/core/src/test/resources/sql-tests/results/operators.sql.out | 2 +- .../test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala | 2 +- 8 files changed, 21 insertions(+), 14 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (842b1dc -> a89006a)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 842b1dc [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available add a89006a [SPARK-31393][SQL] Show the correct alias in schema for expression No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/analysis/FunctionRegistry.scala| 2 +- .../org/apache/spark/sql/catalyst/expressions/TimeWindow.scala | 1 + .../spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala | 9 + .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala | 7 ++- .../spark/sql/catalyst/expressions/bitwiseExpressions.scala | 4 ++-- .../src/test/resources/sql-functions/sql-expression-schema.md| 8 sql/core/src/test/resources/sql-tests/results/operators.sql.out | 2 +- .../test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala | 2 +- 8 files changed, 21 insertions(+), 14 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9469831 [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available 9469831 is described below commit 9469831c3751e898ebe78cb642266b50ea167f22 Author: Jungtaek Lim (HeartSaVioR) AuthorDate: Mon May 11 17:25:41 2020 -0700 [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available ### What changes were proposed in this pull request? This patch re-obtain tokens at the start of AM for yarn cluster mode, if principal and keytab are available. It basically transfers the credentials from the original user, so this patch puts the new tokens into credentials from the original user via overwriting. To obtain tokens from providers in user application, this patch leverages the user classloader as context classloader while initializing token manager in the startup of AM. ### Why are the changes needed? Submitter will obtain delegation tokens for yarn-cluster mode, and add these credentials to the launch context. AM will be launched with these credentials, and AM and driver are able to leverage these tokens. In Yarn cluster mode, driver is launched in AM, which in turn initializes token manager (while initializing SparkContext) and obtain delegation tokens (+ schedule to renew) if both principal and keytab are available. That said, even we provide principal and keytab to run application with yarn-cluster mode, AM always starts with initial tokens from launch context until token manager runs and obtains delegation tokens. So there's a "gap", and if user codes (driver) access to external system with delegation tokens (e.g. HDFS) before initializing SparkContext, it cannot leverage the tokens token manager will obtain. It will make the application fail if AM is killed "after" the initial tokens are expired and relaunched. This is even a regression: see below codes in branch-2.4: https://github.com/apache/spark/blob/branch-2.4/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala https://github.com/apache/spark/blob/branch-2.4/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala In Spark 2.4.x, AM runs AMCredentialRenewer at initialization, and AMCredentialRenew obtains tokens and merge with credentials being provided with launch context of AM. So it guarantees new tokens in driver run. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Manually tested with specifically crafted application (simple reproducer) - https://github.com/HeartSaVioR/spark-delegation-token-experiment/blob/master/src/main/scala/net/heartsavior/spark/example/LongRunningAppWithHDFSConfig.scala Before this patch, new AM attempt failed when I killed AM after the expiration of tokens. After this patch the new AM attempt runs fine. Closes #28336 from HeartSaVioR/SPARK-31559. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Marcelo Vanzin (cherry picked from commit 842b1dcdff0ecab4af9f292c2ff7b2b9ae1ac40a) Signed-off-by: Marcelo Vanzin --- .../org/apache/spark/deploy/yarn/ApplicationMaster.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 1e8f408..862acd8 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -42,6 +42,7 @@ import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.history.HistoryServer +import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ @@ -860,10 +861,22 @@ object ApplicationMaster extends Logging { val ugi = sparkConf.get(PRINCIPAL) match { // We only need to log in with the keytab in cluster mode. In client mode, the driver // handles the user keytab. - case Some(principal) if amArgs.userClass != null => + case Some(principal) if master.isClusterMode => val originalCreds = UserGroupInformation.getCurrentUser().getCredentials()
[spark] branch master updated (64fb358 -> 842b1dc)
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 64fb358 [SPARK-31671][ML] Wrong error message in VectorAssembler add 842b1dc [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available No new revisions were added by this update. Summary of changes: .../org/apache/spark/deploy/yarn/ApplicationMaster.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9469831 [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available 9469831 is described below commit 9469831c3751e898ebe78cb642266b50ea167f22 Author: Jungtaek Lim (HeartSaVioR) AuthorDate: Mon May 11 17:25:41 2020 -0700 [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available ### What changes were proposed in this pull request? This patch re-obtain tokens at the start of AM for yarn cluster mode, if principal and keytab are available. It basically transfers the credentials from the original user, so this patch puts the new tokens into credentials from the original user via overwriting. To obtain tokens from providers in user application, this patch leverages the user classloader as context classloader while initializing token manager in the startup of AM. ### Why are the changes needed? Submitter will obtain delegation tokens for yarn-cluster mode, and add these credentials to the launch context. AM will be launched with these credentials, and AM and driver are able to leverage these tokens. In Yarn cluster mode, driver is launched in AM, which in turn initializes token manager (while initializing SparkContext) and obtain delegation tokens (+ schedule to renew) if both principal and keytab are available. That said, even we provide principal and keytab to run application with yarn-cluster mode, AM always starts with initial tokens from launch context until token manager runs and obtains delegation tokens. So there's a "gap", and if user codes (driver) access to external system with delegation tokens (e.g. HDFS) before initializing SparkContext, it cannot leverage the tokens token manager will obtain. It will make the application fail if AM is killed "after" the initial tokens are expired and relaunched. This is even a regression: see below codes in branch-2.4: https://github.com/apache/spark/blob/branch-2.4/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala https://github.com/apache/spark/blob/branch-2.4/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala In Spark 2.4.x, AM runs AMCredentialRenewer at initialization, and AMCredentialRenew obtains tokens and merge with credentials being provided with launch context of AM. So it guarantees new tokens in driver run. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Manually tested with specifically crafted application (simple reproducer) - https://github.com/HeartSaVioR/spark-delegation-token-experiment/blob/master/src/main/scala/net/heartsavior/spark/example/LongRunningAppWithHDFSConfig.scala Before this patch, new AM attempt failed when I killed AM after the expiration of tokens. After this patch the new AM attempt runs fine. Closes #28336 from HeartSaVioR/SPARK-31559. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Marcelo Vanzin (cherry picked from commit 842b1dcdff0ecab4af9f292c2ff7b2b9ae1ac40a) Signed-off-by: Marcelo Vanzin --- .../org/apache/spark/deploy/yarn/ApplicationMaster.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 1e8f408..862acd8 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -42,6 +42,7 @@ import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.history.HistoryServer +import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ @@ -860,10 +861,22 @@ object ApplicationMaster extends Logging { val ugi = sparkConf.get(PRINCIPAL) match { // We only need to log in with the keytab in cluster mode. In client mode, the driver // handles the user keytab. - case Some(principal) if amArgs.userClass != null => + case Some(principal) if master.isClusterMode => val originalCreds = UserGroupInformation.getCurrentUser().getCredentials()
[spark] branch master updated (64fb358 -> 842b1dc)
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 64fb358 [SPARK-31671][ML] Wrong error message in VectorAssembler add 842b1dc [SPARK-31559][YARN] Re-obtain tokens at the startup of AM for yarn cluster mode if principal and keytab are available No new revisions were added by this update. Summary of changes: .../org/apache/spark/deploy/yarn/ApplicationMaster.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1f85cd7 [SPARK-31671][ML] Wrong error message in VectorAssembler 1f85cd7 is described below commit 1f85cd7504623b9b4e7957aab5856f72e981cbd9 Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 9192e72..994681a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -228,7 +228,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch branch-2.4 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1f85cd7 [SPARK-31671][ML] Wrong error message in VectorAssembler 1f85cd7 is described below commit 1f85cd7504623b9b4e7957aab5856f72e981cbd9 Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 9192e72..994681a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -228,7 +228,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch branch-3.0 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7e226a2 [SPARK-31671][ML] Wrong error message in VectorAssembler 7e226a2 is described below commit 7e226a25efeaf083c95f04ee0d9c3a6e5b6d763d Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 3070012..7bc5e56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -233,7 +233,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch branch-2.4 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1f85cd7 [SPARK-31671][ML] Wrong error message in VectorAssembler 1f85cd7 is described below commit 1f85cd7504623b9b4e7957aab5856f72e981cbd9 Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 9192e72..994681a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -228,7 +228,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch branch-3.0 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7e226a2 [SPARK-31671][ML] Wrong error message in VectorAssembler 7e226a2 is described below commit 7e226a25efeaf083c95f04ee0d9c3a6e5b6d763d Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 3070012..7bc5e56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -233,7 +233,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch master updated (d7c3e9e -> 64fb358)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from d7c3e9e [SPARK-31456][CORE] Fix shutdown hook priority edge cases add 64fb358 [SPARK-31671][ML] Wrong error message in VectorAssembler No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1f85cd7 [SPARK-31671][ML] Wrong error message in VectorAssembler 1f85cd7 is described below commit 1f85cd7504623b9b4e7957aab5856f72e981cbd9 Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 9192e72..994681a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -228,7 +228,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch branch-3.0 updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7e226a2 [SPARK-31671][ML] Wrong error message in VectorAssembler 7e226a2 is described below commit 7e226a25efeaf083c95f04ee0d9c3a6e5b6d763d Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen (cherry picked from commit 64fb358a994d3fff651a742fa067c194b7455853) Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 3070012..7bc5e56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -233,7 +233,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail:
[spark] branch master updated (d7c3e9e -> 64fb358)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from d7c3e9e [SPARK-31456][CORE] Fix shutdown hook priority edge cases add 64fb358 [SPARK-31671][ML] Wrong error message in VectorAssembler No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-31671][ML] Wrong error message in VectorAssembler
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 64fb358 [SPARK-31671][ML] Wrong error message in VectorAssembler 64fb358 is described below commit 64fb358a994d3fff651a742fa067c194b7455853 Author: fan31415 AuthorDate: Mon May 11 18:23:23 2020 -0500 [SPARK-31671][ML] Wrong error message in VectorAssembler ### What changes were proposed in this pull request? When input column lengths can not be inferred and handleInvalid = "keep", VectorAssembler will throw a runtime exception. However the error message with this exception is not consistent. I change the content of this error message to make it work properly. ### Why are the changes needed? This is a bug. Here is a simple example to reproduce it. ``` // create a df without vector size val df = Seq( (Vectors.dense(1.0), Vectors.dense(2.0)) ).toDF("n1", "n2") // only set vector size hint for n1 column val hintedDf = new VectorSizeHint() .setInputCol("n1") .setSize(1) .transform(df) // assemble n1, n2 val output = new VectorAssembler() .setInputCols(Array("n1", "n2")) .setOutputCol("features") .setHandleInvalid("keep") .transform(hintedDf) // because only n1 has vector size, the error message should tell us to set vector size for n2 too output.show() ``` Expected error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n2]. ``` Actual error message: ``` Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint to add metadata for columns: [n1, n2]. ``` This introduce difficulties when I try to resolve this exception, for I do not know which column required vectorSizeHint. This is especially troublesome when you have a large number of columns to deal with. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Add test in VectorAssemblerSuite. Closes #28487 from fan31415/SPARK-31671. Lead-authored-by: fan31415 Co-authored-by: yijiefan Signed-off-by: Sean Owen --- .../scala/org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../org/apache/spark/ml/feature/VectorAssemblerSuite.scala| 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 3070012..7bc5e56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -233,7 +233,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] { getVectorLengthsFromFirstRow(dataset.na.drop(missingColumns), missingColumns) case (true, VectorAssembler.KEEP_INVALID) => throw new RuntimeException( s"""Can not infer column lengths with handleInvalid = "keep". Consider using VectorSizeHint - |to add metadata for columns: ${columns.mkString("[", ", ", "]")}.""" + |to add metadata for columns: ${missingColumns.mkString("[", ", ", "]")}.""" .stripMargin.replaceAll("\n", " ")) case (_, _) => Map.empty } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index a4d388f..4957f6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -261,4 +261,15 @@ class VectorAssemblerSuite val output = vectorAssembler.transform(dfWithNullsAndNaNs) assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) } + + test("SPARK-31671: should give explicit error message when can not infer column lengths") { +val df = Seq( + (Vectors.dense(1.0), Vectors.dense(2.0)) +).toDF("n1", "n2") +val hintedDf = new VectorSizeHint().setInputCol("n1").setSize(1).transform(df) +val assembler = new VectorAssembler() + .setInputCols(Array("n1", "n2")).setOutputCol("features") + assert(!intercept[RuntimeException](assembler.setHandleInvalid("keep").transform(hintedDf)) + .getMessage.contains("n1"), "should only show no vector size columns' name") + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [SPARK-31456][CORE] Fix shutdown hook priority edge cases
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new b28c1fb [SPARK-31456][CORE] Fix shutdown hook priority edge cases b28c1fb is described below commit b28c1fb99f1c0d6252321e4052f47169f43abc75 Author: oleg AuthorDate: Mon May 11 13:10:39 2020 -0700 [SPARK-31456][CORE] Fix shutdown hook priority edge cases ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. Closes #28494 from oleg-smith/SPARK-31456_shutdown_hook_priority. Authored-by: oleg Signed-off-by: Dongjoon Hyun (cherry picked from commit d7c3e9e53e01011f809b6cb145349ee8a9c5e5f0) Signed-off-by: Dongjoon Hyun --- core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index b702838..859da72 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { -other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8b959ab..00f96b9 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -702,10 +702,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() -assert(output.toList === List(4, 3, 2)) +assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [SPARK-31456][CORE] Fix shutdown hook priority edge cases
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new b28c1fb [SPARK-31456][CORE] Fix shutdown hook priority edge cases b28c1fb is described below commit b28c1fb99f1c0d6252321e4052f47169f43abc75 Author: oleg AuthorDate: Mon May 11 13:10:39 2020 -0700 [SPARK-31456][CORE] Fix shutdown hook priority edge cases ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. Closes #28494 from oleg-smith/SPARK-31456_shutdown_hook_priority. Authored-by: oleg Signed-off-by: Dongjoon Hyun (cherry picked from commit d7c3e9e53e01011f809b6cb145349ee8a9c5e5f0) Signed-off-by: Dongjoon Hyun --- core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index b702838..859da72 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { -other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8b959ab..00f96b9 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -702,10 +702,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() -assert(output.toList === List(4, 3, 2)) +assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31456][CORE] Fix shutdown hook priority edge cases
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 37c352c [SPARK-31456][CORE] Fix shutdown hook priority edge cases 37c352c is described below commit 37c352c592a9b8650dc5d1c5413af7a96f01631b Author: oleg AuthorDate: Mon May 11 13:10:39 2020 -0700 [SPARK-31456][CORE] Fix shutdown hook priority edge cases ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. Closes #28494 from oleg-smith/SPARK-31456_shutdown_hook_priority. Authored-by: oleg Signed-off-by: Dongjoon Hyun (cherry picked from commit d7c3e9e53e01011f809b6cb145349ee8a9c5e5f0) Signed-off-by: Dongjoon Hyun --- core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index 4f13112..4db26860 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { -other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index f5e438b..931eb6b 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -745,10 +745,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() -assert(output.toList === List(4, 3, 2)) +assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-2.4 updated: [SPARK-31456][CORE] Fix shutdown hook priority edge cases
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-2.4 by this push: new b28c1fb [SPARK-31456][CORE] Fix shutdown hook priority edge cases b28c1fb is described below commit b28c1fb99f1c0d6252321e4052f47169f43abc75 Author: oleg AuthorDate: Mon May 11 13:10:39 2020 -0700 [SPARK-31456][CORE] Fix shutdown hook priority edge cases ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. Closes #28494 from oleg-smith/SPARK-31456_shutdown_hook_priority. Authored-by: oleg Signed-off-by: Dongjoon Hyun (cherry picked from commit d7c3e9e53e01011f809b6cb145349ee8a9c5e5f0) Signed-off-by: Dongjoon Hyun --- core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index b702838..859da72 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { -other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8b959ab..00f96b9 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -702,10 +702,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() -assert(output.toList === List(4, 3, 2)) +assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31456][CORE] Fix shutdown hook priority edge cases
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 37c352c [SPARK-31456][CORE] Fix shutdown hook priority edge cases 37c352c is described below commit 37c352c592a9b8650dc5d1c5413af7a96f01631b Author: oleg AuthorDate: Mon May 11 13:10:39 2020 -0700 [SPARK-31456][CORE] Fix shutdown hook priority edge cases ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. Closes #28494 from oleg-smith/SPARK-31456_shutdown_hook_priority. Authored-by: oleg Signed-off-by: Dongjoon Hyun (cherry picked from commit d7c3e9e53e01011f809b6cb145349ee8a9c5e5f0) Signed-off-by: Dongjoon Hyun --- core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index 4f13112..4db26860 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { -other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index f5e438b..931eb6b 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -745,10 +745,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() -assert(output.toList === List(4, 3, 2)) +assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (5a5af46 -> d7c3e9e)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 5a5af46 [SPARK-31575][SQL] Synchronise global JVM security configuration modification add d7c3e9e [SPARK-31456][CORE] Fix shutdown hook priority edge cases No new revisions were added by this update. Summary of changes: core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31456][CORE] Fix shutdown hook priority edge cases
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 37c352c [SPARK-31456][CORE] Fix shutdown hook priority edge cases 37c352c is described below commit 37c352c592a9b8650dc5d1c5413af7a96f01631b Author: oleg AuthorDate: Mon May 11 13:10:39 2020 -0700 [SPARK-31456][CORE] Fix shutdown hook priority edge cases ### What changes were proposed in this pull request? Fix application order for shutdown hooks for the priorities of Int.MaxValue, Int.MinValue ### Why are the changes needed? The bug causes out-of-order execution of shutdown hooks if their priorities were Int.MinValue or Int.MaxValue ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a test covering the change. Closes #28494 from oleg-smith/SPARK-31456_shutdown_hook_priority. Authored-by: oleg Signed-off-by: Dongjoon Hyun (cherry picked from commit d7c3e9e53e01011f809b6cb145349ee8a9c5e5f0) Signed-off-by: Dongjoon Hyun --- core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index 4f13112..4db26860 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { -other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index f5e438b..931eb6b 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -745,10 +745,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MinValue, () => output += Int.MinValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) +manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() -assert(output.toList === List(4, 3, 2)) +assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (5a5af46 -> d7c3e9e)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 5a5af46 [SPARK-31575][SQL] Synchronise global JVM security configuration modification add d7c3e9e [SPARK-31456][CORE] Fix shutdown hook priority edge cases No new revisions were added by this update. Summary of changes: core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala | 4 +--- core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 6 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7b567e4 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps 7b567e4 is described below commit 7b567e4f1414e6a3def73f0d3b2f8b058ae58d43 Author: Max Gekk AuthorDate: Mon May 11 12:59:41 2020 + [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps ### What changes were proposed in this pull request? Modified `RandomDataGenerator.forType` for DateType and TimestampType to generate special date//timestamp values with 0.5 probability. This will trigger dictionary encoding in Parquet datasource test HadoopFsRelationTest "test all data types". Currently, dictionary encoding is tested only for numeric types like ShortType. ### Why are the changes needed? To extend test coverage. Currently, probability of testing of dictionary encoding in the test HadoopFsRelationTest "test all data types" for DateType and TimestampType is close to zero because dates/timestamps are uniformly distributed in wide range, and the chance of generating the same values is pretty low. In this way, parquet datasource cannot apply dictionary encoding for such column types. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `ParquetHadoopFsRelationSuite` and `JsonHadoopFsRelationSuite`. Closes #28481 from MaxGekk/test-random-parquet-dict-enc. Authored-by: Max Gekk Signed-off-by: Wenchen Fan (cherry picked from commit 32a5398b659695c338cd002d9094bdf19a89a716) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/RandomDataGenerator.scala | 100 - .../spark/sql/sources/HadoopFsRelationTest.scala | 2 +- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 5a4d23d..cf8d772 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. + while (milliseconds < -6213574080L) { +// 25340232959L is the number of milliseconds since +// January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". +milliseconds = rand.nextLong() % 25340232959L } -Some(generator) + val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) + // The generated `date` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic
[spark] branch master updated (32a5398 -> 7a670b5)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 32a5398 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps add 7a670b5 [SPARK-31667][ML][PYSPARK] Python side flatten the result dataframe of ANOVATest/ChisqTest/FValueTest No new revisions were added by this update. Summary of changes: python/pyspark/ml/stat.py | 60 +-- 1 file changed, 48 insertions(+), 12 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (32a5398 -> 7a670b5)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 32a5398 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps add 7a670b5 [SPARK-31667][ML][PYSPARK] Python side flatten the result dataframe of ANOVATest/ChisqTest/FValueTest No new revisions were added by this update. Summary of changes: python/pyspark/ml/stat.py | 60 +-- 1 file changed, 48 insertions(+), 12 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (7a670b5 -> 5a5af46)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 7a670b5 [SPARK-31667][ML][PYSPARK] Python side flatten the result dataframe of ANOVATest/ChisqTest/FValueTest add 5a5af46 [SPARK-31575][SQL] Synchronise global JVM security configuration modification No new revisions were added by this update. Summary of changes: .../jdbc/connection/DB2ConnectionProvider.scala| 2 +- .../connection/MariaDBConnectionProvider.scala | 2 +- .../connection/PostgresConnectionProvider.scala| 2 +- .../jdbc/connection/SecureConnectionProvider.scala | 9 - .../jdbc/connection/ConnectionProviderSuite.scala | 45 ++ 5 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/connection/ConnectionProviderSuite.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (7a670b5 -> 5a5af46)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 7a670b5 [SPARK-31667][ML][PYSPARK] Python side flatten the result dataframe of ANOVATest/ChisqTest/FValueTest add 5a5af46 [SPARK-31575][SQL] Synchronise global JVM security configuration modification No new revisions were added by this update. Summary of changes: .../jdbc/connection/DB2ConnectionProvider.scala| 2 +- .../connection/MariaDBConnectionProvider.scala | 2 +- .../connection/PostgresConnectionProvider.scala| 2 +- .../jdbc/connection/SecureConnectionProvider.scala | 9 - .../jdbc/connection/ConnectionProviderSuite.scala | 45 ++ 5 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/connection/ConnectionProviderSuite.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (b80309b -> 32a5398)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from b80309b [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental add 32a5398 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/RandomDataGenerator.scala | 100 - .../spark/sql/sources/HadoopFsRelationTest.scala | 2 +- 2 files changed, 58 insertions(+), 44 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (b80309b -> 32a5398)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from b80309b [SPARK-31674][CORE][DOCS] Make Prometheus metric endpoints experimental add 32a5398 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/RandomDataGenerator.scala | 100 - .../spark/sql/sources/HadoopFsRelationTest.scala | 2 +- 2 files changed, 58 insertions(+), 44 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7b567e4 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps 7b567e4 is described below commit 7b567e4f1414e6a3def73f0d3b2f8b058ae58d43 Author: Max Gekk AuthorDate: Mon May 11 12:59:41 2020 + [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps ### What changes were proposed in this pull request? Modified `RandomDataGenerator.forType` for DateType and TimestampType to generate special date//timestamp values with 0.5 probability. This will trigger dictionary encoding in Parquet datasource test HadoopFsRelationTest "test all data types". Currently, dictionary encoding is tested only for numeric types like ShortType. ### Why are the changes needed? To extend test coverage. Currently, probability of testing of dictionary encoding in the test HadoopFsRelationTest "test all data types" for DateType and TimestampType is close to zero because dates/timestamps are uniformly distributed in wide range, and the chance of generating the same values is pretty low. In this way, parquet datasource cannot apply dictionary encoding for such column types. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `ParquetHadoopFsRelationSuite` and `JsonHadoopFsRelationSuite`. Closes #28481 from MaxGekk/test-random-parquet-dict-enc. Authored-by: Max Gekk Signed-off-by: Wenchen Fan (cherry picked from commit 32a5398b659695c338cd002d9094bdf19a89a716) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/RandomDataGenerator.scala | 100 - .../spark/sql/sources/HadoopFsRelationTest.scala | 2 +- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 5a4d23d..cf8d772 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. + while (milliseconds < -6213574080L) { +// 25340232959L is the number of milliseconds since +// January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". +milliseconds = rand.nextLong() % 25340232959L } -Some(generator) + val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) + // The generated `date` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic
[spark] branch branch-3.0 updated: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7b567e4 [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps 7b567e4 is described below commit 7b567e4f1414e6a3def73f0d3b2f8b058ae58d43 Author: Max Gekk AuthorDate: Mon May 11 12:59:41 2020 + [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps ### What changes were proposed in this pull request? Modified `RandomDataGenerator.forType` for DateType and TimestampType to generate special date//timestamp values with 0.5 probability. This will trigger dictionary encoding in Parquet datasource test HadoopFsRelationTest "test all data types". Currently, dictionary encoding is tested only for numeric types like ShortType. ### Why are the changes needed? To extend test coverage. Currently, probability of testing of dictionary encoding in the test HadoopFsRelationTest "test all data types" for DateType and TimestampType is close to zero because dates/timestamps are uniformly distributed in wide range, and the chance of generating the same values is pretty low. In this way, parquet datasource cannot apply dictionary encoding for such column types. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `ParquetHadoopFsRelationSuite` and `JsonHadoopFsRelationSuite`. Closes #28481 from MaxGekk/test-random-parquet-dict-enc. Authored-by: Max Gekk Signed-off-by: Wenchen Fan (cherry picked from commit 32a5398b659695c338cd002d9094bdf19a89a716) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/RandomDataGenerator.scala | 100 - .../spark/sql/sources/HadoopFsRelationTest.scala | 2 +- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 5a4d23d..cf8d772 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -162,52 +162,66 @@ object RandomDataGenerator { }) case BooleanType => Some(() => rand.nextBoolean()) case DateType => -val generator = - () => { -var milliseconds = rand.nextLong() % 25340232959L -// -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT -// for "0001-01-01 00:00:00.00". We need to find a -// number that is greater or equals to this number as a valid timestamp value. -while (milliseconds < -6213574080L) { - // 25340232959L is the number of milliseconds since - // January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". - milliseconds = rand.nextLong() % 25340232959L -} -val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) -// The generated `date` is based on the hybrid calendar Julian + Gregorian since -// 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used -// by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to -// a local date in Proleptic Gregorian calendar to satisfy this requirement. -// Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar. -// As the consequence of that, 29 February of such years might not exist in Proleptic -// Gregorian calendar. When this happens, we shift the date by one day. -Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) +def uniformDateRand(rand: Random): java.sql.Date = { + var milliseconds = rand.nextLong() % 25340232959L + // -6213574080L is the number of milliseconds before January 1, 1970, 00:00:00 GMT + // for "0001-01-01 00:00:00.00". We need to find a + // number that is greater or equals to this number as a valid timestamp value. + while (milliseconds < -6213574080L) { +// 25340232959L is the number of milliseconds since +// January 1, 1970, 00:00:00 GMT for "-12-31 23:59:59.99". +milliseconds = rand.nextLong() % 25340232959L } -Some(generator) + val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt) + // The generated `date` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic