[spark] branch master updated: [SPARK-39052][SQL] Support Literal.create(Char, StringType)

2022-04-27 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new abc2dc03fc8 [SPARK-39052][SQL] Support Literal.create(Char, StringType)
abc2dc03fc8 is described below

commit abc2dc03fc8f910ab95054205cdea4e3cb25801f
Author: Hyukjin Kwon 
AuthorDate: Thu Apr 28 07:53:50 2022 +0300

[SPARK-39052][SQL] Support Literal.create(Char, StringType)

### What changes were proposed in this pull request?

This is sort of a followup of 
https://github.com/apache/spark/commit/54fcaafb094e299f21c18370fddb4a727c88d875.
 `Literal.create` should also support `Char` too.

### Why are the changes needed?

To make the support of external type `Char` same as `Literla.apply`.

### Does this PR introduce _any_ user-facing change?

No, this isn't exposed to users. `Literal.create(Char, StringType)` isn't 
also used in the current codebase internally. This PR is just for completeness.

### How was this patch tested?

Unittests were added.

Closes #36389 from HyukjinKwon/SPARK-39052.

Authored-by: Hyukjin Kwon 
Signed-off-by: Max Gekk 
---
 .../scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala  | 1 +
 .../org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala   | 1 +
 .../spark/sql/catalyst/expressions/LiteralExpressionSuite.scala   | 4 
 3 files changed, 6 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 3e6d31e79b7..263d3734217 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -499,6 +499,7 @@ object CatalystTypeConverters {
*/
   def convertToCatalyst(a: Any): Any = a match {
 case s: String => StringConverter.toCatalyst(s)
+case c: Char => StringConverter.toCatalyst(c.toString)
 case d: Date => DateConverter.toCatalyst(d)
 case ld: LocalDate => LocalDateConverter.toCatalyst(ld)
 case t: Timestamp => TimestampConverter.toCatalyst(t)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
index b559e219882..bf194a2288b 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
@@ -152,6 +152,7 @@ class CatalystTypeConvertersSuite extends SparkFunSuite 
with SQLHelper {
 val converter = 
CatalystTypeConverters.createToCatalystConverter(StringType)
 val expected = UTF8String.fromString("X")
 assert(converter(chr) === expected)
+assert(CatalystTypeConverters.convertToCatalyst('a') === 
UTF8String.fromString("a"))
   }
 
   test("SPARK-33390: Make Literal support char array") {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index 6ce51f1eec8..80e7a3206aa 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -247,6 +247,10 @@ class LiteralExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 // scalastyle:on
   }
 
+  test("SPARK-39052: Support Char in Literal.create") {
+checkEvaluation(Literal.create('a', StringType), "a")
+  }
+
   test("construct literals from java.time.LocalDate") {
 Seq(
   LocalDate.of(1, 1, 1),


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-39047][SQL] Replace the error class ILLEGAL_SUBSTRING by INVALID_PARAMETER_VALUE

2022-04-27 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9dcc24c36f6 [SPARK-39047][SQL] Replace the error class 
ILLEGAL_SUBSTRING by INVALID_PARAMETER_VALUE
9dcc24c36f6 is described below

commit 9dcc24c36f6fcdf43bf66fe50415be575f7b2918
Author: Max Gekk 
AuthorDate: Thu Apr 28 07:46:44 2022 +0300

[SPARK-39047][SQL] Replace the error class ILLEGAL_SUBSTRING by 
INVALID_PARAMETER_VALUE

### What changes were proposed in this pull request?
In the PR, I propose to remove the `ILLEGAL_SUBSTRING` error class, and use 
`INVALID_PARAMETER_VALUE` in the case when the `strfmt` parameter of the 
`format_string()` function contains `%0$`. The last value is handled 
differently by JDKs:  _"... Java 8 and Java 11 uses it as "%1$", and Java 17 
throws IllegalFormatArgumentIndexException(Illegal format argument index = 0)"_.

### Why are the changes needed?
To improve code maintenance and user experience with Spark SQL by reducing 
the number of user-facing error classes.

### Does this PR introduce _any_ user-facing change?
Yes, it changes user-facing error message.

Before:
```sql
spark-sql> select format_string('%0$s', 'Hello');
Error in query: [ILLEGAL_SUBSTRING] The argument_index of string format 
cannot contain position 0$.; line 1 pos 7
```

After:
```sql
spark-sql> select format_string('%0$s', 'Hello');
Error in query: [INVALID_PARAMETER_VALUE] The value of parameter(s) 
'strfmt' in `format_string` is invalid: expects %1$, %2$ and so on, but got 
%0$.; line 1 pos 7
```

### How was this patch tested?
By running the affected test suites:
```
$ build/sbt "test:testOnly *SparkThrowableSuite"
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z 
text.sql"
$ build/sbt "test:testOnly *QueryCompilationErrorsSuite"
```

Closes #36380 from MaxGekk/error-class-ILLEGAL_SUBSTRING.

Authored-by: Max Gekk 
Signed-off-by: Max Gekk 
---
 core/src/main/resources/error/error-classes.json   | 3 ---
 .../apache/spark/sql/catalyst/expressions/stringExpressions.scala  | 3 +--
 .../scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala | 7 ---
 .../src/test/resources/sql-tests/results/postgreSQL/text.sql.out   | 2 +-
 .../org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala  | 7 ---
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/core/src/main/resources/error/error-classes.json 
b/core/src/main/resources/error/error-classes.json
index 673866e6c35..4738599685b 100644
--- a/core/src/main/resources/error/error-classes.json
+++ b/core/src/main/resources/error/error-classes.json
@@ -71,9 +71,6 @@
   "GROUPING_SIZE_LIMIT_EXCEEDED" : {
 "message" : [ "Grouping sets size cannot be greater than " ]
   },
-  "ILLEGAL_SUBSTRING" : {
-"message" : [ " cannot contain ." ]
-  },
   "INCOMPARABLE_PIVOT_COLUMN" : {
 "message" : [ "Invalid pivot column ''. Pivot columns must be 
comparable." ],
 "sqlState" : "42000"
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 976caeb3502..9089ff46637 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1898,8 +1898,7 @@ case class FormatString(children: Expression*) extends 
Expression with ImplicitC
*/
   private def checkArgumentIndexNotZero(expression: Expression): Unit = 
expression match {
 case StringLiteral(pattern) if pattern.contains("%0$") =>
-  throw QueryCompilationErrors.illegalSubstringError(
-"The argument_index of string format", "position 0$")
+  throw QueryCompilationErrors.zeroArgumentIndexError()
 case _ => // do nothing
   }
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
index 7f212ed5891..3d379fb4f71 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -66,10 +66,11 @@ object QueryCompilationErrors extends QueryErrorsBase {
   messageParameters = Array(sizeLimit.toString))
   }
 
-  def illegalSubstringError(subject: String, illegalContent: String): 
Throwable = {
+  def zeroArgumentIndexError(): Throwable = {
 new AnalysisException(
-  errorClass = "ILLEGAL_SUBSTRING",
-  messageParameters = Array(subject, illegalContent))
+  er

[GitHub] [spark-website] yaooqinn commented on pull request #385: Fix Apache Project Website Checks - Events

2022-04-27 Thread GitBox


yaooqinn commented on PR #385:
URL: https://github.com/apache/spark-website/pull/385#issuecomment-717872

   cc @srowen @dongjoon-hyun @HyukjinKwon 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] yaooqinn opened a new pull request, #385: Fix Apache Project Website Checks - Events

2022-04-27 Thread GitBox


yaooqinn opened a new pull request, #385:
URL: https://github.com/apache/spark-website/pull/385

   
   
   
   https://whimsy.apache.org/site/project/spark in red light 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set

2022-04-27 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 606a99f4f2d [SPARK-39046][SQL] Return an empty context string if 
TreeNode.origin is wrongly set
606a99f4f2d is described below

commit 606a99f4f2d91ea30c81285d6c95ee566e80577f
Author: Gengliang Wang 
AuthorDate: Thu Apr 28 09:59:17 2022 +0800

[SPARK-39046][SQL] Return an empty context string if TreeNode.origin is 
wrongly set

### What changes were proposed in this pull request?

For the query context `TreeNode.origin.context`, this PR proposal to return 
an empty context string if
* the query text/ the start index/ the stop index is missing
* the start index is less than 0
* the stop index is larger than the length of query text
* the start index is larger than the stop index

### Why are the changes needed?

There are downstream projects that depend on Spark. There is no guarantee 
for the correctness of TreeNode.origin. Developers may create a plan/expression 
with a Origin containing wrong startIndex/stopIndex/sqlText.
Thus, to avoid errors in calling `String.substring` or showing misleading 
debug information, I suggest returning an empty context string if 
TreeNode.origin is wrongly set. The query context is just for better error 
messages and we should handle it cautiously.

### Does this PR introduce _any_ user-facing change?

No, the context framework is not released yet.

### How was this patch tested?

UT

Closes #36379 from gengliangwang/safeContext.

Authored-by: Gengliang Wang 
Signed-off-by: Gengliang Wang 
(cherry picked from commit 7fe2759e9f81ec267e92e1c6f8a48f42042db791)
Signed-off-by: Gengliang Wang 
---
 .../apache/spark/sql/catalyst/trees/TreeNode.scala | 126 +++--
 .../expressions/ArithmeticExpressionSuite.scala|   6 +
 .../spark/sql/catalyst/trees/TreeNodeSuite.scala   |  37 ++
 3 files changed, 110 insertions(+), 59 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 00690abf18f..079abd3f2e0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -72,71 +72,79 @@ case class Origin(
* SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT4_TBL i
*  ^^^
*/
-  lazy val context: String = sqlText.map { text =>
-val positionContext = if (line.isDefined && startPosition.isDefined) {
-  s"(line ${line.get}, position ${startPosition.get})"
-} else {
+  lazy val context: String = {
+// If the query context is missing or incorrect, simply return an empty 
string.
+if (sqlText.isEmpty || startIndex.isEmpty || stopIndex.isEmpty ||
+  startIndex.get < 0 || stopIndex.get >= sqlText.get.length || 
startIndex.get > stopIndex.get) {
   ""
-}
-val objectContext = if (objectType.isDefined && objectName.isDefined) {
-  s" of ${objectType.get} ${objectName.get}"
 } else {
-  ""
-}
-val builder = new StringBuilder
-builder ++= s"\n== SQL$objectContext$positionContext ==\n"
-
-val start = startIndex.getOrElse(0)
-val stop = stopIndex.getOrElse(sqlText.get.length - 1)
-// Ideally we should show all the lines which contains the SQL text 
context of the current node:
-// [additional text] [current tree node] [additional text]
-// However, we need to truncate the additional text in case it is too 
long. The following
-// variable is to define the max length of additional text.
-val maxExtraContextLength = 32
-val truncatedText = "..."
-var lineStartIndex = start
-// Collect the SQL text within the starting line of current Node.
-// The text is truncated if it is too long.
-while(lineStartIndex >= 0 &&
-  start - lineStartIndex <= maxExtraContextLength &&
-  text.charAt(lineStartIndex) != '\n') {
-  lineStartIndex -= 1
-}
-val startTruncated = start - lineStartIndex > maxExtraContextLength
-var currentIndex = lineStartIndex
-if (startTruncated) {
-  currentIndex -= truncatedText.length
-}
+  val positionContext = if (line.isDefined && startPosition.isDefined) {
+s"(line ${line.get}, position ${startPosition.get})"
+  } else {
+""
+  }
+  val objectContext = if (objectType.isDefined && objectName.isDefined) {
+s" of ${objectType.get} ${objectName.get}"
+  } else {
+""
+  }
+  val builder = new StringBuilder
+  builder ++= s"\n== SQL$objectContext$positionContext ==\n"
+
+  val text = sqlText.get

[spark] branch master updated: [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set

2022-04-27 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 7fe2759e9f8 [SPARK-39046][SQL] Return an empty context string if 
TreeNode.origin is wrongly set
7fe2759e9f8 is described below

commit 7fe2759e9f81ec267e92e1c6f8a48f42042db791
Author: Gengliang Wang 
AuthorDate: Thu Apr 28 09:59:17 2022 +0800

[SPARK-39046][SQL] Return an empty context string if TreeNode.origin is 
wrongly set

### What changes were proposed in this pull request?

For the query context `TreeNode.origin.context`, this PR proposal to return 
an empty context string if
* the query text/ the start index/ the stop index is missing
* the start index is less than 0
* the stop index is larger than the length of query text
* the start index is larger than the stop index

### Why are the changes needed?

There are downstream projects that depend on Spark. There is no guarantee 
for the correctness of TreeNode.origin. Developers may create a plan/expression 
with a Origin containing wrong startIndex/stopIndex/sqlText.
Thus, to avoid errors in calling `String.substring` or showing misleading 
debug information, I suggest returning an empty context string if 
TreeNode.origin is wrongly set. The query context is just for better error 
messages and we should handle it cautiously.

### Does this PR introduce _any_ user-facing change?

No, the context framework is not released yet.

### How was this patch tested?

UT

Closes #36379 from gengliangwang/safeContext.

Authored-by: Gengliang Wang 
Signed-off-by: Gengliang Wang 
---
 .../apache/spark/sql/catalyst/trees/TreeNode.scala | 126 +++--
 .../expressions/ArithmeticExpressionSuite.scala|   6 +
 .../spark/sql/catalyst/trees/TreeNodeSuite.scala   |  37 ++
 3 files changed, 110 insertions(+), 59 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index eed59b9e1bf..0714898e19d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -72,71 +72,79 @@ case class Origin(
* SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT4_TBL i
*  ^^^
*/
-  lazy val context: String = sqlText.map { text =>
-val positionContext = if (line.isDefined && startPosition.isDefined) {
-  s"(line ${line.get}, position ${startPosition.get})"
-} else {
+  lazy val context: String = {
+// If the query context is missing or incorrect, simply return an empty 
string.
+if (sqlText.isEmpty || startIndex.isEmpty || stopIndex.isEmpty ||
+  startIndex.get < 0 || stopIndex.get >= sqlText.get.length || 
startIndex.get > stopIndex.get) {
   ""
-}
-val objectContext = if (objectType.isDefined && objectName.isDefined) {
-  s" of ${objectType.get} ${objectName.get}"
 } else {
-  ""
-}
-val builder = new StringBuilder
-builder ++= s"\n== SQL$objectContext$positionContext ==\n"
-
-val start = startIndex.getOrElse(0)
-val stop = stopIndex.getOrElse(sqlText.get.length - 1)
-// Ideally we should show all the lines which contains the SQL text 
context of the current node:
-// [additional text] [current tree node] [additional text]
-// However, we need to truncate the additional text in case it is too 
long. The following
-// variable is to define the max length of additional text.
-val maxExtraContextLength = 32
-val truncatedText = "..."
-var lineStartIndex = start
-// Collect the SQL text within the starting line of current Node.
-// The text is truncated if it is too long.
-while(lineStartIndex >= 0 &&
-  start - lineStartIndex <= maxExtraContextLength &&
-  text.charAt(lineStartIndex) != '\n') {
-  lineStartIndex -= 1
-}
-val startTruncated = start - lineStartIndex > maxExtraContextLength
-var currentIndex = lineStartIndex
-if (startTruncated) {
-  currentIndex -= truncatedText.length
-}
+  val positionContext = if (line.isDefined && startPosition.isDefined) {
+s"(line ${line.get}, position ${startPosition.get})"
+  } else {
+""
+  }
+  val objectContext = if (objectType.isDefined && objectName.isDefined) {
+s" of ${objectType.get} ${objectName.get}"
+  } else {
+""
+  }
+  val builder = new StringBuilder
+  builder ++= s"\n== SQL$objectContext$positionContext ==\n"
+
+  val text = sqlText.get
+  val start = math.max(startIndex.get, 0)
+  val stop = math.min(stopIndex.getOrElse(text.length - 1), text

[spark] branch branch-3.3 updated: [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info`

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 96d66b030d9 [SPARK-38988][PYTHON] Suppress PerformanceWarnings of 
`DataFrame.info`
96d66b030d9 is described below

commit 96d66b030d914ccd7ded74e33287e45d09935e27
Author: Xinrong Meng 
AuthorDate: Thu Apr 28 09:25:37 2022 +0900

[SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info`

### What changes were proposed in this pull request?
Suppress PerformanceWarnings of DataFrame.info

### Why are the changes needed?
To improve usability.

### Does this PR introduce _any_ user-facing change?
No. Only PerformanceWarnings of DataFrame.info are suppressed.

### How was this patch tested?
Manual tests.

Closes #36367 from xinrong-databricks/frame.info.

Authored-by: Xinrong Meng 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 594337fad131280f62107326062fb554f0566d43)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/pandas/conversion.py | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/sql/pandas/conversion.py 
b/python/pyspark/sql/pandas/conversion.py
index 808444f1e2e..fff0bac5480 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -15,9 +15,9 @@
 # limitations under the License.
 #
 import sys
-import warnings
 from collections import Counter
 from typing import List, Optional, Type, Union, no_type_check, overload, 
TYPE_CHECKING
+from warnings import catch_warnings, simplefilter, warn
 
 from pyspark.rdd import _load_from_socket
 from pyspark.sql.pandas.serializers import ArrowCollectSerializer
@@ -111,7 +111,7 @@ class PandasConversionMixin:
 "'spark.sql.execution.arrow.pyspark.fallback.enabled' 
is set to "
 "true." % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 use_arrow = False
 else:
 msg = (
@@ -121,7 +121,7 @@ class PandasConversionMixin:
 "with 
'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
 "false.\n  %s" % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 raise
 
 # Try to use Arrow optimization when the schema is supported and 
the required version
@@ -198,7 +198,7 @@ class PandasConversionMixin:
 "effect on failures in the middle of "
 "computation.\n  %s" % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 raise
 
 # Below is toPandas without Arrow optimization.
@@ -247,13 +247,18 @@ class PandasConversionMixin:
 if (t is not None and not is_timedelta64_dtype(t)) or 
should_check_timedelta:
 series = series.astype(t, copy=False)
 
-# `insert` API makes copy of data, we only do it for Series of 
duplicate column names.
-# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work 
because `iloc` could
-# return a view or a copy depending by context.
-if column_counter[column_name] > 1:
-df.insert(index, column_name, series, allow_duplicates=True)
-else:
-df[column_name] = series
+with catch_warnings():
+from pandas.errors import PerformanceWarning
+
+simplefilter(action="ignore", category=PerformanceWarning)
+# `insert` API makes copy of data,
+# we only do it for Series of duplicate column names.
+# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always 
work
+# because `iloc` could return a view or a copy depending by 
context.
+if column_counter[column_name] > 1:
+df.insert(index, column_name, series, 
allow_duplicates=True)
+else:
+df[column_name] = series
 
 if timezone is None:
 return df
@@ -417,7 +422,7 @@ class SparkConversionMixin:
 "'spark.sql.execution.arrow.pyspark.fallback.enabled' 
is set to "
 "true." % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 else:
 msg = (
 "createDataFrame attempted Arrow optimization because "
@@ -426,7 +431,7 @@ class SparkConversionMixin:
 "fallback with 
'spark.sql.execution.arrow.pyspark.fallback.enabled' "
 

[spark] branch master updated: [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info`

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 594337fad13 [SPARK-38988][PYTHON] Suppress PerformanceWarnings of 
`DataFrame.info`
594337fad13 is described below

commit 594337fad131280f62107326062fb554f0566d43
Author: Xinrong Meng 
AuthorDate: Thu Apr 28 09:25:37 2022 +0900

[SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info`

### What changes were proposed in this pull request?
Suppress PerformanceWarnings of DataFrame.info

### Why are the changes needed?
To improve usability.

### Does this PR introduce _any_ user-facing change?
No. Only PerformanceWarnings of DataFrame.info are suppressed.

### How was this patch tested?
Manual tests.

Closes #36367 from xinrong-databricks/frame.info.

Authored-by: Xinrong Meng 
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/pandas/conversion.py | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/sql/pandas/conversion.py 
b/python/pyspark/sql/pandas/conversion.py
index 808444f1e2e..fff0bac5480 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -15,9 +15,9 @@
 # limitations under the License.
 #
 import sys
-import warnings
 from collections import Counter
 from typing import List, Optional, Type, Union, no_type_check, overload, 
TYPE_CHECKING
+from warnings import catch_warnings, simplefilter, warn
 
 from pyspark.rdd import _load_from_socket
 from pyspark.sql.pandas.serializers import ArrowCollectSerializer
@@ -111,7 +111,7 @@ class PandasConversionMixin:
 "'spark.sql.execution.arrow.pyspark.fallback.enabled' 
is set to "
 "true." % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 use_arrow = False
 else:
 msg = (
@@ -121,7 +121,7 @@ class PandasConversionMixin:
 "with 
'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
 "false.\n  %s" % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 raise
 
 # Try to use Arrow optimization when the schema is supported and 
the required version
@@ -198,7 +198,7 @@ class PandasConversionMixin:
 "effect on failures in the middle of "
 "computation.\n  %s" % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 raise
 
 # Below is toPandas without Arrow optimization.
@@ -247,13 +247,18 @@ class PandasConversionMixin:
 if (t is not None and not is_timedelta64_dtype(t)) or 
should_check_timedelta:
 series = series.astype(t, copy=False)
 
-# `insert` API makes copy of data, we only do it for Series of 
duplicate column names.
-# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work 
because `iloc` could
-# return a view or a copy depending by context.
-if column_counter[column_name] > 1:
-df.insert(index, column_name, series, allow_duplicates=True)
-else:
-df[column_name] = series
+with catch_warnings():
+from pandas.errors import PerformanceWarning
+
+simplefilter(action="ignore", category=PerformanceWarning)
+# `insert` API makes copy of data,
+# we only do it for Series of duplicate column names.
+# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always 
work
+# because `iloc` could return a view or a copy depending by 
context.
+if column_counter[column_name] > 1:
+df.insert(index, column_name, series, 
allow_duplicates=True)
+else:
+df[column_name] = series
 
 if timezone is None:
 return df
@@ -417,7 +422,7 @@ class SparkConversionMixin:
 "'spark.sql.execution.arrow.pyspark.fallback.enabled' 
is set to "
 "true." % str(e)
 )
-warnings.warn(msg)
+warn(msg)
 else:
 msg = (
 "createDataFrame attempted Arrow optimization because "
@@ -426,7 +431,7 @@ class SparkConversionMixin:
 "fallback with 
'spark.sql.execution.arrow.pyspark.fallback.enabled' "
 "has been set to false.\n  %s" % str(e)
 )
-warnings.warn(msg)
+  

[spark] branch branch-3.3 updated: [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass`

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new c9b6b50ff64 [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass`
c9b6b50ff64 is described below

commit c9b6b50ff640b531cc953249311a78f5b75ce349
Author: Bjørn Jørgensen 
AuthorDate: Thu Apr 28 09:20:53 2022 +0900

[SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass`

### What changes were proposed in this pull request?
Remove unneeded `pass`

### Why are the changes needed?
Class`s Estimator, Transformer and Evaluator are abstract classes. Which 
has functions.

ValueError in def run() has code.

By removing `pass` it will be easier to read, understand and reuse code.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing tests passed.

Closes #36383 from bjornjorgensen/remove-unneeded-pass.

Lead-authored-by: Bjørn Jørgensen 
Co-authored-by: bjornjorgensen 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 0e875875059c1cbf36de49205a4ce8dbc483d9d1)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/ml/base.py   | 4 
 python/pyspark/ml/evaluation.py | 2 --
 python/pyspark/tests/test_worker.py | 1 -
 3 files changed, 7 deletions(-)

diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py
index 20540ebbef6..34c3aa9c62c 100644
--- a/python/pyspark/ml/base.py
+++ b/python/pyspark/ml/base.py
@@ -110,8 +110,6 @@ class Estimator(Params, Generic[M], metaclass=ABCMeta):
 .. versionadded:: 1.3.0
 """
 
-pass
-
 @abstractmethod
 def _fit(self, dataset: DataFrame) -> M:
 """
@@ -220,8 +218,6 @@ class Transformer(Params, metaclass=ABCMeta):
 .. versionadded:: 1.3.0
 """
 
-pass
-
 @abstractmethod
 def _transform(self, dataset: DataFrame) -> DataFrame:
 """
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index ff0e5b91e42..19d123debae 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -67,8 +67,6 @@ class Evaluator(Params, metaclass=ABCMeta):
 .. versionadded:: 1.4.0
 """
 
-pass
-
 @abstractmethod
 def _evaluate(self, dataset: DataFrame) -> float:
 """
diff --git a/python/pyspark/tests/test_worker.py 
b/python/pyspark/tests/test_worker.py
index 0fdf6adb031..06ada8f81d5 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -70,7 +70,6 @@ class WorkerTests(ReusedPySparkTestCase):
 try:
 daemon_pid, worker_pid = map(int, data)
 except ValueError:
-pass
 # In case the value is not written yet.
 cnt += 1
 if cnt == 10:


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (c19fadabde3 -> 0e875875059)

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from c19fadabde3 [SPARK-39051][PYTHON] Minor refactoring of 
`python/pyspark/sql/pandas/conversion.py`
 add 0e875875059 [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass`

No new revisions were added by this update.

Summary of changes:
 python/pyspark/ml/base.py   | 4 
 python/pyspark/ml/evaluation.py | 2 --
 python/pyspark/tests/test_worker.py | 1 -
 3 files changed, 7 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-39051][PYTHON] Minor refactoring of `python/pyspark/sql/pandas/conversion.py`

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 84addc5d1d8 [SPARK-39051][PYTHON] Minor refactoring of 
`python/pyspark/sql/pandas/conversion.py`
84addc5d1d8 is described below

commit 84addc5d1d8359a5b716ec869489fc961af23cf2
Author: Xinrong Meng 
AuthorDate: Thu Apr 28 09:17:24 2022 +0900

[SPARK-39051][PYTHON] Minor refactoring of 
`python/pyspark/sql/pandas/conversion.py`

Minor refactoring of `python/pyspark/sql/pandas/conversion.py`, which 
includes:
- doc change
- renaming

To improve code readability and maintainability.

No.

Existing tests.

Closes #36384 from xinrong-databricks/conversion.py.

Authored-by: Xinrong Meng 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit c19fadabde3ef3f9c7e4fa9bf74632a4f8e1f3e2)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/pandas/conversion.py | 52 -
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/python/pyspark/sql/pandas/conversion.py 
b/python/pyspark/sql/pandas/conversion.py
index 7153450d2bc..808444f1e2e 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -51,7 +51,7 @@ if TYPE_CHECKING:
 
 class PandasConversionMixin:
 """
-Min-in for the conversion from Spark to pandas. Currently, only 
:class:`DataFrame`
+Mix-in for the conversion from Spark to pandas. Currently, only 
:class:`DataFrame`
 can use this class.
 """
 
@@ -65,10 +65,10 @@ class PandasConversionMixin:
 
 Notes
 -
-This method should only be used if the resulting Pandas's 
:class:`DataFrame` is
+This method should only be used if the resulting Pandas 
``pandas.DataFrame`` is
 expected to be small, as all the data is loaded into the driver's 
memory.
 
-Usage with spark.sql.execution.arrow.pyspark.enabled=True is 
experimental.
+Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is 
experimental.
 
 Examples
 
@@ -136,8 +136,7 @@ class PandasConversionMixin:
 
 # Rename columns to avoid duplicated column names.
 tmp_column_names = ["col_{}".format(i) for i in 
range(len(self.columns))]
-c = self.sparkSession._jconf
-self_destruct = c.arrowPySparkSelfDestructEnabled()
+self_destruct = jconf.arrowPySparkSelfDestructEnabled()
 batches = self.toDF(*tmp_column_names)._collect_as_arrow(
 split_batches=self_destruct
 )
@@ -176,11 +175,11 @@ class PandasConversionMixin:
 else:
 corrected_panda_types = {}
 for index, field in enumerate(self.schema):
-panda_type = 
PandasConversionMixin._to_corrected_pandas_type(
+pandas_type = 
PandasConversionMixin._to_corrected_pandas_type(
 field.dataType
 )
 corrected_panda_types[tmp_column_names[index]] = (
-np.object0 if panda_type is None else 
panda_type
+np.object0 if pandas_type is None else 
pandas_type
 )
 
 pdf = pd.DataFrame(columns=tmp_column_names).astype(
@@ -206,36 +205,37 @@ class PandasConversionMixin:
 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
 column_counter = Counter(self.columns)
 
-dtype: List[Optional[Type]] = [None] * len(self.schema)
-for fieldIdx, field in enumerate(self.schema):
-# For duplicate column name, we use `iloc` to access it.
+corrected_dtypes: List[Optional[Type]] = [None] * len(self.schema)
+for index, field in enumerate(self.schema):
+# We use `iloc` to access columns with duplicate column names.
 if column_counter[field.name] > 1:
-pandas_col = pdf.iloc[:, fieldIdx]
+pandas_col = pdf.iloc[:, index]
 else:
 pandas_col = pdf[field.name]
 
 pandas_type = 
PandasConversionMixin._to_corrected_pandas_type(field.dataType)
 # SPARK-21766: if an integer field is nullable and has null 
values, it can be
-# inferred by pandas as float column. Once we convert the column 
with NaN back
-# to integer type e.g., np.int16, we will hit exception. So we use 
the inferred
-# float type, not the corrected type from the schema in this case.
+# inferred by pandas as a float column. If we convert the column 
with NaN back
+

[spark] branch master updated (7b637eef77b -> c19fadabde3)

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 7b637eef77b [SPARK-39038][CI] Skip reporting test results if 
triggering workflow was skipped
 add c19fadabde3 [SPARK-39051][PYTHON] Minor refactoring of 
`python/pyspark/sql/pandas/conversion.py`

No new revisions were added by this update.

Summary of changes:
 python/pyspark/sql/pandas/conversion.py | 52 -
 1 file changed, 25 insertions(+), 27 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-39038][CI] Skip reporting test results if triggering workflow was skipped

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 7b637eef77b [SPARK-39038][CI] Skip reporting test results if 
triggering workflow was skipped
7b637eef77b is described below

commit 7b637eef77b5069cb8825db8ff79389a935364d8
Author: Enrico Minack 
AuthorDate: Thu Apr 28 09:00:17 2022 +0900

[SPARK-39038][CI] Skip reporting test results if triggering workflow was 
skipped

### What changes were proposed in this pull request?
The `"Report test results"` workflow should be skipped when the triggering 
workflow completed with conclusion `'skipped'`.

### Why are the changes needed?
The `"Report test results"` workflow is triggered when either `"Build and 
test"` or `"Build and test (ANSI)"` complete. On fork repositories, workflow 
`"Build and test (ANSI)"` is always skipped.

The triggered `"Report test results"` workflow downloads artifacts from the 
triggering workflow and errors because there are none artifacts.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
In personal repo:
- https://github.com/EnricoMi/spark/actions/runs/2231657986
- triggered by https://github.com/EnricoMi/spark/actions/runs/2231657828

Closes #36371 from EnricoMi/master.

Authored-by: Enrico Minack 
Signed-off-by: Hyukjin Kwon 
---
 .github/workflows/test_report.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test_report.yml 
b/.github/workflows/test_report.yml
index a3f09c06ed9..5f46985a975 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -26,6 +26,7 @@ on:
 
 jobs:
   test_report:
+if: github.event.workflow_run.conclusion != 'skipped'
 runs-on: ubuntu-latest
 steps:
 - name: Download test results to report


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-38997][SQL] DS V2 aggregate push-down supports group by expressions

2022-04-27 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 4a4e35a30c7 [SPARK-38997][SQL] DS V2 aggregate push-down supports 
group by expressions
4a4e35a30c7 is described below

commit 4a4e35a30c7bb7534aece8e917a2813d47c2c498
Author: Jiaan Geng 
AuthorDate: Thu Apr 28 00:43:55 2022 +0800

[SPARK-38997][SQL] DS V2 aggregate push-down supports group by expressions

### What changes were proposed in this pull request?
Currently, Spark DS V2 aggregate push-down only supports group by column.
But the SQL show below is very useful and common.
```
SELECT
  CASE
WHEN 'SALARY' > 8000.00
  AND 'SALARY' < 1.00
THEN 'SALARY'
ELSE 0.00
  END AS key,
  SUM('SALARY')
FROM "test"."employee"
GROUP BY key
```

### Why are the changes needed?
Let DS V2 aggregate push-down supports group by expressions

### Does this PR introduce _any_ user-facing change?
'No'.
New feature.

### How was this patch tested?
New tests

Closes #36325 from beliefer/SPARK-38997.

Authored-by: Jiaan Geng 
Signed-off-by: Wenchen Fan 
(cherry picked from commit ee6ea3c68694e35c36ad006a7762297800d1e463)
Signed-off-by: Wenchen Fan 
---
 .../expressions/aggregate/Aggregation.java |  10 +-
 .../spark/sql/execution/DataSourceScanExec.scala   |   2 +-
 .../datasources/AggregatePushDownUtils.scala   |  23 ++--
 .../execution/datasources/DataSourceStrategy.scala |   7 +-
 .../sql/execution/datasources/orc/OrcUtils.scala   |   2 +-
 .../datasources/parquet/ParquetUtils.scala |   2 +-
 .../datasources/v2/V2ScanRelationPushDown.scala|  23 ++--
 .../datasources/v2/jdbc/JDBCScanBuilder.scala  |  27 ++---
 .../sql/execution/datasources/v2/orc/OrcScan.scala |   2 +-
 .../datasources/v2/parquet/ParquetScan.scala   |   2 +-
 .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala| 120 -
 11 files changed, 151 insertions(+), 69 deletions(-)

diff --git 
a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java
index cf7dbb2978d..11d9e475ca1 100644
--- 
a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java
@@ -20,7 +20,7 @@ package org.apache.spark.sql.connector.expressions.aggregate;
 import java.io.Serializable;
 
 import org.apache.spark.annotation.Evolving;
-import org.apache.spark.sql.connector.expressions.NamedReference;
+import org.apache.spark.sql.connector.expressions.Expression;
 
 /**
  * Aggregation in SQL statement.
@@ -30,14 +30,14 @@ import 
org.apache.spark.sql.connector.expressions.NamedReference;
 @Evolving
 public final class Aggregation implements Serializable {
   private final AggregateFunc[] aggregateExpressions;
-  private final NamedReference[] groupByColumns;
+  private final Expression[] groupByExpressions;
 
-  public Aggregation(AggregateFunc[] aggregateExpressions, NamedReference[] 
groupByColumns) {
+  public Aggregation(AggregateFunc[] aggregateExpressions, Expression[] 
groupByExpressions) {
 this.aggregateExpressions = aggregateExpressions;
-this.groupByColumns = groupByColumns;
+this.groupByExpressions = groupByExpressions;
   }
 
   public AggregateFunc[] aggregateExpressions() { return aggregateExpressions; 
}
 
-  public NamedReference[] groupByColumns() { return groupByColumns; }
+  public Expression[] groupByExpressions() { return groupByExpressions; }
 }
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 5067cd7fa3c..ac0f3af5725 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -163,7 +163,7 @@ case class RowDataSourceScanExec(
   "PushedFilters" -> pushedFilters) ++
   pushedDownOperators.aggregation.fold(Map[String, String]()) { v =>
 Map("PushedAggregates" -> 
seqToString(v.aggregateExpressions.map(_.describe())),
-  "PushedGroupByColumns" -> 
seqToString(v.groupByColumns.map(_.describe(} ++
+  "PushedGroupByExpressions" -> 
seqToString(v.groupByExpressions.map(_.describe(} ++
   topNOrLimitInfo ++
   pushedDownOperators.sample.map(v => "PushedSample" ->
 s"SAMPLE (${(v.upperBound - v.lowerBound) * 100}) ${v.withReplacement} 
SEED(${v.seed})"
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datas

[spark] branch master updated (852997d6ed6 -> ee6ea3c6869)

2022-04-27 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 852997d6ed6 [SPARK-38914][SQL] Allow user to insert specified columns 
into insertable view
 add ee6ea3c6869 [SPARK-38997][SQL] DS V2 aggregate push-down supports 
group by expressions

No new revisions were added by this update.

Summary of changes:
 .../expressions/aggregate/Aggregation.java |  10 +-
 .../spark/sql/execution/DataSourceScanExec.scala   |   2 +-
 .../datasources/AggregatePushDownUtils.scala   |  23 ++--
 .../execution/datasources/DataSourceStrategy.scala |   7 +-
 .../sql/execution/datasources/orc/OrcUtils.scala   |   2 +-
 .../datasources/parquet/ParquetUtils.scala |   2 +-
 .../datasources/v2/V2ScanRelationPushDown.scala|  23 ++--
 .../datasources/v2/jdbc/JDBCScanBuilder.scala  |  27 ++---
 .../sql/execution/datasources/v2/orc/OrcScan.scala |   2 +-
 .../datasources/v2/parquet/ParquetScan.scala   |   2 +-
 .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala| 120 -
 11 files changed, 151 insertions(+), 69 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-38914][SQL] Allow user to insert specified columns into insertable view

2022-04-27 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 852997d6ed6 [SPARK-38914][SQL] Allow user to insert specified columns 
into insertable view
852997d6ed6 is described below

commit 852997d6ed61f4e098803b96927e7cbbd24e3d7c
Author: morvenhuang 
AuthorDate: Wed Apr 27 23:35:45 2022 +0800

[SPARK-38914][SQL] Allow user to insert specified columns into insertable 
view

### What changes were proposed in this pull request?
Allow user to insert specified columns into insertable view, for example,
```
CREATE TEMPORARY VIEW v1 (c1 int, c2 string) USING 
org.apache.spark.sql.json.DefaultSource OPTIONS (path 'json_dir')
INSERT INTO v1(c1) VALUES(100)
SELECT c1, c2 FROM v1;
+---++
| c1|  c2|
+---++
|100|null|
+---++
```

### Why are the changes needed?
The option spark.sql.defaultColumn.useNullsForMissingDefautValues allows us 
to insert specified columns into table 
(https://issues.apache.org/jira/browse/SPARK-38795), but currently this option 
does not work for insertable view,

To keep consistenct with the behavior of INSERT INTO table, we should also 
allow user to specify columns when running INSERT INTO view.

### Does this PR introduce _any_ user-facing change?
Yes.

### How was this patch tested?
New unit tests added.

Closes #36212 from morvenhuang/SPARK-38914.

Authored-by: morvenhuang 
Signed-off-by: Gengliang Wang 
---
 .../catalyst/analysis/ResolveDefaultColumns.scala  |  6 ++-
 .../org/apache/spark/sql/internal/SQLConf.scala|  2 +-
 .../org/apache/spark/sql/sources/InsertSuite.scala | 45 ++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala
index 447769be300..422a1e422be 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala
@@ -79,7 +79,8 @@ case class ResolveDefaultColumns(
   replaceExplicitDefaultColumnValues(analyzer, 
expanded).getOrElse(table)
 replaced
 
-  case i@InsertIntoStatement(_, _, _, project: Project, _, _) =>
+  case i@InsertIntoStatement(_, _, _, project: Project, _, _)
+if !project.projectList.exists(_.isInstanceOf[Star]) =>
 enclosingInsert = Some(i)
 insertTableSchemaWithoutPartitionColumns = 
getInsertTableSchemaWithoutPartitionColumns
 val expanded: Project = 
addMissingDefaultColumnValues(project).getOrElse(project)
@@ -280,6 +281,9 @@ case class ResolveDefaultColumns(
   case SubqueryAlias(_, r: UnresolvedCatalogRelation) =>
 StructType(r.tableMeta.schema.fields.dropRight(
   enclosingInsert.get.partitionSpec.size))
+  case SubqueryAlias(_, r: View) if r.isTempView =>
+StructType(r.schema.fields.dropRight(
+  enclosingInsert.get.partitionSpec.size))
   case _ => return None
 }
 // Rearrange the columns in the result schema to match the order of the 
explicit column list,
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 0ba870d10e9..8876d780799 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2880,7 +2880,7 @@ object SQLConf {
   .createWithDefault(true)
 
   val USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES =
-buildConf("spark.sql.defaultColumn.useNullsForMissingDefautValues")
+buildConf("spark.sql.defaultColumn.useNullsForMissingDefaultValues")
   .internal()
   .doc("When true, and DEFAULT columns are enabled, allow column 
definitions lacking " +
 "explicit default values to behave as if they had specified DEFAULT 
NULL instead. " +
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 04acedb7ead..1312353f537 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -856,6 +856,33 @@ class InsertSuite extends DataSourceTest with 
SharedSparkSession {
 }
   }
 
+  test("Allow user to insert specified columns into insertable view") {
+withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> 
"true") {
+  sql("INSERT OVERWRITE TABLE jsonTable SELECT a FROM jt")
+

[spark] branch master updated: [SPARK-38979][SQL] Improve error log readability in OrcUtils.requestedColumnIds

2022-04-27 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 70b4b1d1f69 [SPARK-38979][SQL] Improve error log readability in 
OrcUtils.requestedColumnIds
70b4b1d1f69 is described below

commit 70b4b1d1f69be3a15eadb0e798139982c152b7bb
Author: sychen 
AuthorDate: Wed Apr 27 08:38:28 2022 -0500

[SPARK-38979][SQL] Improve error log readability in 
OrcUtils.requestedColumnIds

### What changes were proposed in this pull request?
Add detailed log in `OrcUtils#requestedColumnIds`.

### Why are the changes needed?
In `OrcUtils#requestedColumnIds` sometimes it fails because 
`orcFieldNames.length > dataSchema.length`, the log is not very clear.

```
java.lang.AssertionError: assertion failed: The given data schema 
struct has less fields than the actual ORC physical schema, no idea 
which columns were dropped, fail to read.
```

after the change
```
java.lang.AssertionError: assertion failed: The given data schema 
struct (length:1) has fewer 1 fields than the actual ORC physical 
schema struct (length:2), no idea which columns were 
dropped, fail to read.
```

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
exist UT / local test

Closes #36296 from cxzl25/SPARK-38979.

Authored-by: sychen 
Signed-off-by: Sean Owen 
---
 .../org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index f07573beae6..1783aadaa78 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -224,7 +224,9 @@ object OrcUtils extends Logging {
 // the physical schema doesn't match the data schema).
 // In these cases we map the physical schema to the data schema by 
index.
 assert(orcFieldNames.length <= dataSchema.length, "The given data 
schema " +
-  s"${dataSchema.catalogString} has less fields than the actual ORC 
physical schema, " +
+  s"${dataSchema.catalogString} (length:${dataSchema.length}) " +
+  s"has fewer ${orcFieldNames.length - dataSchema.length} fields than 
" +
+  s"the actual ORC physical schema $orcSchema 
(length:${orcFieldNames.length}), " +
   "no idea which columns were dropped, fail to read.")
 // for ORC file written by Hive, no field names
 // in the physical schema, there is a need to send the


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated (b3ecff34ab6 -> b25276f4385)

2022-04-27 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


from b3ecff34ab6 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in 
InjectRuntimeFilterSuite
 add b25276f4385 [SPARK-39015][SQL][3.3] Remove the usage of toSQLValue(v) 
without an explicit type

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/expressions/Cast.scala  | 58 --
 .../expressions/complexTypeExtractors.scala|  5 +-
 .../spark/sql/catalyst/util/DateTimeUtils.scala| 14 --
 .../spark/sql/catalyst/util/IntervalUtils.scala| 23 +
 .../apache/spark/sql/errors/QueryErrorsBase.scala  | 14 ++
 .../spark/sql/errors/QueryExecutionErrors.scala| 47 ++
 .../scala/org/apache/spark/sql/types/Decimal.scala | 21 +---
 .../org/apache/spark/sql/types/numerics.scala  | 13 +++--
 .../catalyst/expressions/AnsiCastSuiteBase.scala   |  3 +-
 .../test/resources/sql-tests/inputs/ansi/map.sql   |  1 +
 .../resources/sql-tests/results/ansi/map.sql.out   | 14 +-
 11 files changed, 125 insertions(+), 88 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (e49147af4a8 -> 4e84f339973)

2022-04-27 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from e49147af4a8 [SPARK-39015][SQL] Remove the usage of toSQLValue(v) 
without an explicit type
 add 4e84f339973 [SPARK-39027][SQL] Output SQL statements in error messages 
in upper case and w/o double quotes

No new revisions were added by this update.

Summary of changes:
 python/pyspark/sql/tests/test_udf.py |  2 +-
 .../apache/spark/sql/errors/QueryErrorsBase.scala|  3 +--
 .../ExtractPythonUDFFromJoinConditionSuite.scala |  2 +-
 .../resources/sql-tests/results/describe.sql.out |  4 ++--
 .../sql/errors/QueryCompilationErrorsSuite.scala |  6 +++---
 .../spark/sql/errors/QueryParsingErrorsSuite.scala   | 20 ++--
 .../spark/sql/execution/command/DDLParserSuite.scala |  4 ++--
 7 files changed, 20 insertions(+), 21 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (d05e01d5402 -> e49147af4a8)

2022-04-27 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from d05e01d5402 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in 
InjectRuntimeFilterSuite
 add e49147af4a8 [SPARK-39015][SQL] Remove the usage of toSQLValue(v) 
without an explicit type

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/expressions/Cast.scala  | 58 --
 .../expressions/complexTypeExtractors.scala|  5 +-
 .../spark/sql/catalyst/util/DateTimeUtils.scala| 14 --
 .../spark/sql/catalyst/util/IntervalUtils.scala| 23 +
 .../apache/spark/sql/errors/QueryErrorsBase.scala  | 14 ++
 .../spark/sql/errors/QueryExecutionErrors.scala| 47 ++
 .../scala/org/apache/spark/sql/types/Decimal.scala | 21 +---
 .../org/apache/spark/sql/types/numerics.scala  | 13 +++--
 .../catalyst/expressions/AnsiCastSuiteBase.scala   |  3 +-
 .../test/resources/sql-tests/inputs/ansi/map.sql   |  1 +
 .../resources/sql-tests/results/ansi/map.sql.out   | 14 +-
 .../sql/errors/QueryExecutionAnsiErrorsSuite.scala |  5 +-
 12 files changed, 128 insertions(+), 90 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite

2022-04-27 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new b3ecff34ab6 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in 
InjectRuntimeFilterSuite
b3ecff34ab6 is described below

commit b3ecff34ab6e3f7b0852db7c0b391cefd176e6ca
Author: Peter Toth 
AuthorDate: Wed Apr 27 16:16:15 2022 +0800

[SPARK-34079][SQL][FOLLOW-UP] Revert some changes in 
InjectRuntimeFilterSuite

To remove unnecessary changes from `InjectRuntimeFilterSuite` after 
https://github.com/apache/spark/pull/32298. These are not needed after 
https://github.com/apache/spark/pull/34929 as the final optimized plan does'n 
contain any `WithCTE` nodes.

No need for those changes.

No.

Added new test.

Closes #36361 from 
peter-toth/SPARK-34079-multi-column-scalar-subquery-follow-up-2.

Authored-by: Peter Toth 
Signed-off-by: Wenchen Fan 
(cherry picked from commit d05e01d54024e3844f1e48e03bad3fd814b8f6b9)
Signed-off-by: Wenchen Fan 
---
 .../spark/sql/InjectRuntimeFilterSuite.scala   | 73 +-
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala
index b541419c823..6c6bd1799e1 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala
@@ -19,13 +19,17 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.expressions.{Alias, 
BloomFilterMightContain, Literal}
 import 
org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, 
BloomFilterAggregate}
+import org.apache.spark.sql.catalyst.optimizer.MergeScalarSubqueries
 import org.apache.spark.sql.catalyst.plans.LeftSemi
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Join, 
LogicalPlan}
+import org.apache.spark.sql.execution.{ReusedSubqueryExec, SubqueryExec}
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, 
AQEPropagateEmptyRelation}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
 import org.apache.spark.sql.types.{IntegerType, StructType}
 
-class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with 
SharedSparkSession {
+class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with 
SharedSparkSession
+  with AdaptiveSparkPlanHelper {
 
   protected override def beforeAll(): Unit = {
 super.beforeAll()
@@ -201,9 +205,16 @@ class InjectRuntimeFilterSuite extends QueryTest with 
SQLTestUtils with SharedSp
 sql("analyze table bf4 compute statistics for columns a4, b4, c4, d4, e4, 
f4")
 sql("analyze table bf5part compute statistics for columns a5, b5, c5, d5, 
e5, f5")
 sql("analyze table bf5filtered compute statistics for columns a5, b5, c5, 
d5, e5, f5")
+
+// `MergeScalarSubqueries` can duplicate subqueries in the optimized plan 
and would make testing
+// complicated.
+conf.setConfString(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, 
MergeScalarSubqueries.ruleName)
   }
 
   protected override def afterAll(): Unit = try {
+conf.setConfString(SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
+  SQLConf.OPTIMIZER_EXCLUDED_RULES.defaultValueString)
+
 sql("DROP TABLE IF EXISTS bf1")
 sql("DROP TABLE IF EXISTS bf2")
 sql("DROP TABLE IF EXISTS bf3")
@@ -264,24 +275,28 @@ class InjectRuntimeFilterSuite extends QueryTest with 
SQLTestUtils with SharedSp
 }
   }
 
-  // `MergeScalarSubqueries` can duplicate subqueries in the optimized plan, 
but the subqueries will
-  // be reused in the physical plan.
-  def getNumBloomFilters(plan: LogicalPlan, scalarSubqueryCTEMultiplicator: 
Int = 1): Integer = {
-val numBloomFilterAggs = plan.collectWithSubqueries {
-  case Aggregate(_, aggregateExpressions, _) =>
-aggregateExpressions.collect {
-  case Alias(AggregateExpression(bfAgg: BloomFilterAggregate, _, _, _, 
_), _) =>
-assert(bfAgg.estimatedNumItemsExpression.isInstanceOf[Literal])
-assert(bfAgg.numBitsExpression.isInstanceOf[Literal])
-1
+  def getNumBloomFilters(plan: LogicalPlan): Integer = {
+val numBloomFilterAggs = plan.collect {
+  case Filter(condition, _) => condition.collect {
+case subquery: org.apache.spark.sql.catalyst.expressions.ScalarSubquery
+=> subquery.plan.collect {
+  case Aggregate(_, aggregateExpressions, _) =>
+aggregateExpressions.map {
+  case Alias(AggregateExpression(bfAgg : BloomFilterAggregate, _, 
_, _, _),
+  _) =>
+assert(bfAgg.estimatedNumItemsExpression.isInstanceOf[Li

[spark] branch master updated (5b53bdfa830 -> d05e01d5402)

2022-04-27 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 5b53bdfa830 [SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()
 add d05e01d5402 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in 
InjectRuntimeFilterSuite

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/InjectRuntimeFilterSuite.scala   | 73 +-
 1 file changed, 57 insertions(+), 16 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.0 updated: Revert "[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()"

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 46fa4998a4c Revert "[SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()"
46fa4998a4c is described below

commit 46fa4998a4c2f0d858c66c3629b13bd91d372cdd
Author: Hyukjin Kwon 
AuthorDate: Wed Apr 27 16:59:34 2022 +0900

Revert "[SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()"

This reverts commit 821a348ae7f9cd5958d29ccf342719f5d753ae28.
---
 python/pyspark/sql/functions.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 803213513da..041f8418176 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -780,8 +780,6 @@ def when(condition, value):
 :param condition: a boolean :class:`Column` expression.
 :param value: a literal value, or a :class:`Column` expression.
 
-Examples
-
 >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
 [Row(age=3), Row(age=4)]
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.0 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
 new 821a348ae7f [SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()
821a348ae7f is described below

commit 821a348ae7f9cd5958d29ccf342719f5d753ae28
Author: vadim <86705+va...@users.noreply.github.com>
AuthorDate: Wed Apr 27 16:56:18 2022 +0900

[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

### What changes were proposed in this pull request?
Fix missing keyword for `pyspark.sql.functions.when()` documentation.

### Why are the changes needed?

[Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html)
 is not formatted correctly

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
All tests passed.

Closes #36369 from vadim/SPARK-39032.

Authored-by: vadim <86705+va...@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 041f8418176..803213513da 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -780,6 +780,8 @@ def when(condition, value):
 :param condition: a boolean :class:`Column` expression.
 :param value: a literal value, or a :class:`Column` expression.
 
+Examples
+
 >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
 [Row(age=3), Row(age=4)]
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.1 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
 new 30be8d0098d [SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()
30be8d0098d is described below

commit 30be8d0098d037e65d798999d80efd4bc8195b82
Author: vadim <86705+va...@users.noreply.github.com>
AuthorDate: Wed Apr 27 16:56:18 2022 +0900

[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

### What changes were proposed in this pull request?
Fix missing keyword for `pyspark.sql.functions.when()` documentation.

### Why are the changes needed?

[Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html)
 is not formatted correctly

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
All tests passed.

Closes #36369 from vadim/SPARK-39032.

Authored-by: vadim <86705+va...@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 88b7c4dfb64..ec21bbe5ce2 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1404,6 +1404,8 @@ def when(condition, value):
 value :
 a literal value, or a :class:`~pyspark.sql.Column` expression.
 
+Examples
+
 >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
 [Row(age=3), Row(age=4)]
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.2 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 5b35caeed4b [SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()
5b35caeed4b is described below

commit 5b35caeed4bd2e0803887dfde61788844eb920a6
Author: vadim <86705+va...@users.noreply.github.com>
AuthorDate: Wed Apr 27 16:56:18 2022 +0900

[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

### What changes were proposed in this pull request?
Fix missing keyword for `pyspark.sql.functions.when()` documentation.

### Why are the changes needed?

[Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html)
 is not formatted correctly

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
All tests passed.

Closes #36369 from vadim/SPARK-39032.

Authored-by: vadim <86705+va...@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index acde817fbda..2d254dc9e54 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1505,6 +1505,8 @@ def when(condition, value):
 value :
 a literal value, or a :class:`~pyspark.sql.Column` expression.
 
+Examples
+
 >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
 [Row(age=3), Row(age=4)]
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new d59f11816b7 [SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()
d59f11816b7 is described below

commit d59f11816b7e1d9195cc08806820469f78c3e0aa
Author: vadim <86705+va...@users.noreply.github.com>
AuthorDate: Wed Apr 27 16:56:18 2022 +0900

[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

### What changes were proposed in this pull request?
Fix missing keyword for `pyspark.sql.functions.when()` documentation.

### Why are the changes needed?

[Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html)
 is not formatted correctly

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
All tests passed.

Closes #36369 from vadim/SPARK-39032.

Authored-by: vadim <86705+va...@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 06fdbf1ed39..019f64b5171 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1716,6 +1716,8 @@ def when(condition: Column, value: Any) -> Column:
 value :
 a literal value, or a :class:`~pyspark.sql.Column` expression.
 
+Examples
+
 >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
 [Row(age=3), Row(age=4)]
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

2022-04-27 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 5b53bdfa830 [SPARK-39032][PYTHON][DOCS] Examples' tag for 
pyspark.sql.functions.when()
5b53bdfa830 is described below

commit 5b53bdfa83061c160652e07b999f996fc8bd2ece
Author: vadim <86705+va...@users.noreply.github.com>
AuthorDate: Wed Apr 27 16:56:18 2022 +0900

[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()

### What changes were proposed in this pull request?
Fix missing keyword for `pyspark.sql.functions.when()` documentation.

### Why are the changes needed?

[Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html)
 is not formatted correctly

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
All tests passed.

Closes #36369 from vadim/SPARK-39032.

Authored-by: vadim <86705+va...@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 06fdbf1ed39..019f64b5171 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1716,6 +1716,8 @@ def when(condition: Column, value: Any) -> Column:
 value :
 a literal value, or a :class:`~pyspark.sql.Column` expression.
 
+Examples
+
 >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
 [Row(age=3), Row(age=4)]
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.1 updated: [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins

2022-04-27 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
 new 7217c51bd7d [SPARK-38868][SQL][3.2] Don't propagate exceptions from 
filter predicate when optimizing outer joins
7217c51bd7d is described below

commit 7217c51bd7dc4c68c28af6473d144737c5db0669
Author: Bruce Robbins 
AuthorDate: Wed Apr 27 15:38:22 2022 +0800

[SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate 
when optimizing outer joins

Backport of #36230

### What changes were proposed in this pull request?

Change `EliminateOuterJoin#canFilterOutNull` to return `false` when a 
`where` condition throws an exception.

### Why are the changes needed?

Consider this query:
```
select *
from (select id, id as b from range(0, 10)) l
left outer join (select id, id + 1 as c from range(0, 10)) r
on l.id = r.id
where assert_true(c > 0) is null;
```
The query should succeed, but instead fails with
```
java.lang.RuntimeException: '(c#1L > cast(0 as bigint))' is not true!
```
This happens even though there is no row where `c > 0` is false.

The `EliminateOuterJoin` rule checks if it can convert the outer join to a 
inner join based on the expression in the where clause, which in this case is
```
assert_true(c > 0) is null
```
`EliminateOuterJoin#canFilterOutNull` evaluates that expression with `c` 
set to `null` to see if the result is `null` or `false`. That rule doesn't 
expect the result to be a `RuntimeException`, but in this case it always is.

That is, the assertion is failing during optimization, not at run time.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test.

Closes #36341 from bersprockets/outer_join_eval_assert_issue_32.

Authored-by: Bruce Robbins 
Signed-off-by: Wenchen Fan 
(cherry picked from commit 3690c8ceb9e5c2f642b9f9e1af526f76d2e2a71a)
Signed-off-by: Wenchen Fan 
---
 .../apache/spark/sql/catalyst/optimizer/joins.scala| 14 --
 .../catalyst/optimizer/OuterJoinEliminationSuite.scala | 18 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index 57c3f3dbd05..af3900ed4ad 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import scala.annotation.tailrec
+import scala.util.control.NonFatal
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
@@ -136,8 +137,17 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with 
PredicateHelper {
 val emptyRow = new GenericInternalRow(attributes.length)
 val boundE = BindReferences.bindReference(e, attributes)
 if (boundE.find(_.isInstanceOf[Unevaluable]).isDefined) return false
-val v = boundE.eval(emptyRow)
-v == null || v == false
+
+// some expressions, like map(), may throw an exception when dealing with 
null values.
+// therefore, we need to handle exceptions.
+try {
+  val v = boundE.eval(emptyRow)
+  v == null || v == false
+} catch {
+  case NonFatal(e) =>
+// cannot filter out null if `where` expression throws an exception 
with null input
+false
+}
   }
 
   private def buildNewJoinType(filter: Filter, join: Join): JoinType = {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
index 893c111c290..ea6ef525041 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
@@ -20,11 +20,13 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Coalesce, IsNotNull}
+import org.apache.spark.sql.catalyst.expressions.{Coalesce, If, IsNotNull, 
Literal, RaiseError}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.ty

[spark] branch branch-3.2 updated: [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins

2022-04-27 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 3690c8ceb9e [SPARK-38868][SQL][3.2] Don't propagate exceptions from 
filter predicate when optimizing outer joins
3690c8ceb9e is described below

commit 3690c8ceb9e5c2f642b9f9e1af526f76d2e2a71a
Author: Bruce Robbins 
AuthorDate: Wed Apr 27 15:38:22 2022 +0800

[SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate 
when optimizing outer joins

Backport of #36230

### What changes were proposed in this pull request?

Change `EliminateOuterJoin#canFilterOutNull` to return `false` when a 
`where` condition throws an exception.

### Why are the changes needed?

Consider this query:
```
select *
from (select id, id as b from range(0, 10)) l
left outer join (select id, id + 1 as c from range(0, 10)) r
on l.id = r.id
where assert_true(c > 0) is null;
```
The query should succeed, but instead fails with
```
java.lang.RuntimeException: '(c#1L > cast(0 as bigint))' is not true!
```
This happens even though there is no row where `c > 0` is false.

The `EliminateOuterJoin` rule checks if it can convert the outer join to a 
inner join based on the expression in the where clause, which in this case is
```
assert_true(c > 0) is null
```
`EliminateOuterJoin#canFilterOutNull` evaluates that expression with `c` 
set to `null` to see if the result is `null` or `false`. That rule doesn't 
expect the result to be a `RuntimeException`, but in this case it always is.

That is, the assertion is failing during optimization, not at run time.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test.

Closes #36341 from bersprockets/outer_join_eval_assert_issue_32.

Authored-by: Bruce Robbins 
Signed-off-by: Wenchen Fan 
---
 .../apache/spark/sql/catalyst/optimizer/joins.scala| 14 --
 .../catalyst/optimizer/OuterJoinEliminationSuite.scala | 18 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index d6e2a59de0b..19f4338908e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import scala.annotation.tailrec
+import scala.util.control.NonFatal
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins
@@ -144,8 +145,17 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with 
PredicateHelper {
 val emptyRow = new GenericInternalRow(attributes.length)
 val boundE = BindReferences.bindReference(e, attributes)
 if (boundE.find(_.isInstanceOf[Unevaluable]).isDefined) return false
-val v = boundE.eval(emptyRow)
-v == null || v == false
+
+// some expressions, like map(), may throw an exception when dealing with 
null values.
+// therefore, we need to handle exceptions.
+try {
+  val v = boundE.eval(emptyRow)
+  v == null || v == false
+} catch {
+  case NonFatal(e) =>
+// cannot filter out null if `where` expression throws an exception 
with null input
+false
+}
   }
 
   private def buildNewJoinType(filter: Filter, join: Join): JoinType = {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
index 893c111c290..ea6ef525041 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
@@ -20,11 +20,13 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Coalesce, IsNotNull}
+import org.apache.spark.sql.catalyst.expressions.{Coalesce, If, IsNotNull, 
Literal, RaiseError}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StringType
+import org.apache.spark.unsafe.types.UTF8String
 
 class Oute