[spark] branch master updated: [SPARK-39052][SQL] Support Literal.create(Char, StringType)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new abc2dc03fc8 [SPARK-39052][SQL] Support Literal.create(Char, StringType) abc2dc03fc8 is described below commit abc2dc03fc8f910ab95054205cdea4e3cb25801f Author: Hyukjin Kwon AuthorDate: Thu Apr 28 07:53:50 2022 +0300 [SPARK-39052][SQL] Support Literal.create(Char, StringType) ### What changes were proposed in this pull request? This is sort of a followup of https://github.com/apache/spark/commit/54fcaafb094e299f21c18370fddb4a727c88d875. `Literal.create` should also support `Char` too. ### Why are the changes needed? To make the support of external type `Char` same as `Literla.apply`. ### Does this PR introduce _any_ user-facing change? No, this isn't exposed to users. `Literal.create(Char, StringType)` isn't also used in the current codebase internally. This PR is just for completeness. ### How was this patch tested? Unittests were added. Closes #36389 from HyukjinKwon/SPARK-39052. Authored-by: Hyukjin Kwon Signed-off-by: Max Gekk --- .../scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala | 1 + .../org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala | 1 + .../spark/sql/catalyst/expressions/LiteralExpressionSuite.scala | 4 3 files changed, 6 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 3e6d31e79b7..263d3734217 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -499,6 +499,7 @@ object CatalystTypeConverters { */ def convertToCatalyst(a: Any): Any = a match { case s: String => StringConverter.toCatalyst(s) +case c: Char => StringConverter.toCatalyst(c.toString) case d: Date => DateConverter.toCatalyst(d) case ld: LocalDate => LocalDateConverter.toCatalyst(ld) case t: Timestamp => TimestampConverter.toCatalyst(t) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala index b559e219882..bf194a2288b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala @@ -152,6 +152,7 @@ class CatalystTypeConvertersSuite extends SparkFunSuite with SQLHelper { val converter = CatalystTypeConverters.createToCatalystConverter(StringType) val expected = UTF8String.fromString("X") assert(converter(chr) === expected) +assert(CatalystTypeConverters.convertToCatalyst('a') === UTF8String.fromString("a")) } test("SPARK-33390: Make Literal support char array") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index 6ce51f1eec8..80e7a3206aa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -247,6 +247,10 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:on } + test("SPARK-39052: Support Char in Literal.create") { +checkEvaluation(Literal.create('a', StringType), "a") + } + test("construct literals from java.time.LocalDate") { Seq( LocalDate.of(1, 1, 1), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39047][SQL] Replace the error class ILLEGAL_SUBSTRING by INVALID_PARAMETER_VALUE
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9dcc24c36f6 [SPARK-39047][SQL] Replace the error class ILLEGAL_SUBSTRING by INVALID_PARAMETER_VALUE 9dcc24c36f6 is described below commit 9dcc24c36f6fcdf43bf66fe50415be575f7b2918 Author: Max Gekk AuthorDate: Thu Apr 28 07:46:44 2022 +0300 [SPARK-39047][SQL] Replace the error class ILLEGAL_SUBSTRING by INVALID_PARAMETER_VALUE ### What changes were proposed in this pull request? In the PR, I propose to remove the `ILLEGAL_SUBSTRING` error class, and use `INVALID_PARAMETER_VALUE` in the case when the `strfmt` parameter of the `format_string()` function contains `%0$`. The last value is handled differently by JDKs: _"... Java 8 and Java 11 uses it as "%1$", and Java 17 throws IllegalFormatArgumentIndexException(Illegal format argument index = 0)"_. ### Why are the changes needed? To improve code maintenance and user experience with Spark SQL by reducing the number of user-facing error classes. ### Does this PR introduce _any_ user-facing change? Yes, it changes user-facing error message. Before: ```sql spark-sql> select format_string('%0$s', 'Hello'); Error in query: [ILLEGAL_SUBSTRING] The argument_index of string format cannot contain position 0$.; line 1 pos 7 ``` After: ```sql spark-sql> select format_string('%0$s', 'Hello'); Error in query: [INVALID_PARAMETER_VALUE] The value of parameter(s) 'strfmt' in `format_string` is invalid: expects %1$, %2$ and so on, but got %0$.; line 1 pos 7 ``` ### How was this patch tested? By running the affected test suites: ``` $ build/sbt "test:testOnly *SparkThrowableSuite" $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z text.sql" $ build/sbt "test:testOnly *QueryCompilationErrorsSuite" ``` Closes #36380 from MaxGekk/error-class-ILLEGAL_SUBSTRING. Authored-by: Max Gekk Signed-off-by: Max Gekk --- core/src/main/resources/error/error-classes.json | 3 --- .../apache/spark/sql/catalyst/expressions/stringExpressions.scala | 3 +-- .../scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala | 7 --- .../src/test/resources/sql-tests/results/postgreSQL/text.sql.out | 2 +- .../org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala | 7 --- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 673866e6c35..4738599685b 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -71,9 +71,6 @@ "GROUPING_SIZE_LIMIT_EXCEEDED" : { "message" : [ "Grouping sets size cannot be greater than " ] }, - "ILLEGAL_SUBSTRING" : { -"message" : [ " cannot contain ." ] - }, "INCOMPARABLE_PIVOT_COLUMN" : { "message" : [ "Invalid pivot column ''. Pivot columns must be comparable." ], "sqlState" : "42000" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 976caeb3502..9089ff46637 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1898,8 +1898,7 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC */ private def checkArgumentIndexNotZero(expression: Expression): Unit = expression match { case StringLiteral(pattern) if pattern.contains("%0$") => - throw QueryCompilationErrors.illegalSubstringError( -"The argument_index of string format", "position 0$") + throw QueryCompilationErrors.zeroArgumentIndexError() case _ => // do nothing } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 7f212ed5891..3d379fb4f71 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -66,10 +66,11 @@ object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Array(sizeLimit.toString)) } - def illegalSubstringError(subject: String, illegalContent: String): Throwable = { + def zeroArgumentIndexError(): Throwable = { new AnalysisException( - errorClass = "ILLEGAL_SUBSTRING", - messageParameters = Array(subject, illegalContent)) + er
[GitHub] [spark-website] yaooqinn commented on pull request #385: Fix Apache Project Website Checks - Events
yaooqinn commented on PR #385: URL: https://github.com/apache/spark-website/pull/385#issuecomment-717872 cc @srowen @dongjoon-hyun @HyukjinKwon -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] [spark-website] yaooqinn opened a new pull request, #385: Fix Apache Project Website Checks - Events
yaooqinn opened a new pull request, #385: URL: https://github.com/apache/spark-website/pull/385 https://whimsy.apache.org/site/project/spark in red light -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 606a99f4f2d [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set 606a99f4f2d is described below commit 606a99f4f2d91ea30c81285d6c95ee566e80577f Author: Gengliang Wang AuthorDate: Thu Apr 28 09:59:17 2022 +0800 [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set ### What changes were proposed in this pull request? For the query context `TreeNode.origin.context`, this PR proposal to return an empty context string if * the query text/ the start index/ the stop index is missing * the start index is less than 0 * the stop index is larger than the length of query text * the start index is larger than the stop index ### Why are the changes needed? There are downstream projects that depend on Spark. There is no guarantee for the correctness of TreeNode.origin. Developers may create a plan/expression with a Origin containing wrong startIndex/stopIndex/sqlText. Thus, to avoid errors in calling `String.substring` or showing misleading debug information, I suggest returning an empty context string if TreeNode.origin is wrongly set. The query context is just for better error messages and we should handle it cautiously. ### Does this PR introduce _any_ user-facing change? No, the context framework is not released yet. ### How was this patch tested? UT Closes #36379 from gengliangwang/safeContext. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang (cherry picked from commit 7fe2759e9f81ec267e92e1c6f8a48f42042db791) Signed-off-by: Gengliang Wang --- .../apache/spark/sql/catalyst/trees/TreeNode.scala | 126 +++-- .../expressions/ArithmeticExpressionSuite.scala| 6 + .../spark/sql/catalyst/trees/TreeNodeSuite.scala | 37 ++ 3 files changed, 110 insertions(+), 59 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 00690abf18f..079abd3f2e0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -72,71 +72,79 @@ case class Origin( * SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT4_TBL i * ^^^ */ - lazy val context: String = sqlText.map { text => -val positionContext = if (line.isDefined && startPosition.isDefined) { - s"(line ${line.get}, position ${startPosition.get})" -} else { + lazy val context: String = { +// If the query context is missing or incorrect, simply return an empty string. +if (sqlText.isEmpty || startIndex.isEmpty || stopIndex.isEmpty || + startIndex.get < 0 || stopIndex.get >= sqlText.get.length || startIndex.get > stopIndex.get) { "" -} -val objectContext = if (objectType.isDefined && objectName.isDefined) { - s" of ${objectType.get} ${objectName.get}" } else { - "" -} -val builder = new StringBuilder -builder ++= s"\n== SQL$objectContext$positionContext ==\n" - -val start = startIndex.getOrElse(0) -val stop = stopIndex.getOrElse(sqlText.get.length - 1) -// Ideally we should show all the lines which contains the SQL text context of the current node: -// [additional text] [current tree node] [additional text] -// However, we need to truncate the additional text in case it is too long. The following -// variable is to define the max length of additional text. -val maxExtraContextLength = 32 -val truncatedText = "..." -var lineStartIndex = start -// Collect the SQL text within the starting line of current Node. -// The text is truncated if it is too long. -while(lineStartIndex >= 0 && - start - lineStartIndex <= maxExtraContextLength && - text.charAt(lineStartIndex) != '\n') { - lineStartIndex -= 1 -} -val startTruncated = start - lineStartIndex > maxExtraContextLength -var currentIndex = lineStartIndex -if (startTruncated) { - currentIndex -= truncatedText.length -} + val positionContext = if (line.isDefined && startPosition.isDefined) { +s"(line ${line.get}, position ${startPosition.get})" + } else { +"" + } + val objectContext = if (objectType.isDefined && objectName.isDefined) { +s" of ${objectType.get} ${objectName.get}" + } else { +"" + } + val builder = new StringBuilder + builder ++= s"\n== SQL$objectContext$positionContext ==\n" + + val text = sqlText.get
[spark] branch master updated: [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7fe2759e9f8 [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set 7fe2759e9f8 is described below commit 7fe2759e9f81ec267e92e1c6f8a48f42042db791 Author: Gengliang Wang AuthorDate: Thu Apr 28 09:59:17 2022 +0800 [SPARK-39046][SQL] Return an empty context string if TreeNode.origin is wrongly set ### What changes were proposed in this pull request? For the query context `TreeNode.origin.context`, this PR proposal to return an empty context string if * the query text/ the start index/ the stop index is missing * the start index is less than 0 * the stop index is larger than the length of query text * the start index is larger than the stop index ### Why are the changes needed? There are downstream projects that depend on Spark. There is no guarantee for the correctness of TreeNode.origin. Developers may create a plan/expression with a Origin containing wrong startIndex/stopIndex/sqlText. Thus, to avoid errors in calling `String.substring` or showing misleading debug information, I suggest returning an empty context string if TreeNode.origin is wrongly set. The query context is just for better error messages and we should handle it cautiously. ### Does this PR introduce _any_ user-facing change? No, the context framework is not released yet. ### How was this patch tested? UT Closes #36379 from gengliangwang/safeContext. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang --- .../apache/spark/sql/catalyst/trees/TreeNode.scala | 126 +++-- .../expressions/ArithmeticExpressionSuite.scala| 6 + .../spark/sql/catalyst/trees/TreeNodeSuite.scala | 37 ++ 3 files changed, 110 insertions(+), 59 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index eed59b9e1bf..0714898e19d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -72,71 +72,79 @@ case class Origin( * SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT4_TBL i * ^^^ */ - lazy val context: String = sqlText.map { text => -val positionContext = if (line.isDefined && startPosition.isDefined) { - s"(line ${line.get}, position ${startPosition.get})" -} else { + lazy val context: String = { +// If the query context is missing or incorrect, simply return an empty string. +if (sqlText.isEmpty || startIndex.isEmpty || stopIndex.isEmpty || + startIndex.get < 0 || stopIndex.get >= sqlText.get.length || startIndex.get > stopIndex.get) { "" -} -val objectContext = if (objectType.isDefined && objectName.isDefined) { - s" of ${objectType.get} ${objectName.get}" } else { - "" -} -val builder = new StringBuilder -builder ++= s"\n== SQL$objectContext$positionContext ==\n" - -val start = startIndex.getOrElse(0) -val stop = stopIndex.getOrElse(sqlText.get.length - 1) -// Ideally we should show all the lines which contains the SQL text context of the current node: -// [additional text] [current tree node] [additional text] -// However, we need to truncate the additional text in case it is too long. The following -// variable is to define the max length of additional text. -val maxExtraContextLength = 32 -val truncatedText = "..." -var lineStartIndex = start -// Collect the SQL text within the starting line of current Node. -// The text is truncated if it is too long. -while(lineStartIndex >= 0 && - start - lineStartIndex <= maxExtraContextLength && - text.charAt(lineStartIndex) != '\n') { - lineStartIndex -= 1 -} -val startTruncated = start - lineStartIndex > maxExtraContextLength -var currentIndex = lineStartIndex -if (startTruncated) { - currentIndex -= truncatedText.length -} + val positionContext = if (line.isDefined && startPosition.isDefined) { +s"(line ${line.get}, position ${startPosition.get})" + } else { +"" + } + val objectContext = if (objectType.isDefined && objectName.isDefined) { +s" of ${objectType.get} ${objectName.get}" + } else { +"" + } + val builder = new StringBuilder + builder ++= s"\n== SQL$objectContext$positionContext ==\n" + + val text = sqlText.get + val start = math.max(startIndex.get, 0) + val stop = math.min(stopIndex.getOrElse(text.length - 1), text
[spark] branch branch-3.3 updated: [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 96d66b030d9 [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info` 96d66b030d9 is described below commit 96d66b030d914ccd7ded74e33287e45d09935e27 Author: Xinrong Meng AuthorDate: Thu Apr 28 09:25:37 2022 +0900 [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info` ### What changes were proposed in this pull request? Suppress PerformanceWarnings of DataFrame.info ### Why are the changes needed? To improve usability. ### Does this PR introduce _any_ user-facing change? No. Only PerformanceWarnings of DataFrame.info are suppressed. ### How was this patch tested? Manual tests. Closes #36367 from xinrong-databricks/frame.info. Authored-by: Xinrong Meng Signed-off-by: Hyukjin Kwon (cherry picked from commit 594337fad131280f62107326062fb554f0566d43) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/pandas/conversion.py | 31 ++- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 808444f1e2e..fff0bac5480 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -15,9 +15,9 @@ # limitations under the License. # import sys -import warnings from collections import Counter from typing import List, Optional, Type, Union, no_type_check, overload, TYPE_CHECKING +from warnings import catch_warnings, simplefilter, warn from pyspark.rdd import _load_from_socket from pyspark.sql.pandas.serializers import ArrowCollectSerializer @@ -111,7 +111,7 @@ class PandasConversionMixin: "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % str(e) ) -warnings.warn(msg) +warn(msg) use_arrow = False else: msg = ( @@ -121,7 +121,7 @@ class PandasConversionMixin: "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to " "false.\n %s" % str(e) ) -warnings.warn(msg) +warn(msg) raise # Try to use Arrow optimization when the schema is supported and the required version @@ -198,7 +198,7 @@ class PandasConversionMixin: "effect on failures in the middle of " "computation.\n %s" % str(e) ) -warnings.warn(msg) +warn(msg) raise # Below is toPandas without Arrow optimization. @@ -247,13 +247,18 @@ class PandasConversionMixin: if (t is not None and not is_timedelta64_dtype(t)) or should_check_timedelta: series = series.astype(t, copy=False) -# `insert` API makes copy of data, we only do it for Series of duplicate column names. -# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work because `iloc` could -# return a view or a copy depending by context. -if column_counter[column_name] > 1: -df.insert(index, column_name, series, allow_duplicates=True) -else: -df[column_name] = series +with catch_warnings(): +from pandas.errors import PerformanceWarning + +simplefilter(action="ignore", category=PerformanceWarning) +# `insert` API makes copy of data, +# we only do it for Series of duplicate column names. +# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work +# because `iloc` could return a view or a copy depending by context. +if column_counter[column_name] > 1: +df.insert(index, column_name, series, allow_duplicates=True) +else: +df[column_name] = series if timezone is None: return df @@ -417,7 +422,7 @@ class SparkConversionMixin: "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % str(e) ) -warnings.warn(msg) +warn(msg) else: msg = ( "createDataFrame attempted Arrow optimization because " @@ -426,7 +431,7 @@ class SparkConversionMixin: "fallback with 'spark.sql.execution.arrow.pyspark.fallback.enabled' "
[spark] branch master updated: [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 594337fad13 [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info` 594337fad13 is described below commit 594337fad131280f62107326062fb554f0566d43 Author: Xinrong Meng AuthorDate: Thu Apr 28 09:25:37 2022 +0900 [SPARK-38988][PYTHON] Suppress PerformanceWarnings of `DataFrame.info` ### What changes were proposed in this pull request? Suppress PerformanceWarnings of DataFrame.info ### Why are the changes needed? To improve usability. ### Does this PR introduce _any_ user-facing change? No. Only PerformanceWarnings of DataFrame.info are suppressed. ### How was this patch tested? Manual tests. Closes #36367 from xinrong-databricks/frame.info. Authored-by: Xinrong Meng Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/pandas/conversion.py | 31 ++- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 808444f1e2e..fff0bac5480 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -15,9 +15,9 @@ # limitations under the License. # import sys -import warnings from collections import Counter from typing import List, Optional, Type, Union, no_type_check, overload, TYPE_CHECKING +from warnings import catch_warnings, simplefilter, warn from pyspark.rdd import _load_from_socket from pyspark.sql.pandas.serializers import ArrowCollectSerializer @@ -111,7 +111,7 @@ class PandasConversionMixin: "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % str(e) ) -warnings.warn(msg) +warn(msg) use_arrow = False else: msg = ( @@ -121,7 +121,7 @@ class PandasConversionMixin: "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to " "false.\n %s" % str(e) ) -warnings.warn(msg) +warn(msg) raise # Try to use Arrow optimization when the schema is supported and the required version @@ -198,7 +198,7 @@ class PandasConversionMixin: "effect on failures in the middle of " "computation.\n %s" % str(e) ) -warnings.warn(msg) +warn(msg) raise # Below is toPandas without Arrow optimization. @@ -247,13 +247,18 @@ class PandasConversionMixin: if (t is not None and not is_timedelta64_dtype(t)) or should_check_timedelta: series = series.astype(t, copy=False) -# `insert` API makes copy of data, we only do it for Series of duplicate column names. -# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work because `iloc` could -# return a view or a copy depending by context. -if column_counter[column_name] > 1: -df.insert(index, column_name, series, allow_duplicates=True) -else: -df[column_name] = series +with catch_warnings(): +from pandas.errors import PerformanceWarning + +simplefilter(action="ignore", category=PerformanceWarning) +# `insert` API makes copy of data, +# we only do it for Series of duplicate column names. +# `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work +# because `iloc` could return a view or a copy depending by context. +if column_counter[column_name] > 1: +df.insert(index, column_name, series, allow_duplicates=True) +else: +df[column_name] = series if timezone is None: return df @@ -417,7 +422,7 @@ class SparkConversionMixin: "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % str(e) ) -warnings.warn(msg) +warn(msg) else: msg = ( "createDataFrame attempted Arrow optimization because " @@ -426,7 +431,7 @@ class SparkConversionMixin: "fallback with 'spark.sql.execution.arrow.pyspark.fallback.enabled' " "has been set to false.\n %s" % str(e) ) -warnings.warn(msg) +
[spark] branch branch-3.3 updated: [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new c9b6b50ff64 [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass` c9b6b50ff64 is described below commit c9b6b50ff640b531cc953249311a78f5b75ce349 Author: Bjørn Jørgensen AuthorDate: Thu Apr 28 09:20:53 2022 +0900 [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass` ### What changes were proposed in this pull request? Remove unneeded `pass` ### Why are the changes needed? Class`s Estimator, Transformer and Evaluator are abstract classes. Which has functions. ValueError in def run() has code. By removing `pass` it will be easier to read, understand and reuse code. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests passed. Closes #36383 from bjornjorgensen/remove-unneeded-pass. Lead-authored-by: Bjørn Jørgensen Co-authored-by: bjornjorgensen Signed-off-by: Hyukjin Kwon (cherry picked from commit 0e875875059c1cbf36de49205a4ce8dbc483d9d1) Signed-off-by: Hyukjin Kwon --- python/pyspark/ml/base.py | 4 python/pyspark/ml/evaluation.py | 2 -- python/pyspark/tests/test_worker.py | 1 - 3 files changed, 7 deletions(-) diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py index 20540ebbef6..34c3aa9c62c 100644 --- a/python/pyspark/ml/base.py +++ b/python/pyspark/ml/base.py @@ -110,8 +110,6 @@ class Estimator(Params, Generic[M], metaclass=ABCMeta): .. versionadded:: 1.3.0 """ -pass - @abstractmethod def _fit(self, dataset: DataFrame) -> M: """ @@ -220,8 +218,6 @@ class Transformer(Params, metaclass=ABCMeta): .. versionadded:: 1.3.0 """ -pass - @abstractmethod def _transform(self, dataset: DataFrame) -> DataFrame: """ diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index ff0e5b91e42..19d123debae 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -67,8 +67,6 @@ class Evaluator(Params, metaclass=ABCMeta): .. versionadded:: 1.4.0 """ -pass - @abstractmethod def _evaluate(self, dataset: DataFrame) -> float: """ diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py index 0fdf6adb031..06ada8f81d5 100644 --- a/python/pyspark/tests/test_worker.py +++ b/python/pyspark/tests/test_worker.py @@ -70,7 +70,6 @@ class WorkerTests(ReusedPySparkTestCase): try: daemon_pid, worker_pid = map(int, data) except ValueError: -pass # In case the value is not written yet. cnt += 1 if cnt == 10: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (c19fadabde3 -> 0e875875059)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from c19fadabde3 [SPARK-39051][PYTHON] Minor refactoring of `python/pyspark/sql/pandas/conversion.py` add 0e875875059 [SPARK-39049][PYTHON][CORE][ML] Remove unneeded `pass` No new revisions were added by this update. Summary of changes: python/pyspark/ml/base.py | 4 python/pyspark/ml/evaluation.py | 2 -- python/pyspark/tests/test_worker.py | 1 - 3 files changed, 7 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-39051][PYTHON] Minor refactoring of `python/pyspark/sql/pandas/conversion.py`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 84addc5d1d8 [SPARK-39051][PYTHON] Minor refactoring of `python/pyspark/sql/pandas/conversion.py` 84addc5d1d8 is described below commit 84addc5d1d8359a5b716ec869489fc961af23cf2 Author: Xinrong Meng AuthorDate: Thu Apr 28 09:17:24 2022 +0900 [SPARK-39051][PYTHON] Minor refactoring of `python/pyspark/sql/pandas/conversion.py` Minor refactoring of `python/pyspark/sql/pandas/conversion.py`, which includes: - doc change - renaming To improve code readability and maintainability. No. Existing tests. Closes #36384 from xinrong-databricks/conversion.py. Authored-by: Xinrong Meng Signed-off-by: Hyukjin Kwon (cherry picked from commit c19fadabde3ef3f9c7e4fa9bf74632a4f8e1f3e2) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/pandas/conversion.py | 52 - 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 7153450d2bc..808444f1e2e 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -51,7 +51,7 @@ if TYPE_CHECKING: class PandasConversionMixin: """ -Min-in for the conversion from Spark to pandas. Currently, only :class:`DataFrame` +Mix-in for the conversion from Spark to pandas. Currently, only :class:`DataFrame` can use this class. """ @@ -65,10 +65,10 @@ class PandasConversionMixin: Notes - -This method should only be used if the resulting Pandas's :class:`DataFrame` is +This method should only be used if the resulting Pandas ``pandas.DataFrame`` is expected to be small, as all the data is loaded into the driver's memory. -Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental. +Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is experimental. Examples @@ -136,8 +136,7 @@ class PandasConversionMixin: # Rename columns to avoid duplicated column names. tmp_column_names = ["col_{}".format(i) for i in range(len(self.columns))] -c = self.sparkSession._jconf -self_destruct = c.arrowPySparkSelfDestructEnabled() +self_destruct = jconf.arrowPySparkSelfDestructEnabled() batches = self.toDF(*tmp_column_names)._collect_as_arrow( split_batches=self_destruct ) @@ -176,11 +175,11 @@ class PandasConversionMixin: else: corrected_panda_types = {} for index, field in enumerate(self.schema): -panda_type = PandasConversionMixin._to_corrected_pandas_type( +pandas_type = PandasConversionMixin._to_corrected_pandas_type( field.dataType ) corrected_panda_types[tmp_column_names[index]] = ( -np.object0 if panda_type is None else panda_type +np.object0 if pandas_type is None else pandas_type ) pdf = pd.DataFrame(columns=tmp_column_names).astype( @@ -206,36 +205,37 @@ class PandasConversionMixin: pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) column_counter = Counter(self.columns) -dtype: List[Optional[Type]] = [None] * len(self.schema) -for fieldIdx, field in enumerate(self.schema): -# For duplicate column name, we use `iloc` to access it. +corrected_dtypes: List[Optional[Type]] = [None] * len(self.schema) +for index, field in enumerate(self.schema): +# We use `iloc` to access columns with duplicate column names. if column_counter[field.name] > 1: -pandas_col = pdf.iloc[:, fieldIdx] +pandas_col = pdf.iloc[:, index] else: pandas_col = pdf[field.name] pandas_type = PandasConversionMixin._to_corrected_pandas_type(field.dataType) # SPARK-21766: if an integer field is nullable and has null values, it can be -# inferred by pandas as float column. Once we convert the column with NaN back -# to integer type e.g., np.int16, we will hit exception. So we use the inferred -# float type, not the corrected type from the schema in this case. +# inferred by pandas as a float column. If we convert the column with NaN back +
[spark] branch master updated (7b637eef77b -> c19fadabde3)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 7b637eef77b [SPARK-39038][CI] Skip reporting test results if triggering workflow was skipped add c19fadabde3 [SPARK-39051][PYTHON] Minor refactoring of `python/pyspark/sql/pandas/conversion.py` No new revisions were added by this update. Summary of changes: python/pyspark/sql/pandas/conversion.py | 52 - 1 file changed, 25 insertions(+), 27 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39038][CI] Skip reporting test results if triggering workflow was skipped
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7b637eef77b [SPARK-39038][CI] Skip reporting test results if triggering workflow was skipped 7b637eef77b is described below commit 7b637eef77b5069cb8825db8ff79389a935364d8 Author: Enrico Minack AuthorDate: Thu Apr 28 09:00:17 2022 +0900 [SPARK-39038][CI] Skip reporting test results if triggering workflow was skipped ### What changes were proposed in this pull request? The `"Report test results"` workflow should be skipped when the triggering workflow completed with conclusion `'skipped'`. ### Why are the changes needed? The `"Report test results"` workflow is triggered when either `"Build and test"` or `"Build and test (ANSI)"` complete. On fork repositories, workflow `"Build and test (ANSI)"` is always skipped. The triggered `"Report test results"` workflow downloads artifacts from the triggering workflow and errors because there are none artifacts. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? In personal repo: - https://github.com/EnricoMi/spark/actions/runs/2231657986 - triggered by https://github.com/EnricoMi/spark/actions/runs/2231657828 Closes #36371 from EnricoMi/master. Authored-by: Enrico Minack Signed-off-by: Hyukjin Kwon --- .github/workflows/test_report.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml index a3f09c06ed9..5f46985a975 100644 --- a/.github/workflows/test_report.yml +++ b/.github/workflows/test_report.yml @@ -26,6 +26,7 @@ on: jobs: test_report: +if: github.event.workflow_run.conclusion != 'skipped' runs-on: ubuntu-latest steps: - name: Download test results to report - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-38997][SQL] DS V2 aggregate push-down supports group by expressions
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 4a4e35a30c7 [SPARK-38997][SQL] DS V2 aggregate push-down supports group by expressions 4a4e35a30c7 is described below commit 4a4e35a30c7bb7534aece8e917a2813d47c2c498 Author: Jiaan Geng AuthorDate: Thu Apr 28 00:43:55 2022 +0800 [SPARK-38997][SQL] DS V2 aggregate push-down supports group by expressions ### What changes were proposed in this pull request? Currently, Spark DS V2 aggregate push-down only supports group by column. But the SQL show below is very useful and common. ``` SELECT CASE WHEN 'SALARY' > 8000.00 AND 'SALARY' < 1.00 THEN 'SALARY' ELSE 0.00 END AS key, SUM('SALARY') FROM "test"."employee" GROUP BY key ``` ### Why are the changes needed? Let DS V2 aggregate push-down supports group by expressions ### Does this PR introduce _any_ user-facing change? 'No'. New feature. ### How was this patch tested? New tests Closes #36325 from beliefer/SPARK-38997. Authored-by: Jiaan Geng Signed-off-by: Wenchen Fan (cherry picked from commit ee6ea3c68694e35c36ad006a7762297800d1e463) Signed-off-by: Wenchen Fan --- .../expressions/aggregate/Aggregation.java | 10 +- .../spark/sql/execution/DataSourceScanExec.scala | 2 +- .../datasources/AggregatePushDownUtils.scala | 23 ++-- .../execution/datasources/DataSourceStrategy.scala | 7 +- .../sql/execution/datasources/orc/OrcUtils.scala | 2 +- .../datasources/parquet/ParquetUtils.scala | 2 +- .../datasources/v2/V2ScanRelationPushDown.scala| 23 ++-- .../datasources/v2/jdbc/JDBCScanBuilder.scala | 27 ++--- .../sql/execution/datasources/v2/orc/OrcScan.scala | 2 +- .../datasources/v2/parquet/ParquetScan.scala | 2 +- .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala| 120 - 11 files changed, 151 insertions(+), 69 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java index cf7dbb2978d..11d9e475ca1 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Aggregation.java @@ -20,7 +20,7 @@ package org.apache.spark.sql.connector.expressions.aggregate; import java.io.Serializable; import org.apache.spark.annotation.Evolving; -import org.apache.spark.sql.connector.expressions.NamedReference; +import org.apache.spark.sql.connector.expressions.Expression; /** * Aggregation in SQL statement. @@ -30,14 +30,14 @@ import org.apache.spark.sql.connector.expressions.NamedReference; @Evolving public final class Aggregation implements Serializable { private final AggregateFunc[] aggregateExpressions; - private final NamedReference[] groupByColumns; + private final Expression[] groupByExpressions; - public Aggregation(AggregateFunc[] aggregateExpressions, NamedReference[] groupByColumns) { + public Aggregation(AggregateFunc[] aggregateExpressions, Expression[] groupByExpressions) { this.aggregateExpressions = aggregateExpressions; -this.groupByColumns = groupByColumns; +this.groupByExpressions = groupByExpressions; } public AggregateFunc[] aggregateExpressions() { return aggregateExpressions; } - public NamedReference[] groupByColumns() { return groupByColumns; } + public Expression[] groupByExpressions() { return groupByExpressions; } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 5067cd7fa3c..ac0f3af5725 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -163,7 +163,7 @@ case class RowDataSourceScanExec( "PushedFilters" -> pushedFilters) ++ pushedDownOperators.aggregation.fold(Map[String, String]()) { v => Map("PushedAggregates" -> seqToString(v.aggregateExpressions.map(_.describe())), - "PushedGroupByColumns" -> seqToString(v.groupByColumns.map(_.describe(} ++ + "PushedGroupByExpressions" -> seqToString(v.groupByExpressions.map(_.describe(} ++ topNOrLimitInfo ++ pushedDownOperators.sample.map(v => "PushedSample" -> s"SAMPLE (${(v.upperBound - v.lowerBound) * 100}) ${v.withReplacement} SEED(${v.seed})" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datas
[spark] branch master updated (852997d6ed6 -> ee6ea3c6869)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 852997d6ed6 [SPARK-38914][SQL] Allow user to insert specified columns into insertable view add ee6ea3c6869 [SPARK-38997][SQL] DS V2 aggregate push-down supports group by expressions No new revisions were added by this update. Summary of changes: .../expressions/aggregate/Aggregation.java | 10 +- .../spark/sql/execution/DataSourceScanExec.scala | 2 +- .../datasources/AggregatePushDownUtils.scala | 23 ++-- .../execution/datasources/DataSourceStrategy.scala | 7 +- .../sql/execution/datasources/orc/OrcUtils.scala | 2 +- .../datasources/parquet/ParquetUtils.scala | 2 +- .../datasources/v2/V2ScanRelationPushDown.scala| 23 ++-- .../datasources/v2/jdbc/JDBCScanBuilder.scala | 27 ++--- .../sql/execution/datasources/v2/orc/OrcScan.scala | 2 +- .../datasources/v2/parquet/ParquetScan.scala | 2 +- .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala| 120 - 11 files changed, 151 insertions(+), 69 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-38914][SQL] Allow user to insert specified columns into insertable view
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 852997d6ed6 [SPARK-38914][SQL] Allow user to insert specified columns into insertable view 852997d6ed6 is described below commit 852997d6ed61f4e098803b96927e7cbbd24e3d7c Author: morvenhuang AuthorDate: Wed Apr 27 23:35:45 2022 +0800 [SPARK-38914][SQL] Allow user to insert specified columns into insertable view ### What changes were proposed in this pull request? Allow user to insert specified columns into insertable view, for example, ``` CREATE TEMPORARY VIEW v1 (c1 int, c2 string) USING org.apache.spark.sql.json.DefaultSource OPTIONS (path 'json_dir') INSERT INTO v1(c1) VALUES(100) SELECT c1, c2 FROM v1; +---++ | c1| c2| +---++ |100|null| +---++ ``` ### Why are the changes needed? The option spark.sql.defaultColumn.useNullsForMissingDefautValues allows us to insert specified columns into table (https://issues.apache.org/jira/browse/SPARK-38795), but currently this option does not work for insertable view, To keep consistenct with the behavior of INSERT INTO table, we should also allow user to specify columns when running INSERT INTO view. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? New unit tests added. Closes #36212 from morvenhuang/SPARK-38914. Authored-by: morvenhuang Signed-off-by: Gengliang Wang --- .../catalyst/analysis/ResolveDefaultColumns.scala | 6 ++- .../org/apache/spark/sql/internal/SQLConf.scala| 2 +- .../org/apache/spark/sql/sources/InsertSuite.scala | 45 ++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala index 447769be300..422a1e422be 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala @@ -79,7 +79,8 @@ case class ResolveDefaultColumns( replaceExplicitDefaultColumnValues(analyzer, expanded).getOrElse(table) replaced - case i@InsertIntoStatement(_, _, _, project: Project, _, _) => + case i@InsertIntoStatement(_, _, _, project: Project, _, _) +if !project.projectList.exists(_.isInstanceOf[Star]) => enclosingInsert = Some(i) insertTableSchemaWithoutPartitionColumns = getInsertTableSchemaWithoutPartitionColumns val expanded: Project = addMissingDefaultColumnValues(project).getOrElse(project) @@ -280,6 +281,9 @@ case class ResolveDefaultColumns( case SubqueryAlias(_, r: UnresolvedCatalogRelation) => StructType(r.tableMeta.schema.fields.dropRight( enclosingInsert.get.partitionSpec.size)) + case SubqueryAlias(_, r: View) if r.isTempView => +StructType(r.schema.fields.dropRight( + enclosingInsert.get.partitionSpec.size)) case _ => return None } // Rearrange the columns in the result schema to match the order of the explicit column list, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0ba870d10e9..8876d780799 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2880,7 +2880,7 @@ object SQLConf { .createWithDefault(true) val USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES = -buildConf("spark.sql.defaultColumn.useNullsForMissingDefautValues") +buildConf("spark.sql.defaultColumn.useNullsForMissingDefaultValues") .internal() .doc("When true, and DEFAULT columns are enabled, allow column definitions lacking " + "explicit default values to behave as if they had specified DEFAULT NULL instead. " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 04acedb7ead..1312353f537 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -856,6 +856,33 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { } } + test("Allow user to insert specified columns into insertable view") { +withSQLConf(SQLConf.USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES.key -> "true") { + sql("INSERT OVERWRITE TABLE jsonTable SELECT a FROM jt") +
[spark] branch master updated: [SPARK-38979][SQL] Improve error log readability in OrcUtils.requestedColumnIds
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 70b4b1d1f69 [SPARK-38979][SQL] Improve error log readability in OrcUtils.requestedColumnIds 70b4b1d1f69 is described below commit 70b4b1d1f69be3a15eadb0e798139982c152b7bb Author: sychen AuthorDate: Wed Apr 27 08:38:28 2022 -0500 [SPARK-38979][SQL] Improve error log readability in OrcUtils.requestedColumnIds ### What changes were proposed in this pull request? Add detailed log in `OrcUtils#requestedColumnIds`. ### Why are the changes needed? In `OrcUtils#requestedColumnIds` sometimes it fails because `orcFieldNames.length > dataSchema.length`, the log is not very clear. ``` java.lang.AssertionError: assertion failed: The given data schema struct has less fields than the actual ORC physical schema, no idea which columns were dropped, fail to read. ``` after the change ``` java.lang.AssertionError: assertion failed: The given data schema struct (length:1) has fewer 1 fields than the actual ORC physical schema struct (length:2), no idea which columns were dropped, fail to read. ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? exist UT / local test Closes #36296 from cxzl25/SPARK-38979. Authored-by: sychen Signed-off-by: Sean Owen --- .../org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index f07573beae6..1783aadaa78 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -224,7 +224,9 @@ object OrcUtils extends Logging { // the physical schema doesn't match the data schema). // In these cases we map the physical schema to the data schema by index. assert(orcFieldNames.length <= dataSchema.length, "The given data schema " + - s"${dataSchema.catalogString} has less fields than the actual ORC physical schema, " + + s"${dataSchema.catalogString} (length:${dataSchema.length}) " + + s"has fewer ${orcFieldNames.length - dataSchema.length} fields than " + + s"the actual ORC physical schema $orcSchema (length:${orcFieldNames.length}), " + "no idea which columns were dropped, fail to read.") // for ORC file written by Hive, no field names // in the physical schema, there is a need to send the - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated (b3ecff34ab6 -> b25276f4385)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git from b3ecff34ab6 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite add b25276f4385 [SPARK-39015][SQL][3.3] Remove the usage of toSQLValue(v) without an explicit type No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/expressions/Cast.scala | 58 -- .../expressions/complexTypeExtractors.scala| 5 +- .../spark/sql/catalyst/util/DateTimeUtils.scala| 14 -- .../spark/sql/catalyst/util/IntervalUtils.scala| 23 + .../apache/spark/sql/errors/QueryErrorsBase.scala | 14 ++ .../spark/sql/errors/QueryExecutionErrors.scala| 47 ++ .../scala/org/apache/spark/sql/types/Decimal.scala | 21 +--- .../org/apache/spark/sql/types/numerics.scala | 13 +++-- .../catalyst/expressions/AnsiCastSuiteBase.scala | 3 +- .../test/resources/sql-tests/inputs/ansi/map.sql | 1 + .../resources/sql-tests/results/ansi/map.sql.out | 14 +- 11 files changed, 125 insertions(+), 88 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (e49147af4a8 -> 4e84f339973)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from e49147af4a8 [SPARK-39015][SQL] Remove the usage of toSQLValue(v) without an explicit type add 4e84f339973 [SPARK-39027][SQL] Output SQL statements in error messages in upper case and w/o double quotes No new revisions were added by this update. Summary of changes: python/pyspark/sql/tests/test_udf.py | 2 +- .../apache/spark/sql/errors/QueryErrorsBase.scala| 3 +-- .../ExtractPythonUDFFromJoinConditionSuite.scala | 2 +- .../resources/sql-tests/results/describe.sql.out | 4 ++-- .../sql/errors/QueryCompilationErrorsSuite.scala | 6 +++--- .../spark/sql/errors/QueryParsingErrorsSuite.scala | 20 ++-- .../spark/sql/execution/command/DDLParserSuite.scala | 4 ++-- 7 files changed, 20 insertions(+), 21 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (d05e01d5402 -> e49147af4a8)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from d05e01d5402 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite add e49147af4a8 [SPARK-39015][SQL] Remove the usage of toSQLValue(v) without an explicit type No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/expressions/Cast.scala | 58 -- .../expressions/complexTypeExtractors.scala| 5 +- .../spark/sql/catalyst/util/DateTimeUtils.scala| 14 -- .../spark/sql/catalyst/util/IntervalUtils.scala| 23 + .../apache/spark/sql/errors/QueryErrorsBase.scala | 14 ++ .../spark/sql/errors/QueryExecutionErrors.scala| 47 ++ .../scala/org/apache/spark/sql/types/Decimal.scala | 21 +--- .../org/apache/spark/sql/types/numerics.scala | 13 +++-- .../catalyst/expressions/AnsiCastSuiteBase.scala | 3 +- .../test/resources/sql-tests/inputs/ansi/map.sql | 1 + .../resources/sql-tests/results/ansi/map.sql.out | 14 +- .../sql/errors/QueryExecutionAnsiErrorsSuite.scala | 5 +- 12 files changed, 128 insertions(+), 90 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new b3ecff34ab6 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite b3ecff34ab6 is described below commit b3ecff34ab6e3f7b0852db7c0b391cefd176e6ca Author: Peter Toth AuthorDate: Wed Apr 27 16:16:15 2022 +0800 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite To remove unnecessary changes from `InjectRuntimeFilterSuite` after https://github.com/apache/spark/pull/32298. These are not needed after https://github.com/apache/spark/pull/34929 as the final optimized plan does'n contain any `WithCTE` nodes. No need for those changes. No. Added new test. Closes #36361 from peter-toth/SPARK-34079-multi-column-scalar-subquery-follow-up-2. Authored-by: Peter Toth Signed-off-by: Wenchen Fan (cherry picked from commit d05e01d54024e3844f1e48e03bad3fd814b8f6b9) Signed-off-by: Wenchen Fan --- .../spark/sql/InjectRuntimeFilterSuite.scala | 73 +- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala index b541419c823..6c6bd1799e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala @@ -19,13 +19,17 @@ package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.{Alias, BloomFilterMightContain, Literal} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, BloomFilterAggregate} +import org.apache.spark.sql.catalyst.optimizer.MergeScalarSubqueries import org.apache.spark.sql.catalyst.plans.LeftSemi import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Join, LogicalPlan} +import org.apache.spark.sql.execution.{ReusedSubqueryExec, SubqueryExec} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, AQEPropagateEmptyRelation} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} import org.apache.spark.sql.types.{IntegerType, StructType} -class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSparkSession { +class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSparkSession + with AdaptiveSparkPlanHelper { protected override def beforeAll(): Unit = { super.beforeAll() @@ -201,9 +205,16 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp sql("analyze table bf4 compute statistics for columns a4, b4, c4, d4, e4, f4") sql("analyze table bf5part compute statistics for columns a5, b5, c5, d5, e5, f5") sql("analyze table bf5filtered compute statistics for columns a5, b5, c5, d5, e5, f5") + +// `MergeScalarSubqueries` can duplicate subqueries in the optimized plan and would make testing +// complicated. +conf.setConfString(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, MergeScalarSubqueries.ruleName) } protected override def afterAll(): Unit = try { +conf.setConfString(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, + SQLConf.OPTIMIZER_EXCLUDED_RULES.defaultValueString) + sql("DROP TABLE IF EXISTS bf1") sql("DROP TABLE IF EXISTS bf2") sql("DROP TABLE IF EXISTS bf3") @@ -264,24 +275,28 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp } } - // `MergeScalarSubqueries` can duplicate subqueries in the optimized plan, but the subqueries will - // be reused in the physical plan. - def getNumBloomFilters(plan: LogicalPlan, scalarSubqueryCTEMultiplicator: Int = 1): Integer = { -val numBloomFilterAggs = plan.collectWithSubqueries { - case Aggregate(_, aggregateExpressions, _) => -aggregateExpressions.collect { - case Alias(AggregateExpression(bfAgg: BloomFilterAggregate, _, _, _, _), _) => -assert(bfAgg.estimatedNumItemsExpression.isInstanceOf[Literal]) -assert(bfAgg.numBitsExpression.isInstanceOf[Literal]) -1 + def getNumBloomFilters(plan: LogicalPlan): Integer = { +val numBloomFilterAggs = plan.collect { + case Filter(condition, _) => condition.collect { +case subquery: org.apache.spark.sql.catalyst.expressions.ScalarSubquery +=> subquery.plan.collect { + case Aggregate(_, aggregateExpressions, _) => +aggregateExpressions.map { + case Alias(AggregateExpression(bfAgg : BloomFilterAggregate, _, _, _, _), + _) => +assert(bfAgg.estimatedNumItemsExpression.isInstanceOf[Li
[spark] branch master updated (5b53bdfa830 -> d05e01d5402)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 5b53bdfa830 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() add d05e01d5402 [SPARK-34079][SQL][FOLLOW-UP] Revert some changes in InjectRuntimeFilterSuite No new revisions were added by this update. Summary of changes: .../spark/sql/InjectRuntimeFilterSuite.scala | 73 +- 1 file changed, 57 insertions(+), 16 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: Revert "[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()"
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 46fa4998a4c Revert "[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()" 46fa4998a4c is described below commit 46fa4998a4c2f0d858c66c3629b13bd91d372cdd Author: Hyukjin Kwon AuthorDate: Wed Apr 27 16:59:34 2022 +0900 Revert "[SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()" This reverts commit 821a348ae7f9cd5958d29ccf342719f5d753ae28. --- python/pyspark/sql/functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 803213513da..041f8418176 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -780,8 +780,6 @@ def when(condition, value): :param condition: a boolean :class:`Column` expression. :param value: a literal value, or a :class:`Column` expression. -Examples - >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() [Row(age=3), Row(age=4)] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.0 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 821a348ae7f [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() 821a348ae7f is described below commit 821a348ae7f9cd5958d29ccf342719f5d753ae28 Author: vadim <86705+va...@users.noreply.github.com> AuthorDate: Wed Apr 27 16:56:18 2022 +0900 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() ### What changes were proposed in this pull request? Fix missing keyword for `pyspark.sql.functions.when()` documentation. ### Why are the changes needed? [Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html) is not formatted correctly ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? All tests passed. Closes #36369 from vadim/SPARK-39032. Authored-by: vadim <86705+va...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon (cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 041f8418176..803213513da 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -780,6 +780,8 @@ def when(condition, value): :param condition: a boolean :class:`Column` expression. :param value: a literal value, or a :class:`Column` expression. +Examples + >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() [Row(age=3), Row(age=4)] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new 30be8d0098d [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() 30be8d0098d is described below commit 30be8d0098d037e65d798999d80efd4bc8195b82 Author: vadim <86705+va...@users.noreply.github.com> AuthorDate: Wed Apr 27 16:56:18 2022 +0900 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() ### What changes were proposed in this pull request? Fix missing keyword for `pyspark.sql.functions.when()` documentation. ### Why are the changes needed? [Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html) is not formatted correctly ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? All tests passed. Closes #36369 from vadim/SPARK-39032. Authored-by: vadim <86705+va...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon (cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 88b7c4dfb64..ec21bbe5ce2 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1404,6 +1404,8 @@ def when(condition, value): value : a literal value, or a :class:`~pyspark.sql.Column` expression. +Examples + >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() [Row(age=3), Row(age=4)] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 5b35caeed4b [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() 5b35caeed4b is described below commit 5b35caeed4bd2e0803887dfde61788844eb920a6 Author: vadim <86705+va...@users.noreply.github.com> AuthorDate: Wed Apr 27 16:56:18 2022 +0900 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() ### What changes were proposed in this pull request? Fix missing keyword for `pyspark.sql.functions.when()` documentation. ### Why are the changes needed? [Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html) is not formatted correctly ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? All tests passed. Closes #36369 from vadim/SPARK-39032. Authored-by: vadim <86705+va...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon (cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index acde817fbda..2d254dc9e54 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1505,6 +1505,8 @@ def when(condition, value): value : a literal value, or a :class:`~pyspark.sql.Column` expression. +Examples + >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() [Row(age=3), Row(age=4)] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new d59f11816b7 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() d59f11816b7 is described below commit d59f11816b7e1d9195cc08806820469f78c3e0aa Author: vadim <86705+va...@users.noreply.github.com> AuthorDate: Wed Apr 27 16:56:18 2022 +0900 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() ### What changes were proposed in this pull request? Fix missing keyword for `pyspark.sql.functions.when()` documentation. ### Why are the changes needed? [Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html) is not formatted correctly ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? All tests passed. Closes #36369 from vadim/SPARK-39032. Authored-by: vadim <86705+va...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon (cherry picked from commit 5b53bdfa83061c160652e07b999f996fc8bd2ece) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 06fdbf1ed39..019f64b5171 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1716,6 +1716,8 @@ def when(condition: Column, value: Any) -> Column: value : a literal value, or a :class:`~pyspark.sql.Column` expression. +Examples + >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() [Row(age=3), Row(age=4)] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when()
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 5b53bdfa830 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() 5b53bdfa830 is described below commit 5b53bdfa83061c160652e07b999f996fc8bd2ece Author: vadim <86705+va...@users.noreply.github.com> AuthorDate: Wed Apr 27 16:56:18 2022 +0900 [SPARK-39032][PYTHON][DOCS] Examples' tag for pyspark.sql.functions.when() ### What changes were proposed in this pull request? Fix missing keyword for `pyspark.sql.functions.when()` documentation. ### Why are the changes needed? [Documentation](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.when.html) is not formatted correctly ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? All tests passed. Closes #36369 from vadim/SPARK-39032. Authored-by: vadim <86705+va...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 06fdbf1ed39..019f64b5171 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1716,6 +1716,8 @@ def when(condition: Column, value: Any) -> Column: value : a literal value, or a :class:`~pyspark.sql.Column` expression. +Examples + >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() [Row(age=3), Row(age=4)] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new 7217c51bd7d [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins 7217c51bd7d is described below commit 7217c51bd7dc4c68c28af6473d144737c5db0669 Author: Bruce Robbins AuthorDate: Wed Apr 27 15:38:22 2022 +0800 [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins Backport of #36230 ### What changes were proposed in this pull request? Change `EliminateOuterJoin#canFilterOutNull` to return `false` when a `where` condition throws an exception. ### Why are the changes needed? Consider this query: ``` select * from (select id, id as b from range(0, 10)) l left outer join (select id, id + 1 as c from range(0, 10)) r on l.id = r.id where assert_true(c > 0) is null; ``` The query should succeed, but instead fails with ``` java.lang.RuntimeException: '(c#1L > cast(0 as bigint))' is not true! ``` This happens even though there is no row where `c > 0` is false. The `EliminateOuterJoin` rule checks if it can convert the outer join to a inner join based on the expression in the where clause, which in this case is ``` assert_true(c > 0) is null ``` `EliminateOuterJoin#canFilterOutNull` evaluates that expression with `c` set to `null` to see if the result is `null` or `false`. That rule doesn't expect the result to be a `RuntimeException`, but in this case it always is. That is, the assertion is failing during optimization, not at run time. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit test. Closes #36341 from bersprockets/outer_join_eval_assert_issue_32. Authored-by: Bruce Robbins Signed-off-by: Wenchen Fan (cherry picked from commit 3690c8ceb9e5c2f642b9f9e1af526f76d2e2a71a) Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/optimizer/joins.scala| 14 -- .../catalyst/optimizer/OuterJoinEliminationSuite.scala | 18 +- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala index 57c3f3dbd05..af3900ed4ad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.optimizer import scala.annotation.tailrec +import scala.util.control.NonFatal import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ @@ -136,8 +137,17 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper { val emptyRow = new GenericInternalRow(attributes.length) val boundE = BindReferences.bindReference(e, attributes) if (boundE.find(_.isInstanceOf[Unevaluable]).isDefined) return false -val v = boundE.eval(emptyRow) -v == null || v == false + +// some expressions, like map(), may throw an exception when dealing with null values. +// therefore, we need to handle exceptions. +try { + val v = boundE.eval(emptyRow) + v == null || v == false +} catch { + case NonFatal(e) => +// cannot filter out null if `where` expression throws an exception with null input +false +} } private def buildNewJoinType(filter: Filter, join: Join): JoinType = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala index 893c111c290..ea6ef525041 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala @@ -20,11 +20,13 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.{Coalesce, IsNotNull} +import org.apache.spark.sql.catalyst.expressions.{Coalesce, If, IsNotNull, Literal, RaiseError} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.ty
[spark] branch branch-3.2 updated: [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 3690c8ceb9e [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins 3690c8ceb9e is described below commit 3690c8ceb9e5c2f642b9f9e1af526f76d2e2a71a Author: Bruce Robbins AuthorDate: Wed Apr 27 15:38:22 2022 +0800 [SPARK-38868][SQL][3.2] Don't propagate exceptions from filter predicate when optimizing outer joins Backport of #36230 ### What changes were proposed in this pull request? Change `EliminateOuterJoin#canFilterOutNull` to return `false` when a `where` condition throws an exception. ### Why are the changes needed? Consider this query: ``` select * from (select id, id as b from range(0, 10)) l left outer join (select id, id + 1 as c from range(0, 10)) r on l.id = r.id where assert_true(c > 0) is null; ``` The query should succeed, but instead fails with ``` java.lang.RuntimeException: '(c#1L > cast(0 as bigint))' is not true! ``` This happens even though there is no row where `c > 0` is false. The `EliminateOuterJoin` rule checks if it can convert the outer join to a inner join based on the expression in the where clause, which in this case is ``` assert_true(c > 0) is null ``` `EliminateOuterJoin#canFilterOutNull` evaluates that expression with `c` set to `null` to see if the result is `null` or `false`. That rule doesn't expect the result to be a `RuntimeException`, but in this case it always is. That is, the assertion is failing during optimization, not at run time. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit test. Closes #36341 from bersprockets/outer_join_eval_assert_issue_32. Authored-by: Bruce Robbins Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/optimizer/joins.scala| 14 -- .../catalyst/optimizer/OuterJoinEliminationSuite.scala | 18 +- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala index d6e2a59de0b..19f4338908e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.optimizer import scala.annotation.tailrec +import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins @@ -144,8 +145,17 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper { val emptyRow = new GenericInternalRow(attributes.length) val boundE = BindReferences.bindReference(e, attributes) if (boundE.find(_.isInstanceOf[Unevaluable]).isDefined) return false -val v = boundE.eval(emptyRow) -v == null || v == false + +// some expressions, like map(), may throw an exception when dealing with null values. +// therefore, we need to handle exceptions. +try { + val v = boundE.eval(emptyRow) + v == null || v == false +} catch { + case NonFatal(e) => +// cannot filter out null if `where` expression throws an exception with null input +false +} } private def buildNewJoinType(filter: Filter, join: Join): JoinType = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala index 893c111c290..ea6ef525041 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala @@ -20,11 +20,13 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.{Coalesce, IsNotNull} +import org.apache.spark.sql.catalyst.expressions.{Coalesce, If, IsNotNull, Literal, RaiseError} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StringType +import org.apache.spark.unsafe.types.UTF8String class Oute