[spark] branch master updated: [SPARK-41417][CORE][SQL] Rename `_LEGACY_ERROR_TEMP_0019` to `INVALID_TYPED_LITERAL`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6972341b06e [SPARK-41417][CORE][SQL] Rename `_LEGACY_ERROR_TEMP_0019` to `INVALID_TYPED_LITERAL` 6972341b06e is described below commit 6972341b06eae40dda787306e2d1bde062501617 Author: yangjie01 AuthorDate: Sat Dec 10 09:50:08 2022 +0300 [SPARK-41417][CORE][SQL] Rename `_LEGACY_ERROR_TEMP_0019` to `INVALID_TYPED_LITERAL` ### What changes were proposed in this pull request? This pr aims rename `_LEGACY_ERROR_TEMP_0019` to `INVALID_TYPED_LITERAL` ### Why are the changes needed? Proper names of error classes to improve user experience with Spark SQL. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #38954 from LuciferYang/SPARK-41417. Authored-by: yangjie01 Signed-off-by: Max Gekk --- core/src/main/resources/error/error-classes.json | 11 +-- .../spark/sql/errors/QueryParsingErrors.scala | 7 +- .../catalyst/parser/ExpressionParserSuite.scala| 31 +--- .../resources/sql-tests/results/ansi/date.sql.out | 21 +++--- .../sql-tests/results/ansi/literals.sql.out| 14 ++-- .../sql-tests/results/ansi/timestamp.sql.out | 21 +++--- .../test/resources/sql-tests/results/date.sql.out | 21 +++--- .../sql-tests/results/datetime-legacy.sql.out | 42 ++- .../resources/sql-tests/results/literals.sql.out | 14 ++-- .../sql-tests/results/postgreSQL/date.sql.out | 84 -- .../resources/sql-tests/results/timestamp.sql.out | 21 +++--- .../results/timestampNTZ/timestamp-ansi.sql.out| 21 +++--- .../results/timestampNTZ/timestamp.sql.out | 21 +++--- 13 files changed, 192 insertions(+), 137 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 19ab5ada2b5..a8738994e17 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -813,6 +813,12 @@ } } }, + "INVALID_TYPED_LITERAL" : { +"message" : [ + "The value of the typed literal is invalid: ." +], +"sqlState" : "42000" + }, "INVALID_WHERE_CONDITION" : { "message" : [ "The WHERE condition contains invalid expressions: .", @@ -1599,11 +1605,6 @@ "Function trim doesn't support with type . Please use BOTH, LEADING or TRAILING as trim type." ] }, - "_LEGACY_ERROR_TEMP_0019" : { -"message" : [ - "Cannot parse the value: ." -] - }, "_LEGACY_ERROR_TEMP_0020" : { "message" : [ "Cannot parse the INTERVAL value: ." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala index 018e9a12e01..ad6f72986d6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala @@ -211,8 +211,11 @@ private[sql] object QueryParsingErrors extends QueryErrorsBase { def cannotParseValueTypeError( valueType: String, value: String, ctx: TypeConstructorContext): Throwable = { new ParseException( - errorClass = "_LEGACY_ERROR_TEMP_0019", - messageParameters = Map("valueType" -> valueType, "value" -> value), + errorClass = "INVALID_TYPED_LITERAL", + messageParameters = Map( +"valueType" -> toSQLType(valueType), +"value" -> toSQLValue(value, StringType) + ), ctx) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala index 884e782736c..01c9907cb8c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala @@ -521,8 +521,12 @@ class ExpressionParserSuite extends AnalysisTest { Literal(Timestamp.valueOf("2016-03-11 20:54:00.000"))) checkError( exception = parseException("timestamP_LTZ '2016-33-11 20:54:00.000'"), -errorClass = "_LEGACY_ERROR_TEMP_0019", -parameters = Map("valueType" -> "TIMESTAMP_LTZ", "value" -> "2016-33-11 20:54:00.000"), +errorClass = "INVALID_TYPED_LITERAL", +sqlState = "42000", +parameters = Map( + "valueType" -> "\"TIMESTAMP_LTZ\"", + "value" -> "'2016-33-11 20:54:00.000'" +), context = ExpectedContext( fragment = "timestamP_LTZ '2016-33-11 20:54:00.000'", start =
[spark] branch master updated: [SPARK-41467][BUILD] Upgrade httpclient from 4.5.13 to 4.5.14
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ef9113f790c [SPARK-41467][BUILD] Upgrade httpclient from 4.5.13 to 4.5.14 ef9113f790c is described below commit ef9113f790c810f4f79c4b076c0340bbcb76edac Author: panbingkun AuthorDate: Fri Dec 9 22:09:39 2022 -0800 [SPARK-41467][BUILD] Upgrade httpclient from 4.5.13 to 4.5.14 ### What changes were proposed in this pull request? This PR upgrades `commons.httpclient` from `4.5.13` to `4.5.14`. ### Why are the changes needed? https://archive.apache.org/dist/httpcomponents/httpclient/RELEASE_NOTES-4.5.x.txt https://user-images.githubusercontent.com/15246973/206691092-72fa2bbd-3f22-4d65-868f-207c5ed8ef0a.png";> ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. Closes #39005 from panbingkun/httpclient_4.5.14. Authored-by: panbingkun Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-2-hive-2.3 | 2 +- dev/deps/spark-deps-hadoop-3-hive-2.3 | 2 +- pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3 index d6e257084a8..ae7cc9d592c 100644 --- a/dev/deps/spark-deps-hadoop-2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2-hive-2.3 @@ -105,7 +105,7 @@ hk2-api/2.6.1//hk2-api-2.6.1.jar hk2-locator/2.6.1//hk2-locator-2.6.1.jar hk2-utils/2.6.1//hk2-utils-2.6.1.jar htrace-core/3.1.0-incubating//htrace-core-3.1.0-incubating.jar -httpclient/4.5.13//httpclient-4.5.13.jar +httpclient/4.5.14//httpclient-4.5.14.jar httpcore/4.4.14//httpcore-4.4.14.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.5.1//ivy-2.5.1.jar diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index b4a4780b560..f70abedd34b 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -92,7 +92,7 @@ hive-storage-api/2.7.3//hive-storage-api-2.7.3.jar hk2-api/2.6.1//hk2-api-2.6.1.jar hk2-locator/2.6.1//hk2-locator-2.6.1.jar hk2-utils/2.6.1//hk2-utils-2.6.1.jar -httpclient/4.5.13//httpclient-4.5.13.jar +httpclient/4.5.14//httpclient-4.5.14.jar httpcore/4.4.14//httpcore-4.4.14.jar ini4j/0.5.4//ini4j-0.5.4.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar diff --git a/pom.xml b/pom.xml index 78eea522398..54eb9287385 100644 --- a/pom.xml +++ b/pom.xml @@ -161,7 +161,7 @@ 0.12.8 hadoop3-2.2.7 -4.5.13 +4.5.14 4.4.14 3.6.1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41457][PYTHON][TESTS] Refactor type annotations and dependency checks in tests
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new bf4981fd4ad [SPARK-41457][PYTHON][TESTS] Refactor type annotations and dependency checks in tests bf4981fd4ad is described below commit bf4981fd4adfe96d3962e2e165c5a5d307a0033d Author: Hyukjin Kwon AuthorDate: Fri Dec 9 21:34:57 2022 -0800 [SPARK-41457][PYTHON][TESTS] Refactor type annotations and dependency checks in tests ### What changes were proposed in this pull request? This PR proposes to: - Print out the correct error message when dependencies are not installed for `pyspark.sql.connect` in main code. - Print out the correct message if 1. dependencies are missing or 2. build is not there in Spark Connect in tests. - Ignore all MyPy errors from tests: - Initial intention was to annotate types for public APIs only. - Also removed unused ignores in other modules Note that, for Spark Connect, we should follow both below to make sure they are properly skipped when dependences are not there, or jars do not exist. - Always use `if should_test_connect` for `pyspark.sql.connect` and related dependency imports in tests. - Use `unittest.skipIf(not should_test_connect, connect_requirement_message)` ### Why are the changes needed? - To make the development easier and faster. - To make sure our Jenkins build pass without Python dependencies installed - See [Scaleaway](https://apache-spark.s3.fr-par.scw.cloud/index.html), see also https://github.com/apache/spark-website/blob/asf-site/developer-tools.md#test-coverage. ### Does this PR introduce _any_ user-facing change? No, dev and test-only. ### How was this patch tested? Manually tested without pandas, pyarrow and grpc, and verified the error messages. Closes #38991 from HyukjinKwon/SPARK-41457. Authored-by: Hyukjin Kwon Signed-off-by: Dongjoon Hyun --- dev/tox.ini| 3 +- python/mypy.ini| 53 ++-- python/pyspark/ml/tests/test_algorithms.py | 2 +- python/pyspark/ml/tests/test_base.py | 2 +- python/pyspark/ml/tests/test_evaluation.py | 2 +- python/pyspark/ml/tests/test_feature.py| 2 +- python/pyspark/ml/tests/test_image.py | 2 +- python/pyspark/ml/tests/test_linalg.py | 2 +- python/pyspark/ml/tests/test_param.py | 2 +- python/pyspark/ml/tests/test_persistence.py| 2 +- python/pyspark/ml/tests/test_pipeline.py | 2 +- python/pyspark/ml/tests/test_stat.py | 2 +- python/pyspark/ml/tests/test_training_summary.py | 2 +- python/pyspark/ml/tests/test_tuning.py | 2 +- python/pyspark/ml/tests/test_util.py | 2 +- python/pyspark/ml/tests/test_wrapper.py| 2 +- python/pyspark/mllib/tests/test_algorithms.py | 2 +- python/pyspark/mllib/tests/test_feature.py | 2 +- python/pyspark/mllib/tests/test_linalg.py | 2 +- python/pyspark/mllib/tests/test_stat.py| 2 +- .../mllib/tests/test_streaming_algorithms.py | 2 +- python/pyspark/mllib/tests/test_util.py| 2 +- .../pandas/tests/data_type_ops/test_base.py| 2 +- .../pandas/tests/data_type_ops/test_binary_ops.py | 2 +- .../pandas/tests/data_type_ops/test_boolean_ops.py | 2 +- .../tests/data_type_ops/test_categorical_ops.py| 2 +- .../pandas/tests/data_type_ops/test_complex_ops.py | 2 +- .../pandas/tests/data_type_ops/test_date_ops.py| 2 +- .../tests/data_type_ops/test_datetime_ops.py | 2 +- .../pandas/tests/data_type_ops/test_null_ops.py| 2 +- .../pandas/tests/data_type_ops/test_num_ops.py | 2 +- .../pandas/tests/data_type_ops/test_string_ops.py | 2 +- .../tests/data_type_ops/test_timedelta_ops.py | 2 +- .../pandas/tests/data_type_ops/test_udt_ops.py | 2 +- python/pyspark/pandas/tests/indexes/test_base.py | 2 +- .../pyspark/pandas/tests/indexes/test_category.py | 2 +- .../pyspark/pandas/tests/indexes/test_datetime.py | 2 +- .../pyspark/pandas/tests/indexes/test_timedelta.py | 2 +- .../pyspark/pandas/tests/plot/test_frame_plot.py | 2 +- .../tests/plot/test_frame_plot_matplotlib.py | 2 +- .../pandas/tests/plot/test_frame_plot_plotly.py| 2 +- .../pyspark/pandas/tests/plot/test_series_plot.py | 2 +- .../tests/plot/test_series_plot_matplotlib.py | 2 +- .../pandas/tests/plot/test_series_plot_plotly.py | 2 +- python/pyspark/pandas/tests/test_categorical.py| 4 +- python/pyspark/pandas/tests/test_config.py | 2 +- python/pyspark/pandas/tests/test_csv.py| 2 +- python/pyspark/pandas
[spark] branch master updated: [SPARK-41475][CONNECT] Fix lint-scala command error and typo
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3084cc531e0 [SPARK-41475][CONNECT] Fix lint-scala command error and typo 3084cc531e0 is described below commit 3084cc531e0e0f352e4f6341f3ceddc78e3e8b3a Author: dengziming AuthorDate: Fri Dec 9 21:26:50 2022 -0800 [SPARK-41475][CONNECT] Fix lint-scala command error and typo ### What changes were proposed in this pull request? We separate connect into server and common, but failed to update the `lint-scala` tools. fix a typo: fase -> false format the code. ### Why are the changes needed? We can check the scala code format without it. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? No Closes #39012 from dengziming/minor-typo. Authored-by: dengziming Signed-off-by: Dongjoon Hyun --- .../main/scala/org/apache/spark/sql/connect/config/Connect.scala | 7 --- .../org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala | 6 ++ dev/lint-scala | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala index 358cb3c8f79..60fdd964018 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala @@ -38,9 +38,10 @@ private[spark] object Connect { val CONNECT_GRPC_ARROW_MAX_BATCH_SIZE = ConfigBuilder("spark.connect.grpc.arrow.maxBatchSize") - .doc("When using Apache Arrow, limit the maximum size of one arrow batch that " + -"can be sent from server side to client side. Currently, we conservatively use 70% " + -"of it because the size is not accurate but estimated.") + .doc( +"When using Apache Arrow, limit the maximum size of one arrow batch that " + + "can be sent from server side to client side. Currently, we conservatively use 70% " + + "of it because the size is not accurate but estimated.") .version("3.4.0") .bytesConf(ByteUnit.MiB) .createWithDefaultString("4m") diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index a40fe990307..ba5ceed4529 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -322,8 +322,7 @@ class SparkConnectPlanner(session: SparkSession) { None, rel.getVariableColumnName, Seq(rel.getValueColumnName), -transformRelation(rel.getInput) - ) +transformRelation(rel.getInput)) } else { val values = rel.getValuesList.asScala.toArray.map { expr => Column(transformExpression(expr)) @@ -335,8 +334,7 @@ class SparkConnectPlanner(session: SparkSession) { None, rel.getVariableColumnName, Seq(rel.getValueColumnName), -transformRelation(rel.getInput) - ) +transformRelation(rel.getInput)) } } diff --git a/dev/lint-scala b/dev/lint-scala index ea3b98464b2..48ecf57ef47 100755 --- a/dev/lint-scala +++ b/dev/lint-scala @@ -29,14 +29,14 @@ ERRORS=$(./build/mvn \ -Dscalafmt.skip=false \ -Dscalafmt.validateOnly=true \ -Dscalafmt.changedOnly=false \ --pl connector/connect \ +-pl connector/connect/server \ 2>&1 | grep -e "^Requires formatting" \ ) if test ! -z "$ERRORS"; then echo -e "The scalafmt check failed on connector/connect at following occurrences:\n\n$ERRORS\n" echo "Before submitting your change, please make sure to format your code using the following command:" - echo "./build/mvn -Pscala-2.12 scalafmt:format -Dscalafmt.skip=fase -Dscalafmt.validateOnly=false -Dscalafmt.changedOnly=false -pl connector/connect" + echo "./build/mvn -Pscala-2.12 scalafmt:format -Dscalafmt.skip=false -Dscalafmt.validateOnly=false -Dscalafmt.changedOnly=false -pl connector/connect/server" exit 1 else echo -e "Scalafmt checks passed." - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (435f6b1b358 -> c78f935185c)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 435f6b1b358 [SPARK-41414][CONNECT][PYTHON] Implement date/timestamp functions add c78f935185c [SPARK-41474][PROTOBUF][BUILD] Exclude `proto` files from `spark-protobuf` jar No new revisions were added by this update. Summary of changes: connector/protobuf/pom.xml | 8 1 file changed, 8 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41414][CONNECT][PYTHON] Implement date/timestamp functions
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 435f6b1b358 [SPARK-41414][CONNECT][PYTHON] Implement date/timestamp functions 435f6b1b358 is described below commit 435f6b1b3588d8c3c719f0e23b91209dd5f7bdb9 Author: Xinrong Meng AuthorDate: Sat Dec 10 09:16:18 2022 +0800 [SPARK-41414][CONNECT][PYTHON] Implement date/timestamp functions ### What changes were proposed in this pull request? Implement date/timestamp functions on Spark Connect. Amond them, functions `to_timestamp, from_utc_timestamp, to_utc_timestamp, timestamp_seconds, current_timestamp, date_trunc` has inconsistent dtypes issues, that should be resolved in https://issues.apache.org/jira/browse/SPARK-41455. ### Why are the changes needed? For API coverage on Spark Connect. ### Does this PR introduce _any_ user-facing change? Yes. New functions API are supported. ### How was this patch tested? Unit tests. Closes #38946 from xinrong-meng/connect_date_ts_func. Authored-by: Xinrong Meng Signed-off-by: Ruifeng Zheng --- python/pyspark/sql/connect/functions.py| 985 - .../sql/tests/connect/test_connect_function.py | 148 2 files changed, 1132 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py index c7a19d985ab..1e3eb7a2ea7 100644 --- a/python/pyspark/sql/connect/functions.py +++ b/python/pyspark/sql/connect/functions.py @@ -23,7 +23,7 @@ from pyspark.sql.connect.column import ( SQLExpression, ) -from typing import Any, TYPE_CHECKING, Union, List, Optional, Tuple +from typing import Any, TYPE_CHECKING, Union, List, overload, Optional, Tuple if TYPE_CHECKING: from pyspark.sql.connect._typing import ColumnOrName @@ -4157,3 +4157,986 @@ def encode(col: "ColumnOrName", charset: str) -> Column: ++ """ return _invoke_function("encode", _to_col(col), lit(charset)) + + +# Date/Timestamp functions +# TODO(SPARK-41283): Resolve dtypes inconsistencies for: +# to_timestamp, from_utc_timestamp, to_utc_timestamp, +# timestamp_seconds, current_timestamp, date_trunc + + +def current_date() -> Column: +""" +Returns the current date at the start of query evaluation as a :class:`DateType` column. +All calls of current_date within the same query return the same value. + +.. versionadded:: 3.4.0 + +Returns +--- +:class:`~pyspark.sql.Column` +current date. + +Examples + +>>> df = spark.range(1) +>>> df.select(current_date()).show() # doctest: +SKIP ++--+ +|current_date()| ++--+ +|2022-08-26| ++--+ +""" +return _invoke_function("current_date") + + +def current_timestamp() -> Column: +""" +Returns the current timestamp at the start of query evaluation as a :class:`TimestampType` +column. All calls of current_timestamp within the same query return the same value. + +.. versionadded:: 3.4.0 + +Returns +--- +:class:`~pyspark.sql.Column` +current date and time. + +Examples + +>>> df = spark.range(1) +>>> df.select(current_timestamp()).show(truncate=False) # doctest: +SKIP ++---+ +|current_timestamp()| ++---+ +|2022-08-26 21:23:22.716| ++---+ +""" +return _invoke_function("current_timestamp") + + +def localtimestamp() -> Column: +""" +Returns the current timestamp without time zone at the start of query evaluation +as a timestamp without time zone column. All calls of localtimestamp within the +same query return the same value. + +.. versionadded:: 3.4.0 + +Returns +--- +:class:`~pyspark.sql.Column` +current local date and time. + +Examples + +>>> df = spark.range(1) +>>> df.select(localtimestamp()).show(truncate=False) # doctest: +SKIP ++---+ +|localtimestamp() | ++---+ +|2022-08-26 21:28:34.639| ++---+ +""" +return _invoke_function("localtimestamp") + + +def date_format(date: "ColumnOrName", format: str) -> Column: +""" +Converts a date/timestamp/string to a value of string in the format specified by the date +format given by the second argument. + +A pattern could be for instance `dd.MM.` and could return a string like '18.03.1993'. All +pattern letters of `datetime pattern`_. can be used. + +.. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + +.. versionadded:: 3.4.0 + +Notes +- +
[spark] branch master updated: [SPARK-41329][CONNECT] Resolve circular imports in Spark Connect
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new deabae7465d [SPARK-41329][CONNECT] Resolve circular imports in Spark Connect deabae7465d is described below commit deabae7465db606b06c8e1cbaddf9cd67df6083c Author: Hyukjin Kwon AuthorDate: Sat Dec 10 09:33:20 2022 +0900 [SPARK-41329][CONNECT] Resolve circular imports in Spark Connect ### What changes were proposed in this pull request? This PR proposes to resolve the circular imports workarounds ### Why are the changes needed? For better readability and maintanence. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? CI in this PR should test it out. Closes #38994 from HyukjinKwon/SPARK-41329. Lead-authored-by: Hyukjin Kwon Co-authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/connect/column.py | 26 +++--- python/pyspark/sql/connect/readwriter.py | 1 - 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py index f1a909b89fc..b84c27afb15 100644 --- a/python/pyspark/sql/connect/column.py +++ b/python/pyspark/sql/connect/column.py @@ -27,14 +27,10 @@ import pyspark.sql.connect.proto as proto from pyspark.sql.connect.types import pyspark_types_to_proto_types if TYPE_CHECKING: -from pyspark.sql.connect._typing import ColumnOrName +from pyspark.sql.connect._typing import ColumnOrName, PrimitiveType from pyspark.sql.connect.client import SparkConnectClient import pyspark.sql.connect.proto as proto -# TODO(SPARK-41329): solve the circular import between _typing and this class -# if we want to reuse _type.PrimitiveType -PrimitiveType = Union[bool, float, int, str] - def _func_op(name: str, doc: str = "") -> Callable[["Column"], "Column"]: def _(self: "Column") -> "Column": @@ -554,7 +550,7 @@ class Column: return _func_op("not")(_bin_op("==")(self, other)) # string methods -def contains(self, other: Union[PrimitiveType, "Column"]) -> "Column": +def contains(self, other: Union["PrimitiveType", "Column"]) -> "Column": """ Contains the other element. Returns a boolean :class:`Column` based on a string match. @@ -698,6 +694,9 @@ class Column: >>> df.select(df.name.substr(1, 3).alias("col")).collect() [Row(col='Ali'), Row(col='Bob')] """ +from pyspark.sql.connect.function_builder import functions as F +from pyspark.sql.connect.functions import lit + if type(startPos) != type(length): raise TypeError( "startPos and length must be the same type. " @@ -706,17 +705,16 @@ class Column: length_t=type(length), ) ) -from pyspark.sql.connect.function_builder import functions as F if isinstance(length, int): -length_exp = self._lit(length) +length_exp = lit(length) elif isinstance(length, Column): length_exp = length else: raise TypeError("Unsupported type for substr().") if isinstance(startPos, int): -start_exp = self._lit(startPos) +start_exp = lit(startPos) else: start_exp = startPos @@ -726,8 +724,11 @@ class Column: """Returns a binary expression with the current column as the left side and the other expression as the right side. """ +from pyspark.sql.connect._typing import PrimitiveType +from pyspark.sql.connect.functions import lit + if isinstance(other, get_args(PrimitiveType)): -other = self._lit(other) +other = lit(other) return scalar_function("==", self, other) def to_plan(self, session: "SparkConnectClient") -> proto.Expression: @@ -779,11 +780,6 @@ class Column: else: raise TypeError("unexpected type: %s" % type(dataType)) -# TODO(SPARK-41329): solve the circular import between functions.py and -# this class if we want to reuse functions.lit -def _lit(self, x: Any) -> "Column": -return Column(LiteralExpression(x)) - def __repr__(self) -> str: return "Column<'%s'>" % self._expr.__repr__() diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py index 470417b6a28..778509bcf76 100644 --- a/python/pyspark/sql/connect/readwriter.py +++ b/python/pyspark/sql/connect/readwriter.py @@ -164,7 +164,6 @@ class DataFrameReader(OptionUtils): return self._df(plan) def _df(self, plan: LogicalPlan) -> "DataFrame": -# The import is n
[spark] branch master updated: [SPARK-41456][SQL] Improve the performance of try_cast
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9bdaed187e8 [SPARK-41456][SQL] Improve the performance of try_cast 9bdaed187e8 is described below commit 9bdaed187e8f8a5ef946785a14901bd188d608d2 Author: Gengliang Wang AuthorDate: Fri Dec 9 15:50:37 2022 -0800 [SPARK-41456][SQL] Improve the performance of try_cast ### What changes were proposed in this pull request? For try_cast and the legacy cast (cast with ANSI SQL mode off), there are conversions which has the same behaviors: - cast(string as date/timestamp/TimestampNTZ) - cast(string as double/float/decimal) So, the try_cast can be faster if it uses the evaluation code path of the legacy cast, instead of turning into ANSI cast and catching exceptions. ### Why are the changes needed? For better performance of `try_cast(string as datetime|Fractional numeric)`. I also observed performance regression for the method summary() after https://github.com/apache/spark/pull/35699 under ANSI SQL mode. The `try...catch` is actually unnecessary. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing UT Closes #38988 from gengliangwang/removeUnnessaryTry. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang --- .../apache/spark/sql/catalyst/expressions/Cast.scala | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 2255a96430e..d7a3952e634 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -510,7 +510,7 @@ case class Cast( final override def nodePatternsInternal(): Seq[TreePattern] = Seq(CAST) def ansiEnabled: Boolean = { -evalMode == EvalMode.ANSI || evalMode == EvalMode.TRY +evalMode == EvalMode.ANSI || (evalMode == EvalMode.TRY && !canUseLegacyCastForTryCast) } // Whether this expression is used for `try_cast()`. @@ -1270,7 +1270,7 @@ case class Cast( } private def cast(from: DataType, to: DataType): Any => Any = { -if (!isTryCast) { +if (!isTryCast || canUseLegacyCastForTryCast) { castInternal(from, to) } else { (input: Any) => @@ -1283,6 +1283,20 @@ case class Cast( } } + // Whether Spark SQL can evaluation the try_cast as the legacy cast, so that no `try...catch` + // is needed and the performance can be faster. + private lazy val canUseLegacyCastForTryCast: Boolean = { +if (!child.resolved) { + false +} else { + (child.dataType, dataType) match { +case (StringType, _: FractionalType) => true +case (StringType, _: DatetimeType) => true +case _ => false + } +} + } + protected[this] lazy val cast: Any => Any = cast(child.dataType, dataType) protected override def nullSafeEval(input: Any): Any = cast(input) @@ -1348,7 +1362,7 @@ case class Cast( protected[this] def castCode(ctx: CodegenContext, input: ExprValue, inputIsNull: ExprValue, result: ExprValue, resultIsNull: ExprValue, resultType: DataType, cast: CastFunction): Block = { val javaType = JavaCode.javaType(resultType) -val castCodeWithTryCatchIfNeeded = if (!isTryCast) { +val castCodeWithTryCatchIfNeeded = if (!isTryCast || canUseLegacyCastForTryCast) { s"${cast(input, result, resultIsNull)}" } else { s""" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41225][CONNECT][PYTHON][FOLLOW-UP] Disable unsupported functions
This is an automated email from the ASF dual-hosted git repository. xinrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 29a70117b27 [SPARK-41225][CONNECT][PYTHON][FOLLOW-UP] Disable unsupported functions 29a70117b27 is described below commit 29a70117b272582d11e7b7b8951dff1be91d3de7 Author: Martin Grund AuthorDate: Fri Dec 9 14:55:50 2022 -0800 [SPARK-41225][CONNECT][PYTHON][FOLLOW-UP] Disable unsupported functions ### What changes were proposed in this pull request? This patch adds method stubs for unsupported functions in the Python client for Spark Connect in the `Column` class that will throw a `NoteImplementedError` when called. This is to give a clear indication to the users that these methods will be implemented in the future. ### Why are the changes needed? UX ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? UT Closes #39009 from grundprinzip/SPARK-41225-v2. Authored-by: Martin Grund Signed-off-by: Xinrong Meng --- python/pyspark/sql/connect/column.py | 36 ++ .../sql/tests/connect/test_connect_column.py | 25 +++ 2 files changed, 61 insertions(+) diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py index 63e95c851db..f1a909b89fc 100644 --- a/python/pyspark/sql/connect/column.py +++ b/python/pyspark/sql/connect/column.py @@ -786,3 +786,39 @@ class Column: def __repr__(self) -> str: return "Column<'%s'>" % self._expr.__repr__() + +def otherwise(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("otherwise() is not yet implemented.") + +def over(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("over() is not yet implemented.") + +def isin(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("isin() is not yet implemented.") + +def when(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("when() is not yet implemented.") + +def getItem(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("getItem() is not yet implemented.") + +def astype(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("astype() is not yet implemented.") + +def between(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("between() is not yet implemented.") + +def getField(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("getField() is not yet implemented.") + +def withField(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("withField() is not yet implemented.") + +def dropFields(self, *args: Any, **kwargs: Any) -> None: +raise NotImplementedError("dropFields() is not yet implemented.") + +def __getitem__(self, k: Any) -> None: +raise NotImplementedError("apply() - __getitem__ is not yet implemented.") + +def __iter__(self) -> None: +raise TypeError("Column is not iterable") diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py b/python/pyspark/sql/tests/connect/test_connect_column.py index c73f1b5b0c7..734b0bbf226 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column.py +++ b/python/pyspark/sql/tests/connect/test_connect_column.py @@ -119,6 +119,31 @@ class SparkConnectTests(SparkConnectSQLTestCase): df.select(df.id.cast(x)).toPandas(), df2.select(df2.id.cast(x)).toPandas() ) +def test_unsupported_functions(self): +# SPARK-41225: Disable unsupported functions. +c = self.connect.range(1).id +for f in ( +"otherwise", +"over", +"isin", +"when", +"getItem", +"astype", +"between", +"getField", +"withField", +"dropFields", +): +with self.assertRaises(NotImplementedError): +getattr(c, f)() + +with self.assertRaises(NotImplementedError): +c["a"] + +with self.assertRaises(TypeError): +for x in c: +pass + if __name__ == "__main__": import unittest - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-41376][CORE][3.2] Correct the Netty preferDirectBufs check logic on executor start
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 3beab1f51af [SPARK-41376][CORE][3.2] Correct the Netty preferDirectBufs check logic on executor start 3beab1f51af is described below commit 3beab1f51afd22cfe26f8a7aa4229580942ebdeb Author: Cheng Pan AuthorDate: Fri Dec 9 10:17:36 2022 -0800 [SPARK-41376][CORE][3.2] Correct the Netty preferDirectBufs check logic on executor start ### What changes were proposed in this pull request? Backport #38901 to branch-3.2. Fix the condition for judging Netty prefer direct memory on executor start, the logic should match `org.apache.spark.network.client.TransportClientFactory`. ### Why are the changes needed? The check logical was added in SPARK-27991, the original intention is to avoid potential Netty OOM issue when Netty uses direct memory to consume shuffle data, but the condition is not sufficient, this PR updates the logic to match `org.apache.spark.network.client.TransportClientFactory` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual testing. Closes #38982 from pan3793/SPARK-41376-3.2. Authored-by: Cheng Pan Signed-off-by: Dongjoon Hyun --- .../java/org/apache/spark/network/util/NettyUtils.java | 14 ++ .../spark/executor/CoarseGrainedExecutorBackend.scala | 5 - 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java b/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java index 0a4a6a7bffe..ac61aa39e5f 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/NettyUtils.java @@ -171,4 +171,18 @@ public class NettyUtils { allowCache ? PooledByteBufAllocator.defaultUseCacheForAllThreads() : false ); } + + /** + * ByteBuf allocator prefers to allocate direct ByteBuf iif both Spark allows to create direct + * ByteBuf and Netty enables directBufferPreferred. + */ + public static boolean preferDirectBufs(TransportConf conf) { +boolean allowDirectBufs; +if (conf.sharedByteBufAllocators()) { + allowDirectBufs = conf.preferDirectBufsForSharedByteBufAllocators(); +} else { + allowDirectBufs = conf.preferDirectBufs(); +} +return allowDirectBufs && PlatformDependent.directBufferPreferred(); + } } diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index e3be6eb201c..7757fe1fc8a 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -35,6 +35,8 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.worker.WorkerWatcher import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ +import org.apache.spark.network.netty.SparkTransportConf +import org.apache.spark.network.util.NettyUtils import org.apache.spark.resource.ResourceInformation import org.apache.spark.resource.ResourceProfile import org.apache.spark.resource.ResourceProfile._ @@ -91,7 +93,8 @@ private[spark] class CoarseGrainedExecutorBackend( logInfo("Connecting to driver: " + driverUrl) try { - if (PlatformDependent.directBufferPreferred() && + val shuffleClientTransportConf = SparkTransportConf.fromSparkConf(env.conf, "shuffle") + if (NettyUtils.preferDirectBufs(shuffleClientTransportConf) && PlatformDependent.maxDirectMemory() < env.conf.get(MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM)) { throw new SparkException(s"Netty direct memory should at least be bigger than " + s"'${MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM.key}', but got " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41466][BUILD] Change Scala Style configuration to catch AnyFunSuite instead of FunSuite
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e371c53350b [SPARK-41466][BUILD] Change Scala Style configuration to catch AnyFunSuite instead of FunSuite e371c53350b is described below commit e371c53350b179fc095397a0cd111aa4085b8219 Author: Hyukjin Kwon AuthorDate: Fri Dec 9 09:31:26 2022 -0800 [SPARK-41466][BUILD] Change Scala Style configuration to catch AnyFunSuite instead of FunSuite ### What changes were proposed in this pull request? This PR is technically a followup of https://github.com/apache/spark/pull/29196 that changes Scala Style configuration to catch `AnyFunSuite` instead of `FunSuite`. ### Why are the changes needed? To keep the original intention of the Scala style configuration. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? Manually tested with `./dev/lint-scala`. CI in this PR should test it out too. Closes #39004 from HyukjinKwon/SPARK-41466. Authored-by: Hyukjin Kwon Signed-off-by: Dongjoon Hyun --- .../spark/sql/connect/planner/LiteralValueProtoConverterSuite.scala | 4 ++-- core/src/test/scala/org/apache/spark/SparkFunSuite.scala | 4 ++-- scalastyle-config.xml| 4 ++-- .../spark/sql/execution/datasources/CommonFileDataSourceSuite.scala | 5 +++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralValueProtoConverterSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralValueProtoConverterSuite.scala index dc8254c47f3..7c8ee6209ac 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralValueProtoConverterSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralValueProtoConverterSuite.scala @@ -17,11 +17,11 @@ package org.apache.spark.sql.connect.planner -import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.sql.connect.planner.LiteralValueProtoConverter.{toCatalystValue, toConnectProtoValue} -class LiteralValueProtoConverterSuite extends AnyFunSuite { +class LiteralValueProtoConverterSuite extends AnyFunSuite { // scalastyle:ignore funsuite test("basic proto value and catalyst value conversion") { val values = Array(null, true, 1.toByte, 1.toShort, 1, 1L, 1.1d, 1.1f, "spark") diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala index 7a08de9c181..27198039fdb 100644 --- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala @@ -32,7 +32,7 @@ import org.apache.logging.log4j.core.{LogEvent, Logger, LoggerContext} import org.apache.logging.log4j.core.appender.AbstractAppender import org.apache.logging.log4j.core.config.Property import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, Failed, Outcome} -import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.deploy.LocalSparkCluster import org.apache.spark.internal.Logging @@ -64,7 +64,7 @@ import org.apache.spark.util.{AccumulatorContext, Utils} * } */ abstract class SparkFunSuite - extends AnyFunSuite + extends AnyFunSuite // scalastyle:ignore funsuite with BeforeAndAfterAll with BeforeAndAfterEach with ThreadAudit diff --git a/scalastyle-config.xml b/scalastyle-config.xml index 4e2da275694..f34b5d55e42 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -135,9 +135,9 @@ This file is divided into 3 sections: - + -^FunSuite[A-Za-z]*$ +^AnyFunSuite[A-Za-z]*$ Tests must extend org.apache.spark.SparkFunSuite instead. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CommonFileDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CommonFileDataSourceSuite.scala index b7d0a7fc306..739f4c440be 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CommonFileDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CommonFileDataSourceSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources -import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.sql.{Dataset, Encoders, FakeFileSystemRequiringDSOption, SparkSession} import org.a
[spark] branch master updated (fc3c0f1008d -> 928eab666da)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from fc3c0f1008d [SPARK-41450][BUILD] Fix shading in `core` module add 928eab666da [SPARK-41462][SQL] Date and timestamp type can up cast to TimestampNTZ No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/catalyst/expressions/Cast.scala | 3 +++ .../apache/spark/sql/catalyst/expressions/CastSuiteBase.scala| 9 + 2 files changed, 12 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (acdacf4f438 -> fc3c0f1008d)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from acdacf4f438 [SPARK-41402][SQL][CONNECT] Override prettyName of StringDecode add fc3c0f1008d [SPARK-41450][BUILD] Fix shading in `core` module No new revisions were added by this update. Summary of changes: core/pom.xml | 24 pom.xml | 11 --- 2 files changed, 24 insertions(+), 11 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (be52d67fbe9 -> acdacf4f438)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from be52d67fbe9 [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service add acdacf4f438 [SPARK-41402][SQL][CONNECT] Override prettyName of StringDecode No new revisions were added by this update. Summary of changes: python/pyspark/sql/connect/functions.py| 61 +++--- python/pyspark/sql/functions.py| 10 ++-- .../sql/tests/connect/test_connect_function.py | 20 ++- .../catalyst/expressions/stringExpressions.scala | 2 + 4 files changed, 41 insertions(+), 52 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-40270][PS][FOLLOWUP][3.2] Skip test_style when pandas <1.3.0
This is an automated email from the ASF dual-hosted git repository. yikun pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 43402fdeb09 [SPARK-40270][PS][FOLLOWUP][3.2] Skip test_style when pandas <1.3.0 43402fdeb09 is described below commit 43402fdeb0942e518ec7f5561ddf3690ae5cac27 Author: Yikun Jiang AuthorDate: Fri Dec 9 22:15:48 2022 +0800 [SPARK-40270][PS][FOLLOWUP][3.2] Skip test_style when pandas <1.3.0 ### What changes were proposed in this pull request? According to https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html: `pandas.io.formats.style.Styler.to_latex` introduced since 1.3.0, so before panda 1.3.0, should skip the check ``` ERROR [0.180s]: test_style (pyspark.pandas.tests.test_dataframe.DataFrameTest) -- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/pandas/tests/test_dataframe.py", line 5795, in test_style check_style() File "/__w/spark/spark/python/pyspark/pandas/tests/test_dataframe.py", line 5793, in check_style self.assert_eq(pdf_style.to_latex(), psdf_style.to_latex()) AttributeError: 'Styler' object has no attribute 'to_latex' ``` Related: https://github.com/apache/spark/commit/58375a86e6ff49c5bcee49939fbd98eb848ae59f ### Why are the changes needed? This test break the 3.2 branch pyspark test (with python 3.6 + pandas 1.1.x), so I think better add the `skipIf` it. See also https://github.com/apache/spark/pull/38982#issuecomment-1343923114 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Closes #39008 from Yikun/branch-3.2-style-check. Authored-by: Yikun Jiang Signed-off-by: Yikun Jiang --- python/pyspark/pandas/tests/test_dataframe.py | 4 1 file changed, 4 insertions(+) diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index b4187d59ae7..15cadbebdb6 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -5774,6 +5774,10 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): for value_psdf, value_pdf in zip(psdf, pdf): self.assert_eq(value_psdf, value_pdf) +@unittest.skipIf( +LooseVersion(pd.__version__) < LooseVersion("1.3.0"), +"pandas support `Styler.to_latex` since 1.3.0", +) def test_style(self): # Currently, the `style` function returns a pandas object `Styler` as it is, # processing only the number of rows declared in `compute.max_rows`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 88d20e4c71e [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service 88d20e4c71e is described below commit 88d20e4c71e757103984316eaab283c4169aa38c Author: Cheng Pan AuthorDate: Fri Dec 9 08:14:17 2022 -0600 [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service ### What changes were proposed in this pull request? Correctly transform the SPI services for Yarn Shuffle Service by configuring `ServicesResourceTransformer`. ### Why are the changes needed? SPARK-12807 relocated the Jackson classes, but did not handle SPI services properly. It affects Spark 2.0 and above, so this PR is for 3.2/3.3/master. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually checked the output jar. Before: https://user-images.githubusercontent.com/26535726/206632421-acbec562-c600-4497-83a3-f9b2f6caba74.png";> After: https://user-images.githubusercontent.com/26535726/206632440-4c8ed745-dbc8-4b6e-a9e7-f285521aa8b4.png";> Closes #38989 from pan3793/SPARK-41458. Authored-by: Cheng Pan Signed-off-by: Sean Owen (cherry picked from commit be52d67fbe98110eeabf1b2a7c16741dceefdca6) Signed-off-by: Sean Owen --- common/network-yarn/pom.xml | 5 + 1 file changed, 5 insertions(+) diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 81146a36c98..cded75b2f8e 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -136,6 +136,11 @@ shade + + + + + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new be52d67fbe9 [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service be52d67fbe9 is described below commit be52d67fbe98110eeabf1b2a7c16741dceefdca6 Author: Cheng Pan AuthorDate: Fri Dec 9 08:14:17 2022 -0600 [SPARK-41458][BUILD][YARN][SHUFFLE] Correctly transform the SPI services for Yarn Shuffle Service ### What changes were proposed in this pull request? Correctly transform the SPI services for Yarn Shuffle Service by configuring `ServicesResourceTransformer`. ### Why are the changes needed? SPARK-12807 relocated the Jackson classes, but did not handle SPI services properly. It affects Spark 2.0 and above, so this PR is for 3.2/3.3/master. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually checked the output jar. Before: https://user-images.githubusercontent.com/26535726/206632421-acbec562-c600-4497-83a3-f9b2f6caba74.png";> After: https://user-images.githubusercontent.com/26535726/206632440-4c8ed745-dbc8-4b6e-a9e7-f285521aa8b4.png";> Closes #38989 from pan3793/SPARK-41458. Authored-by: Cheng Pan Signed-off-by: Sean Owen --- common/network-yarn/pom.xml | 5 + 1 file changed, 5 insertions(+) diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index a4073969dbf..a77732bb8b8 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -136,6 +136,11 @@ shade + + + + + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-40270][PS][FOLLOWUP][3.3] Skip test_style when pandas <1.3.0
This is an automated email from the ASF dual-hosted git repository. yikun pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new b6c6526e3b1 [SPARK-40270][PS][FOLLOWUP][3.3] Skip test_style when pandas <1.3.0 b6c6526e3b1 is described below commit b6c6526e3b1c5bd32b010a38cb0f4faeba678e22 Author: Yikun Jiang AuthorDate: Fri Dec 9 22:13:09 2022 +0800 [SPARK-40270][PS][FOLLOWUP][3.3] Skip test_style when pandas <1.3.0 ### What changes were proposed in this pull request? According to https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html: `pandas.io.formats.style.Styler.to_latex` introduced since 1.3.0, so before panda 1.3.0, should skip the check ``` ERROR [0.180s]: test_style (pyspark.pandas.tests.test_dataframe.DataFrameTest) -- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/pandas/tests/test_dataframe.py", line 5795, in test_style check_style() File "/__w/spark/spark/python/pyspark/pandas/tests/test_dataframe.py", line 5793, in check_style self.assert_eq(pdf_style.to_latex(), psdf_style.to_latex()) AttributeError: 'Styler' object has no attribute 'to_latex' ``` Related: https://github.com/apache/spark/commit/58375a86e6ff49c5bcee49939fbd98eb848ae59f ### Why are the changes needed? This test break the 3.2 branch pyspark test (with python 3.6 + pandas 1.1.x), so I think better add the `skipIf` it. See also https://github.com/apache/spark/pull/38982#issuecomment-1343923114 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - CI passed Closes #39007 from Yikun/branch-3.3-check. Authored-by: Yikun Jiang Signed-off-by: Yikun Jiang --- python/pyspark/pandas/tests/test_dataframe.py | 4 1 file changed, 4 insertions(+) diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index 0a7eda77564..0c23bf07a69 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -6375,6 +6375,10 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): psdf = ps.from_pandas(pdf) self.assert_eq(pdf.cov(), psdf.cov()) +@unittest.skipIf( +LooseVersion(pd.__version__) < LooseVersion("1.3.0"), +"pandas support `Styler.to_latex` since 1.3.0", +) def test_style(self): # Currently, the `style` function returns a pandas object `Styler` as it is, # processing only the number of rows declared in `compute.max_rows`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-40270][PS][FOLLOWUP] Skip test_style when pandas <1.3.0
This is an automated email from the ASF dual-hosted git repository. yikun pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new dd0bd0762b3 [SPARK-40270][PS][FOLLOWUP] Skip test_style when pandas <1.3.0 dd0bd0762b3 is described below commit dd0bd0762b344ab34e1b08c9bbd2ac77b83856e0 Author: Yikun Jiang AuthorDate: Fri Dec 9 22:11:03 2022 +0800 [SPARK-40270][PS][FOLLOWUP] Skip test_style when pandas <1.3.0 ### What changes were proposed in this pull request? According to https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html: `pandas.io.formats.style.Styler.to_latex` introduced since 1.3.0, so before panda 1.3.0, should skip the check ``` ERROR [0.180s]: test_style (pyspark.pandas.tests.test_dataframe.DataFrameTest) -- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/pandas/tests/test_dataframe.py", line 5795, in test_style check_style() File "/__w/spark/spark/python/pyspark/pandas/tests/test_dataframe.py", line 5793, in check_style self.assert_eq(pdf_style.to_latex(), psdf_style.to_latex()) AttributeError: 'Styler' object has no attribute 'to_latex' ``` Related: https://github.com/apache/spark/commit/58375a86e6ff49c5bcee49939fbd98eb848ae59f ### Why are the changes needed? This test break the 3.2 branch pyspark test (with python 3.6 + pandas 1.1.x), so I think better add the `skipIf` it. See also https://github.com/apache/spark/pull/38982#issuecomment-1343923114 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - CI passed - Test on 3.2 branch: https://github.com/Yikun/spark/pull/194, https://github.com/Yikun/spark/actions/runs/3655564439/jobs/6177030747 Closes #39002 from Yikun/skip-check. Authored-by: Yikun Jiang Signed-off-by: Yikun Jiang --- python/pyspark/pandas/tests/test_dataframe.py | 4 1 file changed, 4 insertions(+) diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index 4e80c680b6e..ded110c1231 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -7074,6 +7074,10 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): psdf = ps.from_pandas(pdf) self.assert_eq(pdf.cov(), psdf.cov()) +@unittest.skipIf( +LooseVersion(pd.__version__) < LooseVersion("1.3.0"), +"pandas support `Styler.to_latex` since 1.3.0", +) def test_style(self): # Currently, the `style` function returns a pandas object `Styler` as it is, # processing only the number of rows declared in `compute.max_rows`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (fec210b36be -> 0cfda39e6d5)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from fec210b36be [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row add 0cfda39e6d5 [SPARK-41446][CONNECT][PYTHON] Make `createDataFrame` support schema and more input dataset types No new revisions were added by this update. Summary of changes: .../main/protobuf/spark/connect/relations.proto| 12 ++ .../sql/connect/planner/SparkConnectPlanner.scala | 38 - python/pyspark/sql/connect/plan.py | 31 +++- python/pyspark/sql/connect/proto/relations_pb2.py | 175 +++-- python/pyspark/sql/connect/proto/relations_pb2.pyi | 32 +++- python/pyspark/sql/connect/session.py | 114 -- .../sql/tests/connect/test_connect_basic.py| 88 ++- 7 files changed, 378 insertions(+), 112 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 556a3cac4b0 [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row 556a3cac4b0 is described below commit 556a3cac4b014c3905ab95da14a6a881123d09c7 Author: Bruce Robbins AuthorDate: Fri Dec 9 21:44:45 2022 +0900 [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row Change `InterpretedMutableProjection` to use `setDecimal` rather than `setNullAt` to set null values for decimals in unsafe rows. The following returns the wrong answer: ``` set spark.sql.codegen.wholeStage=false; set spark.sql.codegen.factoryMode=NO_CODEGEN; select max(col1), max(col2) from values (cast(null as decimal(27,2)), cast(null as decimal(27,2))), (cast(77.77 as decimal(27,2)), cast(245.00 as decimal(27,2))) as data(col1, col2); +-+-+ |max(col1)|max(col2)| +-+-+ |null |239.88 | +-+-+ ``` This is because `InterpretedMutableProjection` inappropriately uses `InternalRow#setNullAt` on unsafe rows to set null for decimal types with precision > `Decimal.MAX_LONG_DIGITS`. When `setNullAt` is used, the pointer to the decimal's storage area in the variable length region gets zeroed out. Later, when `InterpretedMutableProjection` calls `setDecimal` on that field, `UnsafeRow#setDecimal` picks up the zero pointer and stores decimal data on top of the null-tracking bit set. Later updates to the null-tracking bit set (e.g., calls to `setNotNullAt`) further corrupt the decimal data (turning 245.00 into 239.88, for example). The stomping of the null-tracking bi [...] This bug can manifest for end-users after codegen fallback (say, if an expression's generated code fails to compile). [Codegen for mutable projection](https://github.com/apache/spark/blob/89b2ee27d258dec8fe265fa862846e800a374d8e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala#L1729) uses `mutableRow.setDecimal` for null decimal values regardless of precision or the type for `mutableRow`, so this PR does the same. No. New unit tests. Closes #38923 from bersprockets/unsafe_decimal_issue. Authored-by: Bruce Robbins Signed-off-by: Hyukjin Kwon (cherry picked from commit fec210b36be22f187b51b67970960692f75ac31f) Signed-off-by: Hyukjin Kwon --- .../expressions/InterpretedMutableProjection.scala | 3 +- .../expressions/MutableProjectionSuite.scala | 62 ++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala index 91c9457af7d..4e129e96d1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.DecimalType /** @@ -72,7 +73,7 @@ class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mutable private[this] val fieldWriters: Array[Any => Unit] = validExprs.map { case (e, i) => val writer = InternalRow.getWriter(i, e.dataType) -if (!e.nullable) { +if (!e.nullable || e.dataType.isInstanceOf[DecimalType]) { (v: Any) => writer(mutableRow, v) } else { (v: Any) => { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala index 0f01bfbb894..e3f11283816 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala @@ -65,6 +65,68 @@ class MutableProjectionSuite extends SparkFunSuite with ExpressionEvalHelper { assert(SafeProjection.create(fixedLengthTypes)(projUnsafeRow) === inputRow) } + def testRows( + bufferSchema: StructType, + buffer: InternalRow, + scalaRows: Seq[Seq[Any]]): Unit = { +val bufferTypes = bufferSchema
[spark] branch branch-3.3 updated: [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new d39e7bac73d [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row d39e7bac73d is described below commit d39e7bac73d6b4f12492cef5e8a31e406b2a5d3a Author: Bruce Robbins AuthorDate: Fri Dec 9 21:44:45 2022 +0900 [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row Change `InterpretedMutableProjection` to use `setDecimal` rather than `setNullAt` to set null values for decimals in unsafe rows. The following returns the wrong answer: ``` set spark.sql.codegen.wholeStage=false; set spark.sql.codegen.factoryMode=NO_CODEGEN; select max(col1), max(col2) from values (cast(null as decimal(27,2)), cast(null as decimal(27,2))), (cast(77.77 as decimal(27,2)), cast(245.00 as decimal(27,2))) as data(col1, col2); +-+-+ |max(col1)|max(col2)| +-+-+ |null |239.88 | +-+-+ ``` This is because `InterpretedMutableProjection` inappropriately uses `InternalRow#setNullAt` on unsafe rows to set null for decimal types with precision > `Decimal.MAX_LONG_DIGITS`. When `setNullAt` is used, the pointer to the decimal's storage area in the variable length region gets zeroed out. Later, when `InterpretedMutableProjection` calls `setDecimal` on that field, `UnsafeRow#setDecimal` picks up the zero pointer and stores decimal data on top of the null-tracking bit set. Later updates to the null-tracking bit set (e.g., calls to `setNotNullAt`) further corrupt the decimal data (turning 245.00 into 239.88, for example). The stomping of the null-tracking bi [...] This bug can manifest for end-users after codegen fallback (say, if an expression's generated code fails to compile). [Codegen for mutable projection](https://github.com/apache/spark/blob/89b2ee27d258dec8fe265fa862846e800a374d8e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala#L1729) uses `mutableRow.setDecimal` for null decimal values regardless of precision or the type for `mutableRow`, so this PR does the same. No. New unit tests. Closes #38923 from bersprockets/unsafe_decimal_issue. Authored-by: Bruce Robbins Signed-off-by: Hyukjin Kwon (cherry picked from commit fec210b36be22f187b51b67970960692f75ac31f) Signed-off-by: Hyukjin Kwon --- .../expressions/InterpretedMutableProjection.scala | 3 +- .../expressions/MutableProjectionSuite.scala | 62 ++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala index 91c9457af7d..4e129e96d1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.DecimalType /** @@ -72,7 +73,7 @@ class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mutable private[this] val fieldWriters: Array[Any => Unit] = validExprs.map { case (e, i) => val writer = InternalRow.getWriter(i, e.dataType) -if (!e.nullable) { +if (!e.nullable || e.dataType.isInstanceOf[DecimalType]) { (v: Any) => writer(mutableRow, v) } else { (v: Any) => { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala index 0f01bfbb894..e3f11283816 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala @@ -65,6 +65,68 @@ class MutableProjectionSuite extends SparkFunSuite with ExpressionEvalHelper { assert(SafeProjection.create(fixedLengthTypes)(projUnsafeRow) === inputRow) } + def testRows( + bufferSchema: StructType, + buffer: InternalRow, + scalaRows: Seq[Seq[Any]]): Unit = { +val bufferTypes = bufferSchema
[spark] branch master updated: [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new fec210b36be [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row fec210b36be is described below commit fec210b36be22f187b51b67970960692f75ac31f Author: Bruce Robbins AuthorDate: Fri Dec 9 21:44:45 2022 +0900 [SPARK-41395][SQL] `InterpretedMutableProjection` should use `setDecimal` to set null values for decimals in an unsafe row ### What changes were proposed in this pull request? Change `InterpretedMutableProjection` to use `setDecimal` rather than `setNullAt` to set null values for decimals in unsafe rows. ### Why are the changes needed? The following returns the wrong answer: ``` set spark.sql.codegen.wholeStage=false; set spark.sql.codegen.factoryMode=NO_CODEGEN; select max(col1), max(col2) from values (cast(null as decimal(27,2)), cast(null as decimal(27,2))), (cast(77.77 as decimal(27,2)), cast(245.00 as decimal(27,2))) as data(col1, col2); +-+-+ |max(col1)|max(col2)| +-+-+ |null |239.88 | +-+-+ ``` This is because `InterpretedMutableProjection` inappropriately uses `InternalRow#setNullAt` on unsafe rows to set null for decimal types with precision > `Decimal.MAX_LONG_DIGITS`. When `setNullAt` is used, the pointer to the decimal's storage area in the variable length region gets zeroed out. Later, when `InterpretedMutableProjection` calls `setDecimal` on that field, `UnsafeRow#setDecimal` picks up the zero pointer and stores decimal data on top of the null-tracking bit set. Later updates to the null-tracking bit set (e.g., calls to `setNotNullAt`) further corrupt the decimal data (turning 245.00 into 239.88, for example). The stomping of the null-tracking bi [...] This bug can manifest for end-users after codegen fallback (say, if an expression's generated code fails to compile). [Codegen for mutable projection](https://github.com/apache/spark/blob/89b2ee27d258dec8fe265fa862846e800a374d8e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala#L1729) uses `mutableRow.setDecimal` for null decimal values regardless of precision or the type for `mutableRow`, so this PR does the same. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests. Closes #38923 from bersprockets/unsafe_decimal_issue. Authored-by: Bruce Robbins Signed-off-by: Hyukjin Kwon --- .../expressions/InterpretedMutableProjection.scala | 3 +- .../expressions/MutableProjectionSuite.scala | 62 ++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala index 91c9457af7d..4e129e96d1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.DecimalType /** @@ -72,7 +73,7 @@ class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mutable private[this] val fieldWriters: Array[Any => Unit] = validExprs.map { case (e, i) => val writer = InternalRow.getWriter(i, e.dataType) -if (!e.nullable) { +if (!e.nullable || e.dataType.isInstanceOf[DecimalType]) { (v: Any) => writer(mutableRow, v) } else { (v: Any) => { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala index 0f01bfbb894..e3f11283816 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala @@ -65,6 +65,68 @@ class MutableProjectionSuite extends SparkFunSuite with ExpressionEvalHelper { assert(SafeProjection.create(fixedLengthTypes)(projUnsafeRow) === inputRow) } + def testRows( + bufferSchema: StructType, + buffer:
[spark] 01/01: Update test_dataframe.py
This is an automated email from the ASF dual-hosted git repository. yikun pushed a commit to branch branch-3.2-style-check in repository https://gitbox.apache.org/repos/asf/spark.git commit 49d31b0d860da90cf2f4ec696b3220f24355f65e Author: Yikun Jiang AuthorDate: Fri Dec 9 19:46:01 2022 +0800 Update test_dataframe.py --- python/pyspark/pandas/tests/test_dataframe.py | 4 1 file changed, 4 insertions(+) diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index b4187d59ae7..15cadbebdb6 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -5774,6 +5774,10 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): for value_psdf, value_pdf in zip(psdf, pdf): self.assert_eq(value_psdf, value_pdf) +@unittest.skipIf( +LooseVersion(pd.__version__) < LooseVersion("1.3.0"), +"pandas support `Styler.to_latex` since 1.3.0", +) def test_style(self): # Currently, the `style` function returns a pandas object `Styler` as it is, # processing only the number of rows declared in `compute.max_rows`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2-style-check created (now 49d31b0d860)
This is an automated email from the ASF dual-hosted git repository. yikun pushed a change to branch branch-3.2-style-check in repository https://gitbox.apache.org/repos/asf/spark.git at 49d31b0d860 Update test_dataframe.py This branch includes the following new commits: new 49d31b0d860 Update test_dataframe.py The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [MINOR][DOC] Fix typo in SqlBaseLexer.g4
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new bc32392a3c8 [MINOR][DOC] Fix typo in SqlBaseLexer.g4 bc32392a3c8 is described below commit bc32392a3c89a117219cd3c33fc4707ce5ecc002 Author: jiaoqb AuthorDate: Fri Dec 9 02:22:20 2022 -0800 [MINOR][DOC] Fix typo in SqlBaseLexer.g4 ### What changes were proposed in this pull request? Fix typo in SqlBaseLexer.g4 ### Why are the changes needed? Better documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A Closes #38990 from jiaoqingbo/typefix. Authored-by: jiaoqb Signed-off-by: Dongjoon Hyun --- .../main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 41adbda7b10..38f52901aa2 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -18,7 +18,7 @@ lexer grammar SqlBaseLexer; @members { /** - * When true, parser should throw ParseExcetion for unclosed bracketed comment. + * When true, parser should throw ParseException for unclosed bracketed comment. */ public boolean has_unclosed_bracketed_comment = false; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (bd37b10c756 -> 83c12c2c0f8)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from bd37b10c756 [SPARK-41377][BUILD] Fix spark-version-info.properties not found on Windows add 83c12c2c0f8 [SPARK-39948][BUILD] Exclude hive-vector-code-gen dependency No new revisions were added by this update. Summary of changes: dev/deps/spark-deps-hadoop-2-hive-2.3| 2 -- dev/deps/spark-deps-hadoop-3-hive-2.3| 2 -- pom.xml | 4 .../main/scala/org/apache/spark/sql/hive/client/package.scala| 9 ++--- 4 files changed, 10 insertions(+), 7 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41377][BUILD] Fix spark-version-info.properties not found on Windows
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new bd37b10c756 [SPARK-41377][BUILD] Fix spark-version-info.properties not found on Windows bd37b10c756 is described below commit bd37b10c7568b0fe07c98da5a532c5ff84d702f6 Author: Gautham Banasandra AuthorDate: Fri Dec 9 02:06:12 2022 -0800 [SPARK-41377][BUILD] Fix spark-version-info.properties not found on Windows ### What changes were proposed in this pull request? This PR enhances the Maven build configuration to automatically detect and switch between using Powershell for Windows and Bash for non-Windows OS to generate `spark-version-info.properties` file. ### Why are the changes needed? While building Spark, the `spark-version-info.properties` file [is generated using bash](https://github.com/apache/spark/blob/d62c18b7497997188ec587e1eb62e75c979c1c93/core/pom.xml#L560-L564). In Windows environment, if Windows Subsystem for Linux (WSL) is installed, it somehow overrides the other bash executables in the PATH, as noted in SPARK-40739. The bash in WSL has a different mounting configuration and thus, [the target location specified for spark-version-info.properties](https [...] This PR fixes the issue by directing the build system to use the right shell according to the platform. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? I tested this by building on a Windows 10 PC. ```psh mvn -Pyarn '-Dhadoop.version=3.3.0' -DskipTests clean package ``` Once the build finished, I verified that `spark-version-info.properties` file was included in the spark-core jar. ![image](https://user-images.githubusercontent.com/10280768/205497898-80e53617-c991-460e-b04a-a3bdd4f298ae.png) I also ran the SparkPi application and verified that it ran successfully without any errors. ![image](https://user-images.githubusercontent.com/10280768/205499567-f6e8e10a-dcbb-45fb-b282-fc29ba58adee.png) Closes #38903 from GauthamBanasandra/spark-version-info-ps. Authored-by: Gautham Banasandra Signed-off-by: Dongjoon Hyun --- appveyor.yml | 1 + build/spark-build-info.ps1 | 46 ++ core/pom.xml | 34 -- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 1a2aef0d3b8..fdb247d5d43 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -28,6 +28,7 @@ only_commits: files: - appveyor.yml - dev/appveyor-install-dependencies.ps1 +- build/spark-build-info.ps1 - R/ - sql/core/src/main/scala/org/apache/spark/sql/api/r/ - core/src/main/scala/org/apache/spark/api/r/ diff --git a/build/spark-build-info.ps1 b/build/spark-build-info.ps1 new file mode 100644 index 000..43db8823340 --- /dev/null +++ b/build/spark-build-info.ps1 @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script generates the build info for spark and places it into the spark-version-info.properties file. +# Arguments: +# ResourceDir - The target directory where properties file would be created. [./core/target/extra-resources] +# SparkVersion - The current version of spark + +param( +# The resource directory. +[Parameter(Position = 0)] +[String] +$ResourceDir, + +# The Spark version. +[Parameter(Position = 1)] +[String] +$SparkVersion +) + +$null = New-Item -Type Directory -Force $ResourceDir +$SparkBuildInfoPath = $ResourceDir.TrimEnd('\').TrimEnd('/') + '\spark-version-info.properties' + +$SparkBuildInfoContent = +"version=$SparkVersion +user=$($Env:USERNAME) +revision=$(git rev-parse HEAD) +branch=$(git rev-parse --abbrev-ref HEAD) +date=$([DateTime]::UtcNow | Get-Date -UFormat +%Y-%m-%dT%H:%M:%SZ) +url=$(git config --get remote.origin.url)" + +Set-Content -Path $SparkBuildInfoPath -Value $SparkBuildInfoContent diff --git a/core/pom.xml b/core/pom.x
[spark] branch master updated: [SPARK-41439][CONNECT][PYTHON] Implement `DataFrame.melt` and `DataFrame.unpivot`
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3212fa96016 [SPARK-41439][CONNECT][PYTHON] Implement `DataFrame.melt` and `DataFrame.unpivot` 3212fa96016 is described below commit 3212fa960169b1f1c29d63185aa96d535798fcc4 Author: Jiaan Geng AuthorDate: Fri Dec 9 16:27:01 2022 +0800 [SPARK-41439][CONNECT][PYTHON] Implement `DataFrame.melt` and `DataFrame.unpivot` ### What changes were proposed in this pull request? Implement `DataFrame.melt` and `DataFrame.unpivot` with a proto message 1. Implement `DataFrame.melt` and `DataFrame.unpivot` for scala API 2. Implement `DataFrame.melt` and `DataFrame.unpivot` for python API ### Why are the changes needed? for Connect API coverage ### Does this PR introduce _any_ user-facing change? 'No'. New API ### How was this patch tested? New test cases. Closes #38973 from beliefer/SPARK-41439. Authored-by: Jiaan Geng Signed-off-by: Ruifeng Zheng --- .../main/protobuf/spark/connect/relations.proto| 19 +++ .../org/apache/spark/sql/connect/dsl/package.scala | 47 ++ .../sql/connect/planner/SparkConnectPlanner.scala | 33 +++- .../connect/planner/SparkConnectProtoSuite.scala | 28 python/pyspark/sql/connect/dataframe.py| 35 python/pyspark/sql/connect/plan.py | 59 +++ python/pyspark/sql/connect/proto/relations_pb2.py | 182 +++-- python/pyspark/sql/connect/proto/relations_pb2.pyi | 72 .../sql/tests/connect/test_connect_plan_only.py| 58 +++ 9 files changed, 448 insertions(+), 85 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto index ece8767c06c..30468501236 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -54,6 +54,7 @@ message Relation { Tail tail = 22; WithColumns with_columns = 23; Hint hint = 24; +Unpivot unpivot = 25; // NA functions NAFill fill_na = 90; @@ -570,3 +571,21 @@ message Hint { // (Optional) Hint parameters. repeated Expression.Literal parameters = 3; } + +// Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set. +message Unpivot { + // (Required) The input relation. + Relation input = 1; + + // (Required) Id columns. + repeated Expression ids = 2; + + // (Optional) Value columns to unpivot. + repeated Expression values = 3; + + // (Required) Name of the variable column. + string variable_column_name = 4; + + // (Required) Name of the value column. + string value_column_name = 5; +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala index fb79243ba37..545c2aaaf04 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala @@ -719,6 +719,53 @@ package object dsl { .build() } + def unpivot( + ids: Seq[Expression], + values: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = { +Relation + .newBuilder() + .setUnpivot( +Unpivot + .newBuilder() + .setInput(logicalPlan) + .addAllIds(ids.asJava) + .addAllValues(values.asJava) + .setVariableColumnName(variableColumnName) + .setValueColumnName(valueColumnName)) + .build() + } + + def unpivot( + ids: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = { +Relation + .newBuilder() + .setUnpivot( +Unpivot + .newBuilder() + .setInput(logicalPlan) + .addAllIds(ids.asJava) + .setVariableColumnName(variableColumnName) + .setValueColumnName(valueColumnName)) + .build() + } + + def melt( + ids: Seq[Expression], + values: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = +unpivot(ids, values, variableColumnName, valueColumnName) + + def melt( + ids: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = +unpivot(ids, variableColumnName, valueColumnName) + private def createSet
[spark] branch master updated (1979169052e -> 24a588c2b78)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 1979169052e Revert "[SPARK-41369][CONNECT] Remove unneeded connect server deps" add 24a588c2b78 [MINOR][CONNECT][DOCS] Document parallelism=1 in Spark Connect testing No new revisions were added by this update. Summary of changes: connector/connect/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org