svn commit: r69092 - in /dev/spark/v4.0.0-preview1-rc1-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/R/articles/ _site/api/R/articles/sparkr-vignettes_files/ _site/api/R/articles/sparkr-vignettes_
Author: wenchen Date: Fri May 10 16:44:08 2024 New Revision: 69092 Log: Apache Spark v4.0.0-preview1-rc1 docs [This commit notification would consist of 4810 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-48143][SQL] Use lightweight exceptions for control-flow between UnivocityParser and FailureSafeParser
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a6632ffa16f6 [SPARK-48143][SQL] Use lightweight exceptions for control-flow between UnivocityParser and FailureSafeParser a6632ffa16f6 is described below commit a6632ffa16f6907eba96e745920d571924bf4b63 Author: Vladimir Golubev AuthorDate: Sat May 11 00:37:54 2024 +0800 [SPARK-48143][SQL] Use lightweight exceptions for control-flow between UnivocityParser and FailureSafeParser # What changes were proposed in this pull request? New lightweight exception for control-flow between UnivocityParser and FalureSafeParser to speed-up malformed CSV parsing. This is a different way to implement these reverted changes: https://github.com/apache/spark/pull/46478 The previous implementation was more invasive - removing `cause` from `BadRecordException` could break upper code, which unwraps errors and checks the types of the causes. This implementation only touches `FailureSafeParser` and `UnivocityParser` since in the codebase they are always used together, unlike `JacksonParser` and `StaxXmlParser`. Removing stacktrace from `BadRecordException` is safe, since the cause itself has an adequate stacktrace (except pure control-flow cases). ### Why are the changes needed? Parsing in `PermissiveMode` is slow due to heavy exception construction (stacktrace filling + string template substitution in `SparkRuntimeException`) ### Does this PR introduce _any_ user-facing change? No, since `FailureSafeParser` unwraps `BadRecordException` and correctly rethrows user-facing exceptions in `FailFastMode` ### How was this patch tested? - `testOnly org.apache.spark.sql.catalyst.csv.UnivocityParserSuite` - Manually run csv benchmark - Manually checked correct and malformed csv in sherk-shell (org.apache.spark.SparkException is thrown with the stacktrace) ### Was this patch authored or co-authored using generative AI tooling? No Closes #46500 from vladimirg-db/vladimirg-db/use-special-lighweight-exception-for-control-flow-between-univocity-parser-and-failure-safe-parser. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/csv/UnivocityParser.scala | 5 +++-- .../sql/catalyst/util/BadRecordException.scala | 22 +++--- .../sql/catalyst/util/FailureSafeParser.scala | 11 +-- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index a5158d8a22c6..4d95097e1681 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -316,7 +316,7 @@ class UnivocityParser( throw BadRecordException( () => getCurrentInput, () => Array.empty, -QueryExecutionErrors.malformedCSVRecordError("")) +LazyBadRecordCauseWrapper(() => QueryExecutionErrors.malformedCSVRecordError(""))) } val currentInput = getCurrentInput @@ -326,7 +326,8 @@ class UnivocityParser( // However, we still have chance to parse some of the tokens. It continues to parses the // tokens normally and sets null when `ArrayIndexOutOfBoundsException` occurs for missing // tokens. - Some(QueryExecutionErrors.malformedCSVRecordError(currentInput.toString)) + Some(LazyBadRecordCauseWrapper( +() => QueryExecutionErrors.malformedCSVRecordError(currentInput.toString))) } else None // When the length of the returned tokens is identical to the length of the parsed schema, // we just need to: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala index 65a56c1064e4..654b0b8c73e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala @@ -67,16 +67,32 @@ case class PartialResultArrayException( extends Exception(cause) /** - * Exception thrown when the underlying parser meet a bad record and can't parse it. + * Exception thrown when the underlying parser met a bad record and can't parse it. + * The stacktrace is not collected for better preformance, and thus, this exception should + * not be used in a user-facing context. * @param record a function to return the record that cause the parser to fail * @param partialResults a fu
(spark) branch master updated: [SPARK-48146][SQL] Fix aggregate function in With expression child assertion
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7ef0440ef221 [SPARK-48146][SQL] Fix aggregate function in With expression child assertion 7ef0440ef221 is described below commit 7ef0440ef22161a6160f7b9000c70b26c84eecf7 Author: Kelvin Jiang AuthorDate: Fri May 10 22:39:15 2024 +0800 [SPARK-48146][SQL] Fix aggregate function in With expression child assertion ### What changes were proposed in this pull request? In https://github.com/apache/spark/pull/46034, there was a complicated edge case where common expression references in aggregate functions in the child of a `With` expression could become dangling. An assertion was added to avoid that case from happening, but the assertion wasn't fully accurate as a query like: ``` select id between max(if(id between 1 and 2, 2, 1)) over () and id from range(10) ``` would fail the assertion. This PR fixes the assertion to be more accurate. ### Why are the changes needed? This addresses a regression in https://github.com/apache/spark/pull/46034. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46443 from kelvinjian-db/SPARK-48146-agg. Authored-by: Kelvin Jiang Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/expressions/With.scala | 26 + .../optimizer/RewriteWithExpressionSuite.scala | 27 +- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala index 14deedd9c70f..29794b33641c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.trees.TreePattern.{AGGREGATE_EXPRESSION, COMMON_EXPR_REF, TreePattern, WITH_EXPRESSION} +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, TreePattern, WITH_EXPRESSION} import org.apache.spark.sql.types.DataType /** @@ -27,9 +28,11 @@ import org.apache.spark.sql.types.DataType */ case class With(child: Expression, defs: Seq[CommonExpressionDef]) extends Expression with Unevaluable { - // We do not allow With to be created with an AggregateExpression in the child, as this would - // create a dangling CommonExpressionRef after rewriting it in RewriteWithExpression. - assert(!child.containsPattern(AGGREGATE_EXPRESSION)) + // We do not allow creating a With expression with an AggregateExpression that contains a + // reference to a common expression defined in that scope (note that it can contain another With + // expression with a common expression ref of the inner With). This is to prevent the creation of + // a dangling CommonExpressionRef after rewriting it in RewriteWithExpression. + assert(!With.childContainsUnsupportedAggExpr(this)) override val nodePatterns: Seq[TreePattern] = Seq(WITH_EXPRESSION) override def dataType: DataType = child.dataType @@ -92,6 +95,21 @@ object With { val commonExprRefs = commonExprDefs.map(new CommonExpressionRef(_)) With(replaced(commonExprRefs), commonExprDefs) } + + private[sql] def childContainsUnsupportedAggExpr(withExpr: With): Boolean = { +lazy val commonExprIds = withExpr.defs.map(_.id).toSet +withExpr.child.exists { + case agg: AggregateExpression => +// Check that the aggregate expression does not contain a reference to a common expression +// in the outer With expression (it is ok if it contains a reference to a common expression +// for a nested With expression). +agg.exists { + case r: CommonExpressionRef => commonExprIds.contains(r.id) + case _ => false +} + case _ => false +} + } } case class CommonExpressionId(id: Long = CommonExpressionId.newId, canonicalized: Boolean = false) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala index d482b18d9331..8f023fa4156b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/
(spark) branch master updated (33cac4436e59 -> 2df494fd4e4e)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 33cac4436e59 [SPARK-47847][CORE] Deprecate `spark.network.remoteReadNioBufferConversion` add 2df494fd4e4e [SPARK-48158][SQL] Add collation support for XML expressions No new revisions were added by this update. Summary of changes: .../sql/catalyst/expressions/xmlExpressions.scala | 9 +- .../spark/sql/CollationSQLExpressionsSuite.scala | 124 + 2 files changed, 129 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-48222][INFRA][DOCS] Sync Ruby Bundler to 2.4.22 and refresh Gem lock file
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9a2818820f11 [SPARK-48222][INFRA][DOCS] Sync Ruby Bundler to 2.4.22 and refresh Gem lock file 9a2818820f11 is described below commit 9a2818820f11f9bdcc042f4ab80850918911c68c Author: Nicholas Chammas AuthorDate: Fri May 10 09:58:16 2024 +0800 [SPARK-48222][INFRA][DOCS] Sync Ruby Bundler to 2.4.22 and refresh Gem lock file ### What changes were proposed in this pull request? Sync the version of Bundler that we are using across various scripts and documentation. Also refresh the Gem lock file. ### Why are the changes needed? We are seeing inconsistent build behavior, likely due to the inconsistent Bundler versions. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI + the preview release process. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46512 from nchammas/bundler-sync. Authored-by: Nicholas Chammas Signed-off-by: Wenchen Fan --- .github/workflows/build_and_test.yml | 3 +++ dev/create-release/spark-rm/Dockerfile | 2 +- docs/Gemfile.lock | 16 docs/README.md | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4a11823aee60..881fb8cb0674 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -872,6 +872,9 @@ jobs: python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 - name: Install dependencies for documentation generation run: | +# Keep the version of Bundler here in sync with the following locations: +# - dev/create-release/spark-rm/Dockerfile +# - docs/README.md gem install bundler -v 2.4.22 cd docs bundle install diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 8d5ca38ba88e..13f4112ca03d 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -38,7 +38,7 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true ARG APT_INSTALL="apt-get install --no-install-recommends -y" ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==2.0.3 pyarrow==10.0.1 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" -ARG GEM_PKGS="bundler:2.3.8" +ARG GEM_PKGS="bundler:2.4.22" # Install extra needed repos and refresh. # - CRAN repo diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 4e38f18703f3..e137f0f039b9 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -4,16 +4,16 @@ GEM addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) colorator (1.1.0) -concurrent-ruby (1.2.2) +concurrent-ruby (1.2.3) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) eventmachine (1.2.7) ffi (1.16.3) forwardable-extended (2.6.0) -google-protobuf (3.25.2) +google-protobuf (3.25.3) http_parser.rb (0.8.0) -i18n (1.14.1) +i18n (1.14.5) concurrent-ruby (~> 1.0) jekyll (4.3.3) addressable (~> 2.4) @@ -42,22 +42,22 @@ GEM kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) liquid (4.0.4) -listen (3.8.0) +listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.4.0) pathutil (0.16.2) forwardable-extended (~> 2.6) -public_suffix (5.0.4) -rake (13.1.0) +public_suffix (5.0.5) +rake (13.2.1) rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) rexml (3.2.6) rouge (3.30.0) safe_yaml (1.0.5) -sass-embedded (1.69.7) - google-protobuf (~> 3.25) +sass-embedded (1.63.6) + google-protobuf (~> 3.23) rake (>= 13.0.0) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) diff --git a/docs/README.md b/docs/README.md index 414c8dbd8303..363f1c207636 100644 --- a/docs/README.md +++ b/docs/README.md @@ -36,7 +36,7 @@ You need to have [Ruby 3][ruby] and [Python 3][python] installed. Make sure the [python]: https://www.python.org/downloads/ ```sh -$ gem install bundler +$ gem install bundler -v 2.4.22 ``` After this all the required Ruby dependencies can be installed from the `docs/` directory
svn commit: r69065 - /dev/spark/v4.0.0-preview1-rc1-bin/
Author: wenchen Date: Thu May 9 16:31:11 2024 New Revision: 69065 Log: Apache Spark v4.0.0-preview1-rc1 Added: dev/spark/v4.0.0-preview1-rc1-bin/ dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz (with props) dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.asc dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.sha512 dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz (with props) dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz.asc dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz.sha512 dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1-bin-hadoop3.tgz (with props) dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1-bin-hadoop3.tgz.asc dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1-bin-hadoop3.tgz.sha512 dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1-bin-without-hadoop.tgz (with props) dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1-bin-without-hadoop.tgz.asc dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1-bin-without-hadoop.tgz.sha512 dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1.tgz (with props) dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1.tgz.asc dev/spark/v4.0.0-preview1-rc1-bin/spark-4.0.0-preview1.tgz.sha512 Added: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz == Binary file - no diff available. Propchange: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz -- svn:mime-type = application/octet-stream Added: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.asc == --- dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.asc (added) +++ dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.asc Thu May 9 16:31:11 2024 @@ -0,0 +1,17 @@ +-BEGIN PGP SIGNATURE- + +iQJHBAABCgAxFiEETclnbO+ag+mPygJ4TWYghDzYf1oFAmY8+e4THHdlbmNoZW5A +YXBhY2hlLm9yZwAKCRBNZiCEPNh/Wv78D/9aNsBANuVpIjYr+XkWYaimRLJ5IT0Z +qKehjJBuMBDaBMMN3iWconDHBiASQT0FTYGDBeYI72fLFSMKBna5+Lu22+KD/K6h +V8SZxPSQsAHQABYq9ha++XXyo1Vo+msPQ0pQAblmTrSpsvSWZmC8spzb5GbKYvK5 +kxr4Qt1XnHeGNJNToqGlbl/Hc2Etg5PkPBxMPBWMh7kLknMEscMNUf87JqCIa8LG +hMid/0lrrevEm8gkuu0ol9Vgz4P+dreKE9eCfmWOXCod04y8tJnVPs83wUOZfmKV +dHkELaMVwz3fa40QP77gK38K5i22aUgYk6dvhB+OgtatZ5tk0Dxp3AI2OObngEUm +4cGmQLwcses53vApwkExq427gS8td4sTE2G1D4+hSdEcm8Fj69w4Ado/DlIAHZob +KLV15qtNOyaIapT4GxBqoeqsw7tnRmxiP8K8UxFcPV/vZC1yQKIIULigPjttZKoW ++REE2N7ZyPvbvgItwjAL8hpCeYEkd7RDa7ofHAv6icC1qSsJZ9gxFM4rJvriI4g2 +tnYEvZduGpBunhlwVb0R3kAF5XoLIZQ5qm6kyWAzioc0gxzYVc3Rd+bXjm+vmopt +bXHOM6N2lLQwqnWlHsyjGVFugrkkRXZbQbIV6FynXpKaz5YtkUhUMkofz7mOYhBi ++1Z8nZ04B6YLbw== +=85FX +-END PGP SIGNATURE- Added: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.sha512 == --- dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.sha512 (added) +++ dev/spark/v4.0.0-preview1-rc1-bin/pyspark-4.0.0.dev1.tar.gz.sha512 Thu May 9 16:31:11 2024 @@ -0,0 +1 @@ +2509cf6473495b0cd5c132d87f5e1c33593fa7375ca01bcab1483093cea92bdb6ad7afc7c72095376b28fc5acdc71bb323935d17513f33ee5276c6991ff668d1 pyspark-4.0.0.dev1.tar.gz Added: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz == Binary file - no diff available. Propchange: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz -- svn:mime-type = application/octet-stream Added: dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz.asc == --- dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz.asc (added) +++ dev/spark/v4.0.0-preview1-rc1-bin/pyspark-connect-4.0.0.dev1.tar.gz.asc Thu May 9 16:31:11 2024 @@ -0,0 +1,17 @@ +-BEGIN PGP SIGNATURE- + +iQJGBAABCgAxFiEETclnbO+ag+mPygJ4TWYghDzYf1oFAmY8+fATHHdlbmNoZW5A +YXBhY2hlLm9yZwAKCRBNZiCEPNh/WoCMD/iZjkaGTUqt3jkIjWIUzpQo+kLn8//m +f+hwUtAguXvbMJXwBOz/Q/f+KvGk0tutsbd6rmBB6cHjH4GoZPp1x6iBitFAO47r +kHy/0xYkb70SPQCWIGQQpRv3g0uxTmpqL9H4YcIvexkV2wXG5VSwGvbSI4596n7l +x7M3rRmFzrxhcNIYLQdhNuat0mwuJFWe6R7Zk7UYFFishn9dNt8EOYx8vsGAuMP8 +Uy3+7oZQOAGqdQGSL7Ev4Pqve7MrrPgGXaixGukXibi707NCURnHTDcenPfoEEiQ +Hj83I3G+JrRhtsue/103a/GnHheUgwE8oEkefnUX7qC5tSn4T8lI2KpDBv9AL1pm +Bv0eXf5X5xEM4wvO7DCgbeEDPLg72jjt9X8zjAYx05HddvTuPjeKEL+Ga6G0ueTz +HRXHrgd1EFZ1znPZhWiSTmeqZTXdrb6wKTYt8Y6mk1oEGL3b0qE2LNkSED+4l40u +41MlV3pmZyjRGYZl29XZKf4isKYyjec7UbJSM5ok4zCRF0p8Gvj0EihGS4X6rYpW +9XxwjViKMIp7DCEcWjWpO6pJ8Ygb2Snh1UTFFgtzSVAoMqUgHnBHejJ4RA4ncHu6
(spark) branch master updated: [SPARK-47409][SQL] Add support for collation for StringTrim type of functions/expressions (for UTF8_BINARY & LCASE)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 21333f8c1fc0 [SPARK-47409][SQL] Add support for collation for StringTrim type of functions/expressions (for UTF8_BINARY & LCASE) 21333f8c1fc0 is described below commit 21333f8c1fc01756e6708ad6ccf21f585fcb881d Author: David Milicevic AuthorDate: Thu May 9 23:05:20 2024 +0800 [SPARK-47409][SQL] Add support for collation for StringTrim type of functions/expressions (for UTF8_BINARY & LCASE) Recreating [original PR](https://github.com/apache/spark/pull/45749) because code has been reorganized in [this PR](https://github.com/apache/spark/pull/45978). ### What changes were proposed in this pull request? This PR is created to add support for collations to StringTrim family of functions/expressions, specifically: - `StringTrim` - `StringTrimBoth` - `StringTrimLeft` - `StringTrimRight` Changes: - `CollationSupport.java` - Add new `StringTrim`, `StringTrimLeft` and `StringTrimRight` classes with corresponding logic. - `CollationAwareUTF8String` - add new `trim`, `trimLeft` and `trimRight` methods that actually implement trim logic. - `UTF8String.java` - expose some of the methods publicly. - `stringExpressions.scala` - Change input types. - Change eval and code gen logic. - `CollationTypeCasts.scala` - add `StringTrim*` expressions to `CollationTypeCasts` rules. ### Why are the changes needed? We are incrementally adding collation support to a built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes: - User should now be able to use non-default collations in string trim functions. ### How was this patch tested? Already existing tests + new unit/e2e tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46206 from davidm-db/string-trim-functions. Authored-by: David Milicevic Signed-off-by: Wenchen Fan --- .../catalyst/util/CollationAwareUTF8String.java| 470 ++ .../spark/sql/catalyst/util/CollationSupport.java | 534 - .../org/apache/spark/unsafe/types/UTF8String.java | 2 +- .../spark/unsafe/types/CollationSupportSuite.java | 193 .../sql/catalyst/analysis/CollationTypeCasts.scala | 2 +- .../catalyst/expressions/stringExpressions.scala | 53 +- .../sql/CollationStringExpressionsSuite.scala | 161 ++- 7 files changed, 1054 insertions(+), 361 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java new file mode 100644 index ..ee0d611d7e65 --- /dev/null +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -0,0 +1,470 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.util; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.StringSearch; +import com.ibm.icu.util.ULocale; + +import org.apache.spark.unsafe.UTF8StringBuilder; +import org.apache.spark.unsafe.types.UTF8String; + +import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET; +import static org.apache.spark.unsafe.Platform.copyMemory; + +import java.util.HashMap; +import java.util.Map; + +/** + * Utility class for collation-aware UTF8String operations. + */ +public class CollationAwareUTF8String { + public static UTF8String replace(final UTF8String src, final UTF8String search, + final UTF8String replace, final int collationId) { +// This collation aware implementation is based on existing implementation on UTF8String +if (src.numBytes() == 0 || search.numBytes() == 0) { + return src; +} + +StringSearch stringSearch = CollationFactory.getStringSearch(src, search,
(spark) branch master updated: [SPARK-47803][FOLLOWUP] Check nulls when casting nested type to variant
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3fd38d4c07f6 [SPARK-47803][FOLLOWUP] Check nulls when casting nested type to variant 3fd38d4c07f6 is described below commit 3fd38d4c07f6c998ec8bb234796f83a6aecfc0d2 Author: Chenhao Li AuthorDate: Thu May 9 22:45:10 2024 +0800 [SPARK-47803][FOLLOWUP] Check nulls when casting nested type to variant ### What changes were proposed in this pull request? It adds null checks when accessing a nested element when casting a nested type to variant. It is necessary because the `get` API doesn't guarantee to return null when the slot is null. For example, `ColumnarArray.get` may return the default value of a primitive type if the slot is null. ### Why are the changes needed? It is a bug fix is necessary for the cast-to-variant expression to work correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Two new unit tests. One directly uses `ColumnarArray` as the input of the cast. The other creates a real-world situation where `ColumnarArray` is the input of the cast (scan). Both of them would fail without the code change in this PR. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46486 from chenhao-db/fix_cast_nested_to_variant. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../variant/VariantExpressionEvalUtils.scala | 9 -- .../apache/spark/sql/VariantEndToEndSuite.scala| 33 -- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala index eb235eb854e0..f7f7097173bb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala @@ -103,7 +103,8 @@ object VariantExpressionEvalUtils { val offsets = new java.util.ArrayList[java.lang.Integer](data.numElements()) for (i <- 0 until data.numElements()) { offsets.add(builder.getWritePos - start) - buildVariant(builder, data.get(i, elementType), elementType) + val element = if (data.isNullAt(i)) null else data.get(i, elementType) + buildVariant(builder, element, elementType) } builder.finishWritingArray(start, offsets) case MapType(StringType, valueType, _) => @@ -116,7 +117,8 @@ object VariantExpressionEvalUtils { val key = keys.getUTF8String(i).toString val id = builder.addKey(key) fields.add(new VariantBuilder.FieldEntry(key, id, builder.getWritePos - start)) - buildVariant(builder, values.get(i, valueType), valueType) + val value = if (values.isNullAt(i)) null else values.get(i, valueType) + buildVariant(builder, value, valueType) } builder.finishWritingObject(start, fields) case StructType(structFields) => @@ -127,7 +129,8 @@ object VariantExpressionEvalUtils { val key = structFields(i).name val id = builder.addKey(key) fields.add(new VariantBuilder.FieldEntry(key, id, builder.getWritePos - start)) - buildVariant(builder, data.get(i, structFields(i).dataType), structFields(i).dataType) + val value = if (data.isNullAt(i)) null else data.get(i, structFields(i).dataType) + buildVariant(builder, value, structFields(i).dataType) } builder.finishWritingObject(start, fields) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala index 3964bf3aedec..53be9d50d351 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala @@ -16,11 +16,13 @@ */ package org.apache.spark.sql -import org.apache.spark.sql.catalyst.expressions.{CreateArray, CreateNamedStruct, JsonToStructs, Literal, StructsToJson} +import org.apache.spark.sql.catalyst.expressions.{Cast, CreateArray, CreateNamedStruct, JsonToStructs, Literal, StructsToJson} import org.apache.spark.sql.catalyst.expressions.variant.ParseJson import org.apache.spark.sql.execution.WholeStageCodegenExec +import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.Va
(spark) branch master updated (045ec6a166c8 -> 34ee0d8414b2)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 045ec6a166c8 [SPARK-48208][SS] Skip providing memory usage metrics from RocksDB if bounded memory usage is enabled add 34ee0d8414b2 [SPARK-47421][SQL] Add collation support for URL expressions No new revisions were added by this update. Summary of changes: .../explain-results/function_url_decode.explain| 2 +- .../explain-results/function_url_encode.explain| 2 +- .../sql/catalyst/expressions/urlExpressions.scala | 19 ++-- .../spark/sql/CollationSQLExpressionsSuite.scala | 103 + 4 files changed, 115 insertions(+), 11 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (a4ab82b8f340 -> 91da4ac25148)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from a4ab82b8f340 [SPARK-48186][SQL] Add support for AbstractMapType add 91da4ac25148 [SPARK-47354][SQL] Add collation support for variant expressions No new revisions were added by this update. Summary of changes: .../function_is_variant_null.explain | 2 +- .../explain-results/function_parse_json.explain| 2 +- .../function_schema_of_variant.explain | 2 +- .../function_schema_of_variant_agg.explain | 2 +- .../function_try_parse_json.explain| 2 +- .../function_try_variant_get.explain | 2 +- .../explain-results/function_variant_get.explain | 2 +- .../expressions/variant/variantExpressions.scala | 23 +- .../spark/sql/CollationSQLExpressionsSuite.scala | 293 - 9 files changed, 312 insertions(+), 18 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (6cc3dc2ef4d2 -> a4ab82b8f340)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 6cc3dc2ef4d2 [SPARK-48169][SPARK-48143][SQL] Revert BadRecordException optimizations add a4ab82b8f340 [SPARK-48186][SQL] Add support for AbstractMapType No new revisions were added by this update. Summary of changes: ...stractArrayType.scala => AbstractMapType.scala} | 20 - .../spark/sql/catalyst/expressions/ExprUtils.scala | 9 +++--- .../sql/catalyst/expressions/jsonExpressions.scala | 5 ++-- .../spark/sql/CollationSQLExpressionsSuite.scala | 34 ++ 4 files changed, 55 insertions(+), 13 deletions(-) copy sql/api/src/main/scala/org/apache/spark/sql/internal/types/{AbstractArrayType.scala => AbstractMapType.scala} (58%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-48169][SPARK-48143][SQL] Revert BadRecordException optimizations
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6cc3dc2ef4d2 [SPARK-48169][SPARK-48143][SQL] Revert BadRecordException optimizations 6cc3dc2ef4d2 is described below commit 6cc3dc2ef4d2ffbff7ffc400e723b97b462e1bab Author: Vladimir Golubev AuthorDate: Thu May 9 15:35:28 2024 +0800 [SPARK-48169][SPARK-48143][SQL] Revert BadRecordException optimizations ### What changes were proposed in this pull request? Revert BadRecordException optimizations for UnivocityParser, StaxXmlParser and JacksonParser ### Why are the changes needed? To reduce the blast radius - this will be implemented differently. There were two PRs by me recently: - https://github.com/apache/spark/pull/46438 - https://github.com/apache/spark/pull/46400 which introduced optimizations to speed-up control flow between UnivocityParser, StaxXmlParser and JacksonParser. However, these changes are quite unstable and may break any calling code, which relies on exception cause type, for example. Also, there may be some Spark plugins/extensions using that exception for user-facing errors ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A ### Was this patch authored or co-authored using generative AI tooling? No Closes #46478 from vladimirg-db/vladimirg-db/revert-SPARK-48169-SPARK-48143. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/csv/UnivocityParser.scala | 8 .../spark/sql/catalyst/json/JacksonParser.scala| 13 ++-- .../sql/catalyst/util/BadRecordException.scala | 13 +++- .../sql/catalyst/util/FailureSafeParser.scala | 2 +- .../spark/sql/catalyst/xml/StaxXmlParser.scala | 23 +++--- 5 files changed, 26 insertions(+), 33 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index 8d06789a7512..a5158d8a22c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -316,17 +316,17 @@ class UnivocityParser( throw BadRecordException( () => getCurrentInput, () => Array.empty, -() => QueryExecutionErrors.malformedCSVRecordError("")) +QueryExecutionErrors.malformedCSVRecordError("")) } val currentInput = getCurrentInput -var badRecordException: Option[() => Throwable] = if (tokens.length != parsedSchema.length) { +var badRecordException: Option[Throwable] = if (tokens.length != parsedSchema.length) { // If the number of tokens doesn't match the schema, we should treat it as a malformed record. // However, we still have chance to parse some of the tokens. It continues to parses the // tokens normally and sets null when `ArrayIndexOutOfBoundsException` occurs for missing // tokens. - Some(() => QueryExecutionErrors.malformedCSVRecordError(currentInput.toString)) + Some(QueryExecutionErrors.malformedCSVRecordError(currentInput.toString)) } else None // When the length of the returned tokens is identical to the length of the parsed schema, // we just need to: @@ -348,7 +348,7 @@ class UnivocityParser( } catch { case e: SparkUpgradeException => throw e case NonFatal(e) => - badRecordException = badRecordException.orElse(Some(() => e)) + badRecordException = badRecordException.orElse(Some(e)) // Use the corresponding DEFAULT value associated with the column, if any. row.update(i, ResolveDefaultColumns.existenceDefaultValues(requiredSchema)(i)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 848c20ee36be..5e75ff6f6e1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -613,7 +613,7 @@ class JacksonParser( // JSON parser currently doesn't support partial results for corrupted records. // For such records, all fields other than the field configured by // `columnNameOfCorruptRecord` are set to `null`. -throw BadRecordException(() => recordLiteral(record), cause = () => e) +throw BadRecordException(() => recordLiteral(record), () => Array.empty, e) cas
(spark) branch branch-3.5 updated: [SPARK-48197][SQL] Avoid assert error for invalid lambda function
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 541e1c4da131 [SPARK-48197][SQL] Avoid assert error for invalid lambda function 541e1c4da131 is described below commit 541e1c4da131ce737b9cf554028cf292bebbcf04 Author: Wenchen Fan AuthorDate: Thu May 9 10:56:21 2024 +0800 [SPARK-48197][SQL] Avoid assert error for invalid lambda function ### What changes were proposed in this pull request? `ExpressionBuilder` asserts all its input expressions to be resolved during lookup, which is not true as the analyzer rule `ResolveFunctions` can trigger function lookup even if the input expression contains unresolved lambda functions. This PR updates that assert to check non-lambda inputs only, and fail earlier if the input contains lambda functions. In the future, if we use `ExpressionBuilder` to register higher-order functions, we can relax it. ### Why are the changes needed? better error message ### Does this PR introduce _any_ user-facing change? no, only changes error message ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #46475 from cloud-fan/minor. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit 7e79e91dc8c531ee9135f0e32a9aa2e1f80c4bbf) Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/FunctionRegistry.scala | 9 - .../plans/logical/FunctionBuilderBase.scala| 2 ++ .../ansi/higher-order-functions.sql.out| 20 .../higher-order-functions.sql.out | 20 .../sql-tests/inputs/higher-order-functions.sql| 2 ++ .../results/ansi/higher-order-functions.sql.out| 22 ++ .../results/higher-order-functions.sql.out | 22 ++ 7 files changed, 96 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 558579cdb80a..aaf718fab941 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -930,7 +930,14 @@ object FunctionRegistry { since: Option[String] = None): (String, (ExpressionInfo, FunctionBuilder)) = { val info = FunctionRegistryBase.expressionInfo[T](name, since) val funcBuilder = (expressions: Seq[Expression]) => { - assert(expressions.forall(_.resolved), "function arguments must be resolved.") + val (lambdas, others) = expressions.partition(_.isInstanceOf[LambdaFunction]) + if (lambdas.nonEmpty && !builder.supportsLambda) { +throw new AnalysisException( + errorClass = "INVALID_LAMBDA_FUNCTION_CALL.NON_HIGHER_ORDER_FUNCTION", + messageParameters = Map( +"class" -> builder.getClass.getCanonicalName)) + } + assert(others.forall(_.resolved), "function arguments must be resolved.") val rearrangedExpressions = rearrangeExpressions(name, builder, expressions) val expr = builder.build(name, rearrangedExpressions) if (setAlias) expr.setTagValue(FUNC_ALIAS, name) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala index 1088655f60cd..a901fa5a72c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala @@ -69,6 +69,8 @@ trait FunctionBuilderBase[T] { } def build(funcName: String, expressions: Seq[Expression]): T + + def supportsLambda: Boolean = false } object NamedParametersSupport { diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out index 08d3be615b31..3fafb9858e5a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out @@ -34,6 +34,26 @@ org.apache.spark.sql.AnalysisException } +-- !query +select ceil(x -> x) as v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID
(spark) branch master updated (337f980f0073 -> 7e79e91dc8c5)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 337f980f0073 [SPARK-48204][INFRA] Fix release script for Spark 4.0+ add 7e79e91dc8c5 [SPARK-48197][SQL] Avoid assert error for invalid lambda function No new revisions were added by this update. Summary of changes: .../sql/catalyst/analysis/FunctionRegistry.scala | 9 - .../plans/logical/FunctionBuilderBase.scala| 2 ++ .../ansi/higher-order-functions.sql.out| 20 .../higher-order-functions.sql.out | 20 .../sql-tests/inputs/higher-order-functions.sql| 2 ++ .../results/ansi/higher-order-functions.sql.out| 22 ++ .../results/higher-order-functions.sql.out | 22 ++ 7 files changed, 96 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) tag v4.0.0-preview1-rc1 created (now 7dcf77c739c3)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to tag v4.0.0-preview1-rc1 in repository https://gitbox.apache.org/repos/asf/spark.git at 7dcf77c739c3 (commit) This tag includes the following new commits: new 7dcf77c739c3 Preparing Spark release v4.0.0-preview1-rc1 The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) 01/01: Preparing Spark release v4.0.0-preview1-rc1
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to tag v4.0.0-preview1-rc1 in repository https://gitbox.apache.org/repos/asf/spark.git commit 7dcf77c739c3854260464d732dbfb9a0f54706e7 Author: Wenchen Fan AuthorDate: Thu May 9 02:32:06 2024 + Preparing Spark release v4.0.0-preview1-rc1 --- R/pkg/R/sparkR.R | 4 ++-- assembly/pom.xml | 2 +- common/kvstore/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml| 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml| 2 +- common/unsafe/pom.xml | 2 +- common/utils/pom.xml | 2 +- common/variant/pom.xml | 2 +- connector/avro/pom.xml | 2 +- connector/connect/client/jvm/pom.xml | 2 +- connector/connect/common/pom.xml | 2 +- connector/connect/server/pom.xml | 2 +- connector/docker-integration-tests/pom.xml | 2 +- connector/kafka-0-10-assembly/pom.xml | 2 +- connector/kafka-0-10-sql/pom.xml | 2 +- connector/kafka-0-10-token-provider/pom.xml| 2 +- connector/kafka-0-10/pom.xml | 2 +- connector/kinesis-asl-assembly/pom.xml | 2 +- connector/kinesis-asl/pom.xml | 2 +- connector/profiler/pom.xml | 2 +- connector/protobuf/pom.xml | 2 +- connector/spark-ganglia-lgpl/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 6 +++--- examples/pom.xml | 2 +- graphx/pom.xml | 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml| 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/kubernetes/integration-tests/pom.xml | 2 +- resource-managers/yarn/pom.xml | 2 +- sql/api/pom.xml| 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 46 files changed, 49 insertions(+), 49 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 0be7e5da24d2..478acf514ef3 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -456,8 +456,8 @@ sparkR.session <- function( # Check if version number of SparkSession matches version number of SparkR package jvmVersion <- callJMethod(sparkSession, "version") - # Remove -SNAPSHOT from jvm versions - jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion, fixed = TRUE) + # Remove -preview1 from jvm versions + jvmVersionStrip <- gsub("-preview1", "", jvmVersion, fixed = TRUE) rPackageVersion <- paste0(packageVersion("SparkR")) if (jvmVersionStrip != rPackageVersion) { diff --git a/assembly/pom.xml b/assembly/pom.xml index 6c31ec745b5b..f6be8db52b54 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.13 -4.0.0-SNAPSHOT +4.0.0-preview1 ../pom.xml diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 3820d1b8e395..534c49987d28 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.13 -4.0.0-SNAPSHOT +4.0.0-preview1 ../../pom.xml diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index cdb5bd72158a..d8dff6996cec 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.13 -4.0.0-SNAPSHOT +4.0.0-preview1 ../../pom.xml diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 0f7036ef
(spark) tag v4.0.0-preview-rc1 created (now 9fec87d16a04)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to tag v4.0.0-preview-rc1 in repository https://gitbox.apache.org/repos/asf/spark.git at 9fec87d16a04 (commit) This tag includes the following new commits: new 9fec87d16a04 Preparing Spark release v4.0.0-preview-rc1 The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) 01/01: Preparing Spark release v4.0.0-preview-rc1
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to tag v4.0.0-preview-rc1 in repository https://gitbox.apache.org/repos/asf/spark.git commit 9fec87d16a0418759d835541557ad22f20940e9e Author: Wenchen Fan AuthorDate: Wed May 8 14:16:23 2024 + Preparing Spark release v4.0.0-preview-rc1 --- R/pkg/R/sparkR.R | 4 ++-- assembly/pom.xml | 2 +- common/kvstore/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml| 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml| 2 +- common/unsafe/pom.xml | 2 +- common/utils/pom.xml | 2 +- common/variant/pom.xml | 2 +- connector/avro/pom.xml | 2 +- connector/connect/client/jvm/pom.xml | 2 +- connector/connect/common/pom.xml | 2 +- connector/connect/server/pom.xml | 2 +- connector/docker-integration-tests/pom.xml | 2 +- connector/kafka-0-10-assembly/pom.xml | 2 +- connector/kafka-0-10-sql/pom.xml | 2 +- connector/kafka-0-10-token-provider/pom.xml| 2 +- connector/kafka-0-10/pom.xml | 2 +- connector/kinesis-asl-assembly/pom.xml | 2 +- connector/kinesis-asl/pom.xml | 2 +- connector/profiler/pom.xml | 2 +- connector/protobuf/pom.xml | 2 +- connector/spark-ganglia-lgpl/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 6 +++--- examples/pom.xml | 2 +- graphx/pom.xml | 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml| 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/kubernetes/integration-tests/pom.xml | 2 +- resource-managers/yarn/pom.xml | 2 +- sql/api/pom.xml| 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 46 files changed, 49 insertions(+), 49 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 0be7e5da24d2..e082f1b58841 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -456,8 +456,8 @@ sparkR.session <- function( # Check if version number of SparkSession matches version number of SparkR package jvmVersion <- callJMethod(sparkSession, "version") - # Remove -SNAPSHOT from jvm versions - jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion, fixed = TRUE) + # Remove -preview from jvm versions + jvmVersionStrip <- gsub("-preview", "", jvmVersion, fixed = TRUE) rPackageVersion <- paste0(packageVersion("SparkR")) if (jvmVersionStrip != rPackageVersion) { diff --git a/assembly/pom.xml b/assembly/pom.xml index 6c31ec745b5b..219d172f2cda 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.13 -4.0.0-SNAPSHOT +4.0.0-preview ../pom.xml diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 3820d1b8e395..f98ae9a0100e 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.13 -4.0.0-SNAPSHOT +4.0.0-preview ../../pom.xml diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index cdb5bd72158a..9e06c8e518a5 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.13 -4.0.0-SNAPSHOT +4.0.0-preview ../../pom.xml diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 0f7036ef746c..3b
(spark) branch master updated (8950add773e6 -> 8d7081639ab4)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 8950add773e6 [SPARK-48188][SQL] Consistently use normalized plan for cache add 8d7081639ab4 [SPARK-48161][SQL] Add collation support for JSON expressions No new revisions were added by this update. Summary of changes: .../catalyst/expressions/complexTypeCreator.scala | 5 +- .../sql/catalyst/expressions/jsonExpressions.scala | 20 ++- .../spark/sql/catalyst/json/JacksonParser.scala| 2 +- .../spark/sql/CollationSQLExpressionsSuite.scala | 198 + 4 files changed, 213 insertions(+), 12 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r69013 - /dev/spark/KEYS
Author: wenchen Date: Tue May 7 17:07:43 2024 New Revision: 69013 Log: Update KEYS Modified: dev/spark/KEYS Modified: dev/spark/KEYS == --- dev/spark/KEYS (original) +++ dev/spark/KEYS Tue May 7 17:07:43 2024 @@ -704,61 +704,62 @@ kyHyHY5kPG9HfDOSahPz =SDAz -END PGP PUBLIC KEY BLOCK- -pub 4096R/4F4FDC8A 2018-09-18 -uid Wenchen Fan (CODE SIGNING KEY) -sub 4096R/6F3F5B0E 2018-09-18 +pub rsa4096 2024-05-07 [SC] + 4DC9676CEF9A83E98FCA02784D6620843CD87F5A +uid Wenchen Fan (CODE SIGNING KEY) +sub rsa4096 2024-05-07 [E] -BEGIN PGP PUBLIC KEY BLOCK- -Version: GnuPG v1 -mQINBFugiYgBEAC4DsJBWF3VjWiKEiD8XNPRTg3Bnw52fe4bTB9Jvh/q0VStJjO7 -CSHZ1/P5h60zbS5UWLP2mt+c0FaW6wv7PxafCnd1MPENGBkttZbC4UjWDSbPp0vx -fkUfrAqflWvO1AaCveg2MlyQdLZ1HwVz+PDLWqE+Ev2p3Si4Jfx5P2O9FmWt8a/b -Wea/4gfy/5zFWRberQjt4CkSBuNU+cOo19/n32JJJYbRqrzFAGs/DJUIxNXC1qef -c2iB3dyff1mkLb9Vzd1RfhZaSNUElo67o4Vi6SswgvHxoE03wIcoJvBTafqLxy6p -mt5SAzOyvvmOVcLNqP9i5+c4sBrxvQ2ZEZrZt7dKfhbh4W8ged/TNWMoNOCX2usD -Fj17KrFAEaeqtEwRdwZMxGqKI/NxANkdPSxS4T/JQoi+N6LBJ88yzmeCquA8MT0b -/H4ziyjgrSRugCE6jcsbuObQsDxiqPSSXeWSjPoYq876JcqAgZzSYYdlGVw2J9Vb -46hhEqhGk+91vK6CtyuhKv5KXk1B3Rhhc5znKWcahD3cpISxwTSzN9OwQHEd8Ovv -x0WAhY3WOexrBekH7Sy00gjaHSAHFj3ReITfffWkv6t4TGLyohEOfgdxFvq03Fhd -p7bWDmux47jP6AUUjP0VXRsG9ev3ch+bbcbRlo15HPBtyehoPn4BellFAQARAQAB +mQINBGY6XpcBEADBeNz3IBYriwrPzMYJJO5u1DaWAJ4Sryx6PUZgvssrcqojYVTh +MjtlBkWRcNquAyDrVlU1vtq1yMq5KopQoAEi/l3xaEDZZ0IFAob6+GlGXEon2Jvf +0FXQsx+Df4nMVl7KPqh68T++Z4GkvK5wyyN9uaUTWL2deGeinVxTh6qWQT8YiCd5 +wof+Dk5IIzKQ5VIBhU/U9S0jo/pqhH4okcZGTyT2Q7sfg4eXl5+Y2OR334RkvTcX +uJjcnJ8BUbBSm1UhNg4OGBEJgi+lE1GEgw4juOfTAPh9fx8SCLhuX0m6Qc/y9bAK +Q4zejbF5F2Um9dqrZqg6Egp+nlzydn59hq9owSnQ6JdoA/PLcgoign0sghu9xGCR +GpgI2kS7Q8bu6dy7T0BfUerLZ1FHu7nCT2ZNSIh/Y2eOhuBhUr3llg8xa3PZZob/ +2sZE2dJ3g/qp2Nbo+s5Q5kELtuo6cZD0EISQwt68hGWIgxs0vtci2c2kQYFS0oqw +fGynEeDFZRHV3ET5rioYaoPi70Cnibght5ocL0t6sl0RQQVp6k2i1aofJbZA480N +ivuJ5agGaSRxmIDk6JlDsHJGxO9oC066ZLJiR6i0JUinGP7sw/nNmgup/AB+y4hW +9WdeAFyYmuYysDRRyE6z1MPDp1R00MyGxHNFDF64/JPY/nKKFdXp+aCazwARAQAB tDNXZW5jaGVuIEZhbiAoQ09ERSBTSUdOSU5HIEtFWSkgPHdlbmNoZW5AYXBhY2hl -Lm9yZz6JAjgEEwECACIFAlugiYgCGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheA -AAoJEGuscolPT9yKvqUP/i34exSQNs9NcDvjOQhrvpjYCarL4mdQZOjIn9JWxeWr -3nkzC9ozEIrb1zt8pqhiYr6qJhmx2EJgIwZTZZ9O0qHFMmYhYn/9/KKidE0XN6t3 -dFcbtRB1PGlc9b34PZNfdhD8PWA/UB1QC0TdTRNKhrIGGIZocrkaBral6uMJZAyV -kbb+s21cRupPLM2wmU1k3U4WxnaIq2foErhaPC9+OEDAcLH/OxwiekJTCsvZypzE -1laxo21rX1kgYzeAuqP4BfX5ARyrfM3O31Gh8asrx1bXD4z7dHqJxdJjh7ycdJdT -VLcqy6LVsRanubiJ7cg4LkEz7Xxm3RLC3ZLbnYr72ljV5wzQ3r9gE+5rEnacZw7P -9fQkAY5W2fqgfSn6Zx2SwPGhXosMdlp4zAWH3aCZj+DAx8XNG0sm3RE465zGxI9w -jIE8V9RYhUoQnmfQA+lqUIrkmEneXYPUvct6F6B5tRNOXsceVG+c4O3o+ueJKEv3 -DvI4UyGD8q5k7IT24sOKa3CZmq3dsutfpecZdbrPT5MV13vrdJ0DZufpAi/mN3vL -obkYyKRwyObUCiG6K9REoUJbes6+oX7SDXUCE1AI6UJrSuSXHazY/FWUjZaZClPR -yxO51LjsEbkmzKggtZMljlAnTfRi/sEt7pC0bx1SCwoBKfLUbXclRJTtr1Ac6MiM -uQINBFugiYgBEADNXxoluY3P7UvGAjprc03pHmg+VHzDQCHZpQ21Qgg4cUgEsckd -J6Vd7CrqLbTuYrQc7vq1UKV/5HWo3d8nK4eeM77DYo7fbmy1gMZ2okHy/EpV5i6d -gArAQM/nbEiCB9grgiFrLdApfPzeHHGXSbrF3BWyIkc/5mJ+VuLC+FY3xWDvHNAf -G8m7zrV5BlKVZl+WYRd9LQEvhaZG74ettro2zkT7UGagdFZtNFzUAQb3hnbYvlpY -OTJu1zUEkeYDsAgp+nbuAbgkGcH2iohsozVx2kKPzmRLmYmVeh725AtHJCsDbyuS -863bf1+xEz1K7k2P7FMbf8R1I8qxOjTA6tCRSkSYxPzEgcYB/ShAtX6N/J/mubsP -ow5NKAsSfdvH/Xk66umAFu2b0V1B8gFvKifZ4GEmKkJkLJ0Pw5Bspsx56BDqQCaf -d1lJcmm7MEQTydgC37UhfJIISdnJ/MGjT8oYyKhxfcfjQ6Gp2/nY0O9akg43chOl -ZvDk9EmuX+oEmM2aMBSIcOwEPvOCaIN62D8m/PeVYzuYBCYEkcURLMN8NnxLhu2q -MYjdKG1tc5tkxyaxJQLTTvxV6xYcz1qZZbk/YKp2yAIfMFJElhPkqidB0pGNIncC -myAnbrgDzEPzRIH9KVrFMb4Gzq3l+/ZSPxSLwxAO9I0o8jmfB0QSpZAdqwARAQAB -iQIfBBgBAgAJBQJboImIAhsMAAoJEGuscolPT9yKeQ0P/jykTh5nrHDaldxZE3eu -Z6I7fJXJEV6i5tmve5Q827EnEaLK07XlaHfP+/sYy9fWs16sNTLHgoNNtfUn738J -W08UtEADGFp6DntZg0+h5CksDNXr4t06ndqyniBMw1aTClrfW/9O8240VlrCrveD -GQmFziY1DePKV9pR0A6xsOGhLtL057T5uJ64lPJewgbg7X7dD0OlHYgcNYn+XGAY -OJYHuk5Nchm47iaciU3kDuECLWmDqyMBTT+B5/hcrwR9zUc+NjCjVVdsxxZlSixw -TOQvYxTdqhMwMedUuWmTkJ1XDU6AJSx013ma9OC1tKqwmjaSUTQDZFKVPy6yL+YH -SqRuC0M0p151rnMwljs2hPwuOOEm82PVsGKNytXtilt5LheGwyBPHTnX1ai2kvUC -HDcwOSR/CVeRLsWi7JBTXsfkflEuwasCEags42R8HzSCGYZwm8cauHnBQ9BxZVCr -iEdCcXfL2dgEIeiI5FXDlNzeXKihatHUYGBeZG0OO4oKoRKieF+pRvEJv4X7HRiL -Tj8aAFzBcLb5RZ0I9PIILQyy16B4ZdjyDjrjeIx1hxO7J/ZRX2DA62/yxo0Qa0SF -qmd2/zxUPGuOvX/e79ATOM8KMQFbGikLTcjW9gcUuBnLPCu0WN2KF+ykz33y1rH9 -DFZYf6DnKNwDnBzBuAx2S+P0 -=a0mL +Lm9yZz6JAlEEEwEIADsWIQRNyWds75qD6Y/KAnhNZiCEPNh/WgUCZjpelwIbAwUL +CQgHAgIiAgYVCgkICwIEFgIDAQIeBwIXgAAKCRBNZiCEPNh/WkofD/9sI7J3i9Ck +NOlHpVnjAaHjyGX5cVA2dZGniJdLf5yOKOI6pu7dMW+NThsXO1Iv+BRYo7una6/Q +vUquKKxCXIN3vNmKIB1e9lj4MaIhCRmXUSQxjkVa9JW3P/F520Ct3VjiCZ5IjPv+ +g1hF/wrkuuoAFlcC/bfGWafkaZgszavSpCdp27mUXUNbvLW0dPJ3+ay4cDPuT1DI +6DhB8qpqN7gInDFACW2qtQ2KZh1JFGy5ZccQ9dB3t/B4BYiUie6a3eQWgKqLF1hw +8yHY3DkCVGfnXJk4+LMWqgazQxoB6oZjBvoQYtGOPXr1ZbmtiRHCDM5KmZ+QmIXB
(spark) branch master updated: [SPARK-47297][SQL] Add collation support for format expressions
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 148f5335427c [SPARK-47297][SQL] Add collation support for format expressions 148f5335427c is described below commit 148f5335427c3aea39cbcce967e18a3b35a88687 Author: Uros Bojanic <157381213+uros...@users.noreply.github.com> AuthorDate: Tue May 7 23:00:30 2024 +0800 [SPARK-47297][SQL] Add collation support for format expressions ### What changes were proposed in this pull request? Introduce collation awareness for format expressions: to_number, try_to_number, to_char, space. ### Why are the changes needed? Add collation support for format expressions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for format functions: to_number, try_to_number, to_char, space. ### How was this patch tested? E2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46423 from uros-db/format-expressions. Authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../expressions/numberFormatExpressions.scala | 14 ++- .../catalyst/expressions/stringExpressions.scala | 2 +- .../spark/sql/CollationSQLExpressionsSuite.scala | 132 - 3 files changed, 141 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index 6d95d7e620a2..e914190c0645 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper import org.apache.spark.sql.catalyst.util.ToNumberParser import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, DatetimeType, Decimal, DecimalType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -47,7 +49,8 @@ abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Bo DecimalType.USER_DEFAULT } - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() @@ -247,8 +250,9 @@ object ToCharacterBuilder extends ExpressionBuilder { inputExpr.dataType match { case _: DatetimeType => DateFormatClass(inputExpr, format) case _: BinaryType => - if (!(format.dataType == StringType && format.foldable)) { -throw QueryCompilationErrors.nonFoldableArgumentError(funcName, "format", StringType) + if (!(format.dataType.isInstanceOf[StringType] && format.foldable)) { +throw QueryCompilationErrors.nonFoldableArgumentError(funcName, "format", + format.dataType) } val fmt = format.eval() if (fmt == null) { @@ -279,8 +283,8 @@ case class ToCharacter(left: Expression, right: Expression) } } - override def dataType: DataType = StringType - override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType, StringType) + override def dataType: DataType = SQLConf.get.defaultStringType + override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType, StringTypeAnyCollation) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() if (inputTypeCheck.isSuccess) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 0769c8e609ec..c2ea17de1953 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1906,7 +1906,7 @@ case class StringRepeat(str: Expression, times: Expression) case class StringSpace(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with
(spark) branch master updated: [SPARK-48143][SQL] Use lightweight exceptions for control-flow between UnivocityParser and FailureSafeParser
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 326dbb447873 [SPARK-48143][SQL] Use lightweight exceptions for control-flow between UnivocityParser and FailureSafeParser 326dbb447873 is described below commit 326dbb4478732eb9b7a683511e69206f2b21bd37 Author: Vladimir Golubev AuthorDate: Tue May 7 20:28:50 2024 +0800 [SPARK-48143][SQL] Use lightweight exceptions for control-flow between UnivocityParser and FailureSafeParser # What changes were proposed in this pull request? New lightweight exception for control-flow between UnivocityParser and FalureSafeParser to speed-up malformed CSV parsing ### Why are the changes needed? Parsing in `PermissiveMode` is slow due to heavy exception construction (stacktrace filling + string template substitution in `SparkRuntimeException`) ### Does this PR introduce _any_ user-facing change? No, since `FailureSafeParser` unwraps `BadRecordException` and correctly rethrows user-facing exceptions in `FailFastMode` ### How was this patch tested? - `testOnly org.apache.spark.sql.catalyst.csv.UnivocityParserSuite` - Manually run csv benchmark on DB benchmark workspace - Manually checked correct and malformed csv in sherk-shell (org.apache.spark.SparkException is thrown with the stacktrace) ### Was this patch authored or co-authored using generative AI tooling? No Closes #46400 from vladimirg-db/vladimirg-db/speed-up-csv-parser. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/csv/UnivocityParser.scala | 10 +- .../spark/sql/catalyst/json/JacksonParser.scala| 5 ++--- .../sql/catalyst/util/BadRecordException.scala | 23 ++ .../sql/catalyst/util/FailureSafeParser.scala | 2 +- .../spark/sql/catalyst/xml/StaxXmlParser.scala | 5 ++--- 5 files changed, 29 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index a5158d8a22c6..37d9143e5b5a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -316,17 +316,17 @@ class UnivocityParser( throw BadRecordException( () => getCurrentInput, () => Array.empty, -QueryExecutionErrors.malformedCSVRecordError("")) +() => QueryExecutionErrors.malformedCSVRecordError("")) } val currentInput = getCurrentInput -var badRecordException: Option[Throwable] = if (tokens.length != parsedSchema.length) { +var badRecordException: Option[() => Throwable] = if (tokens.length != parsedSchema.length) { // If the number of tokens doesn't match the schema, we should treat it as a malformed record. // However, we still have chance to parse some of the tokens. It continues to parses the // tokens normally and sets null when `ArrayIndexOutOfBoundsException` occurs for missing // tokens. - Some(QueryExecutionErrors.malformedCSVRecordError(currentInput.toString)) + Some(() => QueryExecutionErrors.malformedCSVRecordError(currentInput.toString)) } else None // When the length of the returned tokens is identical to the length of the parsed schema, // we just need to: @@ -348,7 +348,7 @@ class UnivocityParser( } catch { case e: SparkUpgradeException => throw e case NonFatal(e) => - badRecordException = badRecordException.orElse(Some(e)) + badRecordException = badRecordException.orElse(Some(() => e)) // Use the corresponding DEFAULT value associated with the column, if any. row.update(i, ResolveDefaultColumns.existenceDefaultValues(requiredSchema)(i)) } @@ -359,7 +359,7 @@ class UnivocityParser( } else { if (badRecordException.isDefined) { throw BadRecordException( - () => currentInput, () => Array(requiredRow.get), badRecordException.get) + () => currentInput, () => Array[InternalRow](requiredRow.get), badRecordException.get) } else { requiredRow } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index eadd0a4f8ab9..d1093a3b1be1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -613,7 +613,7 @@ class
(spark) branch master updated: [SPARK-47267][SQL] Add collation support for hash expressions
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 08c6bb9bf32f [SPARK-47267][SQL] Add collation support for hash expressions 08c6bb9bf32f is described below commit 08c6bb9bf32f31b5b9870d56cc4c16ab97616da6 Author: Uros Bojanic <157381213+uros...@users.noreply.github.com> AuthorDate: Tue May 7 17:13:34 2024 +0800 [SPARK-47267][SQL] Add collation support for hash expressions ### What changes were proposed in this pull request? Introduce collation awareness for hash expressions: MD5, SHA2, SHA1, CRC32, MURMUR3, XXHASH64. ### Why are the changes needed? Add collation support for hash expressions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for hash functions: md5, sha2, sha1, crc32, hash, xxhash64. ### How was this patch tested? E2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46422 from uros-db/hash-expressions. Authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/expressions/hash.scala | 6 +- .../spark/sql/CollationSQLExpressionsSuite.scala | 179 + 2 files changed, 182 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 5089cea136a8..fa342f641509 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -63,7 +63,7 @@ import org.apache.spark.util.ArrayImplicits._ case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(BinaryType) @@ -103,7 +103,7 @@ case class Md5(child: Expression) case class Sha2(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant with Serializable { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = true override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType) @@ -169,7 +169,7 @@ case class Sha2(left: Expression, right: Expression) case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(BinaryType) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index fa82405109f1..596923d975a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -28,6 +28,185 @@ class CollationSQLExpressionsSuite extends QueryTest with SharedSparkSession { + test("Support Md5 hash expression with collation") { +case class Md5TestCase( + input: String, + collationName: String, + result: String +) + +val testCases = Seq( + Md5TestCase("Spark", "UTF8_BINARY", "8cde774d6f7333752ed72cacddb05126"), + Md5TestCase("Spark", "UTF8_BINARY_LCASE", "8cde774d6f7333752ed72cacddb05126"), + Md5TestCase("SQL", "UNICODE", "9778840a0100cb30c982876741b0b5a2"), + Md5TestCase("SQL", "UNICODE_CI", "9778840a0100cb30c982876741b0b5a2") +) + +// Supported collations +testCases.foreach(t => { + val query = +s""" + |select md5('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { +val testQuery = sql(query) +checkAnswer(testQuery, Row(t.result)) +val dataType = StringType(t.collationName) +assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } +}) + } + + test("Support Sha2 hash expression with collation") { +case class Sha2TestCase( + input: String, + collationName: String, + bitLength: Int, + result: Str
(spark) branch master updated: [SPARK-48166][SQL] Avoid using BadRecordException as user-facing error in VariantExpressionEvalUtils
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7f8ef96cea27 [SPARK-48166][SQL] Avoid using BadRecordException as user-facing error in VariantExpressionEvalUtils 7f8ef96cea27 is described below commit 7f8ef96cea27d52d0bdda3808c6c48534dcd8567 Author: Vladimir Golubev AuthorDate: Tue May 7 16:13:46 2024 +0800 [SPARK-48166][SQL] Avoid using BadRecordException as user-facing error in VariantExpressionEvalUtils ### What changes were proposed in this pull request? Stop using `BadRecordException` in a user-facing context. Currently it is thrown then the `parse_json` input is malformed. ### Why are the changes needed? `BadRecordException` is an internal exception designed for `FailureSafeParser`. ### Does this PR introduce _any_ user-facing change? Yes, `parse_json` will not expose `BadRecordException` in error. ### How was this patch tested? `testOnly org.apache.spark.sql.catalyst.expressions.variant.VariantExpressionEvalUtilsSuite` ### Was this patch authored or co-authored using generative AI tooling? No Closes #46428 from vladimirg-db/vladimirg-db/get-rid-of-bad-record-exception-in-variant-expression-eval-utils. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala index f468e9745605..eb235eb854e0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.variant import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.util.{ArrayData, BadRecordException, MapData} +import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types._ import org.apache.spark.types.variant.{Variant, VariantBuilder, VariantSizeLimitException, VariantUtil} @@ -48,7 +48,7 @@ object VariantExpressionEvalUtils { .variantSizeLimitError(VariantUtil.SIZE_LIMIT, "parse_json")) case NonFatal(e) => parseJsonFailure(QueryExecutionErrors.malformedRecordsDetectedInRecordParsingError( - input.toString, BadRecordException(() => input, cause = e))) + input.toString, e)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-48027][SQL] InjectRuntimeFilter for multi-level join should check child join type
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b5e39bedab14 [SPARK-48027][SQL] InjectRuntimeFilter for multi-level join should check child join type b5e39bedab14 is described below commit b5e39bedab14a7fd800597ee0114b07448c1b0f9 Author: Angerszh AuthorDate: Tue May 7 14:47:40 2024 +0800 [SPARK-48027][SQL] InjectRuntimeFilter for multi-level join should check child join type ### What changes were proposed in this pull request? In our prod we meet a case ``` with refund_info as ( select b_key, 1 as b_type from default.table_b ), next_month_time as ( select /*+ broadcast(b, c) */ c_key ,1 as c_time FROM default.table_c ) select a.loan_id ,c.c_time ,b.type from ( select a_key from default.table_a2 union select a_key from default.table_a1 ) a left join refund_info b on a.loan_id = b.loan_id left join next_month_time c on a.loan_id = c.loan_id ; ``` In this query, it inject table_b as table_c's runtime bloom filter, but table_b join condition is LEFT OUTER, causing table_c missing data. Caused by ![image](https://github.com/apache/spark/assets/46485123/be45e211-23e4-4105-98b4-aa571c87665f) InjectRuntimeFilter.extractSelectiveFilterOverScan(), when handle join, since left plan (a left outer join b's a) is a UNION then the extract result is NONE, then zip left/right keys to extract from join's right, finnaly cause this issue. ### Why are the changes needed? Fix data correctness issue ### Does this PR introduce _any_ user-facing change? Yea, fix data incorrect issue ### How was this patch tested? For the existed PR, it fix the wrong case Before: It extract a LEFT_ANTI_JOIN's right child to the outside bf3its not correct ``` Join Inner, (c3#45926 = c1#45914) :- Join LeftAnti, (c1#45914 = c2#45920) : :- Filter isnotnull(c1#45914) : : +- Relation default.bf1[a1#45912,b1#45913,c1#45914,d1#45915,e1#45916,f1#45917] parquet : +- Project [c2#45920] : +- Filter ((isnotnull(a2#45918) AND (a2#45918 = 5)) AND isnotnull(c2#45920)) :+- Relation default.bf2[a2#45918,b2#45919,c2#45920,d2#45921,e2#45922,f2#45923] parquet +- Filter (isnotnull(c3#45926) AND might_contain(scalar-subquery#48719 [], xxhash64(c3#45926, 42))) : +- Aggregate [bloom_filter_agg(xxhash64(c2#45920, 42), 100, 8388608, 0, 0) AS bloomFilter#48718] : +- Project [c2#45920] :+- Filter ((isnotnull(a2#45918) AND (a2#45918 = 5)) AND isnotnull(c2#45920)) : +- Relation default.bf2[a2#45918,b2#45919,c2#45920,d2#45921,e2#45922,f2#45923] parquet +- Relation default.bf3[a3#45924,b3#45925,c3#45926,d3#45927,e3#45928,f3#45929] parquet ``` After: ``` Join Inner, (c3#45926 = c1#45914) :- Join LeftAnti, (c1#45914 = c2#45920) : :- Filter isnotnull(c1#45914) : : +- Relation default.bf1[a1#45912,b1#45913,c1#45914,d1#45915,e1#45916,f1#45917] parquet : +- Project [c2#45920] : +- Filter ((isnotnull(a2#45918) AND (a2#45918 = 5)) AND isnotnull(c2#45920)) :+- Relation default.bf2[a2#45918,b2#45919,c2#45920,d2#45921,e2#45922,f2#45923] parquet +- Filter (isnotnull(c3#45926)) +- Relation default.bf3[a3#45924,b3#45925,c3#45926,d3#45927,e3#45928,f3#45929] parquet ``` ### Was this patch authored or co-authored using generative AI tooling? NO Closes #46263 from AngersZh/SPARK-48027. Lead-authored-by: Angerszh Co-authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../catalyst/optimizer/InjectRuntimeFilter.scala | 44 +++--- .../spark/sql/InjectRuntimeFilterSuite.scala | 4 +- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala index 9c150f1f3308..3bb7c4d1ceca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala @@ -120,7 +120,7 @@ object InjectRuntimeFilter extends Rule[LogicalPlan] with PredicateHelper with J hasHitSelectiveFilter = hasHitSelectiveFilter || isLikelySelective(condition), currentPlan, targetKey) - case ExtractEquiJoinKeys(_, lkeys, rkeys
(spark) branch master updated: [SPARK-47681][SQL][FOLLOWUP] Fix variant decimal handling
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d67752a8f3d7 [SPARK-47681][SQL][FOLLOWUP] Fix variant decimal handling d67752a8f3d7 is described below commit d67752a8f3d7c5bda1f56c940b5112c1d5d82d07 Author: Chenhao Li AuthorDate: Tue May 7 08:10:41 2024 +0800 [SPARK-47681][SQL][FOLLOWUP] Fix variant decimal handling ### What changes were proposed in this pull request? There are two issues with the current variant decimal handling: 1. The precision and scale of the `BigDecimal` returned by `getDecimal` is not checked. Based on the variant spec, they must be within the corresponding limit for DECIMAL4/8/16. An out-of-range decimal can lead to failure in the downstream Spark operations. 2. The current `schema_of_variant` implementation doesn't correctly handle the case where precision is smaller than scale. Spark's `DecimalType` requires `precision >= scale`. The Python side requires a similar fix for 1. During the fix, I found that Python error reporting was not correctly implemented (it was never tested either) and I also fixed it. ### Why are the changes needed? They are bug fixes and are required to process decimals correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46338 from chenhao-db/fix_variant_decimal. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../apache/spark/types/variant/VariantUtil.java| 14 - python/pyspark/errors/error-conditions.json| 5 ++ python/pyspark/sql/tests/test_types.py | 13 + python/pyspark/sql/variant_utils.py| 59 ++ .../expressions/variant/variantExpressions.scala | 3 +- .../variant/VariantExpressionSuite.scala | 3 ++ .../apache/spark/sql/VariantEndToEndSuite.scala| 1 + 7 files changed, 76 insertions(+), 22 deletions(-) diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java index e4e9cc8b4cfa..84e3a45e4b0e 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java @@ -392,6 +392,13 @@ public class VariantUtil { return Double.longBitsToDouble(readLong(value, pos + 1, 8)); } + // Check whether the precision and scale of the decimal are within the limit. + private static void checkDecimal(BigDecimal d, int maxPrecision) { +if (d.precision() > maxPrecision || d.scale() > maxPrecision) { + throw malformedVariant(); +} + } + // Get a decimal value from variant value `value[pos...]`. // Throw `MALFORMED_VARIANT` if the variant is malformed. public static BigDecimal getDecimal(byte[] value, int pos) { @@ -399,14 +406,18 @@ public class VariantUtil { int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); -int scale = value[pos + 1]; +// Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be +// greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. +int scale = value[pos + 1] & 0xFF; BigDecimal result; switch (typeInfo) { case DECIMAL4: result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); +checkDecimal(result, MAX_DECIMAL4_PRECISION); break; case DECIMAL8: result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); +checkDecimal(result, MAX_DECIMAL8_PRECISION); break; case DECIMAL16: checkIndex(pos + 17, value.length); @@ -417,6 +428,7 @@ public class VariantUtil { bytes[i] = value[pos + 17 - i]; } result = new BigDecimal(new BigInteger(bytes), scale); +checkDecimal(result, MAX_DECIMAL16_PRECISION); break; default: throw unexpectedType(Type.DECIMAL); diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json index 7771791e41ca..906bf781e1bb 100644 --- a/python/pyspark/errors/error-conditions.json +++ b/python/pyspark/errors/error-conditions.json @@ -482,6 +482,11 @@ " and should be of the same length, got and ." ] }, + "MALFORMED_VARIANT" : { +"message" : [ + "Variant binary is malformed. Please check the data source is
(spark) branch branch-3.5 updated: [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 45befc07d2a0 [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist 45befc07d2a0 is described below commit 45befc07d2a064ab2a279a113489ed5c66f7a69d Author: Gene Pang AuthorDate: Sun May 5 21:50:15 2024 +0800 [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist ### What changes were proposed in this pull request? This is a followup to https://github.com/apache/spark/pull/46254 . Instead of using object arrays when nulls are present, continue to use primitive arrays when appropriate. This PR sets the null bits appropriately for the primitive array copy. Primitive arrays are faster than object arrays and won't create unnecessary objects. ### Why are the changes needed? This will improve performance and memory usage, when nulls are present in the `ColumnarArray`. ### Does this PR introduce _any_ user-facing change? This is expected to be faster when copying `ColumnarArray`. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46372 from gene-db/primitive-nulls. Authored-by: Gene Pang Signed-off-by: Wenchen Fan (cherry picked from commit bf2e25459fe46ca2b1d26e1c98c873923fc135e1) Signed-off-by: Wenchen Fan --- .../apache/spark/sql/vectorized/ColumnarArray.java | 36 ++ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index c4de83cf8b82..1f8e679a4146 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -47,31 +47,43 @@ public final class ColumnarArray extends ArrayData { return length; } + /** + * Sets all the appropriate null bits in the input UnsafeArrayData. + * + * @param arrayData The UnsafeArrayData to set the null bits for + * @return The UnsafeArrayData with the null bits set + */ + private UnsafeArrayData setNullBits(UnsafeArrayData arrayData) { +if (data.hasNull()) { + for (int i = 0; i < length; i++) { +if (data.isNullAt(i)) { + arrayData.setNullAt(i); +} + } +} +return arrayData; + } + @Override public ArrayData copy() { DataType dt = data.dataType(); -if (data.hasNull()) { - // UnsafeArrayData cannot be used if there are any nulls. - return new GenericArrayData(toObjectArray(dt)).copy(); -} - if (dt instanceof BooleanType) { - return UnsafeArrayData.fromPrimitiveArray(toBooleanArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toBooleanArray())); } else if (dt instanceof ByteType) { - return UnsafeArrayData.fromPrimitiveArray(toByteArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toByteArray())); } else if (dt instanceof ShortType) { - return UnsafeArrayData.fromPrimitiveArray(toShortArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toShortArray())); } else if (dt instanceof IntegerType || dt instanceof DateType || dt instanceof YearMonthIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toIntArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toIntArray())); } else if (dt instanceof LongType || dt instanceof TimestampType || dt instanceof DayTimeIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toLongArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toLongArray())); } else if (dt instanceof FloatType) { - return UnsafeArrayData.fromPrimitiveArray(toFloatArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toFloatArray())); } else if (dt instanceof DoubleType) { - return UnsafeArrayData.fromPrimitiveArray(toDoubleArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toDoubleArray())); } else { return new GenericArrayData(toObjectArray(dt)).copy(); // ensure the elements are copied. } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new bf2e25459fe4 [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist bf2e25459fe4 is described below commit bf2e25459fe46ca2b1d26e1c98c873923fc135e1 Author: Gene Pang AuthorDate: Sun May 5 21:50:15 2024 +0800 [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist ### What changes were proposed in this pull request? This is a followup to https://github.com/apache/spark/pull/46254 . Instead of using object arrays when nulls are present, continue to use primitive arrays when appropriate. This PR sets the null bits appropriately for the primitive array copy. Primitive arrays are faster than object arrays and won't create unnecessary objects. ### Why are the changes needed? This will improve performance and memory usage, when nulls are present in the `ColumnarArray`. ### Does this PR introduce _any_ user-facing change? This is expected to be faster when copying `ColumnarArray`. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46372 from gene-db/primitive-nulls. Authored-by: Gene Pang Signed-off-by: Wenchen Fan --- .../apache/spark/sql/vectorized/ColumnarArray.java | 36 ++ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index d92293b91870..721e6a60befe 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -49,31 +49,43 @@ public final class ColumnarArray extends ArrayData { return length; } + /** + * Sets all the appropriate null bits in the input UnsafeArrayData. + * + * @param arrayData The UnsafeArrayData to set the null bits for + * @return The UnsafeArrayData with the null bits set + */ + private UnsafeArrayData setNullBits(UnsafeArrayData arrayData) { +if (data.hasNull()) { + for (int i = 0; i < length; i++) { +if (data.isNullAt(i)) { + arrayData.setNullAt(i); +} + } +} +return arrayData; + } + @Override public ArrayData copy() { DataType dt = data.dataType(); -if (data.hasNull()) { - // UnsafeArrayData cannot be used if there are any nulls. - return new GenericArrayData(toObjectArray(dt)).copy(); -} - if (dt instanceof BooleanType) { - return UnsafeArrayData.fromPrimitiveArray(toBooleanArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toBooleanArray())); } else if (dt instanceof ByteType) { - return UnsafeArrayData.fromPrimitiveArray(toByteArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toByteArray())); } else if (dt instanceof ShortType) { - return UnsafeArrayData.fromPrimitiveArray(toShortArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toShortArray())); } else if (dt instanceof IntegerType || dt instanceof DateType || dt instanceof YearMonthIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toIntArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toIntArray())); } else if (dt instanceof LongType || dt instanceof TimestampType || dt instanceof DayTimeIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toLongArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toLongArray())); } else if (dt instanceof FloatType) { - return UnsafeArrayData.fromPrimitiveArray(toFloatArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toFloatArray())); } else if (dt instanceof DoubleType) { - return UnsafeArrayData.fromPrimitiveArray(toDoubleArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toDoubleArray())); } else { return new GenericArrayData(toObjectArray(dt)).copy(); // ensure the elements are copied. } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47359][SQL] Support TRANSLATE function to work with collated strings
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0329479acb67 [SPARK-47359][SQL] Support TRANSLATE function to work with collated strings 0329479acb67 is described below commit 0329479acb6758c4d3e53d514ea832a181d31065 Author: Milan Dankovic AuthorDate: Tue Apr 30 22:28:56 2024 +0800 [SPARK-47359][SQL] Support TRANSLATE function to work with collated strings ### What changes were proposed in this pull request? Extend built-in string functions to support non-binary, non-lowercase collation for: `translate` ### Why are the changes needed? Update collation support for built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use COLLATE within arguments for built-in string function TRANSLATE in Spark SQL queries, using non-binary collations such as UNICODE_CI. ### How was this patch tested? Unit tests for queries using StringTranslate (CollationStringExpressionsSuite.scala). ### Was this patch authored or co-authored using generative AI tooling? No Closes #45820 from miland-db/miland-db/string-translate. Authored-by: Milan Dankovic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 85 ++ .../sql/catalyst/analysis/CollationTypeCasts.scala | 3 +- .../catalyst/expressions/stringExpressions.scala | 28 --- .../sql/CollationStringExpressionsSuite.scala | 74 +++ 4 files changed, 180 insertions(+), 10 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 9778ca31209e..b77671cee90b 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -25,7 +25,9 @@ import org.apache.spark.unsafe.UTF8StringBuilder; import org.apache.spark.unsafe.types.UTF8String; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.regex.Pattern; import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET; @@ -483,6 +485,56 @@ public final class CollationSupport { } } + public static class StringTranslate { +public static UTF8String exec(final UTF8String source, Map dict, +final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(source, dict); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(source, dict); + } else { +return execICU(source, dict, collationId); + } +} +public static String genCode(final String source, final String dict, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.EndsWith.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", source, dict); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", source, dict); + } else { +return String.format(expr + "ICU(%s, %s, %d)", source, dict, collationId); + } +} +public static UTF8String execBinary(final UTF8String source, Map dict) { + return source.translate(dict); +} +public static UTF8String execLowercase(final UTF8String source, Map dict) { + String srcStr = source.toString(); + StringBuilder sb = new StringBuilder(); + int charCount = 0; + for (int k = 0; k < srcStr.length(); k += charCount) { +int codePoint = srcStr.codePointAt(k); +charCount = Character.charCount(codePoint); +String subStr = srcStr.substring(k, k + charCount); +String translated = dict.get(subStr.toLowerCase()); +if (null == translated) { + sb.append(subStr); +} else if (!"\0".equals(translated)) { + sb.append(translated); +} + } + return UTF8String.fromString(sb.toString()); +} +public static UTF8String execICU(final UTF8String source, Map dict, +final int collationId) { + return source.translate(CollationAwareUTF8String.getCollationAwareDict( +source, dict, collationId)); +} + } + // TODO: Add more collation-aware string expressions. /** @@ -808,6 +860,39 @@ public final class CollationSupport { } } +priva
(spark) branch master updated: [SPARK-48003][SQL] Add collation support for hll sketch aggregate
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3aea6c258bf3 [SPARK-48003][SQL] Add collation support for hll sketch aggregate 3aea6c258bf3 is described below commit 3aea6c258bf3541d7f53cd3914244f817ed36ff6 Author: Uros Bojanic <157381213+uros...@users.noreply.github.com> AuthorDate: Tue Apr 30 20:58:43 2024 +0800 [SPARK-48003][SQL] Add collation support for hll sketch aggregate ### What changes were proposed in this pull request? Introduce collation awareness for hll sketch aggregate. ### Why are the changes needed? Add collation support for hyperloglog expressions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for hyperloglog function: hll_sketch_agg. ### How was this patch tested? E2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46241 from uros-db/hll-agg. Authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationFactory.java | 14 ++ .../aggregate/datasketchesAggregates.scala| 8 ++-- .../scala/org/apache/spark/sql/CollationSuite.scala | 19 +++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 93691e28c692..863445b6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -25,6 +25,7 @@ import java.util.function.ToLongFunction; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.StringSearch; import com.ibm.icu.util.ULocale; +import com.ibm.icu.text.CollationKey; import com.ibm.icu.text.Collator; import org.apache.spark.SparkException; @@ -270,4 +271,17 @@ public final class CollationFactory { int collationId = collationNameToId(collationName); return collationTable[collationId]; } + + public static UTF8String getCollationKey(UTF8String input, int collationId) { +Collation collation = fetchCollation(collationId); +if (collation.supportsBinaryEquality) { + return input; +} else if (collation.supportsLowercaseEquality) { + return input.toLowerCase(); +} else { + CollationKey collationKey = collation.collator.getCollationKey(input.toString()); + return UTF8String.fromBytes(collationKey.toByteArray()); +} + } + } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala index 02925f3625d2..2102428131f6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala @@ -25,7 +25,9 @@ import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal} import org.apache.spark.sql.catalyst.trees.BinaryLike +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types.{AbstractDataType, BinaryType, BooleanType, DataType, IntegerType, LongType, StringType, TypeCollection} import org.apache.spark.unsafe.types.UTF8String @@ -103,7 +105,7 @@ case class HllSketchAgg( override def prettyName: String = "hll_sketch_agg" override def inputTypes: Seq[AbstractDataType] = -Seq(TypeCollection(IntegerType, LongType, StringType, BinaryType), IntegerType) +Seq(TypeCollection(IntegerType, LongType, StringTypeAnyCollation, BinaryType), IntegerType) override def dataType: DataType = BinaryType @@ -137,7 +139,9 @@ case class HllSketchAgg( // TODO: implement support for decimal/datetime/interval types case IntegerType => sketch.update(v.asInstanceOf[Int]) case LongType => sketch.update(v.asInstanceOf[Long]) -case StringType => sketch.update(v.asInstanceOf[UTF8String].toString) +case st: StringType => + val cKey = CollationFactory.getCollationKey(v.asInstanceOf[UTF8String], st.collationId) +
(spark) branch master updated: [SPARK-47566][SQL] Support SubstringIndex function to work with collated strings
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 12a507464f10 [SPARK-47566][SQL] Support SubstringIndex function to work with collated strings 12a507464f10 is described below commit 12a507464f106d299511d16c2a436cbc0257bc8a Author: Milan Dankovic AuthorDate: Tue Apr 30 17:19:01 2024 +0800 [SPARK-47566][SQL] Support SubstringIndex function to work with collated strings ### What changes were proposed in this pull request? Extend built-in string functions to support non-binary, non-lowercase collation for: substring_index. ### Why are the changes needed? Update collation support for built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use COLLATE within arguments for built-in string function SUBSTRING_INDEX in Spark SQL queries, using non-binary collations such as UNICODE_CI. ### How was this patch tested? Unit tests for queries using SubstringIndex (`CollationStringExpressionsSuite.scala`). ### Was this patch authored or co-authored using generative AI tooling? No ### To consider: There is no check for collation match between string and delimiter, it will be introduced with Implicit Casting. We can remove the original `public UTF8String subStringIndex(UTF8String delim, int count)` method, and get the existing behavior using `subStringIndex(delim, count, 0)`. Closes #45725 from miland-db/miland-db/substringIndex-stringLocate. Authored-by: Milan Dankovic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 169 + .../org/apache/spark/unsafe/types/UTF8String.java | 28 +++- .../spark/unsafe/types/CollationSupportSuite.java | 83 ++ .../sql/catalyst/analysis/CollationTypeCasts.scala | 5 + .../catalyst/expressions/stringExpressions.scala | 15 +- .../sql/CollationStringExpressionsSuite.scala | 31 6 files changed, 323 insertions(+), 8 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 0c81b99de916..9778ca31209e 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -28,6 +28,9 @@ import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; +import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET; +import static org.apache.spark.unsafe.Platform.copyMemory; + /** * Static entry point for collation-aware expressions (StringExpressions, RegexpExpressions, and * other expressions that require custom collation support), as well as private utility methods for @@ -441,6 +444,45 @@ public final class CollationSupport { } } + public static class SubstringIndex { +public static UTF8String exec(final UTF8String string, final UTF8String delimiter, +final int count, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(string, delimiter, count); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(string, delimiter, count); + } else { +return execICU(string, delimiter, count, collationId); + } +} +public static String genCode(final String string, final String delimiter, +final int count, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.SubstringIndex.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s, %d)", string, delimiter, count); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s, %d)", string, delimiter, count); + } else { +return String.format(expr + "ICU(%s, %s, %d, %d)", string, delimiter, count, collationId); + } +} +public static UTF8String execBinary(final UTF8String string, final UTF8String delimiter, +final int count) { + return string.subStringIndex(delimiter, count); +} +public static UTF8String execLowercase(final UTF8String string, final UTF8String delimiter, +final int count) { + return CollationAwareUTF8String.lowercaseSubStringIndex(string, delimiter, count); +} +public static UTF8String execICU(final UTF8String string, final UTF8String delimi
(spark) branch master updated: [SPARK-48033][SQL] Fix `RuntimeReplaceable` expressions being used in default columns
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new da92293f9ce0 [SPARK-48033][SQL] Fix `RuntimeReplaceable` expressions being used in default columns da92293f9ce0 is described below commit da92293f9ce0be1ac283c4a5d769af550abf7031 Author: Richard Chen AuthorDate: Tue Apr 30 09:07:24 2024 +0800 [SPARK-48033][SQL] Fix `RuntimeReplaceable` expressions being used in default columns ### What changes were proposed in this pull request? Currently, default columns that have a default of a `RuntimeReplaceable` expression fails. This is because the `AlterTableCommand` constant folds before replacing expressions with the actual implementation. For example: ``` sql(s"CREATE TABLE t(v VARIANT DEFAULT parse_json('1')) USING PARQUET") sql("INSERT INTO t VALUES(DEFAULT)") ``` fails because `parse_json` is `RuntimeReplaceable` and is evaluated before the analyzer inserts the correct expression into the plan To fix this, we run the `ReplaceExpressions` rule before `ConstantFolding` ### Why are the changes needed? This allows default columns to use expressions that are `RuntimeReplaceable` This is especially important for Variant types because literal variants are difficult to create - `parse_json` will likely be used the majority of the time. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? added UT ### Was this patch authored or co-authored using generative AI tooling? no Closes #46269 from richardc-db/fix_default_cols_runtime_replaceable. Authored-by: Richard Chen Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala | 4 ++-- .../scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala | 8 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala index 7b00349a4f27..d73e2ca6bd9d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.{Literal => ExprLiteral} -import org.apache.spark.sql.catalyst.optimizer.ConstantFolding +import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, ReplaceExpressions} import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.TreePattern.PLAN_EXPRESSION @@ -289,7 +289,7 @@ object ResolveDefaultColumns extends QueryErrorsBase val analyzer: Analyzer = DefaultColumnAnalyzer val analyzed = analyzer.execute(Project(Seq(Alias(parsed, colName)()), OneRowRelation())) analyzer.checkAnalysis(analyzed) - ConstantFolding(analyzed) + ConstantFolding(ReplaceExpressions(analyzed)) } catch { case ex: AnalysisException => throw QueryCompilationErrors.defaultValuesUnresolvedExprError( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala index 48a9564ab8f9..bca147279993 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala @@ -279,4 +279,12 @@ class ResolveDefaultColumnsSuite extends QueryTest with SharedSparkSession { checkAnswer(sql("select CAST(c as STRING) from t"), Row("2018-11-17 13:33:33")) } } + + test("SPARK-48033: default columns using runtime replaceable expression works") { +withTable("t") { + sql("CREATE TABLE t(v VARIANT DEFAULT parse_json('1')) USING PARQUET") + sql("INSERT INTO t VALUES(DEFAULT)") + checkAnswer(sql("select v from t"), sql("select parse_json('1')").collect()) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (3fbcb26d8e99 -> fe05eb8fa3b2)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 3fbcb26d8e99 [SPARK-48016][SQL] Fix a bug in try_divide function when with decimals add fe05eb8fa3b2 [SPARK-47741] Added stack overflow handling in parser No new revisions were added by this update. Summary of changes: .../src/main/resources/error/error-conditions.json | 7 +++ .../spark/sql/errors/QueryParsingErrors.scala | 6 +++ .../sql/catalyst/parser/AbstractSqlParser.scala| 57 -- .../spark/sql/errors/QueryParsingErrorsSuite.scala | 16 ++ .../execution/ExecuteImmediateEndToEndSuite.scala | 29 ++- 5 files changed, 98 insertions(+), 17 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (3f15ad40640c -> d913d1b2662c)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 3f15ad40640c [SPARK-47994][SQL] Fix bug with CASE WHEN column filter push down in SQLServer add d913d1b2662c [SPARK-47148][SQL] Avoid to materialize AQE ExchangeQueryStageExec on the cancellation No new revisions were added by this update. Summary of changes: .../sql/execution/adaptive/QueryStageExec.scala| 39 --- .../execution/exchange/BroadcastExchangeExec.scala | 11 ++ .../spark/sql/execution/exchange/Exchange.scala| 13 +++ .../execution/exchange/ShuffleExchangeExec.scala | 29 - .../adaptive/AdaptiveQueryExecSuite.scala | 120 - 5 files changed, 182 insertions(+), 30 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47567][SQL] Support LOCATE function to work with collated strings
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7b1147a05a6c [SPARK-47567][SQL] Support LOCATE function to work with collated strings 7b1147a05a6c is described below commit 7b1147a05a6ca54276538d766c089980b9ee5d59 Author: Milan Dankovic AuthorDate: Mon Apr 29 17:24:36 2024 +0800 [SPARK-47567][SQL] Support LOCATE function to work with collated strings ### What changes were proposed in this pull request? Extend built-in string functions to support non-binary, non-lowercase collation for: locate ### Why are the changes needed? Update collation support for built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use COLLATE within arguments for built-in string function LOCATE in Spark SQL queries, using non-binary collations such as UNICODE_CI. ### How was this patch tested? Unit tests for queries using StringLocate (`CollationStringExpressionsSuite.scala`). ### Was this patch authored or co-authored using generative AI tooling? No Closes #45791 from miland-db/miland-db/string-locate. Authored-by: Milan Dankovic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 38 + .../spark/unsafe/types/CollationSupportSuite.java | 65 ++ .../sql/catalyst/analysis/CollationTypeCasts.scala | 4 ++ .../catalyst/expressions/stringExpressions.scala | 14 +++-- .../sql/CollationStringExpressionsSuite.scala | 34 +++ 5 files changed, 149 insertions(+), 6 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 0fc37c169612..0c81b99de916 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -403,6 +403,44 @@ public final class CollationSupport { } } + public static class StringLocate { +public static int exec(final UTF8String string, final UTF8String substring, final int start, +final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(string, substring, start); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(string, substring, start); + } else { +return execICU(string, substring, start, collationId); + } +} +public static String genCode(final String string, final String substring, final int start, +final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringLocate.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s, %d)", string, substring, start); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s, %d)", string, substring, start); + } else { +return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, start, collationId); + } +} +public static int execBinary(final UTF8String string, final UTF8String substring, +final int start) { + return string.indexOf(substring, start); +} +public static int execLowercase(final UTF8String string, final UTF8String substring, +final int start) { + return string.toLowerCase().indexOf(substring.toLowerCase(), start); +} +public static int execICU(final UTF8String string, final UTF8String substring, final int start, + final int collationId) { + return CollationAwareUTF8String.indexOf(string, substring, start, collationId); +} + } + // TODO: Add more collation-aware string expressions. /** diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 6c79fc821317..030c7a7a1e3c 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -652,6 +652,71 @@ public class CollationSupportSuite { assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"); } + private void assertLocate(String substring, String string, Integer start, Strin
(spark) branch master updated: [SPARK-47939][SQL] Implement a new Analyzer rule to move ParameterizedQuery inside ExplainCommand and DescribeQueryCommand
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0e52b59b540f [SPARK-47939][SQL] Implement a new Analyzer rule to move ParameterizedQuery inside ExplainCommand and DescribeQueryCommand 0e52b59b540f is described below commit 0e52b59b540fac85972453093805c76b4f71cb94 Author: Vladimir Golubev AuthorDate: Mon Apr 29 17:00:32 2024 +0800 [SPARK-47939][SQL] Implement a new Analyzer rule to move ParameterizedQuery inside ExplainCommand and DescribeQueryCommand ### What changes were proposed in this pull request? Mark `DescribeQueryCommand` and `ExplainCommand` as `SupervisingCommand` (they don't expose their wrapped nodes, but supervise them internally). Introduce a new Analyzer rule `MoveParameterizedQueriesDown`, which moves `ParameterizedQuery` inside `SupervisingCommand` for parameters to be resolved correctly. ### Why are the changes needed? Parameterized `EXPLAIN` and `DESCRIBE` queries: - `spark.sql("describe select ?", Array(1)).show();` - `spark.sql("explain select ?", Array(1)).show();` fail with `org.apache.spark.sql.catalyst.ExtendedAnalysisException: [UNBOUND_SQL_PARAMETER] Found the unbound parameter: _16. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`. SQLSTATE: 42P02; line 1 pos 16; 'Project [unresolvedalias(posparameter(16))] +- OneRowRelation` ### Does this PR introduce _any_ user-facing change? Yes, parameterized `EXPLAIN` and `DESCRIBE` should start working for users ### How was this patch tested? - Run `sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala` - Run `sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala` - New tests for `SQLQuerySuite` ### Was this patch authored or co-authored using generative AI tooling? No Closes #46209 from vladimirg-db/vladimirg-db/make-explain-and-describe-work-with-parameters. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/analysis/Analyzer.scala | 1 + .../spark/sql/catalyst/analysis/parameters.scala | 64 -- .../spark/sql/catalyst/plans/logical/Command.scala | 11 ++ .../spark/sql/execution/command/commands.scala | 7 +- .../spark/sql/execution/command/tables.scala | 5 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 141 + 6 files changed, 218 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 4b753e1f28e5..c29432c916f9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -325,6 +325,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor RewriteDeleteFromTable :: RewriteUpdateTable :: RewriteMergeIntoTable :: + MoveParameterizedQueriesDown :: BindParameters :: typeCoercionRules() ++ Seq( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala index f1cc44b270bc..5b365a0d49ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.expressions.{Alias, CreateArray, CreateMap, CreateNamedStruct, Expression, LeafExpression, Literal, MapFromArrays, MapFromEntries, SubqueryExpression, Unevaluable, VariableReference} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SupervisingCommand} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.trees.TreePattern.{PARAMETER, PARAMETERIZED_QUERY, TreePattern, UNRESOLVED_WITH} +import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMAND, PARAMETER, PARAMETERIZED_QUERY, TreePattern, UNRESOLVED_WITH} import org.apache.spark.sql.errors.QueryErrorsBase import org.apache.spark.sql.types.DataType @@ -104,12 +104,64 @@ case class PosParameterizedQuery(child: LogicalPlan, args: Seq[Expression]) copy(child = newChild) } +/** + * Base class for rules that process parameterized queries. + */ +abstract class ParameterizedQueryProcessor exte
(spark) branch branch-3.4 updated: [SPARK-47927][SQL] Fix nullability attribute in UDF decoder
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new f4dc254ee0bd [SPARK-47927][SQL] Fix nullability attribute in UDF decoder f4dc254ee0bd is described below commit f4dc254ee0bde2e8cac7e3f5b1a22017be9d6fba Author: Emil Ejbyfeldt AuthorDate: Sun Apr 28 13:46:03 2024 +0800 [SPARK-47927][SQL] Fix nullability attribute in UDF decoder ### What changes were proposed in this pull request? This PR fixes a correctness issue by moving the batch that resolves udf decoders to after the `UpdateNullability` batch. This means we now derive a decoder with the updated attributes which fixes a correctness issue. I think the issue has existed since https://github.com/apache/spark/pull/28645 when udf support case class arguments was added. So therefore this issue should be present in all currently supported versions. ### Why are the changes needed? Currently the following code ``` scala> val ds1 = Seq(1).toDS() | val ds2 = Seq[Int]().toDS() | val f = udf[Tuple1[Option[Int]],Tuple1[Option[Int]]](identity) | ds1.join(ds2, ds1("value") === ds2("value"), "left_outer").select(f(struct(ds2("value".collect() val ds1: org.apache.spark.sql.Dataset[Int] = [value: int] val ds2: org.apache.spark.sql.Dataset[Int] = [value: int] val f: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$2481/0x7f7f50961f086b1a2c9f,StructType(StructField(_1,IntegerType,true)),List(Some(class[_1[0]: int])),Some(class[_1[0]: int]),None,true,true) val res0: Array[org.apache.spark.sql.Row] = Array([[0]]) ``` results in an row containing `0` this is incorrect as the value should be `null`. Removing the udf call ``` scala> ds1.join(ds2, ds1("value") === ds2("value"), "left_outer").select(struct(ds2("value"))).collect() val res1: Array[org.apache.spark.sql.Row] = Array([[null]]) ``` gives the correct value. ### Does this PR introduce _any_ user-facing change? Yes, fixes a correctness issue when using ScalaUDFs. ### How was this patch tested? Existing and new unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46156 from eejbyfeldt/SPARK-47927. Authored-by: Emil Ejbyfeldt Signed-off-by: Wenchen Fan (cherry picked from commit 8b8ea60bd4f22ea5763a77bac2d51f25d2479be9) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 4 ++-- sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala | 11 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 3b19b1a12e76..74061f2b8f21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -332,11 +332,11 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor new ResolveHints.RemoveAllHints), Batch("Nondeterministic", Once, PullOutNondeterministic), +Batch("UpdateNullability", Once, + UpdateAttributeNullability), Batch("UDF", Once, HandleNullInputsForUDF, ResolveEncodersInUDF), -Batch("UpdateNullability", Once, - UpdateAttributeNullability), Batch("Subquery", Once, UpdateOuterReferences), Batch("Cleanup", fixedPoint, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 734fcebc80e7..2cee86be7d31 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -1060,4 +1060,15 @@ class UDFSuite extends QueryTest with SharedSparkSession { }.getCause.getCause assert(e.isInstanceOf[java.lang.ArithmeticException]) } + + test("SPARK-47927: Correctly pass null values derived from join to UDF") { +val f = udf[Tuple1[Option[Int]], Tuple1[Option[Int]]](identity) +val ds1 = Seq(1).toDS() +val ds2 = Seq[Int]().toDS() + +checkAnswer( + ds1.join(ds2, ds1("value") === ds2("value"), "left_outer") +.select(f(struct(ds2("value").as("_1", + Row(Row(null))) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch branch-3.5 updated: [SPARK-47927][SQL] Fix nullability attribute in UDF decoder
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 33768f66d953 [SPARK-47927][SQL] Fix nullability attribute in UDF decoder 33768f66d953 is described below commit 33768f66d953159ffd2b2bd0ec89957a2ce2eca0 Author: Emil Ejbyfeldt AuthorDate: Sun Apr 28 13:46:03 2024 +0800 [SPARK-47927][SQL] Fix nullability attribute in UDF decoder This PR fixes a correctness issue by moving the batch that resolves udf decoders to after the `UpdateNullability` batch. This means we now derive a decoder with the updated attributes which fixes a correctness issue. I think the issue has existed since https://github.com/apache/spark/pull/28645 when udf support case class arguments was added. So therefore this issue should be present in all currently supported versions. Currently the following code ``` scala> val ds1 = Seq(1).toDS() | val ds2 = Seq[Int]().toDS() | val f = udf[Tuple1[Option[Int]],Tuple1[Option[Int]]](identity) | ds1.join(ds2, ds1("value") === ds2("value"), "left_outer").select(f(struct(ds2("value".collect() val ds1: org.apache.spark.sql.Dataset[Int] = [value: int] val ds2: org.apache.spark.sql.Dataset[Int] = [value: int] val f: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$2481/0x7f7f50961f086b1a2c9f,StructType(StructField(_1,IntegerType,true)),List(Some(class[_1[0]: int])),Some(class[_1[0]: int]),None,true,true) val res0: Array[org.apache.spark.sql.Row] = Array([[0]]) ``` results in an row containing `0` this is incorrect as the value should be `null`. Removing the udf call ``` scala> ds1.join(ds2, ds1("value") === ds2("value"), "left_outer").select(struct(ds2("value"))).collect() val res1: Array[org.apache.spark.sql.Row] = Array([[null]]) ``` gives the correct value. Yes, fixes a correctness issue when using ScalaUDFs. Existing and new unit tests. No. Closes #46156 from eejbyfeldt/SPARK-47927. Authored-by: Emil Ejbyfeldt Signed-off-by: Wenchen Fan (cherry picked from commit 8b8ea60bd4f22ea5763a77bac2d51f25d2479be9) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 4 ++-- sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala | 11 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index eae150001249..93efa5e4a49c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -338,11 +338,11 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor new ResolveHints.RemoveAllHints), Batch("Nondeterministic", Once, PullOutNondeterministic), +Batch("UpdateNullability", Once, + UpdateAttributeNullability), Batch("UDF", Once, HandleNullInputsForUDF, ResolveEncodersInUDF), -Batch("UpdateNullability", Once, - UpdateAttributeNullability), Batch("Subquery", Once, UpdateOuterReferences), Batch("Cleanup", fixedPoint, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index e54bda1acef5..56bc707450e3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -1067,4 +1067,15 @@ class UDFSuite extends QueryTest with SharedSparkSession { .lookupFunctionInfo(FunctionIdentifier("dummyUDF")) assert(expressionInfo.getClassName.contains("org.apache.spark.sql.UDFRegistration$$Lambda")) } + + test("SPARK-47927: Correctly pass null values derived from join to UDF") { +val f = udf[Tuple1[Option[Int]], Tuple1[Option[Int]]](identity) +val ds1 = Seq(1).toDS() +val ds2 = Seq[Int]().toDS() + +checkAnswer( + ds1.join(ds2, ds1("value") === ds2("value"), "left_outer") +.select(f(struct(ds2("value").as("_1", + Row(Row(null))) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47927][SQL] Fix nullability attribute in UDF decoder
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 8b8ea60bd4f2 [SPARK-47927][SQL] Fix nullability attribute in UDF decoder 8b8ea60bd4f2 is described below commit 8b8ea60bd4f22ea5763a77bac2d51f25d2479be9 Author: Emil Ejbyfeldt AuthorDate: Sun Apr 28 13:46:03 2024 +0800 [SPARK-47927][SQL] Fix nullability attribute in UDF decoder ### What changes were proposed in this pull request? This PR fixes a correctness issue by moving the batch that resolves udf decoders to after the `UpdateNullability` batch. This means we now derive a decoder with the updated attributes which fixes a correctness issue. I think the issue has existed since https://github.com/apache/spark/pull/28645 when udf support case class arguments was added. So therefore this issue should be present in all currently supported versions. ### Why are the changes needed? Currently the following code ``` scala> val ds1 = Seq(1).toDS() | val ds2 = Seq[Int]().toDS() | val f = udf[Tuple1[Option[Int]],Tuple1[Option[Int]]](identity) | ds1.join(ds2, ds1("value") === ds2("value"), "left_outer").select(f(struct(ds2("value".collect() val ds1: org.apache.spark.sql.Dataset[Int] = [value: int] val ds2: org.apache.spark.sql.Dataset[Int] = [value: int] val f: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$2481/0x7f7f50961f086b1a2c9f,StructType(StructField(_1,IntegerType,true)),List(Some(class[_1[0]: int])),Some(class[_1[0]: int]),None,true,true) val res0: Array[org.apache.spark.sql.Row] = Array([[0]]) ``` results in an row containing `0` this is incorrect as the value should be `null`. Removing the udf call ``` scala> ds1.join(ds2, ds1("value") === ds2("value"), "left_outer").select(struct(ds2("value"))).collect() val res1: Array[org.apache.spark.sql.Row] = Array([[null]]) ``` gives the correct value. ### Does this PR introduce _any_ user-facing change? Yes, fixes a correctness issue when using ScalaUDFs. ### How was this patch tested? Existing and new unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46156 from eejbyfeldt/SPARK-47927. Authored-by: Emil Ejbyfeldt Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 4 ++-- sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala | 11 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index e666200a78d4..4b753e1f28e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -339,11 +339,11 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor new ResolveHints.RemoveAllHints), Batch("Nondeterministic", Once, PullOutNondeterministic), +Batch("UpdateNullability", Once, + UpdateAttributeNullability), Batch("UDF", Once, HandleNullInputsForUDF, ResolveEncodersInUDF), -Batch("UpdateNullability", Once, - UpdateAttributeNullability), Batch("Subquery", Once, UpdateOuterReferences), Batch("Cleanup", fixedPoint, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 87ca3a07c4d5..fe47d6c68555 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -1183,4 +1183,15 @@ class UDFSuite extends QueryTest with SharedSparkSession { df10.select(zip_with(col("array1"), col("array2"), (b1, b2) => reverseThenConcat2(b1, b2))) checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil) } + + test("SPARK-47927: Correctly pass null values derived from join to UDF") { +val f = udf[Tuple1[Option[Int]], Tuple1[Option[Int]]](identity) +val ds1 = Seq(1).toDS() +val ds2 = Seq[Int]().toDS() + +checkAnswer( + ds1.join(ds2, ds1("value") === ds2("value"), "left_outer") +.select(f(struct(ds2("value").as("_1", + Row(Row(null))) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch branch-3.5 updated: [SPARK-48019] Fix incorrect behavior in ColumnVector/ColumnarArray with dictionary and nulls
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 7a573b967138 [SPARK-48019] Fix incorrect behavior in ColumnVector/ColumnarArray with dictionary and nulls 7a573b967138 is described below commit 7a573b967138d64506b311207c8e3630b91a5afe Author: Gene Pang AuthorDate: Sun Apr 28 11:07:12 2024 +0800 [SPARK-48019] Fix incorrect behavior in ColumnVector/ColumnarArray with dictionary and nulls This fixes how `ColumnVector` handles copying arrays when the vector has a dictionary and null values. The possible issues with the previous implementation: - An `ArrayIndexOutOfBoundsException` may be thrown when the `ColumnVector` has nulls and dictionaries. This is because the dictionary id for `null` entries might be invalid and should not be used for `null` entries. - Copying a `ColumnarArray` (which contains a `ColumnVector`) is incorrect, if it contains `null` entries. This is because copying a primitive array does not take into account the `null` entries, so all the null entries get lost. These changes are needed to avoid `ArrayIndexOutOfBoundsException` and to produce correct results when copying `ColumnarArray`. The only user facing changes are to fix existing errors and incorrect results. Added new unit tests. No. Closes #46254 from gene-db/dictionary-nulls. Authored-by: Gene Pang Signed-off-by: Wenchen Fan (cherry picked from commit 76ce6b00e036a699ad172ba4b7d3f2632ab75332) Signed-off-by: Wenchen Fan --- .../apache/spark/sql/vectorized/ColumnarArray.java | 5 + .../execution/vectorized/OffHeapColumnVector.java | 24 ++- .../execution/vectorized/OnHeapColumnVector.java | 24 ++- .../execution/vectorized/ColumnVectorSuite.scala | 174 + 4 files changed, 215 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index bd7c3d7c0fd4..c4de83cf8b82 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -51,6 +51,11 @@ public final class ColumnarArray extends ArrayData { public ArrayData copy() { DataType dt = data.dataType(); +if (data.hasNull()) { + // UnsafeArrayData cannot be used if there are any nulls. + return new GenericArrayData(toObjectArray(dt)).copy(); +} + if (dt instanceof BooleanType) { return UnsafeArrayData.fromPrimitiveArray(toBooleanArray()); } else if (dt instanceof ByteType) { diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index 46f241d92e6b..122f775c2b0e 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -215,7 +215,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId, array, Platform.BYTE_ARRAY_OFFSET, count); } else { for (int i = 0; i < count; i++) { -array[i] = getByte(rowId + i); +if (!isNullAt(rowId + i)) { + array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +} } } return array; @@ -276,7 +278,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId * 2L, array, Platform.SHORT_ARRAY_OFFSET, count * 2L); } else { for (int i = 0; i < count; i++) { -array[i] = getShort(rowId + i); +if (!isNullAt(rowId + i)) { + array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +} } } return array; @@ -342,7 +346,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId * 4L, array, Platform.INT_ARRAY_OFFSET, count * 4L); } else { for (int i = 0; i < count; i++) { -array[i] = getInt(rowId + i); +if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +} } } return array; @@ -420,7 +426,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId * 8L, array, Platform.LONG_ARRAY_OFFSET, count * 8L); } else { for (int i = 0; i < count; i++) { -array[i] = getLong(rowId + i); +if (!is
(spark) branch master updated: [SPARK-48019] Fix incorrect behavior in ColumnVector/ColumnarArray with dictionary and nulls
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 76ce6b00e036 [SPARK-48019] Fix incorrect behavior in ColumnVector/ColumnarArray with dictionary and nulls 76ce6b00e036 is described below commit 76ce6b00e036a699ad172ba4b7d3f2632ab75332 Author: Gene Pang AuthorDate: Sun Apr 28 11:07:12 2024 +0800 [SPARK-48019] Fix incorrect behavior in ColumnVector/ColumnarArray with dictionary and nulls ### What changes were proposed in this pull request? This fixes how `ColumnVector` handles copying arrays when the vector has a dictionary and null values. The possible issues with the previous implementation: - An `ArrayIndexOutOfBoundsException` may be thrown when the `ColumnVector` has nulls and dictionaries. This is because the dictionary id for `null` entries might be invalid and should not be used for `null` entries. - Copying a `ColumnarArray` (which contains a `ColumnVector`) is incorrect, if it contains `null` entries. This is because copying a primitive array does not take into account the `null` entries, so all the null entries get lost. ### Why are the changes needed? These changes are needed to avoid `ArrayIndexOutOfBoundsException` and to produce correct results when copying `ColumnarArray`. ### Does this PR introduce _any_ user-facing change? The only user facing changes are to fix existing errors and incorrect results. ### How was this patch tested? Added new unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46254 from gene-db/dictionary-nulls. Authored-by: Gene Pang Signed-off-by: Wenchen Fan --- .../apache/spark/sql/vectorized/ColumnarArray.java | 5 + .../execution/vectorized/OffHeapColumnVector.java | 24 ++- .../execution/vectorized/OnHeapColumnVector.java | 24 ++- .../execution/vectorized/ColumnVectorSuite.scala | 174 + 4 files changed, 215 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index 4163af9bfda5..d92293b91870 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -53,6 +53,11 @@ public final class ColumnarArray extends ArrayData { public ArrayData copy() { DataType dt = data.dataType(); +if (data.hasNull()) { + // UnsafeArrayData cannot be used if there are any nulls. + return new GenericArrayData(toObjectArray(dt)).copy(); +} + if (dt instanceof BooleanType) { return UnsafeArrayData.fromPrimitiveArray(toBooleanArray()); } else if (dt instanceof ByteType) { diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index 2bb0b02d4c9c..1882d990bef5 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -218,7 +218,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId, array, Platform.BYTE_ARRAY_OFFSET, count); } else { for (int i = 0; i < count; i++) { -array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +if (!isNullAt(rowId + i)) { + array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +} } } return array; @@ -279,7 +281,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId * 2L, array, Platform.SHORT_ARRAY_OFFSET, count * 2L); } else { for (int i = 0; i < count; i++) { -array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +if (!isNullAt(rowId + i)) { + array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +} } } return array; @@ -345,7 +349,9 @@ public final class OffHeapColumnVector extends WritableColumnVector { Platform.copyMemory(null, data + rowId * 4L, array, Platform.INT_ARRAY_OFFSET, count * 4L); } else { for (int i = 0; i < count; i++) { -array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); +} } } ret
(spark) branch master updated: [SPARK-47476][SQL] Support REPLACE function to work with collated strings
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 07b84dd57e38 [SPARK-47476][SQL] Support REPLACE function to work with collated strings 07b84dd57e38 is described below commit 07b84dd57e38b6396bffaf6f756019e933512d32 Author: Milan Dankovic AuthorDate: Fri Apr 26 23:19:22 2024 +0800 [SPARK-47476][SQL] Support REPLACE function to work with collated strings ### What changes were proposed in this pull request? Extend built-in string functions to support non-binary, non-lowercase collation for: replace. ### Why are the changes needed? Update collation support for built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use COLLATE within arguments for built-in string function REPLACE in Spark SQL queries, using non-binary collations such as UNICODE_CI. ### How was this patch tested? Unit tests for queries using StringReplace (`CollationStringExpressionsSuite.scala`). ### Was this patch authored or co-authored using generative AI tooling? No ### Algorithm explanation - StringSearch.next() returns position of the first character of `search` string in the `source` source. We need to convert this position to position in bytes so we can perform replace operation correctly. - For UTF8_BINARY_LCASE collation there is no corresponding collator so we have to implement custom logic (`lowercaseReplace`). It is done by performing matching on **lowercase strings** (`source & search`) and using that information to do operations on the **original** `source` string. String building is performed in the same way as for other non-binary collations. Similar logic can be found in existing `int find(UTF8String str, int start)` & `int indexOf(UTF8String v, int start)` methods. Closes #45704 from miland-db/miland-db/string-replace. Lead-authored-by: Milan Dankovic Co-authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 140 + .../org/apache/spark/unsafe/types/UTF8String.java | 4 +- .../spark/unsafe/types/CollationSupportSuite.java | 38 ++ .../sql/catalyst/analysis/CollationTypeCasts.scala | 2 +- .../catalyst/expressions/stringExpressions.scala | 16 +-- .../sql/CollationStringExpressionsSuite.scala | 36 ++ 6 files changed, 226 insertions(+), 10 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 0c03faa0d23a..0fc37c169612 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -21,6 +21,7 @@ import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.StringSearch; import com.ibm.icu.util.ULocale; +import org.apache.spark.unsafe.UTF8StringBuilder; import org.apache.spark.unsafe.types.UTF8String; import java.util.ArrayList; @@ -364,6 +365,44 @@ public final class CollationSupport { } } + public static class StringReplace { +public static UTF8String exec(final UTF8String src, final UTF8String search, +final UTF8String replace, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(src, search, replace); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(src, search, replace); + } else { +return execICU(src, search, replace, collationId); + } +} +public static String genCode(final String src, final String search, final String replace, +final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringReplace.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s, %s)", src, search, replace); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s, %s)", src, search, replace); + } else { +return String.format(expr + "ICU(%s, %s, %s, %d)", src, search, replace, collationId); + } +} +public static UTF8String execBinary(final UTF8String src, final UTF8String search, +final UTF8String replace) { + return src.replace(search, replace); +} +public static UTF8String
(spark) branch master updated: [SPARK-47351][SQL] Add collation support for StringToMap & Mask string expressions
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new afe310d617e5 [SPARK-47351][SQL] Add collation support for StringToMap & Mask string expressions afe310d617e5 is described below commit afe310d617e5d5e1fd79e7d42e2bbafe93c6d3a8 Author: Uros Bojanic <157381213+uros...@users.noreply.github.com> AuthorDate: Fri Apr 26 20:33:29 2024 +0800 [SPARK-47351][SQL] Add collation support for StringToMap & Mask string expressions ### What changes were proposed in this pull request? Introduce collation awareness for string expressions: str_to_map & mask. ### Why are the changes needed? Add collation support for built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for built-in string functions: str_to_map & mask. ### How was this patch tested? E2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46165 from uros-db/SPARK-47351. Authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/CollationTypeCasts.scala | 2 +- .../catalyst/expressions/complexTypeCreator.scala | 8 +- .../sql/catalyst/expressions/maskExpressions.scala | 44 +- .../spark/sql/CollationSQLExpressionsSuite.scala | 98 ++ 4 files changed, 129 insertions(+), 23 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index 473d552b3d94..c7ca5607481d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -64,7 +64,7 @@ object CollationTypeCasts extends TypeCoercionRule { case otherExpr @ ( _: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least | - _: Coalesce | _: BinaryExpression | _: ConcatWs) => + _: Coalesce | _: BinaryExpression | _: ConcatWs | _: Mask) => val newChildren = collateToSingleType(otherExpr.children) otherExpr.withNewChildren(newChildren) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 3eb6225b5426..c38b6cea9a0a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.ArrayImplicits._ @@ -570,11 +571,12 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E override def second: Expression = pairDelim override def third: Expression = keyValueDelim - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation) - override def dataType: DataType = MapType(StringType, StringType) + override def dataType: DataType = MapType(first.dataType, first.dataType) - private lazy val mapBuilder = new ArrayBasedMapBuilder(StringType, StringType) + private lazy val mapBuilder = new ArrayBasedMapBuilder(first.dataType, first.dataType) override def nullSafeEval( inputString: Any, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala index e5157685a9a6..c11357352c79 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala @@ -24,7 +24,9 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.plans.logical.{FunctionSignature, InputParameter} import org.apache.spark.
(spark) branch master updated: [SPARK-47350][SQL] Add collation support for SplitPart string expression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6c40214f3d93 [SPARK-47350][SQL] Add collation support for SplitPart string expression 6c40214f3d93 is described below commit 6c40214f3d93907686ed731caa3d572a9fa93d53 Author: Uros Bojanic <157381213+uros...@users.noreply.github.com> AuthorDate: Fri Apr 26 19:57:11 2024 +0800 [SPARK-47350][SQL] Add collation support for SplitPart string expression ### What changes were proposed in this pull request? Introduce collation awareness for string expression: split_part. ### Why are the changes needed? Add collation support for built-in string function in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for built-in string function: split_part. ### How was this patch tested? Unit collation support tests and e2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46158 from uros-db/SPARK-47350. Authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 58 +++ .../spark/unsafe/types/CollationSupportSuite.java | 86 ++ .../catalyst/expressions/stringExpressions.scala | 15 ++-- .../sql/CollationStringExpressionsSuite.scala | 17 + 4 files changed, 170 insertions(+), 6 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 70a3f5bd6136..0c03faa0d23a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -23,6 +23,8 @@ import com.ibm.icu.util.ULocale; import org.apache.spark.unsafe.types.UTF8String; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Pattern; /** @@ -36,6 +38,62 @@ public final class CollationSupport { * Collation-aware string expressions. */ + public static class StringSplitSQL { +public static UTF8String[] exec(final UTF8String s, final UTF8String d, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(s, d); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(s, d); + } else { +return execICU(s, d, collationId); + } +} +public static String genCode(final String s, final String d, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringSplitSQL.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", s, d); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", s, d); + } else { +return String.format(expr + "ICU(%s, %s, %d)", s, d, collationId); + } +} +public static UTF8String[] execBinary(final UTF8String string, final UTF8String delimiter) { + return string.splitSQL(delimiter, -1); +} +public static UTF8String[] execLowercase(final UTF8String string, final UTF8String delimiter) { + if (delimiter.numBytes() == 0) return new UTF8String[] { string }; + if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 }; + Pattern pattern = Pattern.compile(Pattern.quote(delimiter.toString()), +CollationSupport.lowercaseRegexFlags); + String[] splits = pattern.split(string.toString(), -1); + UTF8String[] res = new UTF8String[splits.length]; + for (int i = 0; i < res.length; i++) { +res[i] = UTF8String.fromString(splits[i]); + } + return res; +} +public static UTF8String[] execICU(final UTF8String string, final UTF8String delimiter, +final int collationId) { + if (delimiter.numBytes() == 0) return new UTF8String[] { string }; + if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 }; + List strings = new ArrayList<>(); + String target = string.toString(), pattern = delimiter.toString(); + StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); + int start = 0, end; + while ((end = stringSearch.next()) != StringSearch.DONE) { +strings.add(UTF8String.fr
(spark) branch master updated: [SPARK-47922][SQL] Implement the try_parse_json expression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 033ca3e7dd5b [SPARK-47922][SQL] Implement the try_parse_json expression 033ca3e7dd5b is described below commit 033ca3e7dd5bd9ceadd9219c2bde105bea301e70 Author: Harsh Motwani AuthorDate: Fri Apr 26 14:35:59 2024 +0800 [SPARK-47922][SQL] Implement the try_parse_json expression ### What changes were proposed in this pull request? This pull request implements the `try_parse_json` that runs `parse_json` on string expressions to extract variants. However, if `parse_json` throws an exception on a row, the value `null` is returned. ### Why are the changes needed? Sometimes, columns containing JSON strings may contain some invalid inputs that should be ignored instead of having the whole execution failed because of it. ### Does this PR introduce _any_ user-facing change? Yes, it allows users to run the `try_parse_json` expression. ### How was this patch tested? Unit tests to check if `try_parse_json` works just like `parse_json` on valid inputs, returns `null` on invalid inputs, and fails on incorrect input data types. ### Was this patch authored or co-authored using generative AI tooling? No Closes #46141 from harshmotw-db/try_parse_json. Authored-by: Harsh Motwani Signed-off-by: Wenchen Fan --- .../scala/org/apache/spark/sql/functions.scala | 12 .../apache/spark/sql/PlanGenerationTestSuite.scala | 4 ++ .../function_is_variant_null.explain | 2 +- .../explain-results/function_parse_json.explain| 2 +- .../function_schema_of_variant.explain | 2 +- .../function_schema_of_variant_agg.explain | 2 +- ...son.explain => function_try_parse_json.explain} | 2 +- .../function_try_variant_get.explain | 2 +- .../explain-results/function_variant_get.explain | 2 +- .../queries/function_try_parse_json.json | 25 .../queries/function_try_parse_json.proto.bin | Bin 0 -> 183 bytes .../source/reference/pyspark.sql/functions.rst | 1 + python/pyspark/sql/connect/functions/builtin.py| 7 +++ python/pyspark/sql/functions/builtin.py| 34 +- python/pyspark/sql/tests/test_functions.py | 8 +++ .../sql/catalyst/analysis/FunctionRegistry.scala | 3 +- .../variant/VariantExpressionEvalUtils.scala | 16 +++-- .../expressions/variant/variantExpressions.scala | 69 - .../variant/VariantExpressionEvalUtilsSuite.scala | 9 ++- .../scala/org/apache/spark/sql/functions.scala | 11 .../sql-functions/sql-expression-schema.md | 3 +- .../apache/spark/sql/VariantEndToEndSuite.scala| 35 +++ .../scala/org/apache/spark/sql/VariantSuite.scala | 9 +++ 23 files changed, 229 insertions(+), 31 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 5a7880c87431..6471d15b63ab 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -7000,6 +7000,18 @@ object functions { fnWithOptions("from_json", options, e, schema) } + /** + * Parses a JSON string and constructs a Variant value. Returns null if the input string is not + * a valid JSON value. + * + * @param json + * a string column that contains JSON data. + * + * @group variant_funcs + * @since 4.0.0 + */ + def try_parse_json(json: Column): Column = Column.fn("try_parse_json", json) + /** * Parses a JSON string and constructs a Variant value. * diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index 1005561b24ac..ebf4ee0e9073 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -2489,6 +2489,10 @@ class PlanGenerationTestSuite Collections.singletonMap("allowNumericLeadingZeros", "true")) } + functionTest("try_parse_json") { +fn.try_parse_json(fn.col("g")) + } + functionTest("to_json") { fn.to_json(fn.col("d"), Map(("timestampFormat", "dd/MM/"))) } diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/fu
(spark) branch master updated (c6aaa18e6cfd -> b4624bf4be28)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from c6aaa18e6cfd Revert "[SPARK-45302][PYTHON] Remove PID communication between Pythonworkers when no demon is used" add b4624bf4be28 [SPARK-47414][SQL] Lowercase collation support for regexp expressions No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/util/CollationSupport.java | 21 +- .../sql/catalyst/analysis/CollationTypeCasts.scala | 7 +- .../catalyst/expressions/regexpExpressions.scala | 92 --- .../CollationRegexpExpressionsSuite.scala | 170 ...nsSuite.scala => CollationSQLRegexpSuite.scala} | 292 ++--- 5 files changed, 456 insertions(+), 126 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala rename sql/core/src/test/scala/org/apache/spark/sql/{CollationRegexpExpressionsSuite.scala => CollationSQLRegexpSuite.scala} (50%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47958][TESTS] Change LocalSchedulerBackend to notify scheduler of executor on start
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 09ed09cb18e7 [SPARK-47958][TESTS] Change LocalSchedulerBackend to notify scheduler of executor on start 09ed09cb18e7 is described below commit 09ed09cb18e749689503d0f8cf9abfe52582e7ad Author: Davin Tjong AuthorDate: Wed Apr 24 19:41:09 2024 +0800 [SPARK-47958][TESTS] Change LocalSchedulerBackend to notify scheduler of executor on start ### What changes were proposed in this pull request? Changing to call `reviveOffers` on start (after the local executor is set up) so that the task scheduler knows about it. This matches behavior in CoarseGrainedSchedulerBackend, which will call an equivalent method on executor registration. ### Why are the changes needed? When using LocalSchedulerBackend, the task scheduler will not know about the executor until a task is run, which can lead to unexpected behavior in tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Running existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46187 from davintjong-db/local-executor-fix. Authored-by: Davin Tjong Signed-off-by: Wenchen Fan --- .../scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala index a00fe2a06899..298669327a39 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala @@ -142,6 +142,7 @@ private[spark] class LocalSchedulerBackend( Map.empty))) launcherBackend.setAppId(appId) launcherBackend.setState(SparkAppHandle.State.RUNNING) +reviveOffers() } override def stop(): Unit = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47764][CORE][SQL] Cleanup shuffle dependencies based on ShuffleCleanupMode
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c44493db1dd7 [SPARK-47764][CORE][SQL] Cleanup shuffle dependencies based on ShuffleCleanupMode c44493db1dd7 is described below commit c44493db1dd7bd56dc41ade19138563b6b76529b Author: Bo Zhang AuthorDate: Wed Apr 24 16:14:06 2024 +0800 [SPARK-47764][CORE][SQL] Cleanup shuffle dependencies based on ShuffleCleanupMode ### What changes were proposed in this pull request? This change adds a new trait, `ShuffleCleanupMode` under `QueryExecution`, and two new configs, `spark.sql.shuffleDependency.skipMigration.enabled` and `spark.sql.shuffleDependency.fileCleanup.enabled`. For Spark Connect query executions, `ShuffleCleanupMode` is controlled by the two new configs, and shuffle dependency cleanup are performed accordingly. When `spark.sql.shuffleDependency.fileCleanup.enabled` is `true`, shuffle dependency files will be cleaned up at the end of query executions. When `spark.sql.shuffleDependency.skipMigration.enabled` is `true`, shuffle dependencies will be skipped at the shuffle data migration for node decommissions. ### Why are the changes needed? This is to: 1. speed up shuffle data migration at decommissions and 2. possibly (when file cleanup mode is enabled) release disk space occupied by unused shuffle files. ### Does this PR introduce _any_ user-facing change? Yes. This change adds two new configs, `spark.sql.shuffleDependency.skipMigration.enabled` and `spark.sql.shuffleDependency.fileCleanup.enabled` to control the cleanup behaviors. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No Closes #45930 from bozhang2820/spark-47764. Authored-by: Bo Zhang Signed-off-by: Wenchen Fan --- .../execution/SparkConnectPlanExecution.scala | 15 +++- .../spark/shuffle/IndexShuffleBlockResolver.scala | 12 +- .../apache/spark/shuffle/MigratableResolver.scala | 5 +++ .../org/apache/spark/storage/BlockManager.scala| 2 +- project/MimaExcludes.scala | 7 +++- .../org/apache/spark/sql/internal/SQLConf.scala| 16 .../main/scala/org/apache/spark/sql/Dataset.scala | 20 +- .../spark/sql/execution/QueryExecution.scala | 19 +- .../apache/spark/sql/execution/SQLExecution.scala | 24 +++- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 9 - .../execution/exchange/ShuffleExchangeExec.scala | 7 .../spark/sql/SparkSessionExtensionSuite.scala | 1 + .../spark/sql/execution/QueryExecutionSuite.scala | 43 ++ 13 files changed, 169 insertions(+), 11 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala index 23390bf7aba8..32cdae7bae56 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala @@ -35,8 +35,9 @@ import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_ import org.apache.spark.sql.connect.planner.SparkConnectPlanner import org.apache.spark.sql.connect.service.ExecuteHolder import org.apache.spark.sql.connect.utils.MetricGenerator -import org.apache.spark.sql.execution.{LocalTableScanExec, SQLExecution} +import org.apache.spark.sql.execution.{DoNotCleanup, LocalTableScanExec, RemoveShuffleFiles, SkipMigration, SQLExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.util.ThreadUtils @@ -58,11 +59,21 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) } val planner = new SparkConnectPlanner(executeHolder) val tracker = executeHolder.eventsManager.createQueryPlanningTracker() +val conf = session.sessionState.conf +val shuffleCleanupMode = + if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED)) { +RemoveShuffleFiles + } else if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED)) { +SkipMigration + } else { +DoNotCleanup + } val dataframe = Dataset.ofRows( sessionHolder.session, planner.transformRelation(request.getPlan.getRoot), -tracker) +tracker, +shuffleCleanupMode) responseObserver.onNext(createSchemaResponse
(spark) branch master updated: [SPARK-47692][SQL] Fix default StringType meaning in implicit casting
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 461ffa1a681b [SPARK-47692][SQL] Fix default StringType meaning in implicit casting 461ffa1a681b is described below commit 461ffa1a681b3d2fd2b0e32f22a45e30b45ba707 Author: Mihailo Milosevic AuthorDate: Wed Apr 24 16:12:20 2024 +0800 [SPARK-47692][SQL] Fix default StringType meaning in implicit casting ### What changes were proposed in this pull request? Addition of priority flag to StringType. ### Why are the changes needed? In order to follow casting rules for collations, we need to know whether StringType is considered default, implicit or explicit. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? Implicit tests in CollationSuite. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45819 from mihailom-db/SPARK-47692. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../sql/internal/types/AbstractStringType.scala| 3 +- .../sql/catalyst/analysis/CollationTypeCasts.scala | 30 ++ .../spark/sql/catalyst/analysis/TypeCoercion.scala | 5 +- .../org/apache/spark/sql/CollationSuite.scala | 66 +- 4 files changed, 88 insertions(+), 16 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala index 6403295fe20c..0828c2d6fc10 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala @@ -17,13 +17,14 @@ package org.apache.spark.sql.internal.types +import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} /** * StringTypeCollated is an abstract class for StringType with collation support. */ abstract class AbstractStringType extends AbstractDataType { - override private[sql] def defaultConcreteType: DataType = StringType + override private[sql] def defaultConcreteType: DataType = SqlApiConf.get.defaultStringType override private[sql] def simpleString: String = "string" } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index 3affd91dd3b8..c6232a870dff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import scala.annotation.tailrec import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} -import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Overlay, StringLPad, StringRPad} +import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Literal, Overlay, StringLPad, StringRPad} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @@ -48,9 +48,9 @@ object CollationTypeCasts extends TypeCoercionRule { case eltExpr: Elt => eltExpr.withNewChildren(eltExpr.children.head +: collateToSingleType(eltExpr.children.tail)) -case overlay: Overlay => - overlay.withNewChildren(collateToSingleType(Seq(overlay.input, overlay.replace)) -++ Seq(overlay.pos, overlay.len)) +case overlayExpr: Overlay => + overlayExpr.withNewChildren(collateToSingleType(Seq(overlayExpr.input, overlayExpr.replace)) +++ Seq(overlayExpr.pos, overlayExpr.len)) case stringPadExpr @ (_: StringRPad | _: StringLPad) => val Seq(str, len, pad) = stringPadExpr.children @@ -108,7 +108,12 @@ object CollationTypeCasts extends TypeCoercionRule { * complex DataTypes with collated StringTypes (e.g. ArrayType) */ def getOutputCollation(expr: Seq[Expression]): StringType = { -val explicitTypes = expr.filter(_.isInstanceOf[Collate]) +val explicitTypes = expr.filter { +case _: Collate => true +case cast: Cast if cast.getTagValue(Cast.USER_SPECIFIED_CAST).isDefined => + cast.dataType.isInstanceOf[StringType] +case _ =
(spark) branch master updated: [SPARK-47418][SQL] Add hand-crafted implementations for lowercase unicode-aware contains, startsWith and endsWith and optimize UTF8_BINARY_LCASE
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 890f78d03020 [SPARK-47418][SQL] Add hand-crafted implementations for lowercase unicode-aware contains, startsWith and endsWith and optimize UTF8_BINARY_LCASE 890f78d03020 is described below commit 890f78d03020f905b732054c78748d8d21a69fcf Author: Vladimir Golubev AuthorDate: Wed Apr 24 15:59:54 2024 +0800 [SPARK-47418][SQL] Add hand-crafted implementations for lowercase unicode-aware contains, startsWith and endsWith and optimize UTF8_BINARY_LCASE ### What changes were proposed in this pull request? Added hand-crafted implementations of unicode-aware lower-case `contains`, `startsWith`, `endsWith` to optimize UTF8_BINARY_LCASE for ASCII-only strings. ### Why are the changes needed? `UTF8String.toLowerCase()`, which is used for the aforementioned collation-aware functions, has an optimization for full-ascii strings, but still always allocates a new object. In this PR I introduced loop-based implementations, which fall-back to `toLowerCase()` in case they meet a non-asci character. ### Does this PR introduce _any_ user-facing change? No, these functions should behave exactly as: - `lhs.containsInLowerCase(rhs)` == `lhs.toLowerCase().contains(rhs.toLowerCase())` - `lhs.startsWithInLowerCase(rhs)` == `lhs.toLowerCase().startsWith(rhs.toLowerCase())` - `lhs.endsWithInLowerCase(rhs)` == `lhs.toLowerCase().endsWith(rhs.toLowerCase())` ### How was this patch tested? Added new test cases to `org.apache.spark.unsafe.types.CollationSupportSuite` and `org.apache.spark.unsafe.types.UTF8StringSuite`, including several unicode lowercase specific. Also I've run `CollationBenchmark` on GHA for JDK 17 and JDK 21 and have updated the data. ### Was this patch authored or co-authored using generative AI tooling? No Closes #46181 from vladimirg-db/vladimirg-db/add-hand-crafted-string-function-implementations-for-utf8-binary-lcase-collations. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 6 +- .../org/apache/spark/unsafe/types/UTF8String.java | 143 +++-- .../spark/unsafe/types/CollationSupportSuite.java | 34 + .../apache/spark/unsafe/types/UTF8StringSuite.java | 105 +++ .../CollationBenchmark-jdk21-results.txt | 60 - sql/core/benchmarks/CollationBenchmark-results.txt | 60 - .../CollationNonASCIIBenchmark-jdk21-results.txt | 60 - .../CollationNonASCIIBenchmark-results.txt | 60 - 8 files changed, 396 insertions(+), 132 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index b28321230840..3e4973f5c187 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -60,7 +60,7 @@ public final class CollationSupport { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.toLowerCase().contains(r.toLowerCase()); + return l.containsInLowerCase(r); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -98,7 +98,7 @@ public final class CollationSupport { return l.startsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.toLowerCase().startsWith(r.toLowerCase()); + return l.startsWithInLowerCase(r); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -135,7 +135,7 @@ public final class CollationSupport { return l.endsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.toLowerCase().endsWith(r.toLowerCase()); + return l.endsWithInLowerCase(r); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 2009f1d20442..8ceeddb0c3dd 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -341,6 +341,44 @@ public final class UTF8String implements Comparable, Externalizable, return false; } + /** + * Returns whether
(spark) branch master updated: [SPARK-47873][SQL] Write collated strings to Hive metastore using the regular string type
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9d9f292ac941 [SPARK-47873][SQL] Write collated strings to Hive metastore using the regular string type 9d9f292ac941 is described below commit 9d9f292ac9415269f604f14cb87dc8129b0bfb0c Author: Stefan Kandic AuthorDate: Tue Apr 23 21:40:47 2024 +0800 [SPARK-47873][SQL] Write collated strings to Hive metastore using the regular string type ### What changes were proposed in this pull request? When writing table schema to hive stop writing collated strings as `string COLLATE name` but instead just write them as regular `string` type as hive doesn't support collations. Since we write the original schema as json to table properties in hive so we will able to read the collation back. Also when reading back the table from the catalog, aside from ignoring case and nullability we should now also ignore any differences in string types as well. ### Why are the changes needed? In order to not break hive compatibility with external engines using hive that would otherwise fail to parse this new type. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46083 from stefankandic/writeCollatedStringsHive. Authored-by: Stefan Kandic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/types/DataTypeUtils.scala | 27 +++- .../org/apache/spark/sql/util/SchemaUtils.scala| 17 + .../spark/sql/hive/HiveExternalCatalog.scala | 74 ++ .../spark/sql/hive/HiveExternalCatalogSuite.scala | 43 - 4 files changed, 134 insertions(+), 27 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala index cf8e903f03a3..f8bb1077a080 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy.{ANSI, STRICT} -import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, Decimal, DecimalType, MapType, NullType, StructField, StructType, UserDefinedType} +import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, Decimal, DecimalType, MapType, NullType, StringType, StructField, StructType, UserDefinedType} import org.apache.spark.sql.types.DecimalType.{forType, fromDecimal} object DataTypeUtils { @@ -47,6 +47,31 @@ object DataTypeUtils { DataType.equalsIgnoreCaseAndNullability(from, to) } + /** + * Compares two types, ignoring nullability of ArrayType, MapType, StructType, ignoring case + * sensitivity of field names in StructType as well as differences in collation for String types. + */ + def equalsIgnoreCaseNullabilityAndCollation(from: DataType, to: DataType): Boolean = { +(from, to) match { + case (ArrayType(fromElement, _), ArrayType(toElement, _)) => +equalsIgnoreCaseNullabilityAndCollation(fromElement, toElement) + + case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) => +equalsIgnoreCaseNullabilityAndCollation(fromKey, toKey) && + equalsIgnoreCaseNullabilityAndCollation(fromValue, toValue) + + case (StructType(fromFields), StructType(toFields)) => +fromFields.length == toFields.length && + fromFields.zip(toFields).forall { case (l, r) => +l.name.equalsIgnoreCase(r.name) && + equalsIgnoreCaseNullabilityAndCollation(l.dataType, r.dataType) + } + + case (_: StringType, _: StringType) => true + case (fromDataType, toDataType) => fromDataType == toDataType +} + } + private val SparkGeneratedName = """col\d+""".r private def isSparkGeneratedName(name: String): Boolean = name match { case SparkGeneratedName(_*) => true diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala index 9c1e78190448..1e0bac331dc7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala @@ -
(spark) branch master updated: [SPARK-47352][SQL] Fix Upper, Lower, InitCap collation awareness
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b9f2270f5b0b [SPARK-47352][SQL] Fix Upper, Lower, InitCap collation awareness b9f2270f5b0b is described below commit b9f2270f5b0ba6ea1fb1cdf3225fa626ab91540b Author: Mihailo Milosevic AuthorDate: Tue Apr 23 16:28:33 2024 +0800 [SPARK-47352][SQL] Fix Upper, Lower, InitCap collation awareness ### What changes were proposed in this pull request? Add support for Locale aware expressions. ### Why are the changes needed? This is needed as some future collations might use different Locales then default. ### Does this PR introduce _any_ user-facing change? Yes, we follow ICU implementations for collations that are non native. ### How was this patch tested? Tests for Upper, Lower and InitCap already exist. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46104 from mihailom-db/SPARK-47352. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationSupport.java | 108 +++ .../spark/unsafe/types/CollationSupportSuite.java | 151 + .../catalyst/expressions/stringExpressions.scala | 24 ++-- 3 files changed, 271 insertions(+), 12 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index d54e297413f4..b28321230840 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -16,7 +16,10 @@ */ package org.apache.spark.sql.catalyst.util; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.StringSearch; +import com.ibm.icu.util.ULocale; import org.apache.spark.unsafe.types.UTF8String; @@ -144,6 +147,93 @@ public final class CollationSupport { } } + public static class Upper { +public static UTF8String exec(final UTF8String v, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { +return execUTF8(v); + } else { +return execICU(v, collationId); + } +} +public static String genCode(final String v, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.Upper.exec"; + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { +return String.format(expr + "UTF8(%s)", v); + } else { +return String.format(expr + "ICU(%s, %d)", v, collationId); + } +} +public static UTF8String execUTF8(final UTF8String v) { + return v.toUpperCase(); +} +public static UTF8String execICU(final UTF8String v, final int collationId) { + return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId)); +} + } + + public static class Lower { +public static UTF8String exec(final UTF8String v, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { +return execUTF8(v); + } else { +return execICU(v, collationId); + } +} +public static String genCode(final String v, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); +String expr = "CollationSupport.Lower.exec"; + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { +return String.format(expr + "UTF8(%s)", v); + } else { +return String.format(expr + "ICU(%s, %d)", v, collationId); + } +} +public static UTF8String execUTF8(final UTF8String v) { + return v.toLowerCase(); +} +public static UTF8String execICU(final UTF8String v, final int collationId) { + return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId)); +} + } + + public static class InitCap { +public static UTF8String exec(final UTF8String v, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { +return execUTF8(v); + } else { +return e
(spark) branch master updated: [SPARK-47412][SQL] Add Collation Support for LPad/RPad
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 885e98ecbe64 [SPARK-47412][SQL] Add Collation Support for LPad/RPad 885e98ecbe64 is described below commit 885e98ecbe64ea01dbf542d46aeac706f2761a05 Author: GideonPotok AuthorDate: Tue Apr 23 14:22:39 2024 +0800 [SPARK-47412][SQL] Add Collation Support for LPad/RPad Add collation support for LPAD and RPAD ### What changes were proposed in this pull request? Add collation support for LPAD and RPAD ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? Unit tests and spark-shell ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46041 from GideonPotok/spark_47412_collation_lpad_rpad. Authored-by: GideonPotok Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/CollationTypeCasts.scala | 7 +- .../catalyst/expressions/stringExpressions.scala | 6 +- .../sql/CollationStringExpressionsSuite.scala | 74 ++ 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index cffdd2872224..3affd91dd3b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import scala.annotation.tailrec import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} -import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Overlay} +import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Overlay, StringLPad, StringRPad} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @@ -52,6 +52,11 @@ object CollationTypeCasts extends TypeCoercionRule { overlay.withNewChildren(collateToSingleType(Seq(overlay.input, overlay.replace)) ++ Seq(overlay.pos, overlay.len)) +case stringPadExpr @ (_: StringRPad | _: StringLPad) => + val Seq(str, len, pad) = stringPadExpr.children + val Seq(newStr, newPad) = collateToSingleType(Seq(str, pad)) + stringPadExpr.withNewChildren(Seq(newStr, len, newPad)) + case otherExpr @ ( _: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least | _: Coalesce | _: BinaryExpression | _: ConcatWs) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 2b7703ed82b3..cd21a6f5fdc2 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1586,7 +1586,8 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression) override def third: Expression = pad override def dataType: DataType = str.dataType - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, StringType) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, IntegerType, StringTypeAnyCollation) override def nullSafeEval(string: Any, len: Any, pad: Any): Any = { string.asInstanceOf[UTF8String].lpad(len.asInstanceOf[Int], pad.asInstanceOf[UTF8String]) @@ -1665,7 +1666,8 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera override def third: Expression = pad override def dataType: DataType = str.dataType - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, StringType) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, IntegerType, StringTypeAnyCollation) override def nullSafeEval(string: Any, len: Any, pad: Any): Any = { string.asInstanceOf[UTF8String].rpad(len.asInstanceOf[Int], pad.asInstanceOf[UTF8String]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spa
(spark) branch master updated: [SPARK-47633][SQL] Include right-side plan output in `LateralJoin#allAttributes` for more consistent canonicalization
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new eba6364dbb36 [SPARK-47633][SQL] Include right-side plan output in `LateralJoin#allAttributes` for more consistent canonicalization eba6364dbb36 is described below commit eba6364dbb36b7009ff79a3cd8638aae6eec280f Author: Bruce Robbins AuthorDate: Tue Apr 23 14:09:01 2024 +0800 [SPARK-47633][SQL] Include right-side plan output in `LateralJoin#allAttributes` for more consistent canonicalization ### What changes were proposed in this pull request? Modify `LateralJoin` to include right-side plan output in `allAttributes`. ### Why are the changes needed? In the following example, the view v1 is cached, but a query of v1 does not use the cache: ``` CREATE or REPLACE TEMP VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2); CREATE or REPLACE TEMP VIEW t2(c1, c2) AS VALUES (0, 1), (1, 2); create or replace temp view v1 as select * from t1 join lateral ( select c1 as a, c2 as b from t2) on c1 = a; cache table v1; explain select * from v1; == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- BroadcastHashJoin [c1#180], [a#173], Inner, BuildRight, false :- LocalTableScan [c1#180, c2#181] +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [plan_id=113] +- LocalTableScan [a#173, b#174] ``` The canonicalized version of the `LateralJoin` node is not consistent when there is a join condition. For example, for the above query, the join condition is canonicalized as follows: ``` Before canonicalization: Some((c1#174 = a#167)) After canonicalization: Some((none#0 = none#167)) ``` You can see that the `exprId` for the second operand of `EqualTo` is not normalized (it remains 167). That's because the attribute `a` from the right-side plan is not included `allAttributes`. This PR adds right-side attributes to `allAttributes` so that references to right-side attributes in the join condition are normalized during canonicalization. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45763 from bersprockets/lj_canonical_issue. Authored-by: Bruce Robbins Signed-off-by: Wenchen Fan --- .../plans/logical/basicLogicalOperators.scala | 2 ++ .../scala/org/apache/spark/sql/CachedTableSuite.scala | 19 +++ 2 files changed, 21 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 7c36e3bc79af..4fd640afe3b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -2057,6 +2057,8 @@ case class LateralJoin( joinType: JoinType, condition: Option[Expression]) extends UnaryNode { + override lazy val allAttributes: AttributeSeq = left.output ++ right.plan.output + require(Seq(Inner, LeftOuter, Cross).contains(joinType), s"Unsupported lateral join type $joinType") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 7865e7f1f864..0ad9ceefc419 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -1770,4 +1770,23 @@ class CachedTableSuite extends QueryTest with SQLTestUtils withSQLConf(SQLConf.DEFAULT_CACHE_STORAGE_LEVEL.key -> "DISK") {} } } + + test("SPARK-47633: Cache hit for lateral join with join condition") { +withTempView("t", "q1") { + sql("create or replace temp view t(c1, c2) as values (0, 1), (1, 2)") + val query = """select * +|from t +|join lateral ( +| select c1 as a, c2 as b +| from t) +|on c1 = a; +|""".stripMargin + sql(s"cache table q1 as $query") + val df = sql(query) + checkAnswer(df, +Row(0, 1, 0, 1) :: Row(1, 2, 1, 2) :: Nil) + assert(getNumInMemoryRelations(df) == 1) +} + + } } -
(spark) branch master updated: [SPARK-47411][SQL] Support StringInstr & FindInSet functions to work with collated strings
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 256fc51508e4 [SPARK-47411][SQL] Support StringInstr & FindInSet functions to work with collated strings 256fc51508e4 is described below commit 256fc51508e4eac3efc4746ef0ef92132bc40643 Author: Milan Dankovic AuthorDate: Mon Apr 22 20:45:43 2024 +0800 [SPARK-47411][SQL] Support StringInstr & FindInSet functions to work with collated strings ### What changes were proposed in this pull request? Extend built-in string functions to support non-binary, non-lowercase collation for: instr & find_in_set. ### Why are the changes needed? Update collation support for built-in string functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use COLLATE within arguments for built-in string functions INSTR and FIND_IN_SET in Spark SQL queries, using non-binary collations such as UNICODE_CI. ### How was this patch tested? Unit tests for queries using "collate" (CollationSuite). ### Was this patch authored or co-authored using generative AI tooling? No Closes #45643 from miland-db/miland-db/substr-functions. Authored-by: Milan Dankovic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationFactory.java | 17 +++- .../spark/sql/catalyst/util/CollationSupport.java | 112 + .../spark/unsafe/types/CollationSupportSuite.java | 82 +++ .../catalyst/expressions/stringExpressions.scala | 28 -- .../sql/CollationStringExpressionsSuite.scala | 62 5 files changed, 288 insertions(+), 13 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 9786c559da44..93691e28c692 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -196,10 +196,21 @@ public final class CollationFactory { final UTF8String targetUTF8String, final UTF8String patternUTF8String, final int collationId) { -String pattern = patternUTF8String.toString(); -CharacterIterator target = new StringCharacterIterator(targetUTF8String.toString()); +return getStringSearch(targetUTF8String.toString(), patternUTF8String.toString(), collationId); + } + + /** + * Returns a StringSearch object for the given pattern and target strings, under collation + * rules corresponding to the given collationId. The external ICU library StringSearch object can + * be used to find occurrences of the pattern in the target string, while respecting collation. + */ + public static StringSearch getStringSearch( + final String targetString, + final String patternString, + final int collationId) { +CharacterIterator target = new StringCharacterIterator(targetString); Collator collator = CollationFactory.fetchCollation(collationId).collator; -return new StringSearch(pattern, target, (RuleBasedCollator) collator); +return new StringSearch(patternString, target, (RuleBasedCollator) collator); } /** diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index f54e6b162a93..d54e297413f4 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -144,6 +144,76 @@ public final class CollationSupport { } } + public static class FindInSet { +public static int exec(final UTF8String word, final UTF8String set, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(word, set); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(word, set); + } else { +return execICU(word, set, collationId); + } +} +public static String genCode(final String word, final String set, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", word, set); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + &quo
(spark) branch master updated (2fb31dea1c53 -> b20356ef55a2)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 2fb31dea1c53 [SPARK-47930][BUILD] Upgrade RoaringBitmap to 1.0.6 add b20356ef55a2 [SPARK-47900] Fix check for implicit (UTF8_BINARY) collation No new revisions were added by this update. Summary of changes: sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala | 6 +- .../scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala | 4 ++-- .../src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala | 6 +++--- .../apache/spark/sql/execution/datasources/DataSourceUtils.scala| 2 +- sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala | 6 +++--- 5 files changed, 14 insertions(+), 10 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (2d0b56c3eac6 -> e1432ef6405a)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 2d0b56c3eac6 [SPARK-47932][SQL][TESTS] Avoid using legacy commons-lang add e1432ef6405a [SPARK-47413][SQL] - add support to substr/left/right for collations No new revisions were added by this update. Summary of changes: .../catalyst/expressions/stringExpressions.scala | 8 ++-- .../sql/CollationStringExpressionsSuite.scala | 49 ++ 2 files changed, 53 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (9f34b8eca2f3 -> 458f70bd5213)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 9f34b8eca2f3 [SPARK-47845][SQL][PYTHON][CONNECT] Support Column type in split function for scala and python add 458f70bd5213 [SPARK-47902][SQL] Making Compute Current Time* expressions foldable No new revisions were added by this update. Summary of changes: .../sql/catalyst/expressions/Expression.scala | 25 +++--- .../catalyst/expressions/datetimeExpressions.scala | 6 +++--- .../analysis/ExpressionTypeCheckingSuite.scala | 5 + .../BinaryComparisonSimplificationSuite.scala | 2 ++ 4 files changed, 27 insertions(+), 11 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (bb5ded8f1bb0 -> 76fb196a8ab7)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from bb5ded8f1bb0 [SPARK-47371][SQL][FOLLOWUP] XML: Stop ignoring CDATA within row tags add 76fb196a8ab7 [SPARK-46935][DOCS] Consolidate error documentation No new revisions were added by this update. Summary of changes: .github/workflows/build_and_test.yml |2 + .../org/apache/spark/SparkThrowableSuite.scala | 222 -- docs/README.md | 24 +- docs/_data/menu-sql.yaml | 45 - docs/_plugins/build_api_docs.rb|8 + docs/css/custom.css| 23 + .../sql-error-conditions-as-of-join-error-class.md | 41 - ...-cannot-create-data-source-table-error-class.md | 37 - ...nditions-cannot-load-state-store-error-class.md | 74 - ...r-conditions-cannot-update-field-error-class.md | 53 - ...ditions-cannot-write-state-store-error-class.md | 37 - ...s-collection-size-limit-exceeded-error-class.md | 45 - ...lex-expression-unsupported-input-error-class.md | 41 - docs/sql-error-conditions-connect-error-class.md | 53 - ...reate-view-column-arity-mismatch-error-class.md | 45 - ...ror-conditions-datatype-mismatch-error-class.md | 258 -- ...ate-routine-parameter-assignment-error-class.md | 41 - ...conditions-expect-table-not-view-error-class.md | 41 - ...conditions-expect-view-not-table-error-class.md | 41 - ...sql-error-conditions-failed-jdbc-error-class.md | 89 - ...ions-incompatible-data-for-table-error-class.md | 73 - ...tions-incomplete-type-definition-error-class.md | 45 - ...onsistent-behavior-cross-version-error-class.md | 68 - ...ons-insert-column-arity-mismatch-error-class.md | 45 - ...ions-insufficient-table-property-error-class.md | 41 - ...-internal-error-metadata-catalog-error-class.md | 57 - ...rror-conditions-invalid-boundary-error-class.md | 41 - ...-error-conditions-invalid-cursor-error-class.md | 49 - ...conditions-invalid-default-value-error-class.md | 49 - ...-error-conditions-invalid-format-error-class.md | 77 - ...-error-conditions-invalid-handle-error-class.md | 61 - ...-conditions-invalid-inline-table-error-class.md | 49 - ...id-inverse-distribution-function-error-class.md | 45 - ...ons-invalid-lambda-function-call-error-class.md | 45 - ...ns-invalid-limit-like-expression-error-class.md | 49 - ...ditions-invalid-observed-metrics-error-class.md | 61 - ...error-conditions-invalid-options-error-class.md | 41 - ...nditions-invalid-parameter-value-error-class.md | 85 - ...ions-invalid-partition-operation-error-class.md | 41 - ...-error-conditions-invalid-schema-error-class.md | 46 - ...or-conditions-invalid-sql-syntax-error-class.md | 114 - ...ions-invalid-subquery-expression-error-class.md | 37 - ...valid-time-travel-timestamp-expr-error-class.md | 49 - ...tions-invalid-write-distribution-error-class.md | 45 - ...ions-malformed-record-in-parsing-error-class.md | 46 - ...or-conditions-missing-attributes-error-class.md | 42 - ...conditions-not-a-constant-string-error-class.md | 45 - ...r-conditions-not-allowed-in-from-error-class.md | 45 - ...ns-not-null-constraint-violation-error-class.md | 41 - ...ns-not-supported-in-jdbc-catalog-error-class.md | 41 - docs/sql-error-conditions-sqlstates.md | 741 - ...itions-stds-invalid-option-value-error-class.md | 45 - ...ror-conditions-unresolved-column-error-class.md | 41 - ...rror-conditions-unresolved-field-error-class.md | 41 - ...or-conditions-unresolved-map-key-error-class.md | 41 - ...-conditions-unsupported-add-file-error-class.md | 41 - ...itions-unsupported-default-value-error-class.md | 41 - ...ditions-unsupported-deserializer-error-class.md | 41 - ...r-conditions-unsupported-feature-error-class.md | 229 -- ...conditions-unsupported-generator-error-class.md | 49 - ...or-conditions-unsupported-insert-error-class.md | 53 - ...ions-unsupported-merge-condition-error-class.md | 45 - ...conditions-unsupported-overwrite-error-class.md | 41 - ...conditions-unsupported-save-mode-error-class.md | 41 - ...ted-subquery-expression-category-error-class.md | 95 - ...-error-conditions-wrong-num-args-error-class.md | 41 - docs/sql-error-conditions.md | 2890 +--- docs/util/build-error-docs.py | 152 + 68 files changed, 210 insertions(+), 7251 deletions(-) delete mode 100644 docs/sql-error-conditions-as-of-join-error-class.md delete mode 100644 docs/sql-error-conditions-cannot-create-data-source-table-error-class.md delete mode 100644 docs/sql-error-conditions-cannot-load-state-store-error-class.md delete mode 100644 docs/sql-error-conditions-cannot-update-field-error-class.md delete mode 100644 docs/sql
(spark) branch branch-3.5 updated: [SPARK-47463][SQL][3.5] Use V2Predicate to wrap expression with return type of boolean
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new e439e29a7bf1 [SPARK-47463][SQL][3.5] Use V2Predicate to wrap expression with return type of boolean e439e29a7bf1 is described below commit e439e29a7bf10ffccf68b2ec11f6e1e06e747d06 Author: Zhen Wang <643348...@qq.com> AuthorDate: Thu Apr 18 21:24:10 2024 +0800 [SPARK-47463][SQL][3.5] Use V2Predicate to wrap expression with return type of boolean Backports #45589 to 3.5 ### What changes were proposed in this pull request? Use V2Predicate to wrap If expr when building v2 expressions. ### Why are the changes needed? The `PushFoldableIntoBranches` optimizer may fold predicate into (if / case) branches and `V2ExpressionBuilder` wraps `If` as `GeneralScalarExpression`, which causes the assertion in `PushablePredicate.unapply` to fail. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? added unit test ### Was this patch authored or co-authored using generative AI tooling? No Closes #46074 from wForget/SPARK-47463_3.5. Authored-by: Zhen Wang <643348...@qq.com> Signed-off-by: Wenchen Fan --- .../sql/catalyst/util/V2ExpressionBuilder.scala| 159 +++-- .../spark/sql/connector/DataSourceV2Suite.scala| 10 ++ 2 files changed, 97 insertions(+), 72 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala index 947a5e9f383f..c7bca751e56e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.connector.expressions.{Cast => V2Cast, Expression => import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Avg, Count, CountStar, GeneralAggregateFunc, Max, Min, Sum, UserDefinedAggregateFunc} import org.apache.spark.sql.connector.expressions.filter.{AlwaysFalse, AlwaysTrue, And => V2And, Not => V2Not, Or => V2Or, Predicate => V2Predicate} import org.apache.spark.sql.execution.datasources.PushableExpression -import org.apache.spark.sql.types.{BooleanType, IntegerType, StringType} +import org.apache.spark.sql.types.{BooleanType, DataType, IntegerType, StringType} /** * The builder to generate V2 expressions from catalyst expressions. @@ -96,45 +96,45 @@ class V2ExpressionBuilder(e: Expression, isPredicate: Boolean = false) { generateExpression(child).map(v => new V2Cast(v, dataType)) case AggregateExpression(aggregateFunction, Complete, isDistinct, None, _) => generateAggregateFunc(aggregateFunction, isDistinct) -case Abs(child, true) => generateExpressionWithName("ABS", Seq(child)) -case Coalesce(children) => generateExpressionWithName("COALESCE", children) -case Greatest(children) => generateExpressionWithName("GREATEST", children) -case Least(children) => generateExpressionWithName("LEAST", children) -case Rand(child, hideSeed) => +case Abs(_, true) => generateExpressionWithName("ABS", expr, isPredicate) +case _: Coalesce => generateExpressionWithName("COALESCE", expr, isPredicate) +case _: Greatest => generateExpressionWithName("GREATEST", expr, isPredicate) +case _: Least => generateExpressionWithName("LEAST", expr, isPredicate) +case Rand(_, hideSeed) => if (hideSeed) { Some(new GeneralScalarExpression("RAND", Array.empty[V2Expression])) } else { -generateExpressionWithName("RAND", Seq(child)) +generateExpressionWithName("RAND", expr, isPredicate) } -case log: Logarithm => generateExpressionWithName("LOG", log.children) -case Log10(child) => generateExpressionWithName("LOG10", Seq(child)) -case Log2(child) => generateExpressionWithName("LOG2", Seq(child)) -case Log(child) => generateExpressionWithName("LN", Seq(child)) -case Exp(child) => generateExpressionWithName("EXP", Seq(child)) -case pow: Pow => generateExpressionWithName("POWER", pow.children) -case Sqrt(child) => generateExpressionWithName("SQRT", Seq(child)) -case Floor(child) => generateExpressionWithName("FLOOR", Seq(child)) -case Ceil(child) => generateExpressionWithName("CEIL", Seq(child)) -case round: Round => generateExpressionW
(spark) branch branch-3.4 updated: [SPARK-47895][SQL] group by all should be idempotent
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new 11c1c520fe57 [SPARK-47895][SQL] group by all should be idempotent 11c1c520fe57 is described below commit 11c1c520fe57bdd728c65d05346bf3aece8dbac9 Author: Wenchen Fan AuthorDate: Thu Apr 18 16:33:47 2024 +0800 [SPARK-47895][SQL] group by all should be idempotent ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/43797 . GROUP BY ALL has the same bug and this PR applies the same fix to GROUP BY ALL ### Why are the changes needed? For advanced users or Spark plugins, they may manipulate the logical plans directly. We need to make the framework more reliable. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #46113 from cloud-fan/group-all. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit b5bb75ca240a98ae5651e5cb429fd4bd31b7bb8a) Signed-off-by: Wenchen Fan --- .../analysis/ResolveReferencesInAggregate.scala| 16 ++-- .../analysis/SubstituteUnresolvedOrdinalsSuite.scala | 18 ++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 1a9ed4ce16eb..c9d7670b2d3e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, IntegerLiteral, Literal, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, AppendColumns, LogicalPlan} import org.apache.spark.sql.catalyst.trees.TreePattern.{LATERAL_COLUMN_ALIAS_REFERENCE, UNRESOLVED_ATTRIBUTE} @@ -134,7 +134,19 @@ object ResolveReferencesInAggregate extends SQLConfHelper groupExprs } else { // This is a valid GROUP BY ALL aggregate. -expandedGroupExprs.get +expandedGroupExprs.get.zipWithIndex.map { case (expr, index) => + trimAliases(expr) match { +// HACK ALERT: If the expanded grouping expression is an integer literal, don't use it +// but use an integer literal of the index. The reason is we may repeatedly +// analyze the plan, and the original integer literal may cause failures +// with a later GROUP BY ordinal resolution. GROUP BY constant is +// meaningless so whatever value does not matter here. +case IntegerLiteral(_) => + // GROUP BY ordinal uses 1-based index. + Literal(index + 1) +case _ => expr + } +} } } else { groupExprs diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 953b2c8bb101..39cf298aec43 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -86,4 +86,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) ) } + + test("SPARK-47895: group by all repeated analysis") { +val plan = testRelation.groupBy($"all")(Literal(100).as("a")).analyze +comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) +) + +val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any +// Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. +val copiedPlan = plan.transform { + case _: LocalRelation => testRelati
(spark) branch branch-3.5 updated: [SPARK-47895][SQL] group by all should be idempotent
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 7aea21eae377 [SPARK-47895][SQL] group by all should be idempotent 7aea21eae377 is described below commit 7aea21eae377321633d3ddd34898e9a5ea43 Author: Wenchen Fan AuthorDate: Thu Apr 18 16:33:47 2024 +0800 [SPARK-47895][SQL] group by all should be idempotent ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/43797 . GROUP BY ALL has the same bug and this PR applies the same fix to GROUP BY ALL ### Why are the changes needed? For advanced users or Spark plugins, they may manipulate the logical plans directly. We need to make the framework more reliable. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #46113 from cloud-fan/group-all. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit b5bb75ca240a98ae5651e5cb429fd4bd31b7bb8a) Signed-off-by: Wenchen Fan --- .../analysis/ResolveReferencesInAggregate.scala| 16 ++-- .../analysis/SubstituteUnresolvedOrdinalsSuite.scala | 18 ++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 09ae87b071fd..a03d5438ff6a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, IntegerLiteral, Literal, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, AppendColumns, LogicalPlan} import org.apache.spark.sql.catalyst.trees.TreePattern.{LATERAL_COLUMN_ALIAS_REFERENCE, UNRESOLVED_ATTRIBUTE} @@ -134,7 +134,19 @@ object ResolveReferencesInAggregate extends SQLConfHelper groupExprs } else { // This is a valid GROUP BY ALL aggregate. -expandedGroupExprs.get +expandedGroupExprs.get.zipWithIndex.map { case (expr, index) => + trimAliases(expr) match { +// HACK ALERT: If the expanded grouping expression is an integer literal, don't use it +// but use an integer literal of the index. The reason is we may repeatedly +// analyze the plan, and the original integer literal may cause failures +// with a later GROUP BY ordinal resolution. GROUP BY constant is +// meaningless so whatever value does not matter here. +case IntegerLiteral(_) => + // GROUP BY ordinal uses 1-based index. + Literal(index + 1) +case _ => expr + } +} } } else { groupExprs diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 953b2c8bb101..39cf298aec43 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -86,4 +86,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) ) } + + test("SPARK-47895: group by all repeated analysis") { +val plan = testRelation.groupBy($"all")(Literal(100).as("a")).analyze +comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) +) + +val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any +// Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. +val copiedPlan = plan.transform { + case _: LocalRelation => testRelati
(spark) branch master updated: [SPARK-47895][SQL] group by all should be idempotent
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b5bb75ca240a [SPARK-47895][SQL] group by all should be idempotent b5bb75ca240a is described below commit b5bb75ca240a98ae5651e5cb429fd4bd31b7bb8a Author: Wenchen Fan AuthorDate: Thu Apr 18 16:33:47 2024 +0800 [SPARK-47895][SQL] group by all should be idempotent ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/43797 . GROUP BY ALL has the same bug and this PR applies the same fix to GROUP BY ALL ### Why are the changes needed? For advanced users or Spark plugins, they may manipulate the logical plans directly. We need to make the framework more reliable. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #46113 from cloud-fan/group-all. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../analysis/ResolveReferencesInAggregate.scala| 16 ++-- .../analysis/SubstituteUnresolvedOrdinalsSuite.scala | 18 ++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 4f5a11835c33..7ea90854932e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, IntegerLiteral, Literal, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, AppendColumns, LogicalPlan} import org.apache.spark.sql.catalyst.trees.TreePattern.{LATERAL_COLUMN_ALIAS_REFERENCE, UNRESOLVED_ATTRIBUTE} @@ -136,7 +136,19 @@ class ResolveReferencesInAggregate(val catalogManager: CatalogManager) extends S groupExprs } else { // This is a valid GROUP BY ALL aggregate. -expandedGroupExprs.get +expandedGroupExprs.get.zipWithIndex.map { case (expr, index) => + trimAliases(expr) match { +// HACK ALERT: If the expanded grouping expression is an integer literal, don't use it +// but use an integer literal of the index. The reason is we may repeatedly +// analyze the plan, and the original integer literal may cause failures +// with a later GROUP BY ordinal resolution. GROUP BY constant is +// meaningless so whatever value does not matter here. +case IntegerLiteral(_) => + // GROUP BY ordinal uses 1-based index. + Literal(index + 1) +case _ => expr + } +} } } else { groupExprs diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 953b2c8bb101..39cf298aec43 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -86,4 +86,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) ) } + + test("SPARK-47895: group by all repeated analysis") { +val plan = testRelation.groupBy($"all")(Literal(100).as("a")).analyze +comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) +) + +val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any +// Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. +val copiedPlan = plan.transform { + case _: LocalRelation => testRelationWithData +} +comparePlans( + copiedPlan.analyze, // repeated analys
(spark) branch master updated (c4b0c260bb13 -> 60e75e6e0275)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from c4b0c260bb13 [SPARK-47839][SQL] Fix aggregate bug in RewriteWithExpression add 60e75e6e0275 [SPARK-47810][SQL] Replace equivalent expression to <=> in join condition No new revisions were added by this update. Summary of changes: .../catalyst/optimizer/OptimizeJoinCondition.scala | 45 .../spark/sql/catalyst/optimizer/Optimizer.scala | 1 + .../sql/catalyst/rules/RuleIdCollection.scala | 1 + .../optimizer/OptimizeJoinConditionSuite.scala | 49 ++ .../org/apache/spark/sql/DataFrameJoinSuite.scala | 10 + 5 files changed, 106 insertions(+) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinCondition.scala create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinConditionSuite.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47839][SQL] Fix aggregate bug in RewriteWithExpression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c4b0c260bb13 [SPARK-47839][SQL] Fix aggregate bug in RewriteWithExpression c4b0c260bb13 is described below commit c4b0c260bb139f61901d5bd5f1d94dddaefc9207 Author: Kelvin Jiang AuthorDate: Thu Apr 18 09:56:10 2024 +0800 [SPARK-47839][SQL] Fix aggregate bug in RewriteWithExpression ### What changes were proposed in this pull request? - Fixes a bug where `RewriteWithExpression` can rewrite an `Aggregate` into an invalid one. The fix is done by separating out the "result expressions" from the "aggregate expressions" in the `Aggregate` node, and rewriting them separately. - Some QOL improvements around `With`: - Fix aliases created by `With` expression to use the `CommonExpressionId` to avoid duplicate aliases (added a conf to fall back to old behaviour, which is useful to keep the IDs consistent for golden files tests) - Implemented `QueryPlan.transformUpWithSubqueriesAndPruning` that the new logic depends on ### Why are the changes needed? See [JIRA ticket](https://issues.apache.org/jira/browse/SPARK-47839) for more details on the bug that this fixes. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added new unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46034 from kelvinjian-db/SPARK-47839-with-aggregate. Authored-by: Kelvin Jiang Signed-off-by: Wenchen Fan --- .../explain-results/function_count_if.explain | 7 +- .../sql/connect/ProtoToParsedPlanTestSuite.scala | 1 + .../spark/sql/catalyst/expressions/With.scala | 6 +- .../catalyst/optimizer/RewriteWithExpression.scala | 70 +-- .../spark/sql/catalyst/plans/QueryPlan.scala | 24 +++ .../org/apache/spark/sql/internal/SQLConf.scala| 11 + .../optimizer/RewriteWithExpressionSuite.scala | 231 - 7 files changed, 281 insertions(+), 69 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain index f2ada15eccb7..a9fd2eeb669a 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain @@ -1,3 +1,4 @@ -Aggregate [count(if ((_common_expr_0#0 = false)) null else _common_expr_0#0) AS count_if((a > 0))#0L] -+- Project [id#0L, a#0, b#0, d#0, e#0, f#0, g#0, (a#0 > 0) AS _common_expr_0#0] - +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] +Project [_aggregateexpression#0L AS count_if((a > 0))#0L] ++- Aggregate [count(if ((_common_expr_0#0 = false)) null else _common_expr_0#0) AS _aggregateexpression#0L] + +- Project [id#0L, a#0, b#0, d#0, e#0, f#0, g#0, (a#0 > 0) AS _common_expr_0#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala index cc9decb4c98b..d404779d7a92 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala @@ -126,6 +126,7 @@ class ProtoToParsedPlanTestSuite Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES.key, "org.apache.spark.sql.connect.plugin.ExampleExpressionPlugin") .set(org.apache.spark.sql.internal.SQLConf.ANSI_ENABLED.key, false.toString) + .set(org.apache.spark.sql.internal.SQLConf.USE_COMMON_EXPR_ID_FOR_ALIAS.key, false.toString) } protected val suiteBaseResourcePath = commonResourcePath.resolve("query-tests") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala index 2745b663639f..14deedd9c70f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, TreePattern, WITH_EXPRESSION} +import org.apache.spark.sql.catalyst.trees.TreePattern.{AGGREGATE_EXPRESSION, CO
(spark) branch master updated (37afc3fff8c6 -> 61a7901a2c83)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 37afc3fff8c6 [SPARK-47803][FOLLOWUP] Fix cast binary/decimal to variant add 61a7901a2c83 [SPARK-47846][SQL] Add support for Variant type in from_json expression No new revisions were added by this update. Summary of changes: .../src/main/resources/error/error-conditions.json | 2 +- ...ror-conditions-datatype-mismatch-error-class.md | 2 +- .../sql/catalyst/expressions/jsonExpressions.scala | 10 +++-- .../apache/spark/sql/VariantEndToEndSuite.scala| 47 +- 4 files changed, 54 insertions(+), 7 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47803][FOLLOWUP] Fix cast binary/decimal to variant
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 37afc3fff8c6 [SPARK-47803][FOLLOWUP] Fix cast binary/decimal to variant 37afc3fff8c6 is described below commit 37afc3fff8c65b612c1242f1fc9a66a2e04639ad Author: Chenhao Li AuthorDate: Thu Apr 18 09:23:45 2024 +0800 [SPARK-47803][FOLLOWUP] Fix cast binary/decimal to variant ### What changes were proposed in this pull request? This PR fixes issues introduced in https://github.com/apache/spark/pull/45989: - `VariantBuilder.appendBinary` incorrectly uses the type tag for the string type. - `VariantExpressionEvalUtils.buildVariant` misses the decimal types. ### Why are the changes needed? It is a bug fix and allows Spark to read a map schema with variant value (for example, `map`) correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests. We ensure that at least all supported types are covered (scalar types, array, map, struct, variant). ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46109 from chenhao-db/fix_cast_to_variant. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../org/apache/spark/types/variant/VariantBuilder.java | 2 +- .../variant/VariantExpressionEvalUtils.scala | 3 ++- .../expressions/variant/VariantExpressionSuite.scala | 18 ++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java index ea7a7674baf5..2afba81d192e 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java @@ -223,7 +223,7 @@ public class VariantBuilder { public void appendBinary(byte[] binary) { checkCapacity(1 + U32_SIZE + binary.length); -writeBuffer[writePos++] = primitiveHeader(LONG_STR); +writeBuffer[writePos++] = primitiveHeader(BINARY); writeLong(writeBuffer, writePos, binary.length, U32_SIZE); writePos += U32_SIZE; System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala index 4d1d70055f5e..ea90bb88a906 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala @@ -76,7 +76,8 @@ object VariantExpressionEvalUtils { case LongType => builder.appendLong(input.asInstanceOf[Long]) case FloatType => builder.appendFloat(input.asInstanceOf[Float]) case DoubleType => builder.appendDouble(input.asInstanceOf[Double]) - case StringType => builder.appendString(input.asInstanceOf[UTF8String].toString) + case _: DecimalType => builder.appendDecimal(input.asInstanceOf[Decimal].toJavaBigDecimal) + case _: StringType => builder.appendString(input.asInstanceOf[UTF8String].toString) case BinaryType => builder.appendBinary(input.asInstanceOf[Array[Byte]]) case DateType => builder.appendDate(input.asInstanceOf[Int]) case TimestampType => builder.appendTimestamp(input.asInstanceOf[Long]) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala index 9aa1dcd2ef95..1f9eec862bbe 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala @@ -807,9 +807,27 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } check(null.asInstanceOf[String], null) +// The following tests cover all allowed scalar types. for (input <- Seq[Any](false, true, 0.toByte, 1.toShort, 2, 3L, 4.0F, 5.0D)) { check(input, input.toString) } +for (precision <- Seq(9, 18, 38)) { + val input = BigDecimal("9" * precision) + check(Literal.create(input, DecimalType(precision, 0)), input.toString) +} +check("", "\"\"") +check("x" * 128, "\&q
(spark) branch master updated: [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 21d8bbdc59d6 [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser 21d8bbdc59d6 is described below commit 21d8bbdc59d6525d0573c7e624c3b2640ac15795 Author: Chenhao Li AuthorDate: Thu Apr 18 09:17:17 2024 +0800 [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser ### What changes were proposed in this pull request? This PR fixes an issue introduced in https://github.com/apache/spark/pull/46071. When parsing a JSON object as a map or struct, the `JacksonParser` only peeks the `FIELD_NAME` token without consuming it. `VariantBuilder.parseJson` will then fail because the current token is `FIELD_NAME` rather than the starting token of the value. Previous tests with struct schemas didn't fail because the parsing error was caught and the parser would then consume the field name, and the field value wo [...] ### Why are the changes needed? It is a bug fix and allows Spark to read a map schema with variant value (for example, `map`) correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? A new unit test. It would fail without the changes. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46107 from chenhao-db/fix_json_scan_variant. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/json/JacksonParser.scala | 6 ++ .../test/scala/org/apache/spark/sql/VariantSuite.scala | 18 ++ 2 files changed, 24 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index f8318aa7ce0a..eadd0a4f8ab9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -116,6 +116,12 @@ class JacksonParser( } protected final def parseVariant(parser: JsonParser): VariantVal = { +// Skips `FIELD_NAME` at the beginning. This check is adapted from `parseJsonToken`, but we +// cannot directly use the function here because it also handles the `VALUE_NULL` token and +// returns null (representing a SQL NULL). Instead, we want to return a variant null. +if (parser.getCurrentToken == FIELD_NAME) { + parser.nextToken() +} try { val v = VariantBuilder.parseJson(parser) new VariantVal(v.getValue, v.getMetadata) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala index d2d12920b68a..0dd9d35f9b4a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala @@ -338,6 +338,24 @@ class VariantSuite extends QueryTest with SharedSparkSession { } } + test("json scan with map schema") { +withTempDir { dir => + val file = new File(dir, "file.json") + val content = Seq( +"true", +"""{"v": null}""", +"""{"v": {"a": 1, "b": null}}""" + ).mkString("\n").getBytes(StandardCharsets.UTF_8) + Files.write(file.toPath, content) + checkAnswer( +spark.read.format("json").schema("v map") + .load(file.getAbsolutePath) + .selectExpr("to_json(v)"), +Seq(Row(null), Row(null), Row("""{"a":1,"b":null}""")) + ) +} + } + test("group/order/join variant are disabled") { var ex = intercept[AnalysisException] { spark.sql("select parse_json('') group by 1") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47360][SQL] Collation support: Overlay, FormatString, Length, BitLength, OctetLength, SoundEx, Luhncheck
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new f99b1b1dd14d [SPARK-47360][SQL] Collation support: Overlay, FormatString, Length, BitLength, OctetLength, SoundEx, Luhncheck f99b1b1dd14d is described below commit f99b1b1dd14d10c5a7d5a851b4add64c10ce62f6 Author: Nikola Mandic AuthorDate: Thu Apr 18 00:55:40 2024 +0800 [SPARK-47360][SQL] Collation support: Overlay, FormatString, Length, BitLength, OctetLength, SoundEx, Luhncheck ### What changes were proposed in this pull request? Add collation support for batch of string expressions mentioned in PR title. ### Why are the changes needed? Add collations support in string functions. ### Does this PR introduce _any_ user-facing change? Yes, it changes behavior of string functions when string parameters have collation. ### How was this patch tested? Added checks to `CollationStringExpressionsSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46003 from nikolamand-db/SPARK-47360. Authored-by: Nikola Mandic Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/CollationTypeCasts.scala | 6 +- .../spark/sql/catalyst/encoders/EncoderUtils.scala | 1 + .../catalyst/expressions/stringExpressions.scala | 38 --- .../sql/CollationStringExpressionsSuite.scala | 119 + 4 files changed, 146 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index 795e8a696b01..cffdd2872224 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import scala.annotation.tailrec import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} -import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least} +import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Overlay} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @@ -48,6 +48,10 @@ object CollationTypeCasts extends TypeCoercionRule { case eltExpr: Elt => eltExpr.withNewChildren(eltExpr.children.head +: collateToSingleType(eltExpr.children.tail)) +case overlay: Overlay => + overlay.withNewChildren(collateToSingleType(Seq(overlay.input, overlay.replace)) +++ Seq(overlay.pos, overlay.len)) + case otherExpr @ ( _: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least | _: Coalesce | _: BinaryExpression | _: ConcatWs) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala index 20f86a32c1a1..81743251bada 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala @@ -77,6 +77,7 @@ object EncoderUtils { case _: DecimalType => classOf[Decimal] case _: DayTimeIntervalType => classOf[PhysicalLongType.InternalType] case _: YearMonthIntervalType => classOf[PhysicalIntegerType.InternalType] + case _: StringType => classOf[UTF8String] case _: StructType => classOf[InternalRow] case _: ArrayType => classOf[ArrayData] case _: MapType => classOf[MapData] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index b3029302c03d..3c9888940221 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -804,8 +804,9 @@ case class Overlay(input: Expression, replace: Expression, pos: Expression, len: override def dataType: DataType = input.dataType - override def inputTypes: Seq[AbstractDataType] = Seq(TypeCol
(spark) branch master updated: [SPARK-47416][SQL] Add new functions to CollationBenchmark
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1b65b2aebf4e [SPARK-47416][SQL] Add new functions to CollationBenchmark 1b65b2aebf4e is described below commit 1b65b2aebf4eacb005629f26a019cef66c454710 Author: Vladimir Golubev AuthorDate: Wed Apr 17 23:25:34 2024 +0800 [SPARK-47416][SQL] Add new functions to CollationBenchmark ### What changes were proposed in this pull request? Added new benchmarks for contains, startsWith, endsWith prior to improving the implementation for the UTF8_BINARY_LCASE collation. ### Why are the changes needed? To see exact improvements after the implementation of https://issues.apache.org/jira/browse/SPARK-47418 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21 ### Was this patch authored or co-authored using generative AI tooling? No Closes #46078 from vladimirg-db/vladimirg-db/add-new-string-functions-to-collation-bencmark. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../CollationBenchmark-jdk21-results.txt | 27 sql/core/benchmarks/CollationBenchmark-results.txt | 27 .../CollationNonASCIIBenchmark-jdk21-results.txt | 27 .../CollationNonASCIIBenchmark-results.txt | 27 .../execution/benchmark/CollationBenchmark.scala | 141 + 5 files changed, 224 insertions(+), 25 deletions(-) diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index 32cbbc74e911..24605e051dbb 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -25,3 +25,30 @@ UNICODE 180133 180137 UTF8_BINARY 10476 10477 1 0.0 104757.4 1.1X UNICODE_CI 148171 148190 28 0.0 1481705.6 0.1X +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative + +UTF8_BINARY_LCASE 49257 49280 32 0.0 492574.0 1.0X +UNICODE 18253 18293 57 0.0 182530.8 2.7X +UTF8_BINARY 20199 20247 68 0.0 201987.8 2.4X +UNICODE_CI 882302 882576 387 0.0 8823023.9 0.1X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative + +UTF8_BINARY_LCASE 45015 45024 13 0.0 450153.7 1.0X +UNICODE 17425 17455 43 0.0 174247.1 2.6X +UTF8_BINARY 19237 19268 44 0.0 192371.4 2.3X +UNICODE_CI 954993 955680 971 0.0 9549930.3 0.0X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative + +UTF8_BINARY_LCASE 45919 45966 67 0.0 459187.0 1.0X +UNICODE 17697 17713 23 0.0 176970.4 2.6X +UTF8_BINARY 19448 19449 2 0.0 194479.6 2.4X +UNICODE_CI 962916 963010 133 0.0 9629158.5 0.0X + diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index 4028b0f005a3
(spark) branch master updated: [SPARK-47765][SQL][FOLLOWUP] Disable SET COLLATION when collations are disabled
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e2e6710fc8ea [SPARK-47765][SQL][FOLLOWUP] Disable SET COLLATION when collations are disabled e2e6710fc8ea is described below commit e2e6710fc8ea8dde90d0b36d969ce2900a732205 Author: Mihailo Milosevic AuthorDate: Wed Apr 17 22:24:57 2024 +0800 [SPARK-47765][SQL][FOLLOWUP] Disable SET COLLATION when collations are disabled ### What changes were proposed in this pull request? Disable SET COLLATION when collations are diabled. ### Why are the changes needed? We do not want users to use syntax that is not supported when collations are disabled. ### Does this PR introduce _any_ user-facing change? Yes, blocks users from using SET COLLATION. ### How was this patch tested? Test in `SQLConfSuite` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46103 from mihailom-db/FOLLOWUP-SPARK-47765. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../scala/org/apache/spark/sql/execution/SparkSqlParser.scala | 3 +++ .../test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala | 8 2 files changed, 11 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 8192be269993..c7f5e41b7879 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -150,6 +150,9 @@ class SparkSqlAstBuilder extends AstBuilder { * }}} */ override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) { +if (!SQLConf.get.collationEnabled) { + throw QueryCompilationErrors.collationNotEnabledError() +} val key = SQLConf.DEFAULT_COLLATION.key SetCommand(Some(key -> Some(ctx.identifier.getText.toUpperCase(Locale.ROOT } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 18a06e83c076..213dfd32c869 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -521,6 +521,14 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { "confName" -> "spark.sql.session.collation.default", "proposal" -> "UNICODE_CI" )) + +withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") { + checkError( +exception = intercept[AnalysisException](sql(s"SET COLLATION UNICODE_CI")), +errorClass = "UNSUPPORTED_FEATURE.COLLATION", +parameters = Map.empty + ) +} } test("SPARK-43028: config not found error") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (e327c1220abf -> 1b53f122d56c)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from e327c1220abf [SPARK-47884][INFRA] Switch ANSI SQL CI job to NON-ANSI SQL CI job add 1b53f122d56c [SPARK-47863][SQL] Fix startsWith & endsWith collation-aware implementation for ICU No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/util/CollationSupport.java | 23 +- .../spark/unsafe/types/CollationSupportSuite.java | 11 +++ 2 files changed, 20 insertions(+), 14 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47822][SQL] Prohibit Hash Expressions from hashing the Variant Data Type
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4e754f778fdc [SPARK-47822][SQL] Prohibit Hash Expressions from hashing the Variant Data Type 4e754f778fdc is described below commit 4e754f778fdc9628bc8af671553f2d85ce8ac32d Author: Harsh Motwani AuthorDate: Wed Apr 17 15:57:17 2024 +0800 [SPARK-47822][SQL] Prohibit Hash Expressions from hashing the Variant Data Type ### What changes were proposed in this pull request? I am prohibiting hash functions from hashing VariantType elements. ### Why are the changes needed? Hashing hasn't been formally implemented on VariantType elements so the current implementation crashes during execution. ### Does this PR introduce _any_ user-facing change? Earlier, when trying to hash Variant data, Spark would crash during execution. Now, the query itself wouldn't compile. ### How was this patch tested? Additional unit test in ExpressionTypeCheckingSuite ### Was this patch authored or co-authored using generative AI tooling? No Closes #46017 from harshmotw-db/hash_variant. Authored-by: Harsh Motwani Signed-off-by: Wenchen Fan --- common/utils/src/main/resources/error/error-conditions.json | 5 + docs/sql-error-conditions-datatype-mismatch-error-class.md| 4 .../org/apache/spark/sql/catalyst/expressions/hash.scala | 8 .../sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala | 11 +++ 4 files changed, 28 insertions(+) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 9636ddbf73bc..54415f80fee0 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -736,6 +736,11 @@ "Input to the function cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"." ] }, + "HASH_VARIANT_TYPE" : { +"message" : [ + "Input to the function cannot contain elements of the \"VARIANT\" type yet." +] + }, "INPUT_SIZE_NOT_ONE" : { "message" : [ "Length of should be 1." diff --git a/docs/sql-error-conditions-datatype-mismatch-error-class.md b/docs/sql-error-conditions-datatype-mismatch-error-class.md index 1d18836ac9e7..971319e3e0fe 100644 --- a/docs/sql-error-conditions-datatype-mismatch-error-class.md +++ b/docs/sql-error-conditions-datatype-mismatch-error-class.md @@ -100,6 +100,10 @@ Filter expression `` of type `` is not a boolean. Input to the function `` cannot contain elements of the "MAP" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on "MAP" elements. To restore previous behavior set "spark.sql.legacy.allowHashOnMapType" to "true". +## HASH_VARIANT_TYPE + +Input to the function `` cannot contain elements of the "VARIANT" type yet. + ## INPUT_SIZE_NOT_ONE Length of `` should be 1. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 436efa892416..5089cea136a8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -271,6 +271,10 @@ abstract class HashExpression[E] extends Expression { dt.existsRecursively(_.isInstanceOf[MapType]) } + private def hasVariantType(dt: DataType): Boolean = { +dt.existsRecursively(_.isInstanceOf[VariantType]) + } + override def checkInputDataTypes(): TypeCheckResult = { if (children.length < 1) { throw QueryCompilationErrors.wrongNumArgsError( @@ -281,6 +285,10 @@ abstract class HashExpression[E] extends Expression { DataTypeMismatch( errorSubClass = "HASH_MAP_TYPE", messageParameters = Map("functionName" -> toSQLId(prettyName))) +} else if (children.exists(child => hasVariantType(child.dataType))) { + DataTypeMismatch( +errorSubClass = "HASH_VARIANT_TYPE", +messageParameters = Map("functionName" -> toSQLId(prettyName))) } else { TypeCheckResult.TypeCheckSuccess } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/cat
(spark) branch master updated (4913c344756d -> 76882d0c13db)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 4913c344756d [SPARK-47867][SQL] Support variant in JSON scan add 76882d0c13db [SPARK-47821][SQL] Implement is_variant_null expression No new revisions were added by this update. Summary of changes: .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../variant/VariantExpressionEvalUtils.scala | 11 ++ .../expressions/variant/variantExpressions.scala | 41 ++ .../variant/VariantExpressionEvalUtilsSuite.scala | 38 .../sql-functions/sql-expression-schema.md | 1 + .../apache/spark/sql/VariantEndToEndSuite.scala| 17 + 6 files changed, 109 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47867][SQL] Support variant in JSON scan
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4913c344756d [SPARK-47867][SQL] Support variant in JSON scan 4913c344756d is described below commit 4913c344756df8c0e382eea762386c1ec797d7f4 Author: Chenhao Li AuthorDate: Wed Apr 17 15:47:25 2024 +0800 [SPARK-47867][SQL] Support variant in JSON scan ### What changes were proposed in this pull request? This PR adds support for the variant type in the JSON scan. As part of this PR we introduce one new JSON option: `spark.read.format("json").option("singleVariantColumn", "colName")`. Setting this option specifies that each JSON document should be ingested into a single variant column called `colName`. When this option is used, the user must not specify a schema, and the schema is inferred as `colName variant`. ### Example 1 (multiple variant fields) JSON files can be ingested into variant fields, e.g. ``` spark.read.format("json").schema("i int, var variant, arr ARRAY").load("a.json").show(false) ``` for a file with the following data: ``` {"i": 1, "var": {"d": "+94875-04-12", "string":"string1","int":1,"array":[1,2,3],"dict": {"key": "value1"}}, "arr": [{"a": 1}, {"b": 2}, {"c": 3, "d": [1, 2, 3]}]} {"i": 2, "var": {"string":"string2","int":2,"array":[2,4,6],"dict": {"key": "value2"}}} {} {"i": 3} ``` ### Example 2 (one variant field) Here's another example with a single variant field: ``` spark.read.format("json").schema("var variant").load("a.json").show(false) ``` for a file with the following data: ``` {"var": {"d": "+94875-04-12", "string":"string1","int":1,"array":[1,2,3],"dict": {"key": "value1"}}} {"var": {"string":"string2","int":2,"array":[2,4,6],"dict": {"key": "value2"}}} {} ``` ### Example 3 (singleVariantColumn option) Each JSON document can also be ingested into a single variant column, e.g. ``` spark.read.format("json").option("singleVariantColumn", "var").load("a.json").show(false) ``` for a file with the following data: ``` {"i": 1, "var": {"d": "+94875-04-12", "string":"string1","int":1,"array":[1,2,3],"dict": {"key": "value1"}}, "arr": [{"a": 1}, {"b": 2}, {"c": 3, "d": [1, 2, 3]}]} {"i": 2, "var": {"string":"string2","int":2,"array":[2,4,6],"dict": {"key": "value2"}}} {} {"i": 3} ``` ### Why are the changes needed? It allows Spark to ingest variant values directly from the JSON data source. Previously, the `parse_json` expression can only operate on a string column that is already in an existing table. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? Unit tests that verify the result and error reporting in JSON scan. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46071 from chenhao-db/json_scan_variant. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../src/main/resources/error/error-conditions.json | 6 ++ docs/sql-error-conditions.md | 6 ++ .../spark/sql/catalyst/json/JSONOptions.scala | 6 ++ .../spark/sql/catalyst/json/JacksonParser.scala| 18 +- .../spark/sql/errors/QueryCompilationErrors.scala | 6 ++ .../org/apache/spark/sql/DataFrameReader.scala | 14 + .../datasources/json/JsonFileFormat.scala | 8 ++- .../scala/org/apache/spark/sql/VariantSuite.scala | 66 ++ .../sql/execution/datasources/json/JsonSuite.scala | 3 +- 9 files changed, 128 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e1c8c881f98f..9636ddbf73bc 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/er
(spark) branch master updated: [SPARK-47417][SQL] Collation support: Ascii, Chr, Base64, UnBase64, Decode, StringDecode, Encode, ToBinary, FormatNumber, Sentences
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ee2673f2e948 [SPARK-47417][SQL] Collation support: Ascii, Chr, Base64, UnBase64, Decode, StringDecode, Encode, ToBinary, FormatNumber, Sentences ee2673f2e948 is described below commit ee2673f2e94811022f6a3d9a03ad119f7a8e5d65 Author: Nikola Mandic AuthorDate: Tue Apr 16 23:09:23 2024 +0800 [SPARK-47417][SQL] Collation support: Ascii, Chr, Base64, UnBase64, Decode, StringDecode, Encode, ToBinary, FormatNumber, Sentences ### What changes were proposed in this pull request? `Chr` and `Base64` are skipped as they don't accept input string types and don't need to be updated. Other functions are updated to accept collated strings as inputs. ### Why are the changes needed? Add collations support in string functions. ### Does this PR introduce _any_ user-facing change? Yes, it changes behavior of string functions when string parameters have collation. ### How was this patch tested? Add checks to `CollationStringExpressionsSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45933 from nikolamand-db/SPARK-47417-47418-47420. Authored-by: Nikola Mandic Signed-off-by: Wenchen Fan --- .../catalyst/expressions/stringExpressions.scala | 40 +-- .../expressions/StringExpressionsSuite.scala | 3 +- .../sql/CollationStringExpressionsSuite.scala | 77 +- 3 files changed, 99 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 4fe57b4f8f02..b3029302c03d 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2352,7 +2352,7 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = IntegerType - override def inputTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) protected override def nullSafeEval(string: Any): Any = { // only pick the first character to reduce the `toString` cost @@ -2398,7 +2398,7 @@ case class Ascii(child: Expression) case class Chr(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(LongType) protected override def nullSafeEval(lon: Any): Any = { @@ -2447,7 +2447,7 @@ case class Chr(child: Expression) case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(BinaryType) protected override def nullSafeEval(bytes: Any): Any = { @@ -2480,7 +2480,7 @@ case class UnBase64(child: Expression, failOnError: Boolean = false) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = BinaryType - override def inputTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) def this(expr: Expression) = this(expr, false) @@ -2672,8 +2672,8 @@ case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Bo override def left: Expression = bin override def right: Expression = charset - override def dataType: DataType = StringType - override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType) + override def dataType: DataType = SQLConf.get.defaultStringType + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation) private val supportedCharsets = Set( "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") @@ -2750,7 +2750,8 @@ case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean) override def left: Expression = str override def right: Expression = charset override def dataType: DataType = BinaryType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) private val supportedCharsets = S
(spark) branch master updated: [SPARK-47356][SQL] Add support for ConcatWs & Elt (all collations)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4dad2170b05c [SPARK-47356][SQL] Add support for ConcatWs & Elt (all collations) 4dad2170b05c is described below commit 4dad2170b05c04faf1da550ab3fb8c52a61b8be7 Author: Mihailo Milosevic AuthorDate: Tue Apr 16 21:21:24 2024 +0800 [SPARK-47356][SQL] Add support for ConcatWs & Elt (all collations) ### What changes were proposed in this pull request? Addition of support for ConcatWs and Elt expressions. ### Why are the changes needed? We need to enable these functions to support collations in order to scope all functions. ### Does this PR introduce _any_ user-facing change? Yes, both expressions now will not return error when called with collated strings. ### How was this patch tested? Addition of tests to `CollationStringExpressionsSuite` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46061 from mihailom-db/SPARK-47356. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/CollationTypeCasts.scala | 5 ++- .../catalyst/expressions/stringExpressions.scala | 25 ++-- .../sql/CollationStringExpressionsSuite.scala | 46 -- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index 1a14b4227de8..795e8a696b01 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import scala.annotation.tailrec import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} -import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Expression, Greatest, If, In, InSubquery, Least} +import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @@ -45,6 +45,9 @@ object CollationTypeCasts extends TypeCoercionRule { caseWhenExpr.elseValue.map(e => castStringType(e, outputStringType).getOrElse(e)) CaseWhen(newBranches, newElseValue) +case eltExpr: Elt => + eltExpr.withNewChildren(eltExpr.children.head +: collateToSingleType(eltExpr.children.tail)) + case otherExpr @ ( _: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least | _: Coalesce | _: BinaryExpression | _: ConcatWs) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 34e8f3f40859..4fe57b4f8f02 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LO import org.apache.spark.sql.catalyst.util.{ArrayData, CollationSupport, GenericArrayData, TypeUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.types.StringTypeAnyCollation +import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.array.ByteArrayMethods @@ -79,11 +79,12 @@ case class ConcatWs(children: Seq[Expression]) /** The 1st child (separator) is str, and rest are either str or array of str. */ override def inputTypes: Seq[AbstractDataType] = { -val arrayOrStr = TypeCollection(ArrayType(StringType), StringType) -StringType +: Seq.fill(children.size - 1)(arrayOrStr) +val arrayOrStr = + TypeCollection(AbstractArrayType(StringTypeAnyCollation), StringTypeAnyCollation) +StringTypeAnyCollation +: Seq.fill(children.size - 1)(arrayOrStr) } - override def dataType: DataType = StringType + override def dataType: DataType = children.head.dataType
(spark) branch master updated: [SPARK-47420][SQL] Fix test output
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6762d1f8beaf [SPARK-47420][SQL] Fix test output 6762d1f8beaf is described below commit 6762d1f8beafc2ae3a406fa24e62e2b6f93dfdb7 Author: Vladimir Golubev AuthorDate: Tue Apr 16 13:50:47 2024 +0800 [SPARK-47420][SQL] Fix test output Make "AssertionFailedError: expected: but was: " to be rendered correctly ### What changes were proposed in this pull request? The assertion was being rendered the other way around ### Why are the changes needed? To avoid confusion during test checks ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? `testOnly org.apache.spark.unsafe.types.CollationSupportSuite` ### Was this patch authored or co-authored using generative AI tooling? No Closes #46058 from vladimirg-db/vladimirg-db/fix-test-output. Authored-by: Vladimir Golubev Signed-off-by: Wenchen Fan --- .../apache/spark/unsafe/types/CollationSupportSuite.java| 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index bfb696c35fff..099a13a025e7 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -30,12 +30,12 @@ public class CollationSupportSuite { * Collation-aware string expressions. */ - private void assertContains(String pattern, String target, String collationName, boolean value) + private void assertContains(String pattern, String target, String collationName, boolean expected) throws SparkException { UTF8String l = UTF8String.fromString(pattern); UTF8String r = UTF8String.fromString(target); int collationId = CollationFactory.collationNameToId(collationName); -assertEquals(CollationSupport.Contains.exec(l, r, collationId), value); +assertEquals(expected, CollationSupport.Contains.exec(l, r, collationId)); } @Test @@ -103,12 +103,13 @@ public class CollationSupportSuite { assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); } - private void assertStartsWith(String pattern, String prefix, String collationName, boolean value) + private void assertStartsWith( + String pattern, String prefix, String collationName, boolean expected) throws SparkException { UTF8String l = UTF8String.fromString(pattern); UTF8String r = UTF8String.fromString(prefix); int collationId = CollationFactory.collationNameToId(collationName); -assertEquals(CollationSupport.StartsWith.exec(l, r, collationId), value); +assertEquals(expected, CollationSupport.StartsWith.exec(l, r, collationId)); } @Test @@ -176,12 +177,12 @@ public class CollationSupportSuite { assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); } - private void assertEndsWith(String pattern, String suffix, String collationName, boolean value) + private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected) throws SparkException { UTF8String l = UTF8String.fromString(pattern); UTF8String r = UTF8String.fromString(suffix); int collationId = CollationFactory.collationNameToId(collationName); -assertEquals(CollationSupport.EndsWith.exec(l, r, collationId), value); +assertEquals(expected, CollationSupport.EndsWith.exec(l, r, collationId)); } @Test - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47769][SQL] Add schema_of_variant_agg expression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 348b1bcff75c [SPARK-47769][SQL] Add schema_of_variant_agg expression 348b1bcff75c is described below commit 348b1bcff75cd6eb951c5792cfe8a65cbe8aba73 Author: Chenhao Li AuthorDate: Tue Apr 16 13:45:37 2024 +0800 [SPARK-47769][SQL] Add schema_of_variant_agg expression ### What changes were proposed in this pull request? This PR adds a new `schema_of_variant_agg` expression. It returns the merged schema in the SQL format of a variant column. Compared to `schema_of_variant`, which is a scalar expression and returns one schema for one row, the `schema_of_variant_agg` expression merges the schema of all rows. Usage examples: ``` > SELECT schema_of_variant_agg(parse_json(j)) FROM VALUES ('1'), ('2'), ('3') AS tab(j); BIGINT > SELECT schema_of_variant_agg(parse_json(j)) FROM VALUES ('{"a": 1}'), ('{"b": true}'), ('{"c": 1.23}') AS tab(j); STRUCT ``` ### Why are the changes needed? This expression can help the user explore the content of variant values. ### Does this PR introduce _any_ user-facing change? Yes. A new SQL expression is added. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45934 from chenhao-db/schema_of_variant_agg. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../expressions/variant/variantExpressions.scala | 66 ++ .../sql-functions/sql-expression-schema.md | 1 + .../apache/spark/sql/VariantEndToEndSuite.scala| 42 ++ 4 files changed, 110 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 9447ea63b51f..c56d04b570e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -825,6 +825,7 @@ object FunctionRegistry { expressionBuilder("variant_get", VariantGetExpressionBuilder), expressionBuilder("try_variant_get", TryVariantGetExpressionBuilder), expression[SchemaOfVariant]("schema_of_variant"), +expression[SchemaOfVariantAgg]("schema_of_variant_agg"), // cast expression[Cast]("cast"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala index 8b09bf5f7de0..cab75e1996ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala @@ -27,11 +27,13 @@ import org.apache.spark.sql.catalyst.analysis.ExpressionBuilder import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke import org.apache.spark.sql.catalyst.json.JsonInferSchema import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, VARIANT_GET} +import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase, QueryExecutionErrors} @@ -615,3 +617,67 @@ object SchemaOfVariant { def mergeSchema(t1: DataType, t2: DataType): DataType = JsonInferSchema.compatibleType(t1, t2, VariantType) } + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(v) - Returns the merged schema in the SQL format of a variant column.", + examples = """ +Examples: + > SELECT _FUNC_(parse_json(j)) FROM VALUES ('1'), ('2'), ('3') AS tab(j); + BIGINT + > SELECT _FUNC_(parse_json(j)) FROM VALUES ('{"a": 1}'), ('{"b": true}'), ('{"c": 1.23}')
(spark) branch master updated: [SPARK-47463][SQL] Use V2Predicate to wrap expression with return type of boolean
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new fa3ef03a0734 [SPARK-47463][SQL] Use V2Predicate to wrap expression with return type of boolean fa3ef03a0734 is described below commit fa3ef03a073407966765544c936a9c65401e955a Author: Zhen Wang <643348...@qq.com> AuthorDate: Tue Apr 16 13:41:53 2024 +0800 [SPARK-47463][SQL] Use V2Predicate to wrap expression with return type of boolean ### What changes were proposed in this pull request? Use V2Predicate to wrap If expr when building v2 expressions. ### Why are the changes needed? The `PushFoldableIntoBranches` optimizer may fold predicate into (if / case) branches and `V2ExpressionBuilder` wraps `If` as `GeneralScalarExpression`, which causes the assertion in `PushablePredicate.unapply` to fail. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? added unit test ### Was this patch authored or co-authored using generative AI tooling? No Closes #45589 from wForget/SPARK-47463. Authored-by: Zhen Wang <643348...@qq.com> Signed-off-by: Wenchen Fan --- .../sql/catalyst/util/V2ExpressionBuilder.scala| 159 +++-- .../spark/sql/connector/DataSourceV2Suite.scala| 10 ++ 2 files changed, 97 insertions(+), 72 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala index 3942d193a328..398f21e01b80 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.connector.expressions.{Cast => V2Cast, Expression => import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Avg, Count, CountStar, GeneralAggregateFunc, Max, Min, Sum, UserDefinedAggregateFunc} import org.apache.spark.sql.connector.expressions.filter.{AlwaysFalse, AlwaysTrue, And => V2And, Not => V2Not, Or => V2Or, Predicate => V2Predicate} import org.apache.spark.sql.execution.datasources.PushableExpression -import org.apache.spark.sql.types.{BooleanType, IntegerType, StringType} +import org.apache.spark.sql.types.{BooleanType, DataType, IntegerType, StringType} /** * The builder to generate V2 expressions from catalyst expressions. @@ -98,45 +98,45 @@ class V2ExpressionBuilder(e: Expression, isPredicate: Boolean = false) { generateExpression(child).map(v => new V2Cast(v, dataType)) case AggregateExpression(aggregateFunction, Complete, isDistinct, None, _) => generateAggregateFunc(aggregateFunction, isDistinct) -case Abs(child, true) => generateExpressionWithName("ABS", Seq(child)) -case Coalesce(children) => generateExpressionWithName("COALESCE", children) -case Greatest(children) => generateExpressionWithName("GREATEST", children) -case Least(children) => generateExpressionWithName("LEAST", children) -case Rand(child, hideSeed) => +case Abs(_, true) => generateExpressionWithName("ABS", expr, isPredicate) +case _: Coalesce => generateExpressionWithName("COALESCE", expr, isPredicate) +case _: Greatest => generateExpressionWithName("GREATEST", expr, isPredicate) +case _: Least => generateExpressionWithName("LEAST", expr, isPredicate) +case Rand(_, hideSeed) => if (hideSeed) { Some(new GeneralScalarExpression("RAND", Array.empty[V2Expression])) } else { -generateExpressionWithName("RAND", Seq(child)) +generateExpressionWithName("RAND", expr, isPredicate) } -case log: Logarithm => generateExpressionWithName("LOG", log.children) -case Log10(child) => generateExpressionWithName("LOG10", Seq(child)) -case Log2(child) => generateExpressionWithName("LOG2", Seq(child)) -case Log(child) => generateExpressionWithName("LN", Seq(child)) -case Exp(child) => generateExpressionWithName("EXP", Seq(child)) -case pow: Pow => generateExpressionWithName("POWER", pow.children) -case Sqrt(child) => generateExpressionWithName("SQRT", Seq(child)) -case Floor(child) => generateExpressionWithName("FLOOR", Seq(child)) -case Ceil(child) => generateExpressionWithName("CEIL", Seq(child)) -case round: Round => generateExpressionWithName("ROUND", round.childre
(spark) branch master updated (be080703688f -> c5b8e60e0d59)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from be080703688f [SPARK-47673][SS] Implementing TTL for ListState add c5b8e60e0d59 [SPARK-46810][DOCS] Align error class terminology with SQL standard No new revisions were added by this update. Summary of changes: common/utils/src/main/resources/error/README.md| 147 +- .../src/main/resources/error/error-categories.json | 90 - .../src/main/resources/error/error-classes.json| 8235 +--- .../{error-classes.json => error-conditions.json} |0 .../org/apache/spark/SparkThrowableHelper.scala|5 +- ...or-classes.json => kafka-error-conditions.json} |0 .../spark/sql/kafka010/KafkaExceptions.scala |5 +- .../org/apache/spark/SparkThrowableSuite.scala | 24 +- 8 files changed, 212 insertions(+), 8294 deletions(-) delete mode 100644 common/utils/src/main/resources/error/error-categories.json copy common/utils/src/main/resources/error/{error-classes.json => error-conditions.json} (100%) rename connector/kafka-0-10-sql/src/main/resources/error/{kafka-error-classes.json => kafka-error-conditions.json} (100%) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (b8354bbe53c0 -> f3a6ca9e2c47)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from b8354bbe53c0 [SPARK-47851][CONNECT][DOCS] Document pyspark-connect package add f3a6ca9e2c47 [SPARK-47357][SQL] Add support for Upper, Lower, InitCap (all collations) No new revisions were added by this update. Summary of changes: .../catalyst/expressions/stringExpressions.scala | 4 +- .../sql/CollationStringExpressionsSuite.scala | 55 -- 2 files changed, 54 insertions(+), 5 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47803][SQL] Support cast to variant
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new dd4c7fc6ae91 [SPARK-47803][SQL] Support cast to variant dd4c7fc6ae91 is described below commit dd4c7fc6ae91ec1f2a348cf4633bff10717cde22 Author: Chenhao Li AuthorDate: Mon Apr 15 13:16:30 2024 +0800 [SPARK-47803][SQL] Support cast to variant ### What changes were proposed in this pull request? This PR allows casting another type into the variant type. The changes can be divided into two major parts: - The `VariantBuilder` class is greatly refactored. Many of its APIs are exposed so that Spark can use them to build a variant value without JSON parsing. - The actual implementation of the cast. ### Why are the changes needed? It provides a convenient way to build variant values from other Spark values. Before this PR, `parse_json` is the only SQL function that can produce variant values. If users want to do so, they may have to use `parse_json(to_json(input))`, which is inefficient and disallowed if the input has a scalar type. ### Does this PR introduce _any_ user-facing change? Yes. Casting to variant was previously not allowed but now allowd. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45989 from chenhao-db/cast_to_variant. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../src/main/resources/error/error-classes.json| 6 + .../org/apache/spark/types/variant/Variant.java| 6 +- .../apache/spark/types/variant/VariantBuilder.java | 423 ++--- docs/sql-error-conditions.md | 6 + .../spark/sql/catalyst/expressions/Cast.scala | 7 + .../variant/VariantExpressionEvalUtils.scala | 71 +++- .../variant/VariantExpressionSuite.scala | 26 ++ 7 files changed, 406 insertions(+), 139 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 2a00edb9a4df..e1c8c881f98f 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -4725,6 +4725,12 @@ ], "sqlState" : "22023" }, + "VARIANT_DUPLICATE_KEY" : { +"message" : [ + "Failed to build variant because of a duplicate object key ``." +], +"sqlState" : "22023" + }, "VARIANT_SIZE_LIMIT" : { "message" : [ "Cannot build variant bigger than in .", diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java b/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java index 4aeb2c6e1435..a705daaf323b 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java @@ -41,12 +41,12 @@ import static org.apache.spark.types.variant.VariantUtil.*; * define a new class to avoid depending on or modifying Spark. */ public final class Variant { - private final byte[] value; - private final byte[] metadata; + final byte[] value; + final byte[] metadata; // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary // when reading a sub-variant in the array/object element. - private final int pos; + final int pos; public Variant(byte[] value, byte[] metadata) { this(value, metadata, 0); diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java index 21a12cbe9d71..ea7a7674baf5 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java @@ -17,6 +17,11 @@ package org.apache.spark.types.variant; +import org.apache.spark.QueryContext; +import org.apache.spark.SparkRuntimeException; +import scala.collection.immutable.Map; +import scala.collection.immutable.Map$; + import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; @@ -61,7 +66,7 @@ public class VariantBuilder { } // Build the variant metadata from `dictionaryKeys` and return the variant result. - private Variant result() { + public Variant result() { int numKeys = dictionaryKeys.size(); // Use long to avoid overflow in accumulating lengths. long dictionaryStringSize = 0; @@ -100,6
(spark) branch master updated: [SPARK-47765][SQL] Add SET COLLATION to parser rules
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new de00ac8a05ae [SPARK-47765][SQL] Add SET COLLATION to parser rules de00ac8a05ae is described below commit de00ac8a05aedb3a150c8c10f76d1fe5496b1df3 Author: Mihailo Milosevic AuthorDate: Fri Apr 12 22:25:06 2024 +0800 [SPARK-47765][SQL] Add SET COLLATION to parser rules ### What changes were proposed in this pull request? Addition of a new statement SET COLLATION collationName. ### Why are the changes needed? Requested by srielau in order to follow other principles for session level defaults (e.g. SET TIME ZONE). ### Does this PR introduce _any_ user-facing change? Users now can use SET COLLATION statement to change session level default collation. ### How was this patch tested? Test added to `CollationSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45946 from mihailom-db/SPARK-47765. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationFactory.java| 17 + .../src/main/resources/error/error-classes.json | 5 + .../apache/spark/internal/config/ConfigBuilder.scala | 4 ++-- ...rror-conditions-invalid-conf-value-error-class.md | 4 docs/sql-ref-ansi-compliance.md | 1 + .../apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 | 1 + .../spark/sql/catalyst/parser/SqlBaseParser.g4 | 2 ++ .../org/apache/spark/sql/internal/SQLConf.scala | 8 +++- .../resources/ansi-sql-2016-reserved-keywords.txt| 1 + .../apache/spark/sql/execution/SparkSqlParser.scala | 12 .../sql-tests/results/ansi/keywords.sql.out | 2 ++ .../resources/sql-tests/results/keywords.sql.out | 1 + .../org/apache/spark/sql/internal/SQLConfSuite.scala | 20 +++- .../ThriftServerWithSparkContextSuite.scala | 2 +- 14 files changed, 75 insertions(+), 5 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index ff7bc450f851..9786c559da44 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -202,6 +202,23 @@ public final class CollationFactory { return new StringSearch(pattern, target, (RuleBasedCollator) collator); } + /** + * Returns if the given collationName is valid one. + */ + public static boolean isValidCollation(String collationName) { +return collationNameToIdMap.containsKey(collationName.toUpperCase()); + } + + /** + * Returns closest valid name to collationName + */ + public static String getClosestCollation(String collationName) { +Collation suggestion = Collections.min(List.of(collationTable), Comparator.comparingInt( +c -> UTF8String.fromString(c.collationName).levenshteinDistance( +UTF8String.fromString(collationName.toUpperCase(); +return suggestion.collationName; + } + /** * Returns a collation-unaware StringSearch object for the given pattern and target strings. * While this object does not respect collation, it can be used to find occurrences of the pattern diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 7b13fa4278e4..2a00edb9a4df 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -1881,6 +1881,11 @@ "The value '' in the config \"\" is invalid." ], "subClass" : { + "DEFAULT_COLLATION" : { +"message" : [ + "Cannot resolve the given default collation. Did you mean ''?" +] + }, "TIME_ZONE" : { "message" : [ "Cannot resolve the given timezone." diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala index 303d856ca2c5..1f19e9444d38 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala @@ -117,12 +117,12 @@ private[spark] class TypedConfigBuilder[T]( def checkValue( validator: T => Boolean, errorClass: String, - parameters: Map[String, String]): TypedConfigBuilder[T] = { + parameters: T => Map[String, Strin
(spark) branch master updated: [SPARK-47800][SQL] Create new method for identifier to tableIdentifier conversion
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 27987536be38 [SPARK-47800][SQL] Create new method for identifier to tableIdentifier conversion 27987536be38 is described below commit 27987536be3810c3e61767d7abd33f3886411c5c Author: Uros Stankovic AuthorDate: Fri Apr 12 14:48:43 2024 +0800 [SPARK-47800][SQL] Create new method for identifier to tableIdentifier conversion Introducing new method for converting catalog identifier (since Spark 3.0.0, newer API) to table identifier (older API) ### Why are the changes needed? Code is cleaner and DataSourceV2Strategy is not responsible for conversion. New method can be reused also. Conversion is a little bit improved also (schema is not required anymore, it can miss) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? No test made, since it is minor refactoring ### Was this patch authored or co-authored using generative AI tooling? No Closes #45985 from urosstan-db/SPARK-47800-v2-Identifier-to-table-identifier-method. Authored-by: Uros Stankovic Signed-off-by: Wenchen Fan --- .../sql/connector/catalog/CatalogV2Implicits.scala | 20 .../datasources/v2/DataSourceV2Strategy.scala| 9 ++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala index bf4cd2eedc83..65bdae85be12 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Implicits.scala @@ -164,6 +164,26 @@ private[sql] object CatalogV2Implicits { case _ => throw QueryCompilationErrors.identifierTooManyNamePartsError(original) } +/** + * Tries to convert catalog identifier to the table identifier. Table identifier does not + * support multiple namespaces (nested namespaces), so if identifier contains nested namespace, + * conversion cannot be done + * @param catalogName Catalog name. Identifier represents just one object in catalog, so it has + *no catalog name needed for table identifier creation + * @return Table identifier if conversion can be done, None otherwise + */ +def asTableIdentifierOpt(catalogName: Option[String]): Option[TableIdentifier] = { + ident.namespace().toImmutableArraySeq match { +case Seq(singleNamespace) => + Some(TableIdentifier(ident.name(), Some(singleNamespace), catalogName)) +case Seq() => + // If namespace is not given, catalog will not be used + Some(TableIdentifier(ident.name())) +case _ => + None + } +} + def asFunctionIdentifier: FunctionIdentifier = ident.namespace() match { case ns if ns.isEmpty => FunctionIdentifier(ident.name()) case Array(dbName) => FunctionIdentifier(ident.name(), Some(dbName)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index e7960f8b61ae..828d737f93fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -24,7 +24,6 @@ import org.apache.commons.lang3.StringUtils import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, Strategy} -import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{ResolvedIdentifier, ResolvedNamespace, ResolvedPartitionSpec, ResolvedTable} import org.apache.spark.sql.catalyst.catalog.CatalogUtils import org.apache.spark.sql.catalyst.expressions @@ -118,12 +117,8 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat val rdd = v1Relation.buildScan() val unsafeRowRDD = DataSourceStrategy.toCatalystRDD(v1Relation, output, rdd) - val tableIdentifier = v2Relation.identifier.map(_.asMultipartIdentifier) match { -case Some(Seq(schema, tableName)) => - Some(new TableIdentifier(tableName, Some(schema), v2Relation.catalog.map(_.name( -case _ => - None - } + val catalogName = v2Relation.catalog.map(_.name()) + val tableIdentifier = v2Relation.identifier.flatMap(_.asTableIdentifierOpt(catalogNa
(spark) branch master updated: [SPARK-47410][SQL] Refactor UTF8String and CollationFactory
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 31036274fc1c [SPARK-47410][SQL] Refactor UTF8String and CollationFactory 31036274fc1c is described below commit 31036274fc1c8013c6428735659959f46afea5d8 Author: Uros Bojanic <157381213+uros...@users.noreply.github.com> AuthorDate: Thu Apr 11 22:21:21 2024 +0800 [SPARK-47410][SQL] Refactor UTF8String and CollationFactory ### What changes were proposed in this pull request? This PR introduces comprehensive support for collation-aware expressions in Spark, focusing on improving code structure, clarity, and testing coverage for various expressions (including: Contains, StartsWith, EndsWith). ### Why are the changes needed? The changes are essential to improve the maintainability and readability of collation-related code in Spark expressions. By restructuring and centralizing collation support through, we simplify the addition of new collation-aware operations and ensure consistent testing across different collation types. ### Does this PR introduce _any_ user-facing change? No, this PR is focused on internal refactoring and testing enhancements for collation-aware expression support. ### How was this patch tested? Unit tests in CollationSupportSuite.java E2E tests in CollationStringExpressionsSuite.scala ### Was this patch authored or co-authored using generative AI tooling? Yes. Closes #45978 from uros-db/SPARK-47410. Authored-by: Uros Bojanic <157381213+uros...@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationFactory.java | 54 +- .../spark/sql/catalyst/util/CollationSupport.java | 174 ++ .../org/apache/spark/unsafe/types/UTF8String.java | 54 -- .../spark/unsafe/types/CollationSupportSuite.java | 266 + .../unsafe/types/UTF8StringWithCollationSuite.java | 103 .../expressions/codegen/CodeGenerator.scala| 3 +- .../catalyst/expressions/stringExpressions.scala | 41 +- .../sql/CollationRegexpExpressionsSuite.scala | 616 + .../sql/CollationStringExpressionsSuite.scala | 179 -- .../org/apache/spark/sql/CollationSuite.scala | 84 --- 10 files changed, 874 insertions(+), 700 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 72a6e574707f..ff7bc450f851 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -78,6 +78,14 @@ public final class CollationFactory { */ public final boolean supportsBinaryOrdering; +/** + * Support for Lowercase Equality implies that it is possible to check equality on + * byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments. + * This allows custom collation support for UTF8_BINARY_LCASE collation in various Spark + * expressions, as this particular collation is not supported by the external ICU library. + */ +public final boolean supportsLowercaseEquality; + public Collation( String collationName, Collator collator, @@ -85,7 +93,8 @@ public final class CollationFactory { String version, ToLongFunction hashFunction, boolean supportsBinaryEquality, -boolean supportsBinaryOrdering) { +boolean supportsBinaryOrdering, +boolean supportsLowercaseEquality) { this.collationName = collationName; this.collator = collator; this.comparator = comparator; @@ -93,9 +102,12 @@ public final class CollationFactory { this.hashFunction = hashFunction; this.supportsBinaryEquality = supportsBinaryEquality; this.supportsBinaryOrdering = supportsBinaryOrdering; + this.supportsLowercaseEquality = supportsLowercaseEquality; // De Morgan's Law to check supportsBinaryOrdering => supportsBinaryEquality assert(!supportsBinaryOrdering || supportsBinaryEquality); + // No Collation can simultaneously support binary equality and lowercase equality + assert(!supportsBinaryEquality || !supportsLowercaseEquality); if (supportsBinaryEquality) { this.equalsFunction = UTF8String::equals; @@ -112,7 +124,8 @@ public final class CollationFactory { Collator collator, String version, boolean supportsBinaryEquality, -boolean supportsBinaryOrdering) { +boolean supportsBinaryOrdering, +boolean supportsLowercaseEquality) { this( collati
(spark) branch master updated: [SPARK-47617][SQL] Add TPC-DS testing infrastructure for collations
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6e371e1df50e [SPARK-47617][SQL] Add TPC-DS testing infrastructure for collations 6e371e1df50e is described below commit 6e371e1df50e35d807065015525772c3c02a5995 Author: Nikola Mandic AuthorDate: Thu Apr 11 21:08:17 2024 +0800 [SPARK-47617][SQL] Add TPC-DS testing infrastructure for collations ### What changes were proposed in this pull request? We can utilize TPC-DS testing infrastructure already present in Spark. The idea is to vary TPC-DS table string columns by adding multiple collations with different ordering rules and case sensitivity, producing new tables. These tables should yield the same results against predefined TPC-DS queries for certain batches of collations. For example, when comparing query runs on table where columns are first collated as `UTF8_BINARY` and then as `UTF8_BINARY_LCASE`, we should be getting sa [...] Introduce new query suite which tests the described behavior with available collations (utf8_binary and unicode) combined with case conversions (lowercase, uppercase, randomized case for fuzzy testing). ### Why are the changes needed? Improve collations testing coverage. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added TPC-DS collations query suite. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45739 from nikolamand-db/SPARK-47617. Lead-authored-by: Nikola Mandic Co-authored-by: Stefan Kandic Signed-off-by: Wenchen Fan --- .github/workflows/build_and_test.yml | 3 + .../scala/org/apache/spark/sql/TPCDSBase.scala | 2 +- .../spark/sql/TPCDSCollationQueryTestSuite.scala | 262 + .../scala/org/apache/spark/sql/TPCDSSchema.scala | 3 +- 4 files changed, 268 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e505be7d4d98..832826333f09 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -937,6 +937,9 @@ jobs: SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=-1 spark.sql.join.forceApplyShuffledHashJoin=true +- name: Run TPC-DS queries on collated data + run: | +SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite" - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSBase.scala index b6d46d279f4c..d4b70ae0d478 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSBase.scala @@ -34,7 +34,7 @@ trait TPCDSBase extends TPCBase with TPCDSSchema { "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90", "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") - protected val excludedTpcdsQueries: Set[String] = if (regenerateGoldenFiles) { + protected def excludedTpcdsQueries: Set[String] = if (regenerateGoldenFiles) { Set() } else { // Since `tpcdsQueriesV2_7_0` has almost the same queries with these ones below, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala new file mode 100644 index ..a84dd9645bcc --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language go
(spark) branch master updated: [SPARK-47736][SQL] Add support for AbstractArrayType
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a8b919f924db [SPARK-47736][SQL] Add support for AbstractArrayType a8b919f924db is described below commit a8b919f924db1e2818b2b0de49762292ae20c17c Author: Mihailo Milosevic AuthorDate: Thu Apr 11 15:39:42 2024 +0800 [SPARK-47736][SQL] Add support for AbstractArrayType ### What changes were proposed in this pull request? Addition of abstract arraytype which accepts StringTypeCollated as elementType. Changes in this PR https://github.com/apache/spark/pull/45693 work for ArrayJoin, but will not work in general for other functions. This PR introduces a change to give an interface for all functions. Merge only after #45693. ### Why are the changes needed? This is needed in order to enable functions to use collated arrays. ### Does this PR introduce _any_ user-facing change? Yes, collation functions will work. ### How was this patch tested? Test for array_join added to `CollationSuite` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45891 from mihailom-db/SPARK-47736. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../sql/internal/types/AbstractArrayType.scala | 37 ++ .../sql/internal/types/AbstractStringType.scala} | 10 +++--- .../sql/catalyst/analysis/AnsiTypeCoercion.scala | 3 +- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 6 +++- .../expressions/collationExpressions.scala | 1 + .../expressions/collectionOperations.scala | 5 +-- .../catalyst/expressions/stringExpressions.scala | 1 + .../org/apache/spark/sql/CollationSuite.scala | 3 ++ 8 files changed, 57 insertions(+), 9 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractArrayType.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractArrayType.scala new file mode 100644 index ..406449a33727 --- /dev/null +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractArrayType.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.internal.types + +import org.apache.spark.sql.types.{AbstractDataType, ArrayType, DataType} + + +/** + * Use AbstractArrayType(AbstractDataType) for defining expected types for expression parameters. + */ +case class AbstractArrayType(elementType: AbstractDataType) extends AbstractDataType { + + override private[sql] def defaultConcreteType: DataType = +ArrayType(elementType.defaultConcreteType, containsNull = true) + + override private[sql] def acceptsType(other: DataType): Boolean = { +other.isInstanceOf[ArrayType] && + elementType.acceptsType(other.asInstanceOf[ArrayType].elementType) + } + + override private[spark] def simpleString: String = s"array<${elementType.simpleString}>" +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala similarity index 86% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala rename to sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala index 67b65859e6bb..6403295fe20c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala @@ -15,14 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.expressions +package org.apache.spark.sql.internal.types import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} /** * StringTypeCollated is an abstract class for StringType with collation support. */ -abstract class StringTypeCollated extends AbstractDataType
(spark) branch master updated: [SPARK-47001][SQL] Pushdown verification in optimizer
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d0605bf3bf7b [SPARK-47001][SQL] Pushdown verification in optimizer d0605bf3bf7b is described below commit d0605bf3bf7baf4e00924923cee70f729f3aa635 Author: Holden Karau AuthorDate: Thu Apr 11 10:38:39 2024 +0800 [SPARK-47001][SQL] Pushdown verification in optimizer ### What changes were proposed in this pull request? Changes how we evaluate & candidate elements for filter pushdown past unions. ### Why are the changes needed? Unions type promotion combined with a reference to the head child dataframe can result in errors. ### Does this PR introduce _any_ user-facing change? Yes: slightly more filters will be pushed down (these would have previously thrown an exception). ### How was this patch tested? New test added. ### Was this patch authored or co-authored using generative AI tooling? No Closes #45146 from holdenk/SPARK-47001-pushdown-verification-in-optimizer. Authored-by: Holden Karau Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/optimizer/Optimizer.scala | 10 +++-- .../catalyst/optimizer/FilterPushdownSuite.scala | 26 +- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 3a4002127df1..cacde9f5a712 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -1824,22 +1824,28 @@ object PushPredicateThroughNonJoin extends Rule[LogicalPlan] with PredicateHelpe if (pushDown.nonEmpty) { val pushDownCond = pushDown.reduceLeft(And) +// The union is the child of the filter so it's children are grandchildren. +// Moves filters down to the grandchild if there is an element in the grand child's +// output which is semantically equal to the filter being evaluated. val output = union.output val newGrandChildren = union.children.map { grandchild => val newCond = pushDownCond transform { -case e if output.exists(_.semanticEquals(e)) => - grandchild.output(output.indexWhere(_.semanticEquals(e))) +case a: Attribute if output.exists(_.exprId == a.exprId) => + grandchild.output(output.indexWhere(_.exprId == a.exprId)) } assert(newCond.references.subsetOf(grandchild.outputSet)) Filter(newCond, grandchild) } val newUnion = union.withNewChildren(newGrandChildren) if (stayUp.nonEmpty) { + // If there is any filter we can't push evaluate them post union Filter(stayUp.reduceLeft(And), newUnion) } else { + // If we pushed all filters then just return the new union. newUnion } } else { +// If we can't push anything just return the initial filter. filter } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala index bd2ac28a049f..03e65412d166 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.types.{IntegerType, StringType} +import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval class FilterPushdownSuite extends PlanTest { @@ -882,6 +882,30 @@ class FilterPushdownSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + test("union filter pushdown w/reference to grand-child field") { +val nonNullableArray = StructField("a", ArrayType(IntegerType, false)) +val bField = StructField("b", IntegerType) +val testRelationNonNull = LocalRelation(nonNullableArray, bField) +val testRelationNull = LocalRelation($"c".array(IntegerType), $"d".int) + +val nonNullArrayRef = AttributeReference("a", ArrayType(IntegerType, false))( + testRelationNonNull.output(0).exprId, List()) + + +val originalQuery = Union(Seq(testRelationNonNull, testRelati
(spark) branch master updated: [SPARK-47274][PYTHON][SQL] Provide more useful context for PySpark DataFrame API errors
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 86ae0d2bc198 [SPARK-47274][PYTHON][SQL] Provide more useful context for PySpark DataFrame API errors 86ae0d2bc198 is described below commit 86ae0d2bc19832f5bf5d872491cdede800427691 Author: Haejoon Lee AuthorDate: Thu Apr 11 09:41:31 2024 +0800 [SPARK-47274][PYTHON][SQL] Provide more useful context for PySpark DataFrame API errors ### What changes were proposed in this pull request? This PR introduces an enhancement to the error messages generated by PySpark's DataFrame API, adding detailed context about the location within the user's PySpark code where the error occurred. This directly adds a PySpark user call site information into `DataFrameQueryContext` added from https://github.com/apache/spark/pull/43334, aiming to provide PySpark users with the same level of detailed error context for better usability and debugging efficiency for DataFrame APIs. This PR also introduces `QueryContext.pysparkCallSite` and `QueryContext.pysparkFragment` to get a PySpark information from the query context easily. This PR also enhances the functionality of `check_error` so that it can test the query context if it exists. ### Why are the changes needed? To improve a debuggability. Errors originating from PySpark operations can be difficult to debug with limited context in the error messages. While improvements on the JVM side have been made to offer detailed error contexts, PySpark errors often lack this level of detail. ### Does this PR introduce _any_ user-facing change? No API changes, but error messages will include a reference to the exact line of user code that triggered the error, in addition to the existing descriptive error message. For example, consider the following PySpark code snippet that triggers a `DIVIDE_BY_ZERO` error: ```python 1 spark.conf.set("spark.sql.ansi.enabled", True) 2 3 df = spark.range(10) 4 df.select(df.id / 0).show() ``` **Before:** ``` pyspark.errors.exceptions.captured.ArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "divide" was called from java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ``` **After:** ``` pyspark.errors.exceptions.captured.ArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "divide" was called from /.../spark/python/test_pyspark_error.py:4 ``` Now the error message points out the exact problematic code path with file name and line number that user writes. ## Points to the actual problem site instead of the site where the action was called Even when action calling after multiple transform operations are mixed, the exact problematic site can be provided to the user: **In:** ```python 1 spark.conf.set("spark.sql.ansi.enabled", True) 2 df = spark.range(10) 3 4 df1 = df.withColumn("div_ten", df.id / 10) 5 df2 = df1.withColumn("plus_four", df.id + 4) 6 7 # This is problematic divide operation that occurs DIVIDE_BY_ZERO. 8 df3 = df2.withColumn("div_zero", df.id / 0) 9 df4 = df3.withColumn("minus_five", df.id / 5) 10 11 df4.collect() ``` **Out:** ``` pyspark.errors.exceptions.captured.ArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "divide" was called from /.../spark/python/test_pyspark_error.py:8 ``` ### How was this patch tested? Added UTs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45377 from itholic/error_context_for_dataframe_api. Authored-by: Haejoon Lee Signed-off-by: Wenchen Fan --- python/pyspark/errors/exceptions/captured.py | 8 + python/pyspark/sql/column.py | 37 +- .../sql/tests/connect/test_parity_dataframe.py | 4 + python/pyspark/sql/tests/test_dataframe.py | 485 +
(spark) branch master updated (8da9a70f701a -> b637bb92a2f8)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 8da9a70f701a [SPARK-47725][INFRA][FOLLOW-UP] Do not run scheduled job in forked repository add b637bb92a2f8 [SPARK-47802][SQL] Revert (*) from meaning struct(*) back to meaning * No new revisions were added by this update. Summary of changes: docs/sql-migration-guide.md| 1 - .../spark/sql/catalyst/parser/SqlBaseParser.g4 | 7 +-- .../spark/sql/catalyst/parser/AstBuilder.scala | 16 + .../org/apache/spark/sql/internal/SQLConf.scala| 13 .../analyzer-results/selectExcept.sql.out | 72 +- .../resources/sql-tests/inputs/selectExcept.sql| 21 +-- .../sql-tests/results/selectExcept.sql.out | 68 +--- 7 files changed, 8 insertions(+), 190 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 627f6082edca [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation 627f6082edca is described below commit 627f6082edca0507439f0c736e179caf55e6a01d Author: Nikola Mandic AuthorDate: Wed Apr 10 23:23:32 2024 +0800 [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation ### What changes were proposed in this pull request? Current collation [benchmarks](https://github.com/apache/spark/blob/e9f204ae93061a862e4da52c128eaf3512a66c7b/sql/core/benchmarks/CollationBenchmark-results.txt) indicate that `UTF8_BINARY_LCASE` collation comparisons are order of magnitude slower (~7-10x) than plain binary comparisons. Improve the performance by optimizing lowercase comparison function for `UTF8String` instances instead of performing full lowercase conversion before binary comparison. Optimization is based on similar method used in `toLowerCase` where we check character by character if conversion is valid under ASCII and fallback to slow comparison of native strings. In latter case, we only take into consideration suffixes that are left to compare. Benchmarks from `CollationBenchmark` ran locally show substantial performance increase: ``` [info] collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative [info] -- [info] UTF8_BINARY_LCASE7199 7209 14 0.0 71988.8 1.0X [info] UNICODE 3925 3929 5 0.0 39250.4 1.8X [info] UTF8_BINARY 3935 3950 21 0.0 39351.2 1.8X [info] UNICODE_CI 45248 514048706 0.0 452484.7 0.2X ``` ### Why are the changes needed? To improve performance of comparisons of strings under UTF8_BINARY_LCASE collation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added unit tests to `UTF8StringSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45816 from nikolamand-db/SPARK-47693. Authored-by: Nikola Mandic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/util/CollationFactory.java | 2 +- .../org/apache/spark/unsafe/types/UTF8String.java | 96 +- .../apache/spark/unsafe/types/UTF8StringSuite.java | 23 ++ .../CollationBenchmark-jdk21-results.txt | 30 +++ sql/core/benchmarks/CollationBenchmark-results.txt | 30 +++ .../CollationNonASCIIBenchmark-jdk21-results.txt | 27 ++ .../CollationNonASCIIBenchmark-results.txt | 27 ++ .../execution/benchmark/CollationBenchmark.scala | 84 +-- 8 files changed, 223 insertions(+), 96 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 119508a37e71..72a6e574707f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -148,7 +148,7 @@ public final class CollationFactory { collationTable[1] = new Collation( "UTF8_BINARY_LCASE", null, - (s1, s2) -> s1.toLowerCase().binaryCompare(s2.toLowerCase()), + UTF8String::compareLowerCase, "1.0", (s) -> (long)s.toLowerCase().hashCode(), false, diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c5dfb91f06c6..2006efb07a04 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -424,21 +424,16 @@ public final class UTF8String implements Comparable, Externalizable, if (numBytes == 0) { return EMPTY_UTF8; } - -byte[] bytes = new byte[numBytes]; -bytes[0] = (byte) Character.toTitleCase(getByte(0)); +// Optimization - do char level uppercase conversion in case of chars in ASCII range for (int i = 0; i < numBytes; i++
(spark) branch master updated: [SPARK-47775][SQL] Support remaining scalar types in the variant spec
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d817c9a60f51 [SPARK-47775][SQL] Support remaining scalar types in the variant spec d817c9a60f51 is described below commit d817c9a60f51ef8035c8d2b37a995976ae54aa47 Author: Chenhao Li AuthorDate: Wed Apr 10 22:51:17 2024 +0800 [SPARK-47775][SQL] Support remaining scalar types in the variant spec ### What changes were proposed in this pull request? This PR adds support for the remaining scalar types defined in the variant spec (DATE, TIMESTAMP, TIMESTAMP_NTZ, FLOAT, BINARY). The current `parse_json` expression doesn't produce these types, but we need them when we support casting a corresponding Spark type into the variant type. ### Why are the changes needed? This PR can be considered as a preparation for the cast-to-variant feature and will make the latter PR smaller. ### Does this PR introduce _any_ user-facing change? Yes. Existing variant expressions can decode more variant scalar types. ### How was this patch tested? Unit tests. We manually construct variant values with these new scalar types and test the existing variant expressions on them. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45945 from chenhao-db/support_atomic_types. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../org/apache/spark/unsafe/types/VariantVal.java | 8 +- .../org/apache/spark/types/variant/Variant.java| 69 - .../apache/spark/types/variant/VariantUtil.java| 71 - .../spark/sql/catalyst/expressions/Cast.scala | 7 +- .../expressions/variant/variantExpressions.scala | 84 +++- .../spark/sql/catalyst/json/JacksonGenerator.scala | 2 +- .../variant/VariantExpressionSuite.scala | 112 + 7 files changed, 314 insertions(+), 39 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/VariantVal.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/VariantVal.java index 652c05daf344..a441bab4ac41 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/VariantVal.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/VariantVal.java @@ -21,6 +21,8 @@ import org.apache.spark.unsafe.Platform; import org.apache.spark.types.variant.Variant; import java.io.Serializable; +import java.time.ZoneId; +import java.time.ZoneOffset; import java.util.Arrays; /** @@ -99,13 +101,17 @@ public class VariantVal implements Serializable { '}'; } + public String toJson(ZoneId zoneId) { +return new Variant(value, metadata).toJson(zoneId); + } + /** * @return A human-readable representation of the Variant value. It is always a JSON string at * this moment. */ @Override public String toString() { -return new Variant(value, metadata).toJson(); +return toJson(ZoneOffset.UTC); } /** diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java b/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java index 8340aadd261f..4aeb2c6e1435 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java @@ -23,7 +23,16 @@ import com.fasterxml.jackson.core.JsonGenerator; import java.io.CharArrayWriter; import java.io.IOException; import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoUnit; import java.util.Arrays; +import java.util.Base64; +import java.util.Locale; import static org.apache.spark.types.variant.VariantUtil.*; @@ -89,6 +98,16 @@ public final class Variant { return VariantUtil.getDecimal(value, pos); } + // Get a float value from the variant. + public float getFloat() { +return VariantUtil.getFloat(value, pos); + } + + // Get a binary value from the variant. + public byte[] getBinary() { +return VariantUtil.getBinary(value, pos); + } + // Get a string value from the variant. public String getString() { return VariantUtil.getString(value, pos); @@ -188,9 +207,9 @@ public final class Variant { // Stringify the variant in JSON format. // Throw `MALFORMED_VARIANT` if the variant is malformed. - public String toJson() { + public String toJson(ZoneId zoneId) { StringBuilder sb = new StringBuilder(); -toJsonImpl(value, metadata, pos, sb); +toJsonImpl(value, metadata, pos, sb, zoneId); return sb.toString
(spark) branch master updated: [SPARK-47680][SQL] Add variant_explode expression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4cb364e6f615 [SPARK-47680][SQL] Add variant_explode expression 4cb364e6f615 is described below commit 4cb364e6f615512811b3001597d0cf98a7a30b00 Author: Chenhao Li AuthorDate: Wed Apr 10 22:47:43 2024 +0800 [SPARK-47680][SQL] Add variant_explode expression ### What changes were proposed in this pull request? This PR adds a new `VariantExplode` expression. It separates a variant object/array into multiple rows containing its fields/elements. Its result schema is `struct`. `pos` is the position of the field/element in its parent object/array, and `value` is the field/element value. `key` is the field name when exploding a variant object, or is NULL when exploding a variant array. It ignores any input that is not a variant array/object, including SQL NULL, [...] It is exposed as two SQL expressions, `variant_explode` and `variant_explode_outer`. The only difference is that whenever `variant_explode` produces zero output row for an input row, `variant_explode_outer` will produce one output row containing `{NULL, NULL, NULL}`. Usage examples: ``` > SELECT variant_explode(parse_json('["hello", "world"]')); 0 NULL"hello" 1 NULL"world" > SELECT variant_explode(parse_json('{"a": true, "b": 3.14}')); 0 a true 1 b 3.14 ``` ### Why are the changes needed? This expression allows the user to process variant array and object more conveniently. ### Does this PR introduce _any_ user-facing change? Yes. A new SQL expression is added. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45805 from chenhao-db/variant_explode. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/FunctionRegistry.scala | 4 +- .../expressions/variant/variantExpressions.scala | 83 ++ .../scala/org/apache/spark/sql/VariantSuite.scala | 26 +++ 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 99ae3adde44f..9447ea63b51f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -1096,7 +1096,9 @@ object TableFunctionRegistry { generator[PosExplode]("posexplode"), generator[PosExplode]("posexplode_outer", outer = true), generator[Stack]("stack"), -generator[SQLKeywords]("sql_keywords") +generator[SQLKeywords]("sql_keywords"), +generator[VariantExplode]("variant_explode"), +generator[VariantExplode]("variant_explode_outer", outer = true) ) val builtin: SimpleTableFunctionRegistry = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala index 7d1a3cf00d2b..c5e316dc6c8c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.variant import scala.util.parsing.combinator.RegexParsers import org.apache.spark.SparkRuntimeException +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.ExpressionBuilder import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch @@ -419,6 +420,88 @@ object VariantGetExpressionBuilder extends VariantGetExpressionBuilderBase(true) // scalastyle:on line.size.limit object TryVariantGetExpressionBuilder extends VariantGetExpressionBuilderBase(false) +// scalastyle:off line.size.limit line.contains.tab +@ExpressionDescription( + usage = "_FUNC_(expr) - It separates a variant object/array into multiple rows containing its fields/elements. Its result schema is `struct`. `pos` is the position of the field/element in its parent object/array, and `value` is the field/element value. `key` is the field name when exploding a variant object, or is NULL when explodi
(spark) branch master updated: [SPARK-47786] SELECT DISTINCT (*) should not become SELECT DISTINCT struct(*) (revert to previous behavior)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e8e7dcce4ac7 [SPARK-47786] SELECT DISTINCT (*) should not become SELECT DISTINCT struct(*) (revert to previous behavior) e8e7dcce4ac7 is described below commit e8e7dcce4ac76f494781ccd7712634b73a6dbe14 Author: Serge Rielau AuthorDate: Wed Apr 10 10:29:32 2024 +0800 [SPARK-47786] SELECT DISTINCT (*) should not become SELECT DISTINCT struct(*) (revert to previous behavior) ### What changes were proposed in this pull request? We special case SELECT DISTINCT (*) to become SELECT DISTINCT * This prevents (*) to be treated as struct(*). We used to ignore parens around stars (*) everywhere, but that is inconsistent with e.g. (c1, c2). However there seems to be a reasonable number of users treating DISTINCT as a function. ### Why are the changes needed? Prevent regression for a common weird case. ### Does this PR introduce _any_ user-facing change? No, It undoes a user changing change ### How was this patch tested? Existing QA, added new tests to selectExcept.sql ### Was this patch authored or co-authored using generative AI tooling? No Closes #45970 from srielau/SPARK-47786-distinct-star-fix. Authored-by: Serge Rielau Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/parser/SqlBaseParser.g4 | 7 +++- .../spark/sql/catalyst/parser/AstBuilder.scala | 9 + .../analyzer-results/selectExcept.sql.out | 41 ++ .../resources/sql-tests/inputs/selectExcept.sql| 12 +++ .../sql-tests/results/selectExcept.sql.out | 32 + 5 files changed, 100 insertions(+), 1 deletion(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 6e79d4af2f5e..8ff1d3f95301 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -575,8 +575,13 @@ transformClause (RECORDREADER recordReader=stringLit)? ; +parenthesizedStar +: LEFT_PAREN ASTERISK RIGHT_PAREN +; + selectClause -: SELECT (hints+=hint)* setQuantifier? namedExpressionSeq +: SELECT (hints+=hint)* setQuantifier parenthesizedStar +| SELECT (hints+=hint)* setQuantifier? namedExpressionSeq ; setClause diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index b4ba2c1caa22..46bf765b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2595,6 +2595,15 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { UnresolvedExtractValue(expression(ctx.value), expression(ctx.index)) } + /** + * Create an expression for an expression between parentheses. This is need because the ANTLR + * visitor cannot automatically convert the nested context into an expression. + */ + override def visitParenthesizedStar( + ctx: ParenthesizedStarContext): Seq[Expression] = withOrigin(ctx) { +Seq(UnresolvedStar(None)) + } + /** * Create an expression for an expression between parentheses. This is need because the ANTLR * visitor cannot automatically convert the nested context into an expression. diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out index 28c6a77d2333..340712399d4f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out @@ -548,13 +548,54 @@ Project [c1#x, c2#x, c3#x, c4#x, c5#x] +- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x] +-- !query +SELECT DISTINCT * FROM v1 +-- !query analysis +Distinct ++- Project [c1#x, c2#x, c3#x, c4#x, c5#x] + +- SubqueryAlias v1 + +- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x]) + +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x, cast(c3#x as void) AS c3#x, cast(c4#x as int) AS c4#x, cast(c5#x as int) AS c5#x] ++- SubqueryAlias T + +- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x] + + +-- !query +SELECT DISTINCT(*) FROM v1 +-- !query analysis +Distinct ++- SubqueryAlias v1 + +- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x]) + +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x, cast(c3#x
(spark) branch master updated (08c49637795f -> 319edfdc5cd6)
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 08c49637795f [SPARK-47770][INFRA] Fix `GenerateMIMAIgnore.isPackagePrivateModule` to return `false` instead of failing add 319edfdc5cd6 [SPARK-47682][SQL] Support cast from variant No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/types/UpCastRule.scala| 1 + .../spark/sql/catalyst/expressions/Cast.scala | 26 +- .../expressions/variant/variantExpressions.scala | 13 + .../variant/VariantExpressionSuite.scala | 59 +++--- 4 files changed, 90 insertions(+), 9 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-47504][SQL] Resolve AbstractDataType simpleStrings for StringTypeCollated
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new abb7b046d50d [SPARK-47504][SQL] Resolve AbstractDataType simpleStrings for StringTypeCollated abb7b046d50d is described below commit abb7b046d50d3aff527834f2ed83618eaeb65a89 Author: Mihailo Milosevic AuthorDate: Tue Apr 9 00:05:37 2024 +0800 [SPARK-47504][SQL] Resolve AbstractDataType simpleStrings for StringTypeCollated ### What changes were proposed in this pull request? Renaming simpleString in StringTypeAnyCollation. This PR should only be merged after https://github.com/apache/spark/pull/45383 is merged. ### Why are the changes needed? [SPARK-47296](https://github.com/apache/spark/pull/45422) introduced a change to fail all unsupported functions. Because of this change expected inputTypes in ExpectsInputTypes had to be changed. This change introduced a change on user side which will print "STRING_ANY_COLLATION" in places where before we printed "STRING" when an error occurred. Concretely if we get an input of Int where StringTypeAnyCollation was expected, we will throw this faulty message for users. ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? Existing tests were changed back to "STRING" notation. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45694 from mihailom-db/SPARK-47504. Authored-by: Mihailo Milosevic Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/expressions/StringTypeCollated.scala| 4 +--- .../sql/catalyst/expressions/StringExpressionsSuite.scala | 2 +- .../src/test/scala/org/apache/spark/sql/CollationSuite.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 10 +- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala index 2f66e5795634..67b65859e6bb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/StringTypeCollated.scala @@ -24,13 +24,13 @@ import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} */ abstract class StringTypeCollated extends AbstractDataType { override private[sql] def defaultConcreteType: DataType = StringType + override private[sql] def simpleString: String = "string" } /** * Use StringTypeBinary for expressions supporting only binary collation. */ case object StringTypeBinary extends StringTypeCollated { - override private[sql] def simpleString: String = "string_binary" override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[StringType] && other.asInstanceOf[StringType].supportsBinaryEquality } @@ -39,7 +39,6 @@ case object StringTypeBinary extends StringTypeCollated { * Use StringTypeBinaryLcase for expressions supporting only binary and lowercase collation. */ case object StringTypeBinaryLcase extends StringTypeCollated { - override private[sql] def simpleString: String = "string_binary_lcase" override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[StringType] && (other.asInstanceOf[StringType].supportsBinaryEquality || other.asInstanceOf[StringType].isUTF8BinaryLcaseCollation) @@ -49,6 +48,5 @@ case object StringTypeBinaryLcase extends StringTypeCollated { * Use StringTypeAnyCollation for expressions supporting all possible collation types. */ case object StringTypeAnyCollation extends StringTypeCollated { - override private[sql] def simpleString: String = "string_any_collation" override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[StringType] } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index cda9676ca58b..1fbd1ac9a29f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -70,7 +70,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { errorSubClass = "UNEXPECTED_INPUT_TYPE", messageParameters = Map( "paramIndex" -> ordinalNumber(0), - "requiredType" -> &q
(spark) branch master updated: [SPARK-47681][SQL] Add schema_of_variant expression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 134a13928965 [SPARK-47681][SQL] Add schema_of_variant expression 134a13928965 is described below commit 134a13928965e9818393511eadd504b9f1679766 Author: Chenhao Li AuthorDate: Tue Apr 9 00:04:02 2024 +0800 [SPARK-47681][SQL] Add schema_of_variant expression ### What changes were proposed in this pull request? This PR adds a new `SchemaOfVariant` expression. It returns schema in the SQL format of a variant. Usage examples: ``` > SELECT schema_of_variant(parse_json('null')); VOID > SELECT schema_of_variant(parse_json('[{"b":true,"a":0}]')); ARRAY> ``` ### Why are the changes needed? This expression can help the user explore the content of variant values. ### Does this PR introduce _any_ user-facing change? Yes. A new SQL expression is added. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45806 from chenhao-db/variant_schema. Authored-by: Chenhao Li Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../spark/sql/catalyst/encoders/EncoderUtils.scala | 7 +- .../expressions/variant/variantExpressions.scala | 86 ++ .../spark/sql/catalyst/json/JsonInferSchema.scala | 18 +++-- .../sql-functions/sql-expression-schema.md | 1 + .../apache/spark/sql/VariantEndToEndSuite.scala| 31 6 files changed, 134 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index ecba8b263c41..bbc063c32103 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -822,6 +822,7 @@ object FunctionRegistry { expression[ParseJson]("parse_json"), expressionBuilder("variant_get", VariantGetExpressionBuilder), expressionBuilder("try_variant_get", TryVariantGetExpressionBuilder), +expression[SchemaOfVariant]("schema_of_variant"), // cast expression[Cast]("cast"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala index 45598b6a66f2..20f86a32c1a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala @@ -23,8 +23,8 @@ import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, C import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.types.{PhysicalBinaryType, PhysicalIntegerType, PhysicalLongType} import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} -import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType, CalendarIntervalType, DataType, DateType, DayTimeIntervalType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, MapType, ObjectType, ShortType, StringType, StructType, TimestampNTZType, TimestampType, UserDefinedType, YearMonthIntervalType} -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType, CalendarIntervalType, DataType, DateType, DayTimeIntervalType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, MapType, ObjectType, ShortType, StringType, StructType, TimestampNTZType, TimestampType, UserDefinedType, VariantType, YearMonthIntervalType} +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String, VariantVal} /** * Helper class for Generating [[ExpressionEncoder]]s. @@ -122,7 +122,8 @@ object EncoderUtils { TimestampType -> classOf[PhysicalLongType.InternalType], TimestampNTZType -> classOf[PhysicalLongType.InternalType], BinaryType -> classOf[PhysicalBinaryType.InternalType], -CalendarIntervalType -> classOf[CalendarInterval] +CalendarIntervalType -> classOf[CalendarInterval], +VariantType -> classOf[VariantVal] ) val typeBoxedJavaMapping: Map[DataType, Class[_]] = Map[DataType, Class[_]]( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalys
(spark) branch master updated: [SPARK-47657][SQL] Implement collation filter push down support per file source
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new eb8e99721714 [SPARK-47657][SQL] Implement collation filter push down support per file source eb8e99721714 is described below commit eb8e99721714eeac14978f0cb6a2dc35251a5d23 Author: Stefan Kandic AuthorDate: Mon Apr 8 12:17:38 2024 +0800 [SPARK-47657][SQL] Implement collation filter push down support per file source ### What changes were proposed in this pull request? Previously in #45262 we completely disabled filter pushdown for any expression referencing non utf8 binary collated columns. However, we should make this more fine-grained so that individual data sources can decide to support pushing down these filters if they can. ### Why are the changes needed? To enable collation filter push down for an individual data source. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? With previously added unit test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45782 from stefankandic/newPushdownLogic. Authored-by: Stefan Kandic Signed-off-by: Wenchen Fan --- .../execution/datasources/DataSourceUtils.scala| 9 ++- .../sql/execution/datasources/FileFormat.scala | 6 ++ .../execution/datasources/FileSourceStrategy.scala | 3 +- .../datasources/PruneFileSourcePartitions.scala| 4 +- .../execution/datasources/v2/FileScanBuilder.scala | 9 ++- .../spark/sql/FileBasedDataSourceSuite.scala | 85 -- 6 files changed, 70 insertions(+), 46 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala index 38567c16fd1f..0db5de724340 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala @@ -284,12 +284,15 @@ object DataSourceUtils extends PredicateHelper { * Determines whether a filter should be pushed down to the data source or not. * * @param expression The filter expression to be evaluated. + * @param isCollationPushDownSupported Whether the data source supports collation push down. * @return A boolean indicating whether the filter should be pushed down or not. */ - def shouldPushFilter(expression: Expression): Boolean = { -expression.deterministic && !expression.exists { + def shouldPushFilter(expression: Expression, isCollationPushDownSupported: Boolean): Boolean = { +if (!expression.deterministic) return false + +isCollationPushDownSupported || !expression.exists { case childExpression @ (_: Attribute | _: GetStructField) => -// don't push down filters for types with non-default collation +// don't push down filters for types with non-binary sortable collation // as it could lead to incorrect results SchemaUtils.hasNonBinarySortableCollatedString(childExpression.dataType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala index 36c59950fe20..0785b0cbe9e2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala @@ -223,6 +223,12 @@ trait FileFormat { */ def fileConstantMetadataExtractors: Map[String, PartitionedFile => Any] = FileFormat.BASE_METADATA_EXTRACTORS + + /** + * Returns whether the file format supports filter push down + * for non utf8 binary collated columns. + */ + def supportsCollationPushDown: Boolean = false } object FileFormat { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index e4b66d72eaf8..f2dcbe26104f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -160,7 +160,8 @@ object FileSourceStrategy extends Strategy with PredicateHelper with Logging { // - filters that need to be evaluated again after the scan val filterSet = ExpressionSet(filters) - val filtersToPush = filters.filter(f => DataSourceUtils.shouldPushFilter(f)) + val filtersToPush = filters.filter(f => + DataSourceUtil
(spark) branch master updated: [SPARK-47713][SQL][CONNECT] Fix a self-join failure
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3a39ac231fe4 [SPARK-47713][SQL][CONNECT] Fix a self-join failure 3a39ac231fe4 is described below commit 3a39ac231fe43332bc242ac582f30bb57c739927 Author: Ruifeng Zheng AuthorDate: Mon Apr 8 12:06:03 2024 +0800 [SPARK-47713][SQL][CONNECT] Fix a self-join failure ### What changes were proposed in this pull request? update the logic to resolve column in spark connect ### Why are the changes needed? ``` df = spark.createDataFrame([(1, 2), (3, 4)], schema=["a", "b"]) df2 = df.select(df.a.alias("aa"), df.b) df3 = df2.join(df, df2.b == df.b) AnalysisException: [AMBIGUOUS_COLUMN_REFERENCE] Column "b" is ambiguous. It's because you joined several DataFrame together, and some of these DataFrames are the same. This column points to one of the DataFrames but Spark is unable to figure out which one. Please alias the DataFrames with different names via `DataFrame.alias` before joining them, and specify the column using qualified name, e.g. `df.alias("a").join(df.alias("b"), col("a.id") > col("b.id"))`. SQLSTATE: 42702 ``` ### Does this PR introduce _any_ user-facing change? yes, above query can run successfully after this PR This PR only affects Spark Connect, won't affect Classic Spark. ### How was this patch tested? added tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #45846 from zhengruifeng/fix_connect_self_join_depth. Authored-by: Ruifeng Zheng Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/ClientE2ETestSuite.scala | 11 --- .../sql/tests/connect/test_connect_basic.py| 9 + python/pyspark/sql/tests/test_dataframe.py | 7 .../catalyst/analysis/ColumnResolutionHelper.scala | 38 +- 4 files changed, 45 insertions(+), 20 deletions(-) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala index 95ee69d2a47d..a0729adb8960 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -940,11 +940,12 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM } assert(e3.getMessage.contains("AMBIGUOUS_COLUMN_REFERENCE")) -val e4 = intercept[AnalysisException] { - // df1("i") is ambiguous as df1 appears in both join sides (df1_filter contains df1). - df1.join(df1_filter, df1("i") === 1).collect() -} -assert(e4.getMessage.contains("AMBIGUOUS_COLUMN_REFERENCE")) +// TODO(SPARK-47749): Dataframe.collect should accept duplicated column names +assert( + // df1.join(df1_filter, df1("i") === 1) fails in classic spark due to: + // org.apache.spark.sql.AnalysisException: Column i#24 are ambiguous + df1.join(df1_filter, df1("i") === 1).columns === +Array("i", "j", "i", "j")) checkSameResult( Seq(Row("a")), diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 3b8e8165b4bf..16e9a577451f 100755 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -1155,6 +1155,15 @@ class SparkConnectBasicTests(SparkConnectSQLTestCase): set(spark_df.select("id").crossJoin(other=spark_df.select("name")).toPandas()), ) +def test_self_join(self): +# SPARK-47713: this query fails in classic spark +df1 = self.connect.createDataFrame([(1, "a")], schema=["i", "j"]) +df1_filter = df1.filter(df1.i > 0) +df2 = df1.join(df1_filter, df1.i == 1) +self.assertEqual(df2.count(), 1) +self.assertEqual(df2.columns, ["i", "j", "i", "j"]) +self.assertEqual(list(df2.first()), [1, "a", 1, "a"]) + def test_with_metadata(self): cdf = self.connect.createDataFrame(data=[(2, "Alice"), (5, "Bob")], schema=["age", "name"]) self.assertEqual(cdf.schema["age"].metadata, {}) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py