[spark] branch master updated: [SPARK-45344][CORE][SQL] Remove all Scala version string check
This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2cc1ee4d3a0 [SPARK-45344][CORE][SQL] Remove all Scala version string check 2cc1ee4d3a0 is described below commit 2cc1ee4d3a05a641d7a245f015ef824d8f7bae8b Author: yangjie01 AuthorDate: Thu Sep 28 11:06:04 2023 +0800 [SPARK-45344][CORE][SQL] Remove all Scala version string check ### What changes were proposed in this pull request? This PR removes all the no longer needed Scala version string checks. ### Why are the changes needed? These version checks are no longer needed. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #43133 from LuciferYang/SPARK-45344. Authored-by: yangjie01 Signed-off-by: yangjie01 --- .../apache/spark/serializer/KryoSerializer.scala | 5 +--- .../deploy/rest/SubmitRestProtocolSuite.scala | 32 +- .../spark/serializer/KryoSerializerSuite.scala | 1 - .../spark/sql/hive/HiveSparkSubmitSuite.scala | 3 +- 4 files changed, 3 insertions(+), 38 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 126f5a0ca3b..60af1abe943 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -26,7 +26,6 @@ import javax.annotation.Nullable import scala.collection.mutable.ArrayBuffer import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag -import scala.util.Properties import scala.util.control.NonFatal import com.esotericsoftware.kryo.{Kryo, KryoException, Serializer => KryoClassSerializer} @@ -229,9 +228,7 @@ class KryoSerializer(conf: SparkConf) kryo.register(None.getClass) kryo.register(Nil.getClass) -if (Properties.versionNumberString.startsWith("2.13")) { - kryo.register(Utils.classForName("scala.collection.immutable.ArraySeq$ofRef")) -} + kryo.register(Utils.classForName("scala.collection.immutable.ArraySeq$ofRef")) kryo.register(Utils.classForName("scala.collection.immutable.$colon$colon")) kryo.register(Utils.classForName("scala.collection.immutable.Map$EmptyMap$")) kryo.register(Utils.classForName("scala.math.Ordering$Reverse")) diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala index 9fdbf485e17..9eb51725831 100644 --- a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.deploy.rest import java.lang.Boolean -import scala.util.Properties.versionNumberString - import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SparkConf, SparkFunSuite} @@ -235,34 +233,7 @@ class SubmitRestProtocolSuite extends SparkFunSuite { |} """.stripMargin - private lazy val submitDriverRequestJson = if (versionNumberString.startsWith("2.12")) { -s""" - |{ - | "action" : "CreateSubmissionRequest", - | "appArgs" : [ "two slices", "a hint of cinnamon" ], - | "appResource" : "honey-walnut-cherry.jar", - | "clientSparkVersion" : "1.2.3", - | "environmentVariables" : { - |"PATH" : "/dev/null" - | }, - | "mainClass" : "org.apache.spark.examples.SparkPie", - | "sparkProperties" : { - |"spark.archives" : "fireballs.zip", - |"spark.driver.extraLibraryPath" : "pickle.jar", - |"spark.jars" : "mayonnaise.jar,ketchup.jar", - |"spark.driver.supervise" : "false", - |"spark.app.name" : "SparkPie", - |"spark.cores.max" : "1", - |"spark.driver.memory" : "${Utils.DEFAULT_DRIVER_MEM_MB}m", - |"spark.files" : "fireball.png", - |"spark.driver.cores" : "180", - |"spark.driver.extraJavaOptions" : " -Dslices=5 -Dcolor=mostly_red", - |"spark.executor.memory" : "256m", - |"spark.driver.extraClassPath" : "food-coloring.jar" - | } - |} -""".stripMargin - } else { + private lazy val submitDriverRequestJson = s""" |{ | "action" : "CreateSubmissionRequest", @@ -289,7 +260,6 @@ class SubmitRestProtocolSuite extends SparkFunSuite { | } |} """.stripMargin - } private val submitDriverResponseJson = """ diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/s
[spark] branch master updated: [SPARK-45353][PYTHON][DOCS] Refine docstring of `create_map/slice/array_join`
This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6d2ffaa4ea8 [SPARK-45353][PYTHON][DOCS] Refine docstring of `create_map/slice/array_join` 6d2ffaa4ea8 is described below commit 6d2ffaa4ea87679ce527512f11d04d136a1d536a Author: yangjie01 AuthorDate: Thu Sep 28 11:03:47 2023 +0800 [SPARK-45353][PYTHON][DOCS] Refine docstring of `create_map/slice/array_join` ### What changes were proposed in this pull request? This pr refine docstring of `create_map/slice/array_join` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #43145 from LuciferYang/collection-functions-2. Authored-by: yangjie01 Signed-off-by: yangjie01 --- python/pyspark/sql/functions.py | 191 ++-- 1 file changed, 163 insertions(+), 28 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index f54ce66e39f..04968440e39 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -11684,7 +11684,12 @@ def create_map(__cols: Union[List["ColumnOrName_"], Tuple["ColumnOrName_", ...]] def create_map( *cols: Union["ColumnOrName", Union[List["ColumnOrName_"], Tuple["ColumnOrName_", ...]]] ) -> Column: -"""Creates a new map column. +""" +Map function: Creates a new map column from an even number of input columns or +column references. The input columns are grouped into key-value pairs to form a map. +For instance, the input (key1, value1, key2, value2, ...) would produce a map that +associates key1 with value1, key2 with value2, and so on. The function supports +grouping columns as a list as well. .. versionadded:: 2.0.0 @@ -11694,16 +11699,54 @@ def create_map( Parameters -- cols : :class:`~pyspark.sql.Column` or str -column names or :class:`~pyspark.sql.Column`\\s that are -grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...). +The input column names or :class:`~pyspark.sql.Column` objects grouped into +key-value pairs. These can also be expressed as a list of columns. + +Returns +--- +:class:`~pyspark.sql.Column` +A new Column of Map type, where each value is a map formed from the corresponding +key-value pairs provided in the input arguments. Examples +Example 1: Basic usage of create_map function. + +>>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) ->>> df.select(create_map('name', 'age').alias("map")).collect() -[Row(map={'Alice': 2}), Row(map={'Bob': 5})] ->>> df.select(create_map([df.name, df.age]).alias("map")).collect() -[Row(map={'Alice': 2}), Row(map={'Bob': 5})] +>>> df.select(sf.create_map('name', 'age')).show() ++--+ +|map(name, age)| ++--+ +| {Alice -> 2}| +|{Bob -> 5}| ++--+ + +Example 2: Usage of create_map function with a list of columns. + +>>> from pyspark.sql import functions as sf +>>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) +>>> df.select(sf.create_map([df.name, df.age])).show() ++--+ +|map(name, age)| ++--+ +| {Alice -> 2}| +|{Bob -> 5}| ++--+ + +Example 3: Usage of create_map function with more than one key-value pair. + +>>> from pyspark.sql import functions as sf +>>> df = spark.createDataFrame([("Alice", 2, "female"), +... ("Bob", 5, "male")], ("name", "age", "gender")) +>>> df.select(sf.create_map(sf.lit('name'), df['name'], +... sf.lit('age'), df['age'])).show(truncate=False) ++-+ +|map(name, name, age, age)| ++-+ +|{name -> Alice, age -> 2}| +|{name -> Bob, age -> 5} | ++-+ """ if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] # type: ignore[assignment] @@ -12002,8 +12045,9 @@ def slice( x: "ColumnOrName", start: Union["ColumnOrName", int], length: Union["ColumnOrName", int] ) -> Column: """ -Collection function: returns an array containing all the elements in `x` from index `start` -(array indices start at 1, or from the end if `start` is negative) with the specified `length`. +Array function: Returns a new array column
[spark] branch master updated (6a9d35f766d -> c5967310740)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 6a9d35f766d [SPARK-45354][SQL] Resolve functions bottom-up add c5967310740 [SPARK-2][MESOS] Remove Mesos support No new revisions were added by this update. Summary of changes: .github/labeler.yml| 3 - .github/workflows/benchmark.yml| 2 +- .github/workflows/build_and_test.yml | 4 +- .github/workflows/maven_test.yml | 12 +- LICENSE-binary | 3 +- NOTICE-binary | 3 - R/pkg/tests/fulltests/test_sparkR.R| 6 +- README.md | 2 +- assembly/pom.xml | 10 - .../shuffle/protocol/BlockTransferMessage.java | 4 - .../shuffle/protocol/mesos/RegisterDriver.java | 77 -- .../protocol/mesos/ShuffleServiceHeartbeat.java| 53 -- conf/spark-env.sh.template | 1 - .../main/scala/org/apache/spark/SparkConf.scala| 2 +- .../main/scala/org/apache/spark/SparkContext.scala | 14 +- .../apache/spark/api/java/JavaSparkContext.scala | 10 +- .../org/apache/spark/deploy/PythonRunner.scala | 1 - .../org/apache/spark/deploy/SparkSubmit.scala | 73 +- .../apache/spark/deploy/SparkSubmitArguments.scala | 8 +- .../spark/deploy/history/HistoryServer.scala | 2 +- .../spark/deploy/rest/RestSubmissionClient.scala | 4 +- .../org/apache/spark/deploy/security/README.md | 2 +- .../scala/org/apache/spark/executor/Executor.scala | 5 +- .../org/apache/spark/internal/config/package.scala | 9 +- .../org/apache/spark/metrics/MetricsSystem.scala | 3 - .../apache/spark/resource/ResourceProfile.scala| 4 +- .../apache/spark/scheduler/SchedulerBackend.scala | 2 +- .../apache/spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../apache/spark/scheduler/TaskSetManager.scala| 1 - .../cluster/CoarseGrainedSchedulerBackend.scala| 5 +- .../main/scala/org/apache/spark/util/Utils.scala | 13 +- .../org/apache/spark/SecurityManagerSuite.scala| 2 +- .../org/apache/spark/deploy/SparkSubmitSuite.scala | 22 - .../deploy/rest/StandaloneRestSubmitSuite.scala| 6 - dev/create-release/release-build.sh| 2 +- dev/create-release/releaseutils.py | 1 - dev/deps/spark-deps-hadoop-3-hive-2.3 | 1 - dev/lint-java | 2 +- dev/mima | 2 +- dev/sbt-checkstyle | 2 +- dev/scalastyle | 2 +- dev/sparktestsupport/modules.py| 8 - dev/test-dependencies.sh | 2 +- docs/_config.yml | 3 +- docs/_layouts/global.html | 1 - docs/building-spark.md | 6 +- docs/cluster-overview.md | 8 +- docs/configuration.md | 34 +- docs/core-migration-guide.md | 2 + docs/hardware-provisioning.md | 3 +- docs/index.md | 3 - docs/job-scheduling.md | 23 +- docs/monitoring.md | 8 - docs/rdd-programming-guide.md | 2 +- docs/running-on-mesos.md | 901 docs/security.md | 26 +- docs/spark-standalone.md | 2 +- docs/streaming-programming-guide.md| 21 +- docs/submitting-applications.md| 16 - .../spark/launcher/AbstractCommandBuilder.java | 1 - .../spark/launcher/SparkClassCommandBuilder.java | 13 +- .../launcher/SparkSubmitCommandBuilderSuite.java | 4 - pom.xml| 7 - project/SparkBuild.scala | 4 +- python/README.md | 2 +- python/docs/source/user_guide/python_packaging.rst | 2 +- python/pyspark/context.py | 2 +- .../scala/org/apache/spark/repl/ReplSuite.scala| 24 - .../deploy/k8s/features/LocalDirsFeatureStep.scala | 2 +- resource-managers/mesos/pom.xml| 128 --- .../mesos/MesosExternalBlockStoreClient.java | 124 --- ...g.apache.spark.scheduler.ExternalClusterManager | 18 - .../deploy/mesos/MesosClusterDispatcher.scala | 136 .../mesos/MesosClusterDispatcherArguments.scala| 149 .../deploy/mesos/MesosDriverDescription.scala | 70 -- .../deploy/mesos/MesosExternalShuffleService.scala | 1
[spark] branch branch-3.4 updated: [SPARK-44034][TESTS][3.4] Add a new test group for sql module
This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new 44c79d9600f [SPARK-44034][TESTS][3.4] Add a new test group for sql module 44c79d9600f is described below commit 44c79d9600f14d1bb9db7ccb2b8b8e5a6d40459e Author: yangjie01 AuthorDate: Thu Sep 28 10:10:51 2023 +0800 [SPARK-44034][TESTS][3.4] Add a new test group for sql module ### What changes were proposed in this pull request? The purpose of this pr is to add a new test tag `SlowSQLTest` to the sql module, and identified some Suites with test cases more than 3 seconds, and apply it to GA testing task to reduce the testing pressure of the `sql others` group. ### Why are the changes needed? For a long time, the sql module UTs has only two groups: `slow` and `others`. The test cases in group `slow` are fixed, while the number of test cases in group `others` continues to increase, which has had a certain impact on the testing duration and stability of group `others`. So this PR proposes to add a new testing group to share the testing pressure of `sql others` group, which has made the testing time of the three groups more average, and hope it can improve the stability of the GA task. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Should monitor GA ### Was this patch authored or co-authored using generative AI tooling? No Closes #43141 from LuciferYang/SPARK-44034-34. Authored-by: yangjie01 Signed-off-by: yangjie01 --- .github/workflows/build_and_test.yml| 8 +++- .../java/org/apache/spark/tags/SlowSQLTest.java | 21 ++--- .../spark/sql/ApproximatePercentileQuerySuite.scala | 2 ++ .../org/apache/spark/sql/CachedTableSuite.scala | 2 ++ .../apache/spark/sql/DataFrameAsOfJoinSuite.scala | 2 ++ .../scala/org/apache/spark/sql/DataFrameSuite.scala | 2 ++ .../spark/sql/DataFrameWindowFunctionsSuite.scala | 2 ++ .../org/apache/spark/sql/DatasetCacheSuite.scala| 3 ++- .../test/scala/org/apache/spark/sql/JoinSuite.scala | 2 ++ .../WriteDistributionAndOrderingSuite.scala | 2 ++ .../sql/execution/BroadcastExchangeSuite.scala | 2 ++ .../execution/OptimizeMetadataOnlyQuerySuite.scala | 2 ++ .../spark/sql/execution/QueryExecutionSuite.scala | 3 ++- .../execution/adaptive/AdaptiveQueryExecSuite.scala | 2 ++ .../FileSourceAggregatePushDownSuite.scala | 5 + .../execution/datasources/V1WriteCommandSuite.scala | 2 ++ .../parquet/ParquetRebaseDatetimeSuite.scala| 3 +++ .../datasources/parquet/ParquetRowIndexSuite.scala | 2 ++ .../execution/streaming/state/RocksDBSuite.scala| 2 ++ .../sql/execution/ui/AllExecutionsPageSuite.scala | 2 ++ .../spark/sql/expressions/ExpressionInfoSuite.scala | 2 ++ .../spark/sql/sources/BucketedReadSuite.scala | 2 ++ .../spark/sql/sources/BucketedWriteSuite.scala | 2 ++ .../DisableUnnecessaryBucketedScanSuite.scala | 3 +++ .../streaming/AcceptsLatestSeenOffsetSuite.scala| 2 ++ .../DeprecatedStreamingAggregationSuite.scala | 2 ++ .../sql/streaming/EventTimeWatermarkSuite.scala | 2 ++ .../spark/sql/streaming/FileStreamSinkSuite.scala | 3 +++ .../spark/sql/streaming/FileStreamSourceSuite.scala | 5 - .../spark/sql/streaming/FileStreamStressSuite.scala | 2 ++ ...apGroupsInPandasWithStateDistributionSuite.scala | 2 ++ .../FlatMapGroupsInPandasWithStateSuite.scala | 2 ++ .../FlatMapGroupsWithStateDistributionSuite.scala | 2 ++ .../sql/streaming/FlatMapGroupsWithStateSuite.scala | 2 ++ ...latMapGroupsWithStateWithInitialStateSuite.scala | 2 ++ .../sql/streaming/MemorySourceStressSuite.scala | 2 ++ .../sql/streaming/MultiStatefulOperatorsSuite.scala | 2 ++ .../apache/spark/sql/streaming/StreamSuite.scala| 2 ++ .../sql/streaming/StreamingAggregationSuite.scala | 2 ++ .../sql/streaming/StreamingDeduplicationSuite.scala | 2 ++ .../spark/sql/streaming/StreamingJoinSuite.scala| 5 + .../sql/streaming/StreamingQueryListenerSuite.scala | 2 ++ .../sql/streaming/StreamingQueryManagerSuite.scala | 2 ++ .../spark/sql/streaming/StreamingQuerySuite.scala | 2 ++ .../StreamingSessionWindowDistributionSuite.scala | 2 ++ .../sql/streaming/StreamingSessionWindowSuite.scala | 2 ++ ...treamingStateStoreFormatCompatibilitySuite.scala | 2 ++ .../sql/streaming/TriggerAvailableNowSuite.scala| 2 ++ .../ContinuousQueryStatusAndProgressSuite.scala | 2 ++ .../sql/streaming/continuous/ContinuousSuite.scala | 4 .../sources/StreamingDataSourceV2Suite.scala| 3 +++ .../test/DataStreamReaderWriterSuite.scala | 2 ++ .../streaming/test/DataStreamTable
[GitHub] [spark-website] allisonwang-db commented on pull request #480: Fix UI issue for `published` docs about Switch languages consistently across docs for all code snippets
allisonwang-db commented on PR #480: URL: https://github.com/apache/spark-website/pull/480#issuecomment-1738335482 Thanks @panbingkun for backporting this fix! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-45354][SQL] Resolve functions bottom-up
This is an automated email from the ASF dual-hosted git repository. ptoth pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6a9d35f766d [SPARK-45354][SQL] Resolve functions bottom-up 6a9d35f766d is described below commit 6a9d35f766dcc323a2f20326fbfa4675fe51b8d7 Author: Peter Toth AuthorDate: Wed Sep 27 21:40:16 2023 +0200 [SPARK-45354][SQL] Resolve functions bottom-up ### What changes were proposed in this pull request? This PR proposes bottum-up resolution in `ResolveFunctions`, which is much faster (requires less number of resolution rounds) if we have deeply nested `UnresolvedFunctions`. These structures are more likely to occur after https://github.com/apache/spark/pull/42864. ### Why are the changes needed? Performance optimization. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing UTs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43146 from peter-toth/SPARK-45354-resolve-functions-bottom-up. Authored-by: Peter Toth Signed-off-by: Peter Toth --- .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 2 +- .../test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out | 6 +++--- sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index aac85e19721..67a958d73f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -2207,7 +2207,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor Project(aliases, u.child) case q: LogicalPlan => -q.transformExpressionsWithPruning( +q.transformExpressionsUpWithPruning( _.containsAnyPattern(UNRESOLVED_FUNCTION, GENERATOR), ruleId) { case u @ UnresolvedFunction(nameParts, arguments, _, _, _) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out index 27b9786160c..eca4a8ba02f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out @@ -72,9 +72,9 @@ org.apache.spark.sql.AnalysisException "queryContext" : [ { "objectType" : "", "objectName" : "", -"startIndex" : 95, -"stopIndex" : 117, -"fragment" : "default.udaf1(int_col1)" +"startIndex" : 8, +"stopIndex" : 35, +"fragment" : "default.udaf1(udf(int_col1))" } ] } diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out index 228a31ba257..f33e4b40395 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out @@ -73,9 +73,9 @@ org.apache.spark.sql.AnalysisException "queryContext" : [ { "objectType" : "", "objectName" : "", -"startIndex" : 95, -"stopIndex" : 117, -"fragment" : "default.udaf1(int_col1)" +"startIndex" : 8, +"stopIndex" : 35, +"fragment" : "default.udaf1(udf(int_col1))" } ] } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44838][SQL] raise_error improvement
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9109d7037f4 [SPARK-44838][SQL] raise_error improvement 9109d7037f4 is described below commit 9109d7037f44158e72d14019eb33f9c7b8838868 Author: srielau AuthorDate: Wed Sep 27 10:02:44 2023 -0700 [SPARK-44838][SQL] raise_error improvement ### What changes were proposed in this pull request? Extend the raise_error() function to a two-argument version: raise_error(errorClassStr, errorParamMap) This new form will accept any error class defined in error-classes.json and require Map to provide values for the parameters in the error classes template. Externally an error raised via raise_error() is indistinguishable from an error raised from within the Spark engine. The single-parameter raise_error(str) will raise USER_RAISED_EXCEPTION (SQLSTATE P0001 - borrowed from PostgreSQL). USER_RAISED_EXCEPTION text is: "" which will be filled in with the str - value. We will also provide `spark.sql.legacy.raiseErrorWithoutErrorClass` (default: false) to revert to the old behavior for the single-parameter version. Naturally assert_true() will also return `USER_RAISED_EXCEPTION`. Examples ``` SELECT raise_error('VIEW_NOT_FOUND', map('relationName', '`v1`'); [VIEW_NOT_FOUND] The view `v1` cannot be found. Verify the spelling ... SELECT raise_error('Error!'); [USER_RAISED_EXCEPTION] Error! SELECT assert_true(1 < 0); [USER_RAISED_EXCEPTION] '(1 < 0)' is not true! SELECT assert_true(1 < 0, 'bad!') [USER_RAISED_EXCEPTION] bad! ``` ### Why are the changes needed? This change moves raise_error() and assert_true() to the new error frame work. It greatly expands the ability of users to raise error messages which can be intercepted via SQLSTATE and/or error class. ### Does this PR introduce _any_ user-facing change? Yes, the result of assert_true() changes and raise_error() gains a new signature. ### How was this patch tested? Run existing QA and add new tests for assert_true and raise_error ### Was this patch authored or co-authored using generative AI tooling? No Closes #42985 from srielau/SPARK-44838-raise_error. Lead-authored-by: srielau Co-authored-by: Serge Rielau Co-authored-by: Wenchen Fan Signed-off-by: Gengliang Wang --- .../src/main/resources/error/error-classes.json| 26 +++ .../org/apache/spark/ErrorClassesJSONReader.scala | 18 ++ .../org/apache/spark/SparkThrowableHelper.scala| 10 +- .../scala/org/apache/spark/sql/functions.scala | 8 + .../function_assert_true_with_message.explain | 2 +- .../explain-results/function_raise_error.explain | 2 +- .../org/apache/spark/SparkThrowableSuite.scala | 4 +- docs/sql-error-conditions.md | 20 ++ python/pyspark/sql/tests/test_functions.py | 4 +- .../spark/sql/catalyst/expressions/misc.scala | 71 --- .../spark/sql/errors/QueryExecutionErrors.scala| 49 - .../org/apache/spark/sql/internal/SQLConf.scala| 14 ++ .../expressions/ExpressionEvalHelper.scala | 4 + .../expressions/MiscExpressionsSuite.scala | 10 +- .../catalyst/optimizer/ConstantFoldingSuite.scala | 2 +- .../scala/org/apache/spark/sql/functions.scala | 8 + .../sql-functions/sql-expression-schema.md | 2 +- .../analyzer-results/misc-functions.sql.out| 86 +++- .../resources/sql-tests/inputs/misc-functions.sql | 22 +++ .../sql-tests/results/misc-functions.sql.out | 220 +++-- .../apache/spark/sql/ColumnExpressionSuite.scala | 26 ++- .../spark/sql/execution/ui/UISeleniumSuite.scala | 9 +- .../sql/expressions/ExpressionInfoSuite.scala | 1 + 23 files changed, 551 insertions(+), 67 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index dd0190c3462..0882e387176 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -3502,6 +3502,26 @@ "3. set \"spark.sql.legacy.allowUntypedScalaUDF\" to \"true\" and use this API with caution." ] }, + "USER_RAISED_EXCEPTION" : { +"message" : [ + "" +], +"sqlState" : "P0001" + }, + "USER_RAISED_EXCEPTION_PARAMETER_MISMATCH" : { +"message" : [ + "The `raise_error()` function was used to raise error class: which expects parameters: .", + "The provided parameters do not match the expected parameters.", + "Please make sure to provide all expected parameters." +], +"sqlS
[spark] branch master updated: [SPARK-44539][BUILD] Upgrade RoaringBitmap to 1.0.0
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 8399dd321af [SPARK-44539][BUILD] Upgrade RoaringBitmap to 1.0.0 8399dd321af is described below commit 8399dd321afce0cb0051501de55da296595fdf53 Author: panbingkun AuthorDate: Wed Sep 27 11:53:53 2023 -0500 [SPARK-44539][BUILD] Upgrade RoaringBitmap to 1.0.0 ### What changes were proposed in this pull request? - The pr aims to upgrade RoaringBitmap from 0.9.45 to 1.0.0. - From version 1.0.0, the `ArraysShim` class has been moved from `shims-x.x.x.jar` jar to `RoaringBitmap-x.x.x.jar` jar, so we no longer need to rely on it. ### Why are the changes needed? - The newest brings some improvments, eg: Add zero-garbage deserialiser for ByteBuffer to RoaringBitmap by shikharid in https://github.com/RoaringBitmap/RoaringBitmap/pull/650 More specialized method for value decrementation by xtonik in https://github.com/RoaringBitmap/RoaringBitmap/pull/640 Duplicated small array sort routine by xtonik in https://github.com/RoaringBitmap/RoaringBitmap/pull/638 Avoid intermediate byte array creation by xtonik in https://github.com/RoaringBitmap/RoaringBitmap/pull/635 Useless back and forth BD bytes conversion by xtonik in https://github.com/RoaringBitmap/RoaringBitmap/pull/636 - The full release notes: https://github.com/RoaringBitmap/RoaringBitmap/releases/tag/1.0.0 https://github.com/RoaringBitmap/RoaringBitmap/releases/tag/0.9.49 https://github.com/RoaringBitmap/RoaringBitmap/releases/tag/0.9.48 https://github.com/RoaringBitmap/RoaringBitmap/releases/tag/0.9.47 https://github.com/RoaringBitmap/RoaringBitmap/releases/tag/0.9.46 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. Closes #42143 from panbingkun/SPARK-44539. Authored-by: panbingkun Signed-off-by: Sean Owen --- core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt | 6 +++--- core/benchmarks/MapStatusesConvertBenchmark-results.txt | 8 dev/deps/spark-deps-hadoop-3-hive-2.3 | 3 +-- pom.xml | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt index 48dbc8e0241..416aaf5b7aa 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt @@ -6,8 +6,8 @@ OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Num Maps: 5 Fetch partitions:500813900 129 0.0 812807240.0 1.0X -Num Maps: 5 Fetch partitions:1000 2226 2238 17 0.0 2226321250.0 0.4X -Num Maps: 5 Fetch partitions:1500 3149 3300 133 0.0 3148506179.0 0.3X +Num Maps: 5 Fetch partitions:500899949 74 0.0 898941184.0 1.0X +Num Maps: 5 Fetch partitions:1000 1947 2043 115 0.0 1947362412.0 0.5X +Num Maps: 5 Fetch partitions:1500 3079 3122 75 0.0 3078809212.0 0.3X diff --git a/core/benchmarks/MapStatusesConvertBenchmark-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-results.txt index 5ed55c839eb..bd87f4876e4 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-results.txt @@ -3,11 +3,11 @@ MapStatuses Convert Benchmark OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Num Maps: 5 Fetch partitions:500 1127 1138 13 0.0 1127479807.0 1.0X -Num Maps: 5 Fetch partitions:1000 2146 2183 49 0.0 2146214882.0 0.5X -Num Maps: 5000
[spark] branch master updated: [SPARK-45343][DOCS] Clarify behavior of multiLine in CSV options
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ab92cae78e3 [SPARK-45343][DOCS] Clarify behavior of multiLine in CSV options ab92cae78e3 is described below commit ab92cae78e3cdf58ba96b0b98e7958287c2d5cd1 Author: Bill Schneider AuthorDate: Wed Sep 27 08:25:02 2023 -0500 [SPARK-45343][DOCS] Clarify behavior of multiLine in CSV options ### What changes were proposed in this pull request? this is a documentation-only change to clarify CSV `multiLine` option: https://issues.apache.org/jira/browse/SPARK-45343 ### Why are the changes needed? documentation clarity ### Does this PR introduce _any_ user-facing change? Documentation only ### How was this patch tested? N/A, documentation only ### Was this patch authored or co-authored using generative AI tooling? Documentation only Closes #43132 from wrschneider/SPARK-45343-csv-multiline-doc-clarification. Lead-authored-by: Bill Schneider Co-authored-by: Bill Schneider Signed-off-by: Sean Owen --- docs/sql-data-sources-csv.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 31167f55143..721563d1681 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -213,7 +213,7 @@ Data source options of CSV can be set via: multiLine false -Parse one record, which may span multiple lines, per file. CSV built-in functions ignore this option. +Allows a row to span multiple lines, by parsing line breaks within quoted values as part of the value itself. CSV built-in functions ignore this option. read - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (9f817379c68 -> b7763a7eae2)
This is an automated email from the ASF dual-hosted git repository. srowen pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 9f817379c68 [SPARK-45341][CORE] Correct the title level in the comments of KVStore.java to make `sbt doc` run successfully with Java 17 add b7763a7eae2 [SPARK-45338][CORE][SQL] Replace `scala.collection.JavaConverters` to `scala.jdk.CollectionConverters` No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/ErrorClassesJSONReader.scala | 2 +- .../src/main/scala/org/apache/spark/SparkException.scala | 2 +- .../scala/org/apache/spark/SparkThrowableHelper.scala| 2 +- .../main/scala/org/apache/spark/internal/Logging.scala | 2 +- .../org/apache/spark/sql/avro/AvroDeserializer.scala | 2 +- .../org/apache/spark/sql/avro/AvroOutputWriter.scala | 2 +- .../scala/org/apache/spark/sql/avro/AvroSerializer.scala | 2 +- .../main/scala/org/apache/spark/sql/avro/AvroUtils.scala | 2 +- .../org/apache/spark/sql/avro/SchemaConverters.scala | 2 +- .../main/scala/org/apache/spark/sql/avro/functions.scala | 2 +- .../scala/org/apache/spark/sql/v2/avro/AvroScan.scala| 2 +- .../scala/org/apache/spark/sql/v2/avro/AvroTable.scala | 2 +- .../org/apache/spark/sql/avro/AvroFunctionsSuite.scala | 2 +- .../test/scala/org/apache/spark/sql/avro/AvroSuite.scala | 2 +- .../jvm/src/main/scala/org/apache/spark/sql/Column.scala | 2 +- .../org/apache/spark/sql/DataFrameNaFunctions.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameReader.scala | 2 +- .../org/apache/spark/sql/DataFrameStatFunctions.scala| 2 +- .../scala/org/apache/spark/sql/DataFrameWriter.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameWriterV2.scala | 2 +- .../src/main/scala/org/apache/spark/sql/Dataset.scala| 2 +- .../org/apache/spark/sql/KeyValueGroupedDataset.scala| 2 +- .../org/apache/spark/sql/RelationalGroupedDataset.scala | 2 +- .../main/scala/org/apache/spark/sql/SparkSession.scala | 2 +- .../main/scala/org/apache/spark/sql/avro/functions.scala | 2 +- .../scala/org/apache/spark/sql/catalog/Catalog.scala | 2 +- .../spark/sql/expressions/UserDefinedFunction.scala | 2 +- .../org/apache/spark/sql/expressions/WindowSpec.scala| 2 +- .../src/main/scala/org/apache/spark/sql/functions.scala | 2 +- .../scala/org/apache/spark/sql/protobuf/functions.scala | 2 +- .../apache/spark/sql/streaming/DataStreamReader.scala| 2 +- .../apache/spark/sql/streaming/DataStreamWriter.scala| 2 +- .../org/apache/spark/sql/streaming/StreamingQuery.scala | 2 +- .../spark/sql/streaming/StreamingQueryManager.scala | 2 +- .../scala/org/apache/spark/sql/streaming/progress.scala | 2 +- .../scala/org/apache/spark/sql/ClientE2ETestSuite.scala | 2 +- .../scala/org/apache/spark/sql/ColumnTestSuite.scala | 2 +- .../org/apache/spark/sql/DataFrameNaFunctionSuite.scala | 2 +- .../scala/org/apache/spark/sql/FunctionTestSuite.scala | 2 +- .../org/apache/spark/sql/PlanGenerationTestSuite.scala | 2 +- .../spark/sql/UserDefinedFunctionE2ETestSuite.scala | 2 +- .../apache/spark/sql/connect/client/ArtifactSuite.scala | 2 +- .../sql/connect/client/SparkConnectClientSuite.scala | 2 +- .../spark/sql/streaming/ClientStreamingQuerySuite.scala | 2 +- .../spark/sql/connect/client/ArtifactManager.scala | 2 +- .../apache/spark/sql/connect/client/ClassFinder.scala| 2 +- .../connect/client/CustomSparkConnectBlockingStub.scala | 2 +- .../client/ExecutePlanResponseReattachableIterator.scala | 2 +- .../sql/connect/client/GrpcExceptionConverter.scala | 2 +- .../spark/sql/connect/client/SparkConnectClient.scala| 2 +- .../sql/connect/client/arrow/ArrowEncoderUtils.scala | 2 +- .../spark/sql/connect/client/arrow/ArrowSerializer.scala | 2 +- .../sql/connect/common/LiteralValueProtoConverter.scala | 2 +- .../org/apache/spark/sql/connect/common/ProtoUtils.scala | 2 +- .../org/apache/spark/sql/connect/common/UdfUtils.scala | 2 +- .../apache/spark/sql/connect/SparkConnectPlugin.scala| 2 +- .../connect/artifact/SparkConnectArtifactManager.scala | 2 +- .../scala/org/apache/spark/sql/connect/dsl/package.scala | 2 +- .../connect/execution/SparkConnectPlanExecution.scala| 2 +- .../spark/sql/connect/planner/SparkConnectPlanner.scala | 2 +- .../connect/planner/StreamingForeachBatchHelper.scala| 2 +- .../apache/spark/sql/connect/service/ExecuteHolder.scala | 2 +- .../apache/spark/sql/connect/service/SessionHolder.scala | 2 +- .../sql/connect/service/SparkConnectAnalyzeHandler.scala | 2 +- .../service/SparkConnectArtifactStatusesHandler.scala| 2 +- .../sql/connect/service/SparkConnectConfigHandler.scala | 2 +- .../connect/service/SparkConnectExecutionManager.scala | 2 +- .../connect/service/SparkConnectInterruptHandler.scala | 2 +- ..
[spark] branch master updated: [SPARK-45341][CORE] Correct the title level in the comments of KVStore.java to make `sbt doc` run successfully with Java 17
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9f817379c68 [SPARK-45341][CORE] Correct the title level in the comments of KVStore.java to make `sbt doc` run successfully with Java 17 9f817379c68 is described below commit 9f817379c68e551680e60900f1d61b70e1b62960 Author: yangjie01 AuthorDate: Wed Sep 27 08:21:05 2023 -0500 [SPARK-45341][CORE] Correct the title level in the comments of KVStore.java to make `sbt doc` run successfully with Java 17 ### What changes were proposed in this pull request? This pr aims to correct the title level in the comments of `KVStore.java` to make `sbt doc` run successfully with Java 17. ### Why are the changes needed? Make the `sbt doc` command execute successfully with Java 17 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Manually check. run `build/sbt clean doc -Phadoop-3 -Phadoop-cloud -Pmesos -Pyarn -Pkinesis-asl -Phive-thriftserver -Pspark-ganglia-lgpl -Pkubernetes -Phive -Pvolcano` **Before** ``` [error] /Users/yangjie01/SourceCode/git/spark-mine-sbt/Picked up JAVA_TOOL_OPTIONS:-Duser.language=en [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/ArrayWrappers.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVIndex.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBTypeInfo.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/UnsupportedStoreVersionException.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStore.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreView.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java... [error] Loading source file /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java... [error] Constructing Javadoc information... [error] Building index for all the packages and classes... [error] Standard Doclet version 17.0.8+7-LTS [error] Building tree for all the packages and classes... [error] /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStore.java:32:1: error: heading used out of sequence: , compared to implicit preceding heading: [error] * Serialization [error]^Generating /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/target/scala-2.13/api/org/apache/spark/util/kvstore/InMemoryStore.html... [error] Generating /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/target/scala-2.13/api/org/apache/spark/util/kvstore/KVIndex.html... [error] Generating /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/target/scala-2.13/api/org/apache/spark/util/kvstore/KVStore.html... [error] Generating /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore/target/scala-2.13/api/org/apache/spark/util/kvstore/KVStoreIterator.html... [error] Generating /Users/yangjie01/SourceCode/git/spark-mine-sbt/common/kvstore
[spark] branch master updated: [SPARK-45337][CORE] Refactor `AbstractCommandBuilder#getScalaVersion` to remove the check for Scala 2.12
This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2da6d1a8731 [SPARK-45337][CORE] Refactor `AbstractCommandBuilder#getScalaVersion` to remove the check for Scala 2.12 2da6d1a8731 is described below commit 2da6d1a8731431d03e8ffce2141d74241f0410dc Author: yangjie01 AuthorDate: Wed Sep 27 18:34:09 2023 +0800 [SPARK-45337][CORE] Refactor `AbstractCommandBuilder#getScalaVersion` to remove the check for Scala 2.12 ### What changes were proposed in this pull request? This pr refactors `AbstractCommandBuilder#getScalaVersion` function as follows: 1. comment out the code for multiple Scala versions support , making it easier to reintroduce when Scala 3 is supported. 2. Change to directly return `"2.13"` when `scala213.isDirectory()` is true ### Why are the changes needed? Remove Scala 2.12 check ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Pass GitHub Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #43125 from LuciferYang/SPARK-45337. Authored-by: yangjie01 Signed-off-by: yangjie01 --- .../spark/launcher/AbstractCommandBuilder.java | 24 +- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java index b75410e11a5..2bbeac25c06 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java @@ -230,17 +230,21 @@ abstract class AbstractCommandBuilder { return scala; } String sparkHome = getSparkHome(); -File scala212 = new File(sparkHome, "launcher/target/scala-2.12"); File scala213 = new File(sparkHome, "launcher/target/scala-2.13"); -checkState(!scala212.isDirectory() || !scala213.isDirectory(), - "Presence of build for multiple Scala versions detected.\n" + - "Either clean one of them or set SPARK_SCALA_VERSION in your environment."); -if (scala213.isDirectory()) { - return "2.13"; -} else { - checkState(scala212.isDirectory(), "Cannot find any build directories."); - return "2.12"; -} +checkState(scala213.isDirectory(), "Cannot find any build directories."); +return "2.13"; +// String sparkHome = getSparkHome(); +// File scala212 = new File(sparkHome, "launcher/target/scala-2.12"); +// File scala213 = new File(sparkHome, "launcher/target/scala-2.13"); +// checkState(!scala212.isDirectory() || !scala213.isDirectory(), +// "Presence of build for multiple Scala versions detected.\n" + +// "Either clean one of them or set SPARK_SCALA_VERSION in your environment."); +// if (scala213.isDirectory()) { +// return "2.13"; +// } else { +// checkState(scala212.isDirectory(), "Cannot find any build directories."); +// return "2.12"; +// } } String getSparkHome() { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-45268][PYTHON][DOCS] python function categories should be consistent with SQL References
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6e6ee613a0d [SPARK-45268][PYTHON][DOCS] python function categories should be consistent with SQL References 6e6ee613a0d is described below commit 6e6ee613a0d36c56d2e522510897818ba1754d56 Author: Ruifeng Zheng AuthorDate: Wed Sep 27 16:58:50 2023 +0800 [SPARK-45268][PYTHON][DOCS] python function categories should be consistent with SQL References ### What changes were proposed in this pull request? re-org python function categories ### Why are the changes needed? should be consistent with SQL expression groups ### Does this PR introduce _any_ user-facing change? yes ### How was this patch tested? manually check ### Was this patch authored or co-authored using generative AI tooling? no Closes #43045 from zhengruifeng/fix_function_doc. Lead-authored-by: Ruifeng Zheng Co-authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .../source/reference/pyspark.sql/functions.rst | 562 - 1 file changed, 315 insertions(+), 247 deletions(-) diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index 43615d690e8..362be812d83 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -31,43 +31,71 @@ Normal Functions .. autosummary:: :toctree: api/ +broadcast +call_function col column lit -broadcast +expr + + +Conditional Functions +- +.. autosummary:: +:toctree: api/ + coalesce -input_file_name -isnan -isnull -monotonically_increasing_id -named_struct +ifnull nanvl -rand -randn -spark_partition_id +nullif +nvl +nvl2 when -bitwise_not -bitwiseNOT -expr -greatest -least -Math Functions +Predicate Functions +--- +.. autosummary:: +:toctree: api/ + +ilike +isnan +isnotnull +isnull +like +regexp +regexp_like +rlike + + +Sort Functions -- .. autosummary:: :toctree: api/ -sqrt +asc +asc_nulls_first +asc_nulls_last +desc +desc_nulls_first +desc_nulls_last + + +Mathematical Functions +-- +.. autosummary:: +:toctree: api/ + abs acos acosh asin asinh atan -atanh atan2 +atanh bin +bround cbrt ceil ceiling @@ -76,14 +104,16 @@ Math Functions cosh cot csc +degrees e exp expm1 factorial floor +greatest hex -unhex hypot +least ln log log10 @@ -96,36 +126,117 @@ Math Functions positive pow power +radians +rand +randn rint round -bround sec -shiftleft -shiftright -shiftrightunsigned sign signum sin sinh +sqrt tan tanh toDegrees +toRadians try_add -try_avg try_divide try_multiply try_subtract -try_sum +unhex +width_bucket + + +String Functions + +.. autosummary:: +:toctree: api/ + +ascii +base64 +bit_length +btrim +char +char_length +character_length +concat_ws +contains +decode +elt +encode +endswith +find_in_set +format_number +format_string +initcap +instr +lcase +left +length +levenshtein +locate +lower +lpad +ltrim +mask +octet_length +overlay +position +printf +regexp_count +regexp_extract +regexp_extract_all +regexp_instr +regexp_replace +regexp_substr +repeat +replace +right +rpad +rtrim +sentences +soundex +split +split_part +startswith +substr +substring +substring_index +to_binary +to_char +to_number +to_varchar +translate +trim try_to_binary try_to_number -degrees -toRadians -radians -width_bucket +ucase +unbase64 +upper -Datetime Functions --- +Bitwise Functions +- +.. autosummary:: +:toctree: api/ + +bit_count +bit_get +bitwiseNOT +bitwise_not +getbit +shiftLeft +shiftleft +shiftRight +shiftright +shiftRightUnsigned +shiftrightunsigned + + +Date and Timestamp Functions + .. autosummary:: :toctree: api/ @@ -139,24 +250,23 @@ Datetime Functions date_diff date_format date_from_unix_date +date_part date_sub date_trunc dateadd d
[spark] branch master updated: [MINOR][PYTHON][CONNECT] Check `self._session` before call `self._session.client`
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b348cd12a33 [MINOR][PYTHON][CONNECT] Check `self._session` before call `self._session.client` b348cd12a33 is described below commit b348cd12a331c9ffc463cd632b2c36dbab2d9208 Author: Ruifeng Zheng AuthorDate: Wed Sep 27 16:54:53 2023 +0800 [MINOR][PYTHON][CONNECT] Check `self._session` before call `self._session.client` ### What changes were proposed in this pull request? In `DataFrame.{schema, explain}`, check `self._session` before call `self._session.client` I don't find other similar places ### Why are the changes needed? we should check `self._session` before call `self._session.client` ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #43143 from zhengruifeng/connect_schema_check_session. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- python/pyspark/sql/connect/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 5e2623336a2..081eabd9837 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -1773,9 +1773,9 @@ class DataFrame: @property def schema(self) -> StructType: if self._plan is not None: -query = self._plan.to_proto(self._session.client) if self._session is None: raise Exception("Cannot analyze without SparkSession.") +query = self._plan.to_proto(self._session.client) return self._session.client.schema(query) else: raise Exception("Empty plan.") @@ -1903,9 +1903,9 @@ class DataFrame: explain_mode = cast(str, extended) if self._plan is not None: -query = self._plan.to_proto(self._session.client) if self._session is None: raise Exception("Cannot analyze without SparkSession.") +query = self._plan.to_proto(self._session.client) return self._session.client.explain_string(query, explain_mode) else: return "" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new e6e0c074636 [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression e6e0c074636 is described below commit e6e0c074636c1a43fa5957f906a881469515393a Author: zeruibao AuthorDate: Wed Sep 27 16:42:35 2023 +0800 [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression ### What changes were proposed in this pull request? My last PR https://github.com/apache/spark/pull/41052 causes AVRO read performance regression since I change the code structure. I turn one match case into a nested match case. So I fix the Avro data type conversion issues in anther way to avoid this regression. Original Change: We introduce the SQLConf `spark.sql.legacy.avro.allowReadingWithIncompatibleSchema` to prevent reading interval types as date or timestamp types to avoid getting corrupt dates as well as reading decimal types with incorrect precision. ### Why are the changes needed? We found the following issues with open source Avro: - Interval types can be read as date or timestamp types that would lead to wildly different results For example, `Duration.ofDays(1).plusSeconds(1)` will be read as `1972-09-27`, which is weird. - Decimal types can be read with lower precision, that leads to data being read as `null` instead of suggesting that a wider decimal format should be provided ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Old unit test Closes #42503 from zeruibao/SPARK-4380-real-fix-regression. Lead-authored-by: zeruibao Co-authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit f8c87f03297e2770e2944e8e8fe097b75f9e8fea) Signed-off-by: Wenchen Fan --- .../src/main/resources/error/error-classes.json| 5 + .../apache/spark/sql/avro/AvroDeserializer.scala | 46 -- .../org/apache/spark/sql/avro/AvroSuite.scala | 158 + docs/sql-error-conditions.md | 6 + docs/sql-migration-guide.md| 1 + .../spark/sql/errors/QueryCompilationErrors.scala | 16 +++ .../org/apache/spark/sql/internal/SQLConf.scala| 12 ++ 7 files changed, 235 insertions(+), 9 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 477fe9b3f61..9bc65ae32a2 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -69,6 +69,11 @@ } } }, + "AVRO_INCOMPATIBLE_READ_TYPE" : { +"message" : [ + "Cannot convert Avro to SQL because the original encoded data type is , however you're trying to read the field as , which would lead to an incorrect answer. To allow reading this field, enable the SQL configuration: \"spark.sql.legacy.avro.allowIncompatibleSchema\"." +] + }, "BATCH_METADATA_NOT_FOUND" : { "message" : [ "Unable to find batch ." diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala index a78ee89a3e9..e82116eec1e 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala @@ -35,8 +35,9 @@ import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArr import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.datasources.DataSourceUtils -import org.apache.spark.sql.internal.LegacyBehaviorPolicy +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -117,6 +118,10 @@ private[sql] class AvroDeserializer( val incompatibleMsg = errorPrefix + s"schema is incompatible (avroType = $avroType, sqlType = ${catalystType.sql})" +val realDataType = SchemaConverters.toSqlType(avroType).dataType +val confKey = SQLConf.LEGACY_AVRO_ALLOW_INCOMPATIBLE_SCHEMA +val preventReadingIncorrectType = !SQLConf.get.getConf(confKey) + (avroType.getType, catalystType) match { case (NULL, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal) @@ -128,9 +133,19
[spark] branch master updated: [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new f8c87f03297 [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression f8c87f03297 is described below commit f8c87f03297e2770e2944e8e8fe097b75f9e8fea Author: zeruibao AuthorDate: Wed Sep 27 16:42:35 2023 +0800 [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression ### What changes were proposed in this pull request? My last PR https://github.com/apache/spark/pull/41052 causes AVRO read performance regression since I change the code structure. I turn one match case into a nested match case. So I fix the Avro data type conversion issues in anther way to avoid this regression. Original Change: We introduce the SQLConf `spark.sql.legacy.avro.allowReadingWithIncompatibleSchema` to prevent reading interval types as date or timestamp types to avoid getting corrupt dates as well as reading decimal types with incorrect precision. ### Why are the changes needed? We found the following issues with open source Avro: - Interval types can be read as date or timestamp types that would lead to wildly different results For example, `Duration.ofDays(1).plusSeconds(1)` will be read as `1972-09-27`, which is weird. - Decimal types can be read with lower precision, that leads to data being read as `null` instead of suggesting that a wider decimal format should be provided ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Old unit test Closes #42503 from zeruibao/SPARK-4380-real-fix-regression. Lead-authored-by: zeruibao Co-authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../src/main/resources/error/error-classes.json| 5 + .../apache/spark/sql/avro/AvroDeserializer.scala | 46 -- .../org/apache/spark/sql/avro/AvroSuite.scala | 158 + docs/sql-error-conditions.md | 8 +- docs/sql-migration-guide.md| 1 + .../spark/sql/errors/QueryCompilationErrors.scala | 16 +++ .../org/apache/spark/sql/internal/SQLConf.scala| 12 ++ 7 files changed, 235 insertions(+), 11 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 5d827c67482..dd0190c3462 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -75,6 +75,11 @@ } } }, + "AVRO_INCOMPATIBLE_READ_TYPE" : { +"message" : [ + "Cannot convert Avro to SQL because the original encoded data type is , however you're trying to read the field as , which would lead to an incorrect answer. To allow reading this field, enable the SQL configuration: \"spark.sql.legacy.avro.allowIncompatibleSchema\"." +] + }, "BATCH_METADATA_NOT_FOUND" : { "message" : [ "Unable to find batch ." diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala index a78ee89a3e9..e82116eec1e 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala @@ -35,8 +35,9 @@ import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArr import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.datasources.DataSourceUtils -import org.apache.spark.sql.internal.LegacyBehaviorPolicy +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -117,6 +118,10 @@ private[sql] class AvroDeserializer( val incompatibleMsg = errorPrefix + s"schema is incompatible (avroType = $avroType, sqlType = ${catalystType.sql})" +val realDataType = SchemaConverters.toSqlType(avroType).dataType +val confKey = SQLConf.LEGACY_AVRO_ALLOW_INCOMPATIBLE_SCHEMA +val preventReadingIncorrectType = !SQLConf.get.getConf(confKey) + (avroType.getType, catalystType) match { case (NULL, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal) @@ -128,9 +133,19 @@ private[sql] class AvroDeserializer( case (INT, IntegerType) => (updater, ordinal, value) =>
[spark] branch branch-3.5 updated: [SPARK-45346][SQL] Parquet schema inference should respect case sensitive flag when merging schema
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new eb6b68daa4e [SPARK-45346][SQL] Parquet schema inference should respect case sensitive flag when merging schema eb6b68daa4e is described below commit eb6b68daa4ef237d92575f799bafd97b1c3615b5 Author: Wenchen Fan AuthorDate: Wed Sep 27 16:00:11 2023 +0800 [SPARK-45346][SQL] Parquet schema inference should respect case sensitive flag when merging schema ### What changes were proposed in this pull request? Currently, when we infer schema from parquet files and try to merge the schema, it's always case-sensitive. Then a check fails later which tries to make sure the data schema of parquet fields does not have duplicated columns, in a case-insensitive way (the default). This PR fixes the problem and make the schema merging respect the case sensitivity flag. ### Why are the changes needed? bug fix ### Does this PR introduce _any_ user-facing change? yes, spark can read some parquet files now. ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #43134 from cloud-fan/merge-schema. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit 1cbc424ae2acaf4d82f928cfea2767c81425305e) Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/types/StructType.scala| 28 +++--- .../execution/datasources/SchemaMergeUtils.scala | 5 ++-- .../datasources/parquet/ParquetSchemaSuite.scala | 21 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala index 8edc7cf370b..8fd7f47b346 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import java.util.Locale + import scala.collection.{mutable, Map} import scala.util.Try import scala.util.control.NonFatal @@ -476,8 +478,8 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * 4. Otherwise, `this` and `that` are considered as conflicting schemas and an exception would be *thrown. */ - private[sql] def merge(that: StructType): StructType = -StructType.merge(this, that).asInstanceOf[StructType] + private[sql] def merge(that: StructType, caseSensitive: Boolean = true): StructType = +StructType.merge(this, that, caseSensitive).asInstanceOf[StructType] override private[spark] def asNullable: StructType = { val newFields = fields.map { @@ -561,16 +563,20 @@ object StructType extends AbstractDataType { StructType(newFields) }) - private[sql] def merge(left: DataType, right: DataType): DataType = + private[sql] def merge(left: DataType, right: DataType, caseSensitive: Boolean = true): DataType = mergeInternal(left, right, (s1: StructType, s2: StructType) => { val leftFields = s1.fields val rightFields = s2.fields val newFields = mutable.ArrayBuffer.empty[StructField] - val rightMapped = fieldsMap(rightFields) + def normalize(name: String): String = { +if (caseSensitive) name else name.toLowerCase(Locale.ROOT) + } + + val rightMapped = fieldsMap(rightFields, caseSensitive) leftFields.foreach { case leftField @ StructField(leftName, leftType, leftNullable, _) => - rightMapped.get(leftName) + rightMapped.get(normalize(leftName)) .map { case rightField @ StructField(rightName, rightType, rightNullable, _) => try { leftField.copy( @@ -588,9 +594,9 @@ object StructType extends AbstractDataType { .foreach(newFields += _) } - val leftMapped = fieldsMap(leftFields) + val leftMapped = fieldsMap(leftFields, caseSensitive) rightFields -.filterNot(f => leftMapped.get(f.name).nonEmpty) +.filterNot(f => leftMapped.contains(normalize(f.name))) .foreach { f => newFields += f } @@ -643,11 +649,15 @@ object StructType extends AbstractDataType { throw DataTypeErrors.cannotMergeIncompatibleDataTypesError(left, right) } - private[sql] def fieldsMap(fields: Array[StructField]): Map[String, StructField] = { + private[sql] def fieldsMap( + fields: Array[StructField], + caseSensitive: Boolean = true): Map[String, StructField] = { // Mimics the optimization of breakOut, not present in Scala 2.13, while working in 2.12 val map = mutable.Map[String, S
[spark] branch master updated: [SPARK-45346][SQL] Parquet schema inference should respect case sensitive flag when merging schema
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1cbc424ae2a [SPARK-45346][SQL] Parquet schema inference should respect case sensitive flag when merging schema 1cbc424ae2a is described below commit 1cbc424ae2acaf4d82f928cfea2767c81425305e Author: Wenchen Fan AuthorDate: Wed Sep 27 16:00:11 2023 +0800 [SPARK-45346][SQL] Parquet schema inference should respect case sensitive flag when merging schema ### What changes were proposed in this pull request? Currently, when we infer schema from parquet files and try to merge the schema, it's always case-sensitive. Then a check fails later which tries to make sure the data schema of parquet fields does not have duplicated columns, in a case-insensitive way (the default). This PR fixes the problem and make the schema merging respect the case sensitivity flag. ### Why are the changes needed? bug fix ### Does this PR introduce _any_ user-facing change? yes, spark can read some parquet files now. ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #43134 from cloud-fan/merge-schema. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/types/StructType.scala| 28 +++--- .../execution/datasources/SchemaMergeUtils.scala | 5 ++-- .../datasources/parquet/ParquetSchemaSuite.scala | 21 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala index 8edc7cf370b..8fd7f47b346 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import java.util.Locale + import scala.collection.{mutable, Map} import scala.util.Try import scala.util.control.NonFatal @@ -476,8 +478,8 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * 4. Otherwise, `this` and `that` are considered as conflicting schemas and an exception would be *thrown. */ - private[sql] def merge(that: StructType): StructType = -StructType.merge(this, that).asInstanceOf[StructType] + private[sql] def merge(that: StructType, caseSensitive: Boolean = true): StructType = +StructType.merge(this, that, caseSensitive).asInstanceOf[StructType] override private[spark] def asNullable: StructType = { val newFields = fields.map { @@ -561,16 +563,20 @@ object StructType extends AbstractDataType { StructType(newFields) }) - private[sql] def merge(left: DataType, right: DataType): DataType = + private[sql] def merge(left: DataType, right: DataType, caseSensitive: Boolean = true): DataType = mergeInternal(left, right, (s1: StructType, s2: StructType) => { val leftFields = s1.fields val rightFields = s2.fields val newFields = mutable.ArrayBuffer.empty[StructField] - val rightMapped = fieldsMap(rightFields) + def normalize(name: String): String = { +if (caseSensitive) name else name.toLowerCase(Locale.ROOT) + } + + val rightMapped = fieldsMap(rightFields, caseSensitive) leftFields.foreach { case leftField @ StructField(leftName, leftType, leftNullable, _) => - rightMapped.get(leftName) + rightMapped.get(normalize(leftName)) .map { case rightField @ StructField(rightName, rightType, rightNullable, _) => try { leftField.copy( @@ -588,9 +594,9 @@ object StructType extends AbstractDataType { .foreach(newFields += _) } - val leftMapped = fieldsMap(leftFields) + val leftMapped = fieldsMap(leftFields, caseSensitive) rightFields -.filterNot(f => leftMapped.get(f.name).nonEmpty) +.filterNot(f => leftMapped.contains(normalize(f.name))) .foreach { f => newFields += f } @@ -643,11 +649,15 @@ object StructType extends AbstractDataType { throw DataTypeErrors.cannotMergeIncompatibleDataTypesError(left, right) } - private[sql] def fieldsMap(fields: Array[StructField]): Map[String, StructField] = { + private[sql] def fieldsMap( + fields: Array[StructField], + caseSensitive: Boolean = true): Map[String, StructField] = { // Mimics the optimization of breakOut, not present in Scala 2.13, while working in 2.12 val map = mutable.Map[String, StructField]() map.sizeHint(fields.length) -fields.foreach(s => map.put(s.name, s)) +fields.foreach {
[spark] branch master updated: [SPARK-43662][PS][CONNECT] Support merge_asof in Spark Connect
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 400d4ded64e [SPARK-43662][PS][CONNECT] Support merge_asof in Spark Connect 400d4ded64e is described below commit 400d4ded64efdf62b75f2bcdc6025d474d6395ee Author: Takuya UESHIN AuthorDate: Wed Sep 27 15:09:47 2023 +0800 [SPARK-43662][PS][CONNECT] Support merge_asof in Spark Connect ### What changes were proposed in this pull request? Supports `merge_asof` in Spark Connect. ### Why are the changes needed? `merge_asof` is missing in Spark Connect. Ref: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.merge_asof.html ### Does this PR introduce _any_ user-facing change? Yes, `merge_asof` is available in Spark Connect. ### How was this patch tested? The parity tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43137 from ueshin/issues/SPARK-43662/merge_asof. Authored-by: Takuya UESHIN Signed-off-by: Ruifeng Zheng --- .../main/protobuf/spark/connect/relations.proto| 42 .../sql/connect/planner/SparkConnectPlanner.scala | 37 +++ .../pandas/tests/connect/test_parity_reshape.py| 4 +- python/pyspark/sql/connect/dataframe.py| 41 +++ python/pyspark/sql/connect/plan.py | 94 +++ python/pyspark/sql/connect/proto/relations_pb2.py | 278 +++-- python/pyspark/sql/connect/proto/relations_pb2.pyi | 123 + python/pyspark/sql/dataframe.py| 3 + 8 files changed, 481 insertions(+), 141 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto index 0cf08431d46..deb33978386 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -72,6 +72,7 @@ message Relation { CachedLocalRelation cached_local_relation = 36; CachedRemoteRelation cached_remote_relation = 37; CommonInlineUserDefinedTableFunction common_inline_user_defined_table_function = 38; +AsOfJoin as_of_join = 39; // NA functions NAFill fill_na = 90; @@ -1009,3 +1010,44 @@ message Parse { PARSE_FORMAT_JSON = 2; } } + +// Relation of type [[AsOfJoin]]. +// +// `left` and `right` must be present. +message AsOfJoin { + // (Required) Left input relation for a Join. + Relation left = 1; + + // (Required) Right input relation for a Join. + Relation right = 2; + + // (Required) Field to join on in left DataFrame + Expression left_as_of = 3; + + // (Required) Field to join on in right DataFrame + Expression right_as_of = 4; + + // (Optional) The join condition. Could be unset when `using_columns` is utilized. + // + // This field does not co-exist with using_columns. + Expression join_expr = 5; + + // Optional. using_columns provides a list of columns that should present on both sides of + // the join inputs that this Join will join on. For example A JOIN B USING col_name is + // equivalent to A JOIN B on A.col_name = B.col_name. + // + // This field does not co-exist with join_condition. + repeated string using_columns = 6; + + // (Required) The join type. + string join_type = 7; + + // (Optional) The asof tolerance within this range. + Expression tolerance = 8; + + // (Required) Whether allow matching with the same value or not. + bool allow_exact_matches = 9; + + // (Required) Whether to search for prior, subsequent, or closest matches. + string direction = 10; +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index dda7a713fa0..e6b5d89e1ab 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -110,6 +110,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging { case proto.Relation.RelTypeCase.OFFSET => transformOffset(rel.getOffset) case proto.Relation.RelTypeCase.TAIL => transformTail(rel.getTail) case proto.Relation.RelTypeCase.JOIN => transformJoinOrJoinWith(rel.getJoin) + case proto.Relation.RelTypeCase.AS_OF_JOIN => transformAsOfJoin(rel.getAsOfJoin) case proto.Relation.RelTypeCase.DEDUPLICATE => transformDeduplicate(rel.getDeduplicate) case proto.Relation.RelTypeCase.SET_OP => transformSetOperation(rel.get
[spark] branch master updated: [SPARK-45308][PS][TESTS] Enable `GroupbySplitApplyTests.test_split_apply_combine_on_series` for pandas 2.0.0
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0b057c807f1 [SPARK-45308][PS][TESTS] Enable `GroupbySplitApplyTests.test_split_apply_combine_on_series` for pandas 2.0.0 0b057c807f1 is described below commit 0b057c807f193ef9d09bb973411b06bf33438987 Author: Haejoon Lee AuthorDate: Wed Sep 27 15:06:41 2023 +0800 [SPARK-45308][PS][TESTS] Enable `GroupbySplitApplyTests.test_split_apply_combine_on_series` for pandas 2.0.0 ### What changes were proposed in this pull request? This PR proposes to enable `GroupbySplitApplyTests.test_split_apply_combine_on_series`. ### Why are the changes needed? Similar to https://github.com/apache/spark/pull/43002, this test is skipped since Pandas 2.0.0 upgrade, but the root cause of the test failure is classified as regression from Pandas. So we can manually make the test pass for now and will update the test when Pandas regression is resolved. ### Does this PR introduce _any_ user-facing change? No, it's test-only. ### How was this patch tested? To update the existing test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43096 from itholic/SPARK-45308. Authored-by: Haejoon Lee Signed-off-by: Ruifeng Zheng --- python/pyspark/pandas/tests/groupby/test_split_apply.py | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py index 070fa01a868..a3ef8c73de4 100644 --- a/python/pyspark/pandas/tests/groupby/test_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py @@ -40,18 +40,18 @@ class GroupbySplitApplyMixin: def psdf(self): return ps.from_pandas(self.pdf) -@unittest.skipIf( -LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), -"TODO(SPARK-43445): Enable GroupBySlowTests.test_split_apply_combine_on_series " -"for pandas 2.0.0.", -) def test_split_apply_combine_on_series(self): +# TODO(SPARK-45228): Enabling string type columns for `test_split_apply_combine_on_series` +# when Pandas regression is fixed +# There is a regression in Pandas 2.1.0, +# so we should manually cast to float until the regression is fixed. +# See https://github.com/pandas-dev/pandas/issues/55194. pdf = pd.DataFrame( { "a": [1, 2, 6, 4, 4, 6, 4, 3, 7], "b": [4, 2, 7, 3, 3, 1, 1, 1, 2], "c": [4, 2, 7, 3, None, 1, 1, 1, 2], -"d": list("abcdefght"), +# "d": list("abcdefght"), }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9], ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org