This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e2e6e09a24db [SPARK-47634][SQL] Add legacy support for disabling map key normalization e2e6e09a24db is described below commit e2e6e09a24dbb828b55ee35f60da22c3df4ac2c5 Author: Stevo Mitric <stevo.mit...@databricks.com> AuthorDate: Tue Apr 2 16:28:27 2024 +0800 [SPARK-47634][SQL] Add legacy support for disabling map key normalization ### What changes were proposed in this pull request? Added a `DISABLE_MAP_KEY_NORMALIZATION` option in `SQLConf` to allow for legacy creation of a map without key normalization (keys `0.0` and `-0.0`) in `ArrayBasedMapBuilder`. ### Why are the changes needed? As a legacy fallback option. ### Does this PR introduce _any_ user-facing change? New `DISABLE_MAP_KEY_NORMALIZATION` config option. ### How was this patch tested? New UT proposed in this PR ### Was this patch authored or co-authored using generative AI tooling? No Closes #45760 from stevomitric/stevomitric/normalize-conf. Authored-by: Stevo Mitric <stevo.mit...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- docs/sql-migration-guide.md | 1 + .../spark/sql/catalyst/util/ArrayBasedMapBuilder.scala | 12 +++++++----- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++++++ .../spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala | 10 ++++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index b22665487c7b..13d6702c4cf9 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -24,6 +24,7 @@ license: | ## Upgrading from Spark SQL 3.5 to 4.0 +- Since Spark 4.0, the default behaviour when inserting elements in a map is changed to first normalize keys -0.0 to 0.0. The affected SQL functions are `create_map`, `map_from_arrays`, `map_from_entries`, and `map_concat`. To restore the previous behaviour, set `spark.sql.legacy.disableMapKeyNormalization` to `true`. - Since Spark 4.0, the default value of `spark.sql.maxSinglePartitionBytes` is changed from `Long.MaxValue` to `128m`. To restore the previous behavior, set `spark.sql.maxSinglePartitionBytes` to `9223372036854775807`(`Long.MaxValue`). - Since Spark 4.0, any read of SQL tables takes into consideration the SQL configs `spark.sql.files.ignoreCorruptFiles`/`spark.sql.files.ignoreMissingFiles` instead of the core config `spark.files.ignoreCorruptFiles`/`spark.files.ignoreMissingFiles`. - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala index d13c3c6026a2..a2d41ebf04e1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala @@ -53,11 +53,13 @@ class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Seria private val mapKeyDedupPolicy = SQLConf.get.getConf(SQLConf.MAP_KEY_DEDUP_POLICY) - private lazy val keyNormalizer: Any => Any = keyType match { - case FloatType => NormalizeFloatingNumbers.FLOAT_NORMALIZER - case DoubleType => NormalizeFloatingNumbers.DOUBLE_NORMALIZER - case _ => identity - } + private lazy val keyNormalizer: Any => Any = + (SQLConf.get.getConf(SQLConf.DISABLE_MAP_KEY_NORMALIZATION), keyType) match { + case (false, FloatType) => NormalizeFloatingNumbers.FLOAT_NORMALIZER + case (false, DoubleType) => NormalizeFloatingNumbers.DOUBLE_NORMALIZER + case _ => identity + } + def put(key: Any, value: Any): Unit = { if (key == null) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index fc7425ce2bea..af4498274620 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3247,6 +3247,16 @@ object SQLConf { .stringConf .createWithDefault("") + val DISABLE_MAP_KEY_NORMALIZATION = + buildConf("spark.sql.legacy.disableMapKeyNormalization") + .internal() + .doc("Disables key normalization when creating a map with `ArrayBasedMapBuilder`. When " + + "set to `true` it will prevent key normalization when building a map, which will " + + "allow for values such as `-0.0` and `0.0` to be present as distinct keys.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val FASTFAIL_ON_FILEFORMAT_OUTPUT = buildConf("spark.sql.execution.fastFailOnFileFormatOutput") .internal() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala index 3c8c49ee7fec..1d3fb835f5a7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala @@ -72,6 +72,16 @@ class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper { ) } + test ("disable map key normalization") { + withSQLConf(SQLConf.DISABLE_MAP_KEY_NORMALIZATION.key -> "true") { + val builder = new ArrayBasedMapBuilder(DoubleType, IntegerType) + builder.put(0.0, 1) + builder.put(-0.0, 1) + val map = builder.build() + assert(map.numElements() == 2) + } + } + test("successful map normalization on build") { val builder = new ArrayBasedMapBuilder(DoubleType, IntegerType) builder.put(-0.0, 1) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org