This is an automated email from the ASF dual-hosted git repository. yao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 207d675110e6 [SPARK-48211][SQL] DB2: Read SMALLINT as ShortType 207d675110e6 is described below commit 207d675110e6fa699a434e81296f6f050eb0304b Author: Kent Yao <y...@apache.org> AuthorDate: Thu May 9 17:27:04 2024 +0800 [SPARK-48211][SQL] DB2: Read SMALLINT as ShortType ### What changes were proposed in this pull request? This PR supports read SMALLINT from DB2 as ShortType ### Why are the changes needed? - 15 bits is sufficient - we write ShortType as SMALLINT - we read smallint from other builtin jdbc sources as ShortType ### Does this PR introduce _any_ user-facing change? yes, we add a migration guide for this ### How was this patch tested? changed tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #46497 from yaooqinn/SPARK-48211. Authored-by: Kent Yao <y...@apache.org> Signed-off-by: Kent Yao <y...@apache.org> --- .../spark/sql/jdbc/DB2IntegrationSuite.scala | 69 +++++++++++++--------- docs/sql-migration-guide.md | 1 + .../org/apache/spark/sql/internal/SQLConf.scala | 11 ++++ .../org/apache/spark/sql/jdbc/DB2Dialect.scala | 3 + 4 files changed, 56 insertions(+), 28 deletions(-) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala index cedb33d491fb..aca174cce194 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala @@ -25,6 +25,7 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, ByteType, ShortType, StructType} import org.apache.spark.tags.DockerTest @@ -77,32 +78,44 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { } test("Numeric types") { - val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) - val rows = df.collect() - assert(rows.length == 1) - val types = rows(0).toSeq.map(x => x.getClass.toString) - assert(types.length == 10) - assert(types(0).equals("class java.lang.Integer")) - assert(types(1).equals("class java.lang.Integer")) - assert(types(2).equals("class java.lang.Long")) - assert(types(3).equals("class java.math.BigDecimal")) - assert(types(4).equals("class java.lang.Double")) - assert(types(5).equals("class java.lang.Double")) - assert(types(6).equals("class java.lang.Float")) - assert(types(7).equals("class java.math.BigDecimal")) - assert(types(8).equals("class java.math.BigDecimal")) - assert(types(9).equals("class java.math.BigDecimal")) - assert(rows(0).getInt(0) == 17) - assert(rows(0).getInt(1) == 77777) - assert(rows(0).getLong(2) == 922337203685477580L) - val bd = new BigDecimal("123456745.56789012345000000000") - assert(rows(0).getAs[BigDecimal](3).equals(bd)) - assert(rows(0).getDouble(4) == 42.75) - assert(rows(0).getDouble(5) == 5.4E-70) - assert(rows(0).getFloat(6) == 3.4028234663852886e+38) - assert(rows(0).getDecimal(7) == new BigDecimal("4.299900000000000000")) - assert(rows(0).getDecimal(8) == new BigDecimal("99999999999999990000.000000000000000000")) - assert(rows(0).getDecimal(9) == new BigDecimal("1234567891234567.123456789123456789")) + Seq(true, false).foreach { legacy => + withSQLConf(SQLConf.LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED.key -> legacy.toString) { + val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 10) + if (legacy) { + assert(types(0).equals("class java.lang.Integer")) + } else { + assert(types(0).equals("class java.lang.Short")) + } + assert(types(1).equals("class java.lang.Integer")) + assert(types(2).equals("class java.lang.Long")) + assert(types(3).equals("class java.math.BigDecimal")) + assert(types(4).equals("class java.lang.Double")) + assert(types(5).equals("class java.lang.Double")) + assert(types(6).equals("class java.lang.Float")) + assert(types(7).equals("class java.math.BigDecimal")) + assert(types(8).equals("class java.math.BigDecimal")) + assert(types(9).equals("class java.math.BigDecimal")) + if (legacy) { + assert(rows(0).getInt(0) == 17) + } else { + assert(rows(0).getShort(0) == 17) + } + assert(rows(0).getInt(1) == 77777) + assert(rows(0).getLong(2) == 922337203685477580L) + val bd = new BigDecimal("123456745.56789012345000000000") + assert(rows(0).getAs[BigDecimal](3).equals(bd)) + assert(rows(0).getDouble(4) == 42.75) + assert(rows(0).getDouble(5) == 5.4E-70) + assert(rows(0).getFloat(6) == 3.4028234663852886e+38) + assert(rows(0).getDecimal(7) == new BigDecimal("4.299900000000000000")) + assert(rows(0).getDecimal(8) == new BigDecimal("99999999999999990000.000000000000000000")) + assert(rows(0).getDecimal(9) == new BigDecimal("1234567891234567.123456789123456789")) + } + } } test("Date types") { @@ -154,8 +167,8 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { new StructType().add("c1", ShortType).add("b", ByteType).add("c3", BooleanType)) df4.write.jdbc(jdbcUrl, "otherscopy", new Properties) val rows = sqlContext.read.jdbc(jdbcUrl, "otherscopy", new Properties).collect() - assert(rows(0).getInt(0) == 1) - assert(rows(0).getInt(1) == 20) + assert(rows(0).getShort(0) == 1) + assert(rows(0).getShort(1) == 20) assert(rows(0).getString(2) == "1") } diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index bd6604cb69c0..8b55fb48b8b5 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -50,6 +50,7 @@ license: | - Since Spark 4.0, Oracle JDBC datasource will write TimestampType as TIMESTAMP WITH LOCAL TIME ZONE, while in Spark 3.5 and previous, write as TIMESTAMP. To restore the previous behavior, set `spark.sql.legacy.oracle.timestampMapping.enabled` to `true`. - Since Spark 4.0, MsSQL Server JDBC datasource will read TINYINT as ShortType, while in Spark 3.5 and previous, read as IntegerType. To restore the previous behavior, set `spark.sql.legacy.mssqlserver.numericMapping.enabled` to `true`. - Since Spark 4.0, MsSQL Server JDBC datasource will read DATETIMEOFFSET as TimestampType, while in Spark 3.5 and previous, read as StringType. To restore the previous behavior, set `spark.sql.legacy.mssqlserver.datetimeoffsetMapping.enabled` to `true`. +- Since Spark 4.0, DB2 JDBC datasource will read SMALLINT as ShortType, while in Spark 3.5 and previous, it was read as IntegerType. To restore the previous behavior, set `spark.sql.legacy.db2.numericMapping.enabled` to `true`. - Since Spark 4.0, The default value for `spark.sql.legacy.ctePrecedencePolicy` has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an error, inner CTE definitions take precedence over outer definitions. - Since Spark 4.0, The default value for `spark.sql.legacy.timeParserPolicy` has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an `INCONSISTENT_BEHAVIOR_CROSS_VERSION` error, `CANNOT_PARSE_TIMESTAMP` will be raised if ANSI mode is enable. `NULL` will be returned if ANSI mode is disabled. See [Datetime Patterns for Formatting and Parsing](sql-ref-datetime-pattern.html). - Since Spark 4.0, A bug falsely allowing `!` instead of `NOT` when `!` is not a prefix operator has been fixed. Clauses such as `expr ! IN (...)`, `expr ! BETWEEN ...`, or `col ! NULL` now raise syntax errors. To restore the previous behavior, set `spark.sql.legacy.bangEqualsNot` to `true`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index df75985043d0..54aa87260534 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4222,6 +4222,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED = + buildConf("spark.sql.legacy.db2.numericMapping.enabled") + .internal() + .doc("When true, SMALLINT maps to IntegerType in DB2; otherwise, ShortType" ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val CSV_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.csv.filterPushdown.enabled") .doc("When true, enable filter pushdown to CSV datasource.") .version("3.0.0") @@ -5339,6 +5347,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyOracleTimestampMappingEnabled: Boolean = getConf(LEGACY_ORACLE_TIMESTAMP_MAPPING_ENABLED) + def legacyDB2numericMappingEnabled: Boolean = + getConf(LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED) + override def legacyTimeParserPolicy: LegacyBehaviorPolicy.Value = { LegacyBehaviorPolicy.withName(getConf(SQLConf.LEGACY_TIME_PARSER_POLICY)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala index 31a7c783ba60..cc596a5f0185 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.NonEmptyNamespaceException import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.connector.expressions.Expression import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ private case class DB2Dialect() extends JdbcDialect { @@ -86,6 +87,8 @@ private case class DB2Dialect() extends JdbcDialect { typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = sqlType match { + case Types.SMALLINT if !SQLConf.get.legacyDB2numericMappingEnabled => + Option(ShortType) case Types.REAL => Option(FloatType) case Types.OTHER => typeName match { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org