sunxiaoguang commented on code in PR #49453:
URL: https://github.com/apache/spark/pull/49453#discussion_r1929552100
##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala:
##########
@@ -241,6 +241,61 @@ class MySQLIntegrationSuite extends
DockerJDBCIntegrationV2Suite with V2JDBCTest
assert(rows10(0).getString(0) === "amy")
assert(rows10(1).getString(0) === "alex")
}
+
+ test("SPARK-50793: MySQL JDBC Connector failed to cast some types") {
+ val tableName = catalogName + ".test_cast_function"
+ withTable(tableName) {
+ val stringValue = "0"
+ val stringLiteral = "'0'"
+ val longValue = 0L
+ val binaryValue = Array[Byte](0x30)
+ val binaryLiteral = "x'30'"
+ val doubleValue = 0.0
+ val doubleLiteral = "0.0"
+ // CREATE table to use types defined in Spark SQL
+ sql(
+ s"CREATE TABLE $tableName (string_col STRING, long_col LONG, " +
+ "binary_col BINARY, double_col DOUBLE)")
+ sql(
+ s"INSERT INTO $tableName VALUES($stringLiteral, $longValue,
$binaryLiteral, $doubleValue)")
+
+ def testCast(
+ castType: String,
+ sourceCol: String,
+ targetCol: String,
+ targetDataType: DataType,
+ targetValue: Any): Unit = {
+ val sql = s"SELECT CAST($sourceCol AS $castType) AS target " +
+ s"FROM $tableName WHERE CAST($sourceCol AS $castType) = $targetCol"
+ val df = spark.sql(sql)
+ castType match {
+ case "SHORT" | "INTEGER" =>
+ checkError(
+ exception = intercept[SparkException] {
+ df.collect()
+ },
+ condition = null)
+ case _ =>
+ checkFilterPushed(df)
+ checkAnswer(df, Seq(Row(targetValue)))
+ val expectedTypes = Array(targetDataType)
+ val resultTypes = df.schema.fields.map(_.dataType)
+ assert(resultTypes === expectedTypes, s"Failed to cast $sourceCol
to $castType")
+ }
+ }
+
+ testCast("BINARY", "string_col", "binary_col", BinaryType, binaryValue);
+ testCast("SHORT", "string_col", "long_col", LongType, longValue)
+ testCast("INTEGER", "string_col", "long_col", LongType, longValue)
+ testCast("LONG", "string_col", "long_col", LongType, longValue)
+ // We use stringLiteral to make sure both values are using the same
collation
+ testCast("STRING", "long_col", stringLiteral, StringType, stringValue)
Review Comment:
Since the test runs with both `MariaDB connector/j 2.7.12` and `MySQL
connector/j 9.1.0` we need this to make the behavior of test consistent. It's a
lot of detail. Simply put, we don't want to change the environment and changing
the behavior of test setup, so we need to make the new test works with existing
test setup while still validating the test cases.
And the following is the detail for everyone's reference. The reason they
are incompatible is that for the test setup, `MariaDB connector/j 2.7.12` uses
`utf8mb4_unicode_ci` for `collation_connection`, on the other hand `MySQL
connector/j 9.1.0` uses `utf8mb4_0900_ai_ci` for `collation_connection`. The
test would fail without using `stringLiteral` at here like this.
<img width="1920" alt="image"
src="https://github.com/user-attachments/assets/6e4454b5-ffe0-483e-b506-8a5f6201719b"
/>
[The implementation to
decide](https://github.com/mariadb-corporation/mariadb-connector-j/blob/175b47fa386c3a4acac827ae0a00214ef812c297/src/main/java/org/mariadb/jdbc/internal/protocol/AbstractConnectProtocol.java#L1229)
`connection_collation` in `MariaDB connector/j 2.7.12` is like this. The
serverLanguage is 255 for test setup of `MySQL server 9.1.0`, therefore the
collation chosen is 224 which is `utf8mb4_unicode_ci`
```java
/**
* Default collation used for string exchanges with server.
*
* @param serverLanguage server default collation
* @return collation byte
*/
private byte decideLanguage(int serverLanguage) {
// return current server utf8mb4 collation
if (serverLanguage == 45 // utf8mb4_general_ci
|| serverLanguage == 46 // utf8mb4_bin
|| (serverLanguage >= 224 && serverLanguage <= 247)) {
return (byte) serverLanguage;
}
if (getMajorServerVersion() == 5 && getMinorServerVersion() <= 1) {
// 5.1 version doesn't know 4 bytes utf8
return (byte) 33; // utf8_general_ci
}
// if server language is utf8mb3, use utf8mb4 equivalent collation
if (serverLanguage == 33) {
// utf8mb4_general_ci
return 45;
}
if (serverLanguage == 83) {
// utf8mb4_bin
return 46;
}
if (serverLanguage >= 192 && serverLanguage <= 215) {
// equivalent utf8mb4 collation
return (byte) (serverLanguage + 32);
}
return (byte) 224; // UTF8MB4_UNICODE_CI;
}
```
On the other hand, [the implementation
of](https://github.com/mysql/mysql-connector-j/blob/cf2917ea44ae2e43a4514a33771035aa99de73bf/src/main/core-api/java/com/mysql/cj/CharsetMapping.java#L395)
`MySQL connector/j 9.1.0` is a lookup table like this. It uses the collation
`utf8mb4_0900_ai_ci` which is the same as `collation_server` and
`collation_database` with the test setup.
```java
// 216..222
collation[223] = new Collation(223, new String[] {
"utf8mb3_general_mysql500_ci", "utf8_general_mysql500_ci" }, 0,
MYSQL_CHARSET_NAME_utf8mb3);
collation[224] = new Collation(224, "utf8mb4_unicode_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[225] = new Collation(225, "utf8mb4_icelandic_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[226] = new Collation(226, "utf8mb4_latvian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[227] = new Collation(227, "utf8mb4_romanian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[228] = new Collation(228, "utf8mb4_slovenian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[229] = new Collation(229, "utf8mb4_polish_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[230] = new Collation(230, "utf8mb4_estonian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[231] = new Collation(231, "utf8mb4_spanish_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[232] = new Collation(232, "utf8mb4_swedish_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[233] = new Collation(233, "utf8mb4_turkish_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[234] = new Collation(234, "utf8mb4_czech_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[235] = new Collation(235, "utf8mb4_danish_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[236] = new Collation(236, "utf8mb4_lithuanian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[237] = new Collation(237, "utf8mb4_slovak_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[238] = new Collation(238, "utf8mb4_spanish2_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[239] = new Collation(239, "utf8mb4_roman_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[240] = new Collation(240, "utf8mb4_persian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[241] = new Collation(241, "utf8mb4_esperanto_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[242] = new Collation(242, "utf8mb4_hungarian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[243] = new Collation(243, "utf8mb4_sinhala_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[244] = new Collation(244, "utf8mb4_german2_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[245] = new Collation(245, "utf8mb4_croatian_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[246] = new Collation(246, "utf8mb4_unicode_520_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[247] = new Collation(247, "utf8mb4_vietnamese_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[248] = new Collation(248, "gb18030_chinese_ci", 1,
MYSQL_CHARSET_NAME_gb18030);
collation[249] = new Collation(249, "gb18030_bin", 0,
MYSQL_CHARSET_NAME_gb18030);
collation[250] = new Collation(250, "gb18030_unicode_520_ci", 0,
MYSQL_CHARSET_NAME_gb18030);
// 251..254
collation[255] = new Collation(255, "utf8mb4_0900_ai_ci", 1,
MYSQL_CHARSET_NAME_utf8mb4);
collation[256] = new Collation(256, "utf8mb4_de_pb_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[257] = new Collation(257, "utf8mb4_is_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[258] = new Collation(258, "utf8mb4_lv_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[259] = new Collation(259, "utf8mb4_ro_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[260] = new Collation(260, "utf8mb4_sl_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[261] = new Collation(261, "utf8mb4_pl_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[262] = new Collation(262, "utf8mb4_et_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[263] = new Collation(263, "utf8mb4_es_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[264] = new Collation(264, "utf8mb4_sv_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[265] = new Collation(265, "utf8mb4_tr_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[266] = new Collation(266, "utf8mb4_cs_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[267] = new Collation(267, "utf8mb4_da_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[268] = new Collation(268, "utf8mb4_lt_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[269] = new Collation(269, "utf8mb4_sk_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[270] = new Collation(270, "utf8mb4_es_trad_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
collation[271] = new Collation(271, "utf8mb4_la_0900_ai_ci", 0,
MYSQL_CHARSET_NAME_utf8mb4);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]