yihua commented on code in PR #12798: URL: https://github.com/apache/hudi/pull/12798#discussion_r1972655426
########## hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestTableColumnTypeMismatch.scala: ########## @@ -0,0 +1,919 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.{DataSourceWriteOptions, ScalaAssertionSupport} + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.hudi.ErrorMessageChecker.isIncompatibleDataException +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase + +class TestTableColumnTypeMismatch extends HoodieSparkSqlTestBase with ScalaAssertionSupport { + + test("Test Spark successful implicit type casting behaviors") { + withRecordType()(withTempDir { tmp => + // Define test cases for successful implicit casting + case class TypeCastTestCase( + sourceType: String, + targetType: String, + testValue: String, + expectedValue: Any, + description: String + ) + + val successfulTestCases = Seq( + // Numeric widening conversions (always safe) + TypeCastTestCase("tinyint", "smallint", "127", 127, "tinyint to smallint widening"), + TypeCastTestCase("tinyint", "int", "127", 127, "tinyint to int widening"), + TypeCastTestCase("tinyint", "bigint", "127", 127L, "tinyint to bigint widening"), + TypeCastTestCase("tinyint", "float", "127", 127.0f, "tinyint to float widening"), + TypeCastTestCase("tinyint", "double", "127", 127.0d, "tinyint to double widening"), + TypeCastTestCase("tinyint", "decimal(10,1)", "127", java.math.BigDecimal.valueOf(127.0), "tinyint to decimal widening"), + + TypeCastTestCase("smallint", "int", "32767", 32767, "smallint to int widening"), + TypeCastTestCase("smallint", "bigint", "32767", 32767L, "smallint to bigint widening"), + TypeCastTestCase("smallint", "float", "32767", 32767.0f, "smallint to float widening"), + TypeCastTestCase("smallint", "double", "32767", 32767.0d, "smallint to double widening"), + TypeCastTestCase("smallint", "decimal(10,1)", "32767", java.math.BigDecimal.valueOf(32767.0), "smallint to decimal widening"), + + TypeCastTestCase("int", "bigint", "2147483647", 2147483647L, "int to bigint widening"), + TypeCastTestCase("int", "float", "2147483647", 2147483647.0f, "int to float widening"), + TypeCastTestCase("int", "double", "2147483647", 2147483647.0d, "int to double widening"), + TypeCastTestCase("int", "decimal(10,1)", "22", java.math.BigDecimal.valueOf(22.0), "int to decimal widening"), + + // double value would have some epsilon error which is expected. + TypeCastTestCase("float", "double", "3.14", 3.140000104904175d, "float to double widening"), + TypeCastTestCase("float", "decimal(10,2)", "3.14", java.math.BigDecimal.valueOf(3.14).setScale(2, java.math.RoundingMode.HALF_UP), "float to decimal"), + + // Numeric narrowing conversions (potential data loss) + TypeCastTestCase("double", "int", "123.45", 123, "double to int - truncates decimal"), + TypeCastTestCase("decimal(10,2)", "int", "123.45", 123, "decimal to int - truncates decimal"), + + // Boolean conversions + TypeCastTestCase("boolean", "string", "true", "true", "boolean to string"), + + // Timestamp/Date conversions + TypeCastTestCase("timestamp", "string", "timestamp'2023-01-01 12:00:00'", "2023-01-01 12:00:00", "timestamp to string"), + TypeCastTestCase("timestamp", "date", "timestamp'2023-01-01 12:00:00'", java.sql.Date.valueOf("2023-01-01"), "timestamp to date"), + TypeCastTestCase("date", "string", "date'2023-01-01'", "2023-01-01", "date to string"), + TypeCastTestCase("date", "timestamp", "date'2023-01-01'", java.sql.Timestamp.valueOf("2023-01-01 00:00:00"), "date to timestamp") + ).filter(_.expectedValue != null) // Ensure we only test successful cases + + val tableName = generateTableName + + // Create columns definition dynamically + val columnsDefinition = successfulTestCases.zipWithIndex.map { case (test, idx) => + s"col_${idx} ${test.targetType}" + }.mkString(",\n ") + + // Create single table with all target type columns + spark.sql( + s""" + |create table $tableName ( + | id int, + | $columnsDefinition, + | ts long + |) using hudi + |location '${tmp.getCanonicalPath}/$tableName' + |tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts' + |) + """.stripMargin) + + // Generate insert values + val insertValues = successfulTestCases.zipWithIndex.map { case (test, idx) => + s"cast(${test.testValue} as ${test.sourceType}) as col_${idx}" + }.mkString(",\n ") + + // Insert all test values in one go + spark.sql( + s""" + |insert into $tableName + |select + | 1 as id, + | $insertValues, + | 1000 as ts + """.stripMargin) + + // Verify each column value + successfulTestCases.zipWithIndex.foreach { case (test, idx) => + val result = spark.sql(s"select col_${idx} from $tableName where id = 1").collect()(0)(0) + assert(result == test.expectedValue, + s"${test.description}: Expected ${test.expectedValue} but got $result") + } + }) + } + + test("Test Spark disallowed implicit type casting behaviors") { + // Capturing the current behavior of Spark's implicit type casting. + withRecordType()(withTempDir { tmp => + // Define test cases for implicit casting + case class TypeCastTestCase( + sourceType: String, + targetType: String, + testValue: String, // SQL literal expression + expectedValue: Any, + description: String = "" + ) + + val testCases = Seq( + TypeCastTestCase("int", "decimal(10,1)", "2147483647", java.math.BigDecimal.valueOf(2147483647.0), "int to decimal widening overflow"), + + // String conversions + TypeCastTestCase("string", "int", "'123'", 123, "string to int - invalid numeric string"), + TypeCastTestCase("string", "double", "'12.34'", 12.34d, "string to double - invalid numeric string"), + TypeCastTestCase("string", "double", "'abc'", null, "string to double - invalid numeric string"), + TypeCastTestCase("string", "boolean", "'abc'", null, "string to boolean - invalid boolean string"), + TypeCastTestCase("string", "timestamp", "'2023-01-01'", java.sql.Timestamp.valueOf("2023-01-01 00:00:00"), "string to timestamp - invalid date string"), + TypeCastTestCase("string", "date", "'2023-01-01'", java.sql.Date.valueOf("2023-01-01"), "string to date - invalid date string"), + + // Numeric narrowing conversions (potential data loss) + TypeCastTestCase("double", "int", s"${Int.MaxValue.toDouble + 1}", null, "double to int - overflow"), + TypeCastTestCase("bigint", "int", "2147483648", null, "bigint to int - overflow"), + + // Boolean conversions + TypeCastTestCase("boolean", "int", "true", 1, "boolean to int") + ) + + testCases.foreach { testCase => + val tableName = generateTableName + + // Create table with target type + spark.sql( + s""" + |create table $tableName ( + | id int, + | value ${testCase.targetType}, + | ts long + |) using hudi + |location '${tmp.getCanonicalPath}/$tableName' + |tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts' + |) + """.stripMargin) + + // Test failed conversion + val exception = intercept[Exception] { + spark.sql( + s""" + |insert into $tableName + |select 1 as id, cast(${testCase.testValue} as ${testCase.sourceType}) as value, 1000 as ts + """.stripMargin) + } + + val exceptionMsg = exception.getMessage + val exceptionCauseMsg = Option(exception.getCause).map(_.getMessage).getOrElse("") + assert(isIncompatibleDataException(exception), + s"${testCase.description}: Expected casting related error but got different exception: " + + s"Message from the exception ${exceptionMsg}, message from the exception cause ${exceptionCauseMsg}") + } + }) + } + + test("Test All Valid Type Casting For Merge Into and Insert") { + // For all valid type casting pairs, test merge into and insert operations. + // Define the column types for testing, based on successful casting cases + case class ColumnTypePair( + sourceType: String, + targetType: String, + testValue: String, + expectedValue: Any, + columnName: String + ) + + // Define valid type casting pairs based on the previous test cases + val validTypePairs = Seq( + // Numeric widening pairs + ColumnTypePair("tinyint", "smallint", "127", 127, "tiny_to_small"), + ColumnTypePair("tinyint", "int", "127", 127, "tiny_to_int"), + ColumnTypePair("tinyint", "bigint", "127", 127L, "tiny_to_big"), + ColumnTypePair("tinyint", "float", "127", 127.0f, "tiny_to_float"), + ColumnTypePair("tinyint", "double", "127", 127.0d, "tiny_to_double"), + ColumnTypePair("tinyint", "decimal(10,1)", "127", java.math.BigDecimal.valueOf(127.0), "tiny_to_decimal"), + + ColumnTypePair("smallint", "int", "32767", 32767, "small_to_int"), + ColumnTypePair("smallint", "bigint", "32767", 32767L, "small_to_big"), + ColumnTypePair("smallint", "float", "32767", 32767.0f, "small_to_float"), + ColumnTypePair("smallint", "double", "32767", 32767.0d, "small_to_double"), + ColumnTypePair("smallint", "decimal(10,1)", "32767", java.math.BigDecimal.valueOf(32767.0), "small_to_decimal"), + + ColumnTypePair("int", "bigint", "2147483647", 2147483647L, "int_to_big"), + ColumnTypePair("int", "float", "2147483647", 2147483647.0f, "int_to_float"), + ColumnTypePair("int", "double", "2147483647", 2147483647.0d, "int_to_double"), + ColumnTypePair("int", "decimal(10,1)", "22", java.math.BigDecimal.valueOf(22.0), "int_to_decimal"), + + ColumnTypePair("float", "double", "3.14", 3.140000104904175d, "float_to_double"), + ColumnTypePair("float", "decimal(10,2)", "3.14", java.math.BigDecimal.valueOf(3.14).setScale(2, java.math.RoundingMode.HALF_UP), "float_to_decimal"), + + // Timestamp/Date conversions + ColumnTypePair("timestamp", "string", "timestamp'2023-01-01 12:00:00'", "2023-01-01 12:00:00", "ts_to_string"), + ColumnTypePair("timestamp", "date", "timestamp'2023-01-01 12:00:00'", java.sql.Date.valueOf("2023-01-01"), "ts_to_date"), + ColumnTypePair("date", "string", "date'2023-01-01'", "2023-01-01", "date_to_string"), + ColumnTypePair("date", "timestamp", "date'2023-01-01'", java.sql.Timestamp.valueOf("2023-01-01 00:00:00"), "date_to_ts"), + + // Boolean conversions + ColumnTypePair("boolean", "string", "true", "true", "bool_to_string") + ) + + Seq("cow", "mor").foreach { tableType => + withRecordType()(withTempDir { tmp => Review Comment: I removed the permutation on the record types as they are irrelevant to the tests here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
