attilapiros commented on code in PR #53458:
URL: https://github.com/apache/spark/pull/53458#discussion_r2756338704


##########
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastWithAnsiOffSuite.scala:
##########
@@ -930,4 +931,139 @@ class CastWithAnsiOffSuite extends CastSuiteBase {
     checkEvaluation(cast(largeTime1, ShortType), null)
     checkEvaluation(cast(largeTime1, ByteType), null)
   }
+
+  test("LEGACY mode: cast invalid UTF-8 binary to string should return null") {
+    withSQLConf(
+      SQLConf.VALIDATE_BINARY_TO_STRING_CAST.key -> "true",
+      SQLConf.ANSI_ENABLED.key -> "false") {
+      // Create Cast expressions inside withSQLConf so they pick up the 
correct config
+      // In LEGACY mode with validation enabled, invalid UTF-8 returns null
+      checkEvaluation(cast(invalidUtf8Literal, StringType), null)
+
+      // Valid UTF-8 should work
+      checkEvaluation(cast(validUtf8Literal, StringType), 
UTF8String.fromString("Hello"))
+
+      // Empty binary should work
+      checkEvaluation(cast(emptyBinaryLiteral, StringType), 
UTF8String.fromString(""))
+    }
+  }
+
+  test("LEGACY mode: cast invalid UTF-8 with validation disabled (old 
behavior)") {
+    withSQLConf(SQLConf.VALIDATE_BINARY_TO_STRING_CAST.key -> "false") {
+      // With validation disabled, invalid UTF-8 passes through (old behavior)
+      val result = cast(invalidUtf8Literal, StringType).eval()
+      assert(result != null, "Should not return null when validation is 
disabled")
+      assert(!result.asInstanceOf[UTF8String].isValid(),
+        "Result should contain invalid UTF-8")
+
+      // Valid UTF-8 should still work
+      checkEvaluation(cast(validUtf8Literal, StringType), 
UTF8String.fromString("Hello"))
+
+      // Empty binary should work
+      checkEvaluation(cast(emptyBinaryLiteral, StringType), 
UTF8String.fromString(""))
+    }
+  }
+
+  test("LEGACY mode: cast array with invalid UTF-8 binary to array of string") 
{
+    // Array with mix of valid and invalid UTF-8
+    val arrayLiteral = Literal.create(
+      Seq(validUtf8Bytes, invalidUtf8Bytes, emptyBinaryBytes),
+      ArrayType(BinaryType, containsNull = false))
+
+    val result = cast(arrayLiteral, ArrayType(StringType, containsNull = 
true)).eval()
+    val resultArray = result.asInstanceOf[ArrayData]
+
+    // Valid UTF-8 should convert
+    assert(resultArray.getUTF8String(0) == UTF8String.fromString("Hello"))
+    // Invalid UTF-8 should be NULL
+    assert(resultArray.isNullAt(1))

Review Comment:
   This is very interesting as I would expect the exception here. The test 
title says LEGACY mode but actually this must be ANSI mode as ANSI_ENABLED is 
true by default.
   
   Even using a spark-shell this case is inconsistent:
   
   ```
   scala> spark.conf.get("spark.sql.ansi.enabled")
   val res0: String = true
   
   scala> spark.conf.get("spark.sql.castBinaryToString.validateUtf8")
   val res1: String = true
   
   scala> spark.sql("SELECT CAST(ARRAY(X'80') AS STRING)").show()
   +----------------------------+
   |CAST(array(X'80') AS STRING)|
   +----------------------------+
   |                         [�]|
   +----------------------------+
   
   
   scala> spark.sql("SELECT CAST(X'80' AS STRING)").show()
   org.apache.spark.SparkRuntimeException: [CAST_INVALID_INPUT] The value \x80 
of the type "BINARY" cannot be cast to "STRING" because it is malformed. 
Correct the value as per the syntax, or change its target type. Use `try_cast` 
to tolerate malformed input and return NULL instead. SQLSTATE: 22018
   == SQL (line 1, position 8) ==
   SELECT CAST(X'80' AS STRING)
          ^^^^^^^^^^^^^^^^^^^^^
   
     at 
org.apache.spark.sql.errors.QueryExecutionErrors$.invalidUtf8InBinaryCastError(QueryExecutionErrors.scala:246)
     at 
org.apache.spark.sql.catalyst.expressions.Cast.$anonfun$castBinaryToString$2(Cast.scala:712)
     at org.apache.spark.sql.catalyst.expressions.Cast.buildCast(Cast.scala:675)
     at 
org.apache.spark.sql.catalyst.expressions.Cast.$anonfun$castBinaryToString$1(Cast.scala:702)
     at 
org.apache.spark.sql.catalyst.expressions.Cast.nullSafeEval(Cast.scala:1368)
     at 
org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:601)
   ....
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to