Re: [PR] fix: requested character too large for encoding in chr function [datafusion-comet]

via GitHub Wed, 12 Jun 2024 23:29:19 -0700


vaibhawvipul commented on code in PR #552:
URL: https://github.com/apache/datafusion-comet/pull/552#discussion_r1637622187



##########
spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala:
##########
@@ -1053,6 +1053,32 @@ class CometExpressionSuite extends CometTestBase with 
AdaptiveSparkPlanHelper {
     }
   }
 
+  test("Chr with negative and large value") {
+    Seq(false, true).foreach { dictionary =>
+      withSQLConf(
+        "parquet.enable.dictionary" -> dictionary.toString,
+        CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key -> "true") {
+        val table = "test0"
+        withTable(table) {
+          sql(s"create table $table(c9 int, c4 int) using parquet")
+          sql(
+            s"insert into $table values(0, 0), (61231, -61231), (-1700, 1700), 
(0, -4000), (-40, 40)")
+          val query = s"SELECT chr(c9), chr(c4) FROM $table"
+          checkSparkAnswerAndOperator(query)
+        }
+      }
+    }
+
+    withParquetTable((0 until 5).map(i => (i % 5, i % 3)), "tbl") {
+      withSQLConf(
+        "spark.sql.optimizer.excludedRules" -> 
"org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
+        for (n <- Seq("0", "-0", "0.5", "-0.5", "555", "-555", "null")) {
+          checkSparkAnswerAndOperator(s"select chr(cast(${n} as int)) FROM 
tbl")

Review Comment:
   @viirya here is the test which covers all the cases.



##########
core/src/execution/datafusion/expressions/scalar_funcs/chr.rs:
##########
@@ -44,10 +44,15 @@ pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
         .iter()
         .map(|integer: Option<i64>| {
             integer
-                .map(|integer| match core::char::from_u32(integer as u32) {
-                    Some(integer) => Ok(integer.to_string()),
-                    None => {
-                        exec_err!("requested character too large for 
encoding.")
+                .map(|integer| {
+                    if integer < 0 {
+                        return Ok("".to_string()); // Return empty string for 
negative integers
+                    }
+                    match core::char::from_u32((integer % 256) as u32) {

Review Comment:
   hey @viirya, thank you for the review. 
   
   char::from_u32 internally uses - 
   
   ```
   const fn char_try_from_u32(i: u32) -> Result<char, CharTryFromError> {
       // This is an optimized version of the check
       // (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF),
       // which can also be written as
       // i >= 0x110000 || (i >= 0xD800 && i < 0xE000).
       //
       // The XOR with 0xD800 permutes the ranges such that 0xD800..0xE000 is
       // mapped to 0x0000..0x0800, while keeping all the high bits outside 
0xFFFF the same.
       // In particular, numbers >= 0x110000 stay in this range.
       //
       // Subtracting 0x800 causes 0x0000..0x0800 to wrap, meaning that a single
       // unsigned comparison against 0x110000 - 0x800 will detect both the 
wrapped
       // surrogate range as well as the numbers originally larger than 
0x110000.
       //
       if (i ^ 0xD800).wrapping_sub(0x800) >= 0x110000 - 0x800 {
           Err(CharTryFromError(()))
       } else {
           // SAFETY: checked that it's a legal unicode value
           Ok(unsafe { transmute(i) })
       }
   }
   ```
   
   This returns unicode `'\0'` which is equivalent to Character. MIN_VALUE. 
   
   Hence the implementation has same behaviour, as far as i understand.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] fix: requested character too large for encoding in chr function [datafusion-comet]

Reply via email to