This is an automated email from the ASF dual-hosted git repository.

changchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 02c776f50d [GLUTEN-11012][CH] Support Thai/Khmer digit dates in CH 
(#11476)
02c776f50d is described below

commit 02c776f50d77ec965152b2033388bec5f27dbdd3
Author: zhanglistar <[email protected]>
AuthorDate: Mon Feb 2 20:21:41 2026 +0800

    [GLUTEN-11012][CH] Support Thai/Khmer digit dates in CH (#11476)
    
    * Support Thai/Khmer digit dates in CH
    
    Scan UTF-8 strings for local digits before conversion and add regression
    queries for Thai and Khmer numeral date parsing in the function suite.
    
    * [CH] Document local digit date fixtures
    
    Add comments describing the base64-encoded local digit date fixtures used 
in local digit date tests.
    
    * [CH] Speed up local digit conversion
    
    Use SIMD-based ASCII detection, fast-path common UTF-8 digit ranges, and 
avoid double scans when converting local digits.
    
    * [CH] Fix UTF-8 fallback in local digit conversion
    
    Preserve original UTF-8 bytes when no local digit is detected in multi-byte 
sequences, and downgrade logging to debug.
    
    * [CH] Fix Devanagari/Bengali digit mapping
    
    Map UTF-8 byte ranges to correct digit values for Devanagari and Bengali 
local digits.
    
    * Fix scala code format in CH CI.
---
 .../org/apache/spark/sql/delta/DeltaAdapter.scala  |   1 +
 .../sql/execution/datasources/DeltaV1Writes.scala  |   1 +
 .../execution/GlutenFunctionValidateSuite.scala    |  27 ++-
 .../Functions/LocalDigitsToAsciiDigitForDate.cpp   | 229 ++++++++++++++++-----
 4 files changed, 203 insertions(+), 55 deletions(-)

diff --git 
a/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala
 
b/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala
index 58d59aa9de..f414ab8f28 100644
--- 
a/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala
+++ 
b/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 package org.apache.spark.sql.delta
+
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.delta.stats.DeltaScan
 
diff --git 
a/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala
 
b/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala
index 8ae99cc0d5..de9b760c09 100644
--- 
a/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala
+++ 
b/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 package org.apache.spark.sql.execution.datasources
+
 import org.apache.gluten.backendsapi.BackendsApiManager
 
 import org.apache.spark.sql.SparkSession
diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index e446af8f29..88abe3ae2e 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -1408,8 +1408,11 @@ class GlutenFunctionValidateSuite extends 
GlutenClickHouseWholeStageTransformerS
             |(8, '4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M='),
             |(9, null),
             |(10, '4Keo4Kem4Keo4KerLeCnp+Cnpy3gp6fgp6k='),
-            |(11, 'MjAyNS0xMS0xMg==')
+            |(11, 'MjAyNS0xMS0xMg=='),
+            |(12, '4LmS4LmQ4LmS4LmVLeC5keC5kS3guZHguZM=')
             |""".stripMargin)
+      // base64 inputs decode to local digit dates:
+      // 1-3 Arabic-Indic, 5 Persian, 7 Devanagari, 8 Khmer, 10 Bengali, 11 
ASCII, 12 Thai
       var query_sql = """
                         |select
                         |from_unixtime(unix_timestamp(cast(unbase64(d) as 
string), 'yyyy-MM-dd')),
@@ -1433,6 +1436,28 @@ class GlutenFunctionValidateSuite extends 
GlutenClickHouseWholeStageTransformerS
                     |""".stripMargin
       compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
 
+      query_sql = """
+                    |select from_unixtime(
+                    | unix_timestamp(
+                    |   regexp_replace(
+                    |     
cast(unbase64('4LmS4LmQ4LmS4LmVLeC5keC5kS3guZHguZM=') as string),
+                    |     '-0', '-'),
+                    |   'yyyy-MM-dd'),
+                    | 'yyyy-MM-dd')
+                    |""".stripMargin
+      compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
+
+      query_sql = """
+                    |select from_unixtime(
+                    | unix_timestamp(
+                    |   regexp_replace(
+                    |     
cast(unbase64('4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M=') as string),
+                    |     '-0', '-'),
+                    |   'yyyy-MM-dd'),
+                    | 'yyyy-MM-dd')
+                    |""".stripMargin
+      compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
+
       sql("drop table tb_local_date")
     }
   }
diff --git a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp 
b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
index 7f19975c0a..1e50e02bff 100644
--- a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
+++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
@@ -25,6 +25,31 @@
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionHelpers.h>
 #include <Functions/IFunction.h>
+#include <simdjson/implementation_detection.h>
+#if SIMDJSON_IMPLEMENTATION_ICELAKE && defined(__AVX512F__) && 
defined(__AVX512BW__)
+#include <simdjson/icelake/simd.h>
+namespace simdjson_impl = simdjson::icelake::simd;
+#elif SIMDJSON_IMPLEMENTATION_HASWELL && defined(__AVX2__)
+#include <simdjson/haswell/simd.h>
+namespace simdjson_impl = simdjson::haswell::simd;
+#elif SIMDJSON_IMPLEMENTATION_WESTMERE && defined(__SSE4_2__)
+#include <simdjson/westmere/simd.h>
+namespace simdjson_impl = simdjson::westmere::simd;
+#elif SIMDJSON_IMPLEMENTATION_ARM64
+#include <simdjson/arm64/simd.h>
+namespace simdjson_impl = simdjson::arm64::simd;
+#elif SIMDJSON_IMPLEMENTATION_PPC64
+#include <simdjson/ppc64/simd.h>
+namespace simdjson_impl = simdjson::ppc64::simd;
+#elif SIMDJSON_IMPLEMENTATION_LSX
+#include <simdjson/lsx/simd.h>
+namespace simdjson_impl = simdjson::lsx::simd;
+#elif SIMDJSON_IMPLEMENTATION_LASX
+#include <simdjson/lasx/simd.h>
+namespace simdjson_impl = simdjson::lasx::simd;
+#else
+#define SIMDJSON_NO_SIMD 1
+#endif
 #include <boost/iostreams/detail/select.hpp>
 #include <Common/Exception.h>
 #include <Common/logger_useful.h>
@@ -97,7 +122,9 @@ public:
                     getName(),
                     data_col->getName());
             auto date_str = col_str->getDataAt(0);
-            auto new_str = convertLocalDigit(date_str);
+            std::string new_str;
+            if (!convertLocalDigitIfNeeded(date_str, new_str))
+                return arguments[0].column;
             auto new_data_col = data_col->cloneEmpty();
             new_data_col->insertData(new_str.c_str(), new_str.size());
             return DB::ColumnConst::create(std::move(new_data_col), 
input_rows_count);
@@ -120,62 +147,43 @@ public:
                 getName(),
                 data_col->getName());
 
-        auto nested_data_col = DB::removeNullable(arguments[0].column);
-        bool has_local_digit = false;
-        size_t row_index = 0;
-        for (row_index = 0; row_index < input_rows_count; ++row_index)
+        std::string converted;
+        DB::MutableColumnPtr res_col;
+        for (size_t row_index = 0; row_index < input_rows_count; ++row_index)
         {
             if (null_map && (*null_map)[row_index])
             {
+                if (res_col)
+                    res_col->insertDefault();
                 continue;
             }
             auto str = col_str->getDataAt(row_index);
-            if (hasLocalDigit(str))
+            if (convertLocalDigitIfNeeded(str, converted))
             {
-                has_local_digit = true;
-                break;
+                if (!res_col)
+                {
+                    res_col = data_col->cloneEmpty();
+                    if (row_index)
+                        res_col->insertManyFrom(*data_col, 0, row_index);
+                }
+                LOG_DEBUG(
+                    getLogger("LocalDigitsToAsciiDigitForDateFunction"),
+                    "Converted local digit string {} to ascii digit string: 
{}",
+                    col_str->getDataAt(row_index).toString(),
+                    converted);
+                res_col->insertData(converted.c_str(), converted.size());
             }
-        }
-
-        if (!has_local_digit)
-        {
-            // No local language digits found, return the original column
-            return arguments[0].column;
-        }
-
-        auto res_col = data_col->cloneEmpty();
-        if (row_index)
-        {
-            res_col->insertManyFrom(*data_col, 0, row_index);
-        }
-        for (; row_index < input_rows_count; ++row_index)
-        {
-            if (null_map && (*null_map)[row_index])
+            else if (res_col)
             {
-                res_col->insertDefault();
-                continue;
+                res_col->insertFrom(*data_col, row_index);
             }
-            auto str = convertLocalDigit(col_str->getDataAt(row_index));
-            LOG_ERROR(getLogger("LocalDigitsToAsciiDigitForDateFunction"), 
"Converted local digit string {} to ascii digit string: {}", 
col_str->getDataAt(row_index).toString(), str);
-            res_col->insertData(str.c_str(), str.size());
         }
+        if (!res_col)
+            return arguments[0].column;
         return res_col;
     }
 
 private:
-    bool hasLocalDigit(StringRef str) const
-    {
-        if (!str.size)
-            return false;
-        // In most cases, the first byte is a digit.
-        char c = reinterpret_cast<char>(str.data[0]);
-        if ('0' <= c && c <= '9')
-        {
-            return false;
-        }
-        return true;
-    }
-
     char toAsciiDigit(char32_t c) const {
         // In Thai and Persian, dates typically do not use the Gregorian 
calendar.
         // This may cause failures in unix_timestamp parsing.
@@ -195,41 +203,154 @@ private:
             return 0;
     }
 
-    String convertLocalDigit(const StringRef & str) const
+    bool hasNonAsciiSimd(const char * data, size_t size) const
+    {
+#if SIMDJSON_NO_SIMD
+        const unsigned char * bytes = reinterpret_cast<const unsigned char 
*>(data);
+        for (size_t i = 0; i < size; ++i)
+        {
+            if (bytes[i] & 0x80)
+                return true;
+        }
+        return false;
+#else
+        using simd8_u8 = simdjson_impl::simd8<uint8_t>;
+        constexpr size_t kBlockSize = simd8_u8::SIZE;
+        size_t i = 0;
+        for (; i + kBlockSize <= size; i += kBlockSize)
+        {
+            if (!simd8_u8::load(reinterpret_cast<const uint8_t *>(data + 
i)).is_ascii())
+                return true;
+        }
+        for (; i < size; ++i)
+        {
+            if (static_cast<unsigned char>(data[i]) & 0x80)
+                return true;
+        }
+        return false;
+#endif
+    }
+
+    bool convertLocalDigitIfNeeded(StringRef str, std::string & result) const
     {
-        std::string result;
+        if (!str.size)
+            return false;
+        if (!hasNonAsciiSimd(str.data, str.size))
+            return false;
+        result.clear();
         result.reserve(str.size);
+        bool has_local_digit = false;
         for (size_t i = 0; i < str.size;)
         {
             unsigned char c = str.data[i];
             char32_t cp = 0;
             if ((c & 0x80) == 0) // 1-byte
             {
-                cp = c;
+                result.push_back(c);
                 i += 1;
+                continue;
             }
             else if ((c & 0xE0) == 0xC0) // 2-byte
             {
-                cp = ((c & 0x1F) << 6) | (str.data[i + 1] & 0x3F);
+                unsigned char b1 = str.data[i + 1];
+                if (c == 0xD9 && b1 >= 0xA0 && b1 <= 0xA9) // Arabic-Indic
+                {
+                    result.push_back(static_cast<char>('0' + (b1 - 0xA0)));
+                    has_local_digit = true;
+                    i += 2;
+                    continue;
+                }
+                if (c == 0xDB && b1 >= 0xB0 && b1 <= 0xB9) // Eastern 
Arabic-Indic (Persian)
+                {
+                    result.push_back(static_cast<char>('0' + (b1 - 0xB0)));
+                    has_local_digit = true;
+                    i += 2;
+                    continue;
+                }
+                cp = ((c & 0x1F) << 6) | (b1 & 0x3F);
+                auto local_digit = toAsciiDigit(cp);
+                if (local_digit)
+                {
+                    result.push_back(local_digit);
+                    has_local_digit = true;
+                }
+                else
+                {
+                    result.push_back(static_cast<char>(c));
+                    result.push_back(static_cast<char>(b1));
+                }
                 i += 2;
+                continue;
             }
             else if ((c & 0xF0) == 0xE0) // 3-byte
             {
-                cp = ((c & 0x0F) << 12) | ((str.data[i + 1] & 0x3F) << 6) | 
(str.data[i + 2] & 0x3F);
+                unsigned char b1 = str.data[i + 1];
+                unsigned char b2 = str.data[i + 2];
+                if (c == 0xE0)
+                {
+                    if ((b1 == 0xA5 && b2 >= 0xA6 && b2 <= 0xAF) || // 
Devanagari
+                        (b1 == 0xA7 && b2 >= 0xA6 && b2 <= 0xAF))   // Bengali
+                    {
+                        result.push_back(static_cast<char>('0' + (b2 - 0xA6)));
+                        has_local_digit = true;
+                        i += 3;
+                        continue;
+                    }
+                    if (b1 == 0xB9 && b2 >= 0x90 && b2 <= 0x99) // Thai
+                    {
+                        result.push_back(static_cast<char>('0' + (b2 - 0x90)));
+                        has_local_digit = true;
+                        i += 3;
+                        continue;
+                    }
+                }
+                else if (c == 0xE1 && b1 == 0x9F && b2 >= 0xA0 && b2 <= 0xA9) 
// Khmer
+                {
+                    result.push_back(static_cast<char>('0' + (b2 - 0xA0)));
+                    has_local_digit = true;
+                    i += 3;
+                    continue;
+                }
+                cp = ((c & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+                auto local_digit = toAsciiDigit(cp);
+                if (local_digit)
+                {
+                    result.push_back(local_digit);
+                    has_local_digit = true;
+                }
+                else
+                {
+                    result.push_back(static_cast<char>(c));
+                    result.push_back(static_cast<char>(b1));
+                    result.push_back(static_cast<char>(b2));
+                }
                 i += 3;
+                continue;
             }
             else if ((c & 0xF8) == 0xF0) // 4-byte
             {
-                cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | 
((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F);
+                unsigned char b1 = str.data[i + 1];
+                unsigned char b2 = str.data[i + 2];
+                unsigned char b3 = str.data[i + 3];
+                cp = ((c & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) 
<< 6) | (b3 & 0x3F);
+                auto local_digit = toAsciiDigit(cp);
+                if (local_digit)
+                {
+                    result.push_back(local_digit);
+                    has_local_digit = true;
+                }
+                else
+                {
+                    result.push_back(static_cast<char>(c));
+                    result.push_back(static_cast<char>(b1));
+                    result.push_back(static_cast<char>(b2));
+                    result.push_back(static_cast<char>(b3));
+                }
                 i += 4;
+                continue;
             }
-            auto local_digit = toAsciiDigit(cp);
-            if (local_digit)
-                result.push_back(local_digit);
-            else
-                result.push_back(cp);
         }
-        return result;
+        return has_local_digit;
     }
 };
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to