This is an automated email from the ASF dual-hosted git repository.

lgbo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 3a6d9b9c16 [GLUTEN-11012][CH]Support more local digits date (#11073)
3a6d9b9c16 is described below

commit 3a6d9b9c16b0cca924806235d987be7300fe94ae
Author: lgbo <[email protected]>
AuthorDate: Fri Dec 5 11:38:39 2025 +0800

    [GLUTEN-11012][CH]Support more local digits date (#11073)
    
    * support more local digit date
    
    * fix a bug about empty string
---
 .../execution/GlutenFunctionValidateSuite.scala    | 32 ++++++---
 ...Date.cpp => LocalDigitsToAsciiDigitForDate.cpp} | 81 +++++++++++++++-------
 .../Parser/scalar_function_parser/getTimestamp.h   |  6 +-
 3 files changed, 82 insertions(+), 37 deletions(-)

diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index f0fd18515b..e446af8f29 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -1391,29 +1391,40 @@ class GlutenFunctionValidateSuite extends 
GlutenClickHouseWholeStageTransformerS
     compareResultsAgainstVanillaSpark(sql, true, { _ => })
   }
 
-  test("arabic_indic digit date") {
+  test("local digit date") {
     withSQLConf(
       SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
         (ConstantFolding.ruleName + "," + NullPropagation.ruleName),
       ("spark.sql.legacy.timeParserPolicy", "LEGACY")) {
-      sql("create table tb_arabic_date(d string) using parquet")
+      sql("create table tb_local_date(i bigint, d string) using parquet")
       sql("""
-            |insert into tb_arabic_date values
-            |'2aLZoNmi2aQt2aDZpi3ZoNmh',
-            |'2aLZoNmi2aQt2aHZoi3Zo9mh',
-            |'2aLZoNmi2aQt2aHZoi3Zo9mh'
+            |insert into tb_local_date values
+            |(1, '2aLZoNmi2aQt2aDZpi3ZoNmh'),
+            |(2, '2aLZoNmi2aQt2aHZoi3Zo9mh'),
+            |(3, '2aLZoNmi2aQt2aHZoi3Zo9mh'),
+            |(5, '27LbsNuy27Ut27HbsS3bsduz'),
+            |(6, ''),
+            |(7, '4KWo4KWm4KWo4KWrLeClp+Clpy3gpafgpak='),
+            |(8, '4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M='),
+            |(9, null),
+            |(10, '4Keo4Kem4Keo4KerLeCnp+Cnpy3gp6fgp6k='),
+            |(11, 'MjAyNS0xMS0xMg==')
             |""".stripMargin)
       var query_sql = """
                         |select
-                        |from_unixtime(unix_timestamp(cast(unbase64(d) as 
string), 'yyyy-MM-dd'))
-                        |from tb_arabic_date
+                        |from_unixtime(unix_timestamp(cast(unbase64(d) as 
string), 'yyyy-MM-dd')),
+                        |cast(unbase64(d) as string) from (
+                        |select d, i
+                        |from tb_local_date
+                        |order by i)
                         |""".stripMargin
       compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
 
       query_sql = """
                     |select from_unixtime(
                     | unix_timestamp(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') 
as string),
-                    | 'yyyy-MM-dd'))
+                    | 'yyyy-MM-dd')),
+                    | cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string)
                     |""".stripMargin
       compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
 
@@ -1422,7 +1433,8 @@ class GlutenFunctionValidateSuite extends 
GlutenClickHouseWholeStageTransformerS
                     |""".stripMargin
       compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
 
-      sql("drop table tb_arabic_date")
+      sql("drop table tb_local_date")
     }
   }
+
 }
diff --git a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp 
b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
similarity index 69%
rename from cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
rename to cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
index 94332c3a5f..7f19975c0a 100644
--- a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
+++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
@@ -16,17 +16,19 @@
  */
 
 
-#include <Columns/IColumn.h>
-#include <Columns/ColumnNullable.h>
 #include <Columns/ColumnConst.h>
+#include <Columns/ColumnNullable.h>
 #include <Columns/ColumnString.h>
+#include <Columns/IColumn.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/IDataType.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionHelpers.h>
 #include <Functions/IFunction.h>
+#include <boost/iostreams/detail/select.hpp>
 #include <Common/Exception.h>
 #include <Common/logger_useful.h>
+#include "base/types.h"
 
 namespace DB
 {
@@ -40,12 +42,12 @@ namespace local_engine
 {
 // Since spark 3.3, unix_timestamp support arabic number input, e.g., 
"٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠".
 // We implement a function to translate arabic indic digits to ascii digits 
here.
-class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
+class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction
 {
 public:
-    static constexpr auto name = "arabic_indic_to_ascii_digit_for_date";
+    static constexpr auto name = "local_digit_to_ascii_digit_for_date";
 
-    static DB::FunctionPtr create(DB::ContextPtr) { return 
std::make_shared<ArabicIndicToAsciiDigitForDateFunction>(); }
+    static DB::FunctionPtr create(DB::ContextPtr) { return 
std::make_shared<LocalDigitsToAsciiDigitForDateFunction>(); }
 
     String getName() const override { return name; }
 
@@ -56,7 +58,11 @@ public:
     {
         auto nested_type = DB::removeNullable(arguments[0]);
         if (!DB::WhichDataType(nested_type).isString())
-            throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, 
"Argument for function {} must be String, but got {}", getName(), 
arguments[0]->getName());
+            throw DB::Exception(
+                DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Argument for function {} must be String, but got {}",
+                getName(),
+                arguments[0]->getName());
         return arguments[0];
     }
 
@@ -85,9 +91,13 @@ public:
                 col_str = 
DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
             }
             if (!col_str)
-                throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, 
"Argument for function {} must be String, but got {}", getName(), 
data_col->getName());
+                throw DB::Exception(
+                    DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Argument for function {} must be String, but got {}",
+                    getName(),
+                    data_col->getName());
             auto date_str = col_str->getDataAt(0);
-            auto new_str = convertArabicIndicDigit(date_str);
+            auto new_str = convertLocalDigit(date_str);
             auto new_data_col = data_col->cloneEmpty();
             new_data_col->insertData(new_str.c_str(), new_str.size());
             return DB::ColumnConst::create(std::move(new_data_col), 
input_rows_count);
@@ -104,10 +114,14 @@ public:
             col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
         }
         if (!col_str)
-            throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, 
"Argument for function {} must be String, but got {}", getName(), 
data_col->getName());
+            throw DB::Exception(
+                DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Argument for function {} must be String, but got {}",
+                getName(),
+                data_col->getName());
 
         auto nested_data_col = DB::removeNullable(arguments[0].column);
-        bool has_arabic_indic_digit = false;
+        bool has_local_digit = false;
         size_t row_index = 0;
         for (row_index = 0; row_index < input_rows_count; ++row_index)
         {
@@ -116,16 +130,16 @@ public:
                 continue;
             }
             auto str = col_str->getDataAt(row_index);
-            if (hasArabicIndicDigit(str))
+            if (hasLocalDigit(str))
             {
-                has_arabic_indic_digit = true;
+                has_local_digit = true;
                 break;
             }
         }
 
-        if (!has_arabic_indic_digit)
+        if (!has_local_digit)
         {
-            // No Arabic indic digits found, return the original column
+            // No local language digits found, return the original column
             return arguments[0].column;
         }
 
@@ -141,15 +155,18 @@ public:
                 res_col->insertDefault();
                 continue;
             }
-            auto str = convertArabicIndicDigit(col_str->getDataAt(row_index));
+            auto str = convertLocalDigit(col_str->getDataAt(row_index));
+            LOG_ERROR(getLogger("LocalDigitsToAsciiDigitForDateFunction"), 
"Converted local digit string {} to ascii digit string: {}", 
col_str->getDataAt(row_index).toString(), str);
             res_col->insertData(str.c_str(), str.size());
         }
         return res_col;
     }
 
 private:
-    bool hasArabicIndicDigit(StringRef str) const
+    bool hasLocalDigit(StringRef str) const
     {
+        if (!str.size)
+            return false;
         // In most cases, the first byte is a digit.
         char c = reinterpret_cast<char>(str.data[0]);
         if ('0' <= c && c <= '9')
@@ -159,11 +176,26 @@ private:
         return true;
     }
 
+    char toAsciiDigit(char32_t c) const {
+        // In Thai and Persian, dates typically do not use the Gregorian 
calendar.
+        // This may cause failures in unix_timestamp parsing.
+        if (c >= 0x0660 && c <= 0x0669)
+            return static_cast<char>(c - 0x0660 + '0');
+        else if (c >= 0x06F0 && c <= 0x06F9)
+            return static_cast<char>(c - 0x06F0 + '0');
+        else if (c >= 0x0966 && c <= 0x096F)
+            return static_cast<char>(c - 0x0966 + '0');
+        else if (c >= 0x0E50 && c <= 0x0E59)
+            return static_cast<char>(c - 0x0E50 + '0');
+        else if (c >= 0x17E0 && c <= 0x17E9)
+            return static_cast<char>(c - 0x17E0 + '0');
+        else if (c >= 0x09E6 && c <= 0x09EF)
+            return static_cast<char>(c - 0x09E6 + '0');
+        else
+            return 0;
+    }
 
-    bool isArabicIndicDigit(char32_t c) const { return c >= 0x0660 && c <= 
0x0669; }
-    char toAsciiDigit(char32_t c) const { return static_cast<char>(c - 0x0660 
+ '0'); }
-
-    String convertArabicIndicDigit(const StringRef & str) const
+    String convertLocalDigit(const StringRef & str) const
     {
         std::string result;
         result.reserve(str.size);
@@ -191,8 +223,9 @@ private:
                 cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | 
((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F);
                 i += 4;
             }
-            if (isArabicIndicDigit(cp))
-                result.push_back(toAsciiDigit(cp));
+            auto local_digit = toAsciiDigit(cp);
+            if (local_digit)
+                result.push_back(local_digit);
             else
                 result.push_back(cp);
         }
@@ -201,8 +234,8 @@ private:
 };
 
 using namespace DB;
-REGISTER_FUNCTION(ArabicIndicToAsciiDigitForDate)
+REGISTER_FUNCTION(LocalDigitToAsciiDigitForDate)
 {
-    factory.registerFunction<ArabicIndicToAsciiDigitForDateFunction>();
+    factory.registerFunction<LocalDigitsToAsciiDigitForDateFunction>();
 }
 }
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h 
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
index 53e6a0e6e6..f5c3ac0d71 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
@@ -61,7 +61,7 @@ public:
         auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
         if (parsed_args.size() != 2)
             throw 
DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} 
requires exactly two arguments", getName());
-        const auto * expr_arg = convertArabicIndicDigit(actions_dag, 
parsed_args[0]);
+        const auto * expr_arg = convertLocalDigit(actions_dag, parsed_args[0]);
         const auto * fmt_arg = parsed_args[1];
         
         const auto & args = substrait_func.arguments();
@@ -130,9 +130,9 @@ private:
         }
     }
 
-    const DB::ActionsDAG::Node * convertArabicIndicDigit(DB::ActionsDAG & 
actions_dag, const DB::ActionsDAG::Node * node) const
+    const DB::ActionsDAG::Node * convertLocalDigit(DB::ActionsDAG & 
actions_dag, const DB::ActionsDAG::Node * node) const
     {
-        const auto * func_node = toFunctionNode(actions_dag, 
"arabic_indic_to_ascii_digit_for_date", {node});
+        const auto * func_node = toFunctionNode(actions_dag, 
"local_digit_to_ascii_digit_for_date", {node});
         return func_node;
     }
 };


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to