This is an automated email from the ASF dual-hosted git repository.
lgbo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 3a6d9b9c16 [GLUTEN-11012][CH]Support more local digits date (#11073)
3a6d9b9c16 is described below
commit 3a6d9b9c16b0cca924806235d987be7300fe94ae
Author: lgbo <[email protected]>
AuthorDate: Fri Dec 5 11:38:39 2025 +0800
[GLUTEN-11012][CH]Support more local digits date (#11073)
* support more local digit date
* fix a bug about empty string
---
.../execution/GlutenFunctionValidateSuite.scala | 32 ++++++---
...Date.cpp => LocalDigitsToAsciiDigitForDate.cpp} | 81 +++++++++++++++-------
.../Parser/scalar_function_parser/getTimestamp.h | 6 +-
3 files changed, 82 insertions(+), 37 deletions(-)
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index f0fd18515b..e446af8f29 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -1391,29 +1391,40 @@ class GlutenFunctionValidateSuite extends
GlutenClickHouseWholeStageTransformerS
compareResultsAgainstVanillaSpark(sql, true, { _ => })
}
- test("arabic_indic digit date") {
+ test("local digit date") {
withSQLConf(
SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
(ConstantFolding.ruleName + "," + NullPropagation.ruleName),
("spark.sql.legacy.timeParserPolicy", "LEGACY")) {
- sql("create table tb_arabic_date(d string) using parquet")
+ sql("create table tb_local_date(i bigint, d string) using parquet")
sql("""
- |insert into tb_arabic_date values
- |'2aLZoNmi2aQt2aDZpi3ZoNmh',
- |'2aLZoNmi2aQt2aHZoi3Zo9mh',
- |'2aLZoNmi2aQt2aHZoi3Zo9mh'
+ |insert into tb_local_date values
+ |(1, '2aLZoNmi2aQt2aDZpi3ZoNmh'),
+ |(2, '2aLZoNmi2aQt2aHZoi3Zo9mh'),
+ |(3, '2aLZoNmi2aQt2aHZoi3Zo9mh'),
+ |(5, '27LbsNuy27Ut27HbsS3bsduz'),
+ |(6, ''),
+ |(7, '4KWo4KWm4KWo4KWrLeClp+Clpy3gpafgpak='),
+ |(8, '4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M='),
+ |(9, null),
+ |(10, '4Keo4Kem4Keo4KerLeCnp+Cnpy3gp6fgp6k='),
+ |(11, 'MjAyNS0xMS0xMg==')
|""".stripMargin)
var query_sql = """
|select
- |from_unixtime(unix_timestamp(cast(unbase64(d) as
string), 'yyyy-MM-dd'))
- |from tb_arabic_date
+ |from_unixtime(unix_timestamp(cast(unbase64(d) as
string), 'yyyy-MM-dd')),
+ |cast(unbase64(d) as string) from (
+ |select d, i
+ |from tb_local_date
+ |order by i)
|""".stripMargin
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
query_sql = """
|select from_unixtime(
| unix_timestamp(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh')
as string),
- | 'yyyy-MM-dd'))
+ | 'yyyy-MM-dd')),
+ | cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string)
|""".stripMargin
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
@@ -1422,7 +1433,8 @@ class GlutenFunctionValidateSuite extends
GlutenClickHouseWholeStageTransformerS
|""".stripMargin
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
- sql("drop table tb_arabic_date")
+ sql("drop table tb_local_date")
}
}
+
}
diff --git a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
similarity index 69%
rename from cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
rename to cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
index 94332c3a5f..7f19975c0a 100644
--- a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
+++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp
@@ -16,17 +16,19 @@
*/
-#include <Columns/IColumn.h>
-#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
+#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
+#include <Columns/IColumn.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/IDataType.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
+#include <boost/iostreams/detail/select.hpp>
#include <Common/Exception.h>
#include <Common/logger_useful.h>
+#include "base/types.h"
namespace DB
{
@@ -40,12 +42,12 @@ namespace local_engine
{
// Since spark 3.3, unix_timestamp support arabic number input, e.g.,
"٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠".
// We implement a function to translate arabic indic digits to ascii digits
here.
-class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
+class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction
{
public:
- static constexpr auto name = "arabic_indic_to_ascii_digit_for_date";
+ static constexpr auto name = "local_digit_to_ascii_digit_for_date";
- static DB::FunctionPtr create(DB::ContextPtr) { return
std::make_shared<ArabicIndicToAsciiDigitForDateFunction>(); }
+ static DB::FunctionPtr create(DB::ContextPtr) { return
std::make_shared<LocalDigitsToAsciiDigitForDateFunction>(); }
String getName() const override { return name; }
@@ -56,7 +58,11 @@ public:
{
auto nested_type = DB::removeNullable(arguments[0]);
if (!DB::WhichDataType(nested_type).isString())
- throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument for function {} must be String, but got {}", getName(),
arguments[0]->getName());
+ throw DB::Exception(
+ DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+ "Argument for function {} must be String, but got {}",
+ getName(),
+ arguments[0]->getName());
return arguments[0];
}
@@ -85,9 +91,13 @@ public:
col_str =
DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
}
if (!col_str)
- throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument for function {} must be String, but got {}", getName(),
data_col->getName());
+ throw DB::Exception(
+ DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+ "Argument for function {} must be String, but got {}",
+ getName(),
+ data_col->getName());
auto date_str = col_str->getDataAt(0);
- auto new_str = convertArabicIndicDigit(date_str);
+ auto new_str = convertLocalDigit(date_str);
auto new_data_col = data_col->cloneEmpty();
new_data_col->insertData(new_str.c_str(), new_str.size());
return DB::ColumnConst::create(std::move(new_data_col),
input_rows_count);
@@ -104,10 +114,14 @@ public:
col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
}
if (!col_str)
- throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument for function {} must be String, but got {}", getName(),
data_col->getName());
+ throw DB::Exception(
+ DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+ "Argument for function {} must be String, but got {}",
+ getName(),
+ data_col->getName());
auto nested_data_col = DB::removeNullable(arguments[0].column);
- bool has_arabic_indic_digit = false;
+ bool has_local_digit = false;
size_t row_index = 0;
for (row_index = 0; row_index < input_rows_count; ++row_index)
{
@@ -116,16 +130,16 @@ public:
continue;
}
auto str = col_str->getDataAt(row_index);
- if (hasArabicIndicDigit(str))
+ if (hasLocalDigit(str))
{
- has_arabic_indic_digit = true;
+ has_local_digit = true;
break;
}
}
- if (!has_arabic_indic_digit)
+ if (!has_local_digit)
{
- // No Arabic indic digits found, return the original column
+ // No local language digits found, return the original column
return arguments[0].column;
}
@@ -141,15 +155,18 @@ public:
res_col->insertDefault();
continue;
}
- auto str = convertArabicIndicDigit(col_str->getDataAt(row_index));
+ auto str = convertLocalDigit(col_str->getDataAt(row_index));
+ LOG_ERROR(getLogger("LocalDigitsToAsciiDigitForDateFunction"),
"Converted local digit string {} to ascii digit string: {}",
col_str->getDataAt(row_index).toString(), str);
res_col->insertData(str.c_str(), str.size());
}
return res_col;
}
private:
- bool hasArabicIndicDigit(StringRef str) const
+ bool hasLocalDigit(StringRef str) const
{
+ if (!str.size)
+ return false;
// In most cases, the first byte is a digit.
char c = reinterpret_cast<char>(str.data[0]);
if ('0' <= c && c <= '9')
@@ -159,11 +176,26 @@ private:
return true;
}
+ char toAsciiDigit(char32_t c) const {
+ // In Thai and Persian, dates typically do not use the Gregorian
calendar.
+ // This may cause failures in unix_timestamp parsing.
+ if (c >= 0x0660 && c <= 0x0669)
+ return static_cast<char>(c - 0x0660 + '0');
+ else if (c >= 0x06F0 && c <= 0x06F9)
+ return static_cast<char>(c - 0x06F0 + '0');
+ else if (c >= 0x0966 && c <= 0x096F)
+ return static_cast<char>(c - 0x0966 + '0');
+ else if (c >= 0x0E50 && c <= 0x0E59)
+ return static_cast<char>(c - 0x0E50 + '0');
+ else if (c >= 0x17E0 && c <= 0x17E9)
+ return static_cast<char>(c - 0x17E0 + '0');
+ else if (c >= 0x09E6 && c <= 0x09EF)
+ return static_cast<char>(c - 0x09E6 + '0');
+ else
+ return 0;
+ }
- bool isArabicIndicDigit(char32_t c) const { return c >= 0x0660 && c <=
0x0669; }
- char toAsciiDigit(char32_t c) const { return static_cast<char>(c - 0x0660
+ '0'); }
-
- String convertArabicIndicDigit(const StringRef & str) const
+ String convertLocalDigit(const StringRef & str) const
{
std::string result;
result.reserve(str.size);
@@ -191,8 +223,9 @@ private:
cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) |
((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F);
i += 4;
}
- if (isArabicIndicDigit(cp))
- result.push_back(toAsciiDigit(cp));
+ auto local_digit = toAsciiDigit(cp);
+ if (local_digit)
+ result.push_back(local_digit);
else
result.push_back(cp);
}
@@ -201,8 +234,8 @@ private:
};
using namespace DB;
-REGISTER_FUNCTION(ArabicIndicToAsciiDigitForDate)
+REGISTER_FUNCTION(LocalDigitToAsciiDigitForDate)
{
- factory.registerFunction<ArabicIndicToAsciiDigitForDateFunction>();
+ factory.registerFunction<LocalDigitsToAsciiDigitForDateFunction>();
}
}
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
index 53e6a0e6e6..f5c3ac0d71 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
@@ -61,7 +61,7 @@ public:
auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
if (parsed_args.size() != 2)
throw
DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}
requires exactly two arguments", getName());
- const auto * expr_arg = convertArabicIndicDigit(actions_dag,
parsed_args[0]);
+ const auto * expr_arg = convertLocalDigit(actions_dag, parsed_args[0]);
const auto * fmt_arg = parsed_args[1];
const auto & args = substrait_func.arguments();
@@ -130,9 +130,9 @@ private:
}
}
- const DB::ActionsDAG::Node * convertArabicIndicDigit(DB::ActionsDAG &
actions_dag, const DB::ActionsDAG::Node * node) const
+ const DB::ActionsDAG::Node * convertLocalDigit(DB::ActionsDAG &
actions_dag, const DB::ActionsDAG::Node * node) const
{
- const auto * func_node = toFunctionNode(actions_dag,
"arabic_indic_to_ascii_digit_for_date", {node});
+ const auto * func_node = toFunctionNode(actions_dag,
"local_digit_to_ascii_digit_for_date", {node});
return func_node;
}
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]