This is an automated email from the ASF dual-hosted git repository.

gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b4673069223 [chore](column string) add is_ascii  function in ColumnStr 
(#51283)
b4673069223 is described below

commit b4673069223f3d3a4d08071b1796ff1360ba27a0
Author: Mryange <[email protected]>
AuthorDate: Tue Jun 3 14:57:35 2025 +0800

    [chore](column string) add is_ascii  function in ColumnStr (#51283)
---
 be/src/util/simd/vstring_function.h        |  1 +
 be/src/vec/columns/column_string.cpp       |  7 ++++++-
 be/src/vec/columns/column_string.h         |  2 ++
 be/src/vec/functions/function_string.h     | 17 +++--------------
 be/test/vec/columns/column_string_test.cpp | 29 +++++++++++++++++++++++++++++
 5 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/be/src/util/simd/vstring_function.h 
b/be/src/util/simd/vstring_function.h
index c554c6b0e9a..b883e427838 100644
--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@@ -203,6 +203,7 @@ public:
     }
 
     // Gcc will do auto simd in this function
+    // if input empty, return true
     static bool is_ascii(const StringRef& str) {
 #ifdef __AVX2__
         return validate_ascii_fast_avx(str.data, str.size);
diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index 591cd7143d2..4ac88f9187f 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -27,13 +27,13 @@
 #include "runtime/primitive_type.h"
 #include "util/memcpy_inlined.h"
 #include "util/simd/bits.h"
+#include "util/simd/vstring_function.h"
 #include "vec/columns/columns_common.h"
 #include "vec/common/arena.h"
 #include "vec/common/assert_cast.h"
 #include "vec/common/memcmp_small.h"
 #include "vec/common/unaligned.h"
 #include "vec/core/sort_block.h"
-
 namespace doris::vectorized {
 #include "common/compile_check_begin.h"
 
@@ -777,6 +777,11 @@ void ColumnStr<T>::insert(const Field& x) {
     sanity_check_simple();
 }
 
+template <typename T>
+bool ColumnStr<T>::is_ascii() const {
+    return simd::VStringFunctions::is_ascii(StringRef(chars.data(), 
chars.size()));
+}
+
 template class ColumnStr<uint32_t>;
 template class ColumnStr<uint64_t>;
 } // namespace doris::vectorized
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 49b3497cb66..52cd4c95ff5 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -502,6 +502,8 @@ public:
         return typeid(rhs) == typeid(ColumnStr<T>);
     }
 
+    bool is_ascii() const;
+
     Chars& get_chars() { return chars; }
     const Chars& get_chars() const { return chars; }
 
diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 7d72d43b3d8..7d6b2fcf552 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -3767,9 +3767,7 @@ struct SubReplaceImpl {
 
         std::visit(
                 [&](auto origin_str_const, auto new_str_const, auto 
start_const, auto len_const) {
-                    if (simd::VStringFunctions::is_ascii(
-                                StringRef {data_column->get_chars().data(),
-                                           data_column->get_chars().size()})) {
+                    if (data_column->is_ascii()) {
                         vector_ascii<origin_str_const, new_str_const, 
start_const, len_const>(
                                 data_column, mask_column, 
start_column->get_data(),
                                 length_column->get_data(), 
args_null_map->get_data(), result_column,
@@ -4355,11 +4353,7 @@ public:
         ColumnString::MutablePtr col_res = ColumnString::create();
 
         // if all input string is ascii, we can use ascii function to handle it
-        const bool is_all_ascii =
-                simd::VStringFunctions::is_ascii(StringRef 
{col_origin->get_chars().data(),
-                                                            
col_origin->get_chars().size()}) &&
-                simd::VStringFunctions::is_ascii(
-                        StringRef {col_insert->get_chars().data(), 
col_insert->get_chars().size()});
+        const bool is_all_ascii = col_origin->is_ascii() && 
col_insert->is_ascii();
         std::visit(
                 [&](auto origin_const, auto pos_const, auto len_const, auto 
insert_const) {
                     if (is_all_ascii) {
@@ -4627,12 +4621,7 @@ public:
         const auto* col_from = assert_cast<const 
ColumnString*>(argument_columns[1].get());
         const auto* col_to = assert_cast<const 
ColumnString*>(argument_columns[2].get());
 
-        bool is_ascii = simd::VStringFunctions::is_ascii(
-                                {col_source->get_chars().data(), 
col_source->get_chars().size()}) &&
-                        simd::VStringFunctions::is_ascii(
-                                {col_from->get_chars().data(), 
col_from->get_chars().size()}) &&
-                        simd::VStringFunctions::is_ascii(
-                                {col_to->get_chars().data(), 
col_to->get_chars().size()});
+        bool is_ascii = col_source->is_ascii() && col_from->is_ascii() && 
col_to->is_ascii();
         auto impl_vectors = impl_vectors_utf8<false>;
         if (col_const[1] && col_const[2] && is_ascii) {
             impl_vectors = impl_vectors_ascii<true>;
diff --git a/be/test/vec/columns/column_string_test.cpp 
b/be/test/vec/columns/column_string_test.cpp
index 9423e76bbdb..196eafa7c32 100644
--- a/be/test/vec/columns/column_string_test.cpp
+++ b/be/test/vec/columns/column_string_test.cpp
@@ -1078,4 +1078,33 @@ TEST_F(ColumnStringTest, ScalaTypeStringTest2erase) {
     }
 }
 
+TEST_F(ColumnStringTest, is_ascii) {
+    {
+        auto column = ColumnString::create();
+        std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"), 
StringRef("3"),
+                                       StringRef("4"), StringRef("5")};
+        for (auto d : data) {
+            column->insert_data(d.data, d.size);
+        }
+        EXPECT_TRUE(column->is_ascii());
+    }
+
+    {
+        auto column = ColumnString::create();
+        std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"),
+                                       StringRef("3"),   StringRef("4"),
+                                       StringRef("5"),   StringRef("你好世界")};
+        for (auto d : data) {
+            column->insert_data(d.data, d.size);
+        }
+        EXPECT_FALSE(column->is_ascii());
+    }
+    {
+        auto column = ColumnString::create();
+        std::vector<StringRef> data = {StringRef(""), StringRef(""), 
StringRef(""),
+                                       StringRef(""), StringRef(""), 
StringRef("")};
+        EXPECT_TRUE(column->is_ascii());
+    }
+}
+
 } // namespace doris::vectorized
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to