This is an automated email from the ASF dual-hosted git repository.
gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new b4673069223 [chore](column string) add is_ascii function in ColumnStr
(#51283)
b4673069223 is described below
commit b4673069223f3d3a4d08071b1796ff1360ba27a0
Author: Mryange <[email protected]>
AuthorDate: Tue Jun 3 14:57:35 2025 +0800
[chore](column string) add is_ascii function in ColumnStr (#51283)
---
be/src/util/simd/vstring_function.h | 1 +
be/src/vec/columns/column_string.cpp | 7 ++++++-
be/src/vec/columns/column_string.h | 2 ++
be/src/vec/functions/function_string.h | 17 +++--------------
be/test/vec/columns/column_string_test.cpp | 29 +++++++++++++++++++++++++++++
5 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/be/src/util/simd/vstring_function.h
b/be/src/util/simd/vstring_function.h
index c554c6b0e9a..b883e427838 100644
--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@@ -203,6 +203,7 @@ public:
}
// Gcc will do auto simd in this function
+ // if input empty, return true
static bool is_ascii(const StringRef& str) {
#ifdef __AVX2__
return validate_ascii_fast_avx(str.data, str.size);
diff --git a/be/src/vec/columns/column_string.cpp
b/be/src/vec/columns/column_string.cpp
index 591cd7143d2..4ac88f9187f 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -27,13 +27,13 @@
#include "runtime/primitive_type.h"
#include "util/memcpy_inlined.h"
#include "util/simd/bits.h"
+#include "util/simd/vstring_function.h"
#include "vec/columns/columns_common.h"
#include "vec/common/arena.h"
#include "vec/common/assert_cast.h"
#include "vec/common/memcmp_small.h"
#include "vec/common/unaligned.h"
#include "vec/core/sort_block.h"
-
namespace doris::vectorized {
#include "common/compile_check_begin.h"
@@ -777,6 +777,11 @@ void ColumnStr<T>::insert(const Field& x) {
sanity_check_simple();
}
+template <typename T>
+bool ColumnStr<T>::is_ascii() const {
+ return simd::VStringFunctions::is_ascii(StringRef(chars.data(),
chars.size()));
+}
+
template class ColumnStr<uint32_t>;
template class ColumnStr<uint64_t>;
} // namespace doris::vectorized
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index 49b3497cb66..52cd4c95ff5 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -502,6 +502,8 @@ public:
return typeid(rhs) == typeid(ColumnStr<T>);
}
+ bool is_ascii() const;
+
Chars& get_chars() { return chars; }
const Chars& get_chars() const { return chars; }
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index 7d72d43b3d8..7d6b2fcf552 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -3767,9 +3767,7 @@ struct SubReplaceImpl {
std::visit(
[&](auto origin_str_const, auto new_str_const, auto
start_const, auto len_const) {
- if (simd::VStringFunctions::is_ascii(
- StringRef {data_column->get_chars().data(),
- data_column->get_chars().size()})) {
+ if (data_column->is_ascii()) {
vector_ascii<origin_str_const, new_str_const,
start_const, len_const>(
data_column, mask_column,
start_column->get_data(),
length_column->get_data(),
args_null_map->get_data(), result_column,
@@ -4355,11 +4353,7 @@ public:
ColumnString::MutablePtr col_res = ColumnString::create();
// if all input string is ascii, we can use ascii function to handle it
- const bool is_all_ascii =
- simd::VStringFunctions::is_ascii(StringRef
{col_origin->get_chars().data(),
-
col_origin->get_chars().size()}) &&
- simd::VStringFunctions::is_ascii(
- StringRef {col_insert->get_chars().data(),
col_insert->get_chars().size()});
+ const bool is_all_ascii = col_origin->is_ascii() &&
col_insert->is_ascii();
std::visit(
[&](auto origin_const, auto pos_const, auto len_const, auto
insert_const) {
if (is_all_ascii) {
@@ -4627,12 +4621,7 @@ public:
const auto* col_from = assert_cast<const
ColumnString*>(argument_columns[1].get());
const auto* col_to = assert_cast<const
ColumnString*>(argument_columns[2].get());
- bool is_ascii = simd::VStringFunctions::is_ascii(
- {col_source->get_chars().data(),
col_source->get_chars().size()}) &&
- simd::VStringFunctions::is_ascii(
- {col_from->get_chars().data(),
col_from->get_chars().size()}) &&
- simd::VStringFunctions::is_ascii(
- {col_to->get_chars().data(),
col_to->get_chars().size()});
+ bool is_ascii = col_source->is_ascii() && col_from->is_ascii() &&
col_to->is_ascii();
auto impl_vectors = impl_vectors_utf8<false>;
if (col_const[1] && col_const[2] && is_ascii) {
impl_vectors = impl_vectors_ascii<true>;
diff --git a/be/test/vec/columns/column_string_test.cpp
b/be/test/vec/columns/column_string_test.cpp
index 9423e76bbdb..196eafa7c32 100644
--- a/be/test/vec/columns/column_string_test.cpp
+++ b/be/test/vec/columns/column_string_test.cpp
@@ -1078,4 +1078,33 @@ TEST_F(ColumnStringTest, ScalaTypeStringTest2erase) {
}
}
+TEST_F(ColumnStringTest, is_ascii) {
+ {
+ auto column = ColumnString::create();
+ std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"),
StringRef("3"),
+ StringRef("4"), StringRef("5")};
+ for (auto d : data) {
+ column->insert_data(d.data, d.size);
+ }
+ EXPECT_TRUE(column->is_ascii());
+ }
+
+ {
+ auto column = ColumnString::create();
+ std::vector<StringRef> data = {StringRef("asd"), StringRef("1234567"),
+ StringRef("3"), StringRef("4"),
+ StringRef("5"), StringRef("你好世界")};
+ for (auto d : data) {
+ column->insert_data(d.data, d.size);
+ }
+ EXPECT_FALSE(column->is_ascii());
+ }
+ {
+ auto column = ColumnString::create();
+ std::vector<StringRef> data = {StringRef(""), StringRef(""),
StringRef(""),
+ StringRef(""), StringRef(""),
StringRef("")};
+ EXPECT_TRUE(column->is_ascii());
+ }
+}
+
} // namespace doris::vectorized
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]