This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new f5b8f317208 branch-4.0: [fix](load) fix multi byte char load #56353 
(#56517)
f5b8f317208 is described below

commit f5b8f3172084776f4718077c518595ee399812fc
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Sep 27 21:47:28 2025 +0800

    branch-4.0: [fix](load) fix multi byte char load #56353 (#56517)
    
    Cherry-picked from #56353
    
    Co-authored-by: TengJianPing <[email protected]>
---
 be/src/vec/sink/vtablet_block_convertor.cpp        | 70 +++++++++++--------
 .../test_insert_strict_mode_and_filter_ratio.out   | 12 ++++
 ...test_insert_strict_mode_and_filter_ratio.groovy | 78 ++++++++++++++++++++++
 3 files changed, 131 insertions(+), 29 deletions(-)

diff --git a/be/src/vec/sink/vtablet_block_convertor.cpp 
b/be/src/vec/sink/vtablet_block_convertor.cpp
index c88de1a5dd7..521febba99a 100644
--- a/be/src/vec/sink/vtablet_block_convertor.cpp
+++ b/be/src/vec/sink/vtablet_block_convertor.cpp
@@ -235,35 +235,12 @@ Status OlapTableBlockConvertor::_internal_validate_column(
         }
 
         if (invalid_count) {
-            if (state->enable_insert_strict()) {
-                for (size_t j = 0; j < row_count; ++j) {
-                    auto row = rows ? (*rows)[j] : j;
-                    if (need_to_validate(j, row)) {
-                        auto str_val = column_string->get_data_at(j);
-                        bool invalid = str_val.size > limit;
-                        if (invalid) {
-                            if (str_val.size > len) {
-                                fmt::format_to(error_msg, "{}",
-                                               "the length of input is too 
long than schema. ");
-                                fmt::format_to(error_msg, "first 32 bytes of 
input str: [{}] ",
-                                               str_val.to_prefix(32));
-                                fmt::format_to(error_msg, "schema length: {}; 
", len);
-                                fmt::format_to(error_msg, "actual length: {}; 
", str_val.size);
-                            } else if (str_val.size > limit) {
-                                fmt::format_to(
-                                        error_msg, "{}",
-                                        "the length of input string is too 
long than vec schema. ");
-                                fmt::format_to(error_msg, "first 32 bytes of 
input str: [{}] ",
-                                               str_val.to_prefix(32));
-                                fmt::format_to(error_msg, "schema length: {}; 
", len);
-                                fmt::format_to(error_msg, "limit length: {}; 
", limit);
-                                fmt::format_to(error_msg, "actual length: {}; 
", str_val.size);
-                            }
-                            
RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
-                        }
-                    }
-                }
-            } else if (type_str) {
+            // For string column, if in non-strict load mode(for both insert 
stmt and stream load),
+            // truncate the string to schema len.
+            // After truncation, still need to check if byte len of each row 
exceed the schema len,
+            // because currently the schema len is defined in bytes, and 
substring works by unit of chars.
+            // This is a workaround for now, need to improve it after better 
support of multi-byte chars.
+            if (type_str && !state->enable_insert_strict()) {
                 ColumnsWithTypeAndName argument_template;
                 auto pos_type = DataTypeFactory::instance().create_data_type(
                         FieldType::OLAP_FIELD_TYPE_INT, 0, 0);
@@ -287,6 +264,41 @@ Status OlapTableBlockConvertor::_internal_validate_column(
                 RETURN_IF_ERROR(func->execute(nullptr, tmp_block, {0, 1, 2}, 
3, row_count));
                 block->get_by_position(slot_index).column =
                         std::move(tmp_block.get_by_position(3).column);
+                const auto* tmp_column_ptr =
+                        
vectorized::check_and_get_column<vectorized::ColumnNullable>(
+                                *block->get_by_position(slot_index).column);
+                const auto& tmp_real_column_ptr =
+                        tmp_column_ptr == nullptr ? 
block->get_by_position(slot_index).column
+                                                  : 
(tmp_column_ptr->get_nested_column_ptr());
+                column_string =
+                        assert_cast<const 
vectorized::ColumnString*>(tmp_real_column_ptr.get());
+            }
+            for (size_t j = 0; j < row_count; ++j) {
+                auto row = rows ? (*rows)[j] : j;
+                if (need_to_validate(j, row)) {
+                    auto str_val = column_string->get_data_at(j);
+                    bool invalid = str_val.size > limit;
+                    if (invalid) {
+                        if (str_val.size > len) {
+                            fmt::format_to(error_msg, "{}",
+                                           "the length of input is too long 
than schema. ");
+                            fmt::format_to(error_msg, "first 32 bytes of input 
str: [{}] ",
+                                           str_val.to_prefix(32));
+                            fmt::format_to(error_msg, "schema length: {}; ", 
len);
+                            fmt::format_to(error_msg, "actual length: {}; ", 
str_val.size);
+                        } else if (str_val.size > limit) {
+                            fmt::format_to(
+                                    error_msg, "{}",
+                                    "the length of input string is too long 
than vec schema. ");
+                            fmt::format_to(error_msg, "first 32 bytes of input 
str: [{}] ",
+                                           str_val.to_prefix(32));
+                            fmt::format_to(error_msg, "schema length: {}; ", 
len);
+                            fmt::format_to(error_msg, "limit length: {}; ", 
limit);
+                            fmt::format_to(error_msg, "actual length: {}; ", 
str_val.size);
+                        }
+                        RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
+                    }
+                }
             }
         }
         return Status::OK();
diff --git 
a/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
 
b/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
index 1f431345896..7872fa4bef8 100644
--- 
a/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
+++ 
b/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
@@ -78,3 +78,15 @@
 
 -- !sql_string_exceed_len_strict1 --
 
+-- !sql_mb_string_exceed_len_non_strict0 --
+
+-- !sql_mb_string_exceed_len_non_strict1 --
+1      a
+2      b
+3      c
+4      d
+5      e
+6      f
+
+-- !sql_mb_string_exceed_len_strict0 --
+
diff --git 
a/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
 
b/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
index 893dfc5dec0..5fc9a00d086 100644
--- 
a/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
+++ 
b/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
@@ -406,4 +406,82 @@ suite("test_insert_strict_mode_and_filter_ratio","p0") {
         exception """url"""
     }
     qt_sql_string_exceed_len_strict1 "select * from 
test_insert_strict_mode_and_filter_ratio order by 1"
+
+    // TODO: change the following test case when BE support mbstring length 
check
+    // 6 test Chinese char
+    // 6.1 string exceed schema length, enable_insert_strict=false, 
insert_max_filter_ratio=0.3, load fail
+    sql """
+        drop table if exists test_insert_strict_mode_and_filter_ratio;
+    """
+    sql """
+        create table test_insert_strict_mode_and_filter_ratio (
+          id int,
+          name char(1)
+        ) properties ('replication_num' = '1');
+    """
+    sql "set enable_insert_strict=false"
+    sql "set enable_strict_cast=true"
+    sql "set insert_max_filter_ratio=0.3"
+    test {
+        sql """
+        insert into test_insert_strict_mode_and_filter_ratio  values
+            (1, "a"),
+            (2, "b"),
+            (3, "c"),
+            (4, "d"),
+            (5, "e"),
+            (6, "f"),
+            (7, "宅z"),
+            (8, "兹z"),
+            (9, "中z"),
+            (10, "国g");
+        """
+        exception """Insert has too many filtered data"""
+    }
+    qt_sql_mb_string_exceed_len_non_strict0 "select * from 
test_insert_strict_mode_and_filter_ratio order by 1"
+
+    // 6.2 string exceed schema length, enable_insert_strict=false, 
insert_max_filter_ratio=0.4, load success
+    sql """
+        truncate table test_insert_strict_mode_and_filter_ratio;
+    """
+    sql "set insert_max_filter_ratio=0.4"
+    sql """
+    insert into test_insert_strict_mode_and_filter_ratio  values
+        (1, "a"),
+        (2, "b"),
+        (3, "c"),
+        (4, "d"),
+        (5, "e"),
+        (6, "f"),
+        (7, "宅z"),
+        (8, "兹z"),
+        (9, "中z"),
+        (10, "国g");
+    """
+    qt_sql_mb_string_exceed_len_non_strict1 "select * from 
test_insert_strict_mode_and_filter_ratio order by 1"
+
+    // 6.3 string exceed schema length, enable_insert_strict=true, 
insert_max_filter_ratio=1, load fail
+    sql """
+        truncate table test_insert_strict_mode_and_filter_ratio;
+    """
+    sql "set enable_insert_strict=true"
+    sql "set enable_strict_cast=false"
+    sql "set insert_max_filter_ratio=1"
+    test {
+        sql """
+        insert into test_insert_strict_mode_and_filter_ratio  values
+            (1, "a"),
+            (2, "b"),
+            (3, "c"),
+            (4, "d"),
+            (5, "e"),
+            (6, "f"),
+            (7, "宅z"),
+            (8, "兹z"),
+            (9, "中z"),
+            (10, "国g");
+        """
+        exception """Insert has filtered data in strict mode"""
+    }
+    qt_sql_mb_string_exceed_len_strict0 "select * from 
test_insert_strict_mode_and_filter_ratio order by 1"
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to