github-actions[bot] commented on code in PR #64651:
URL: https://github.com/apache/doris/pull/64651#discussion_r3448557010


##########
be/src/exec/sink/vtablet_block_convertor.cpp:
##########
@@ -298,33 +299,81 @@ Status 
OlapTableBlockConvertor::_internal_validate_column(RuntimeState* state, B
                 null_map = tmp_column_ptr == nullptr ? nullptr
                                                      : 
tmp_column_ptr->get_null_map_data().data();
             }
+            bool need_manual_truncation = false;
             for (size_t j = 0; j < row_count; ++j) {
                 auto row = rows ? (*rows)[j] : j;
                 if (need_to_validate(j, row, filter_map, null_map)) {
                     auto str_val = column_string->get_data_at(j);
                     bool invalid = str_val.size > limit;
                     if (invalid) {
-                        if (str_val.size > len) {
-                            fmt::format_to(error_msg, "{}",
-                                           "the length of input is too long 
than schema. ");
-                            fmt::format_to(error_msg, "first 32 bytes of input 
str: [{}] ",
-                                           str_val.to_prefix(32));
-                            fmt::format_to(error_msg, "schema length: {}; ", 
len);
-                            fmt::format_to(error_msg, "actual length: {}; ", 
str_val.size);
-                        } else if (str_val.size > limit) {
-                            fmt::format_to(
-                                    error_msg, "{}",
-                                    "the length of input string is too long 
than vec schema. ");
-                            fmt::format_to(error_msg, "first 32 bytes of input 
str: [{}] ",
-                                           str_val.to_prefix(32));
-                            fmt::format_to(error_msg, "schema length: {}; ", 
len);
-                            fmt::format_to(error_msg, "limit length: {}; ", 
limit);
-                            fmt::format_to(error_msg, "actual length: {}; ", 
str_val.size);
+                        if (!state->enable_insert_strict() && type_str) {
+                            // In non-strict mode, the string will be 
truncated at a valid
+                            // UTF-8 character boundary within the byte limit 
(see below).
+                            need_manual_truncation = true;
+                        } else {
+                            if (str_val.size > len) {
+                                fmt::format_to(error_msg, "{}",
+                                               "the length of input is too 
long than schema. ");
+                                fmt::format_to(error_msg, "first 32 bytes of 
input str: [{}] ",
+                                               str_val.to_prefix(32));
+                                fmt::format_to(error_msg, "schema length: {}; 
", len);
+                                fmt::format_to(error_msg, "actual length: {}; 
", str_val.size);
+                            } else if (str_val.size > limit) {
+                                fmt::format_to(
+                                        error_msg, "{}",
+                                        "the length of input string is too 
long than vec schema. ");
+                                fmt::format_to(error_msg, "first 32 bytes of 
input str: [{}] ",
+                                               str_val.to_prefix(32));
+                                fmt::format_to(error_msg, "schema length: {}; 
", len);
+                                fmt::format_to(error_msg, "limit length: {}; 
", limit);
+                                fmt::format_to(error_msg, "actual length: {}; 
", str_val.size);
+                            }
+                            
RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
                         }
-                        RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
                     }
                 }
             }

Review Comment:
   This changes user-visible load semantics, but the PR does not add any 
regression or unit test. The reported failure is specific and easy to regress: 
non-strict load/insert into `VARCHAR(32)` with 
`${jnd${upper:ı}:ldap://test.comxxxxxx}` or `中123456789012345678901234567890` 
should succeed and store a value whose byte length does not exceed 32, while 
strict mode should still reject. Please add a regression test, with generated 
`.out`, for this converter path before merging.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to