SUBSTITUDE commented on code in PR #64651:
URL: https://github.com/apache/doris/pull/64651#discussion_r3448768921
##########
be/src/exec/sink/vtablet_block_convertor.cpp:
##########
@@ -298,33 +299,81 @@ Status
OlapTableBlockConvertor::_internal_validate_column(RuntimeState* state, B
null_map = tmp_column_ptr == nullptr ? nullptr
:
tmp_column_ptr->get_null_map_data().data();
}
+ bool need_manual_truncation = false;
for (size_t j = 0; j < row_count; ++j) {
auto row = rows ? (*rows)[j] : j;
if (need_to_validate(j, row, filter_map, null_map)) {
auto str_val = column_string->get_data_at(j);
bool invalid = str_val.size > limit;
if (invalid) {
- if (str_val.size > len) {
- fmt::format_to(error_msg, "{}",
- "the length of input is too long
than schema. ");
- fmt::format_to(error_msg, "first 32 bytes of input
str: [{}] ",
- str_val.to_prefix(32));
- fmt::format_to(error_msg, "schema length: {}; ",
len);
- fmt::format_to(error_msg, "actual length: {}; ",
str_val.size);
- } else if (str_val.size > limit) {
- fmt::format_to(
- error_msg, "{}",
- "the length of input string is too long
than vec schema. ");
- fmt::format_to(error_msg, "first 32 bytes of input
str: [{}] ",
- str_val.to_prefix(32));
- fmt::format_to(error_msg, "schema length: {}; ",
len);
- fmt::format_to(error_msg, "limit length: {}; ",
limit);
- fmt::format_to(error_msg, "actual length: {}; ",
str_val.size);
+ if (!state->enable_insert_strict() && type_str) {
+ // In non-strict mode, the string will be
truncated at a valid
+ // UTF-8 character boundary within the byte limit
(see below).
+ need_manual_truncation = true;
+ } else {
+ if (str_val.size > len) {
+ fmt::format_to(error_msg, "{}",
+ "the length of input is too
long than schema. ");
+ fmt::format_to(error_msg, "first 32 bytes of
input str: [{}] ",
+ str_val.to_prefix(32));
+ fmt::format_to(error_msg, "schema length: {};
", len);
+ fmt::format_to(error_msg, "actual length: {};
", str_val.size);
+ } else if (str_val.size > limit) {
+ fmt::format_to(
+ error_msg, "{}",
+ "the length of input string is too
long than vec schema. ");
+ fmt::format_to(error_msg, "first 32 bytes of
input str: [{}] ",
+ str_val.to_prefix(32));
+ fmt::format_to(error_msg, "schema length: {};
", len);
+ fmt::format_to(error_msg, "limit length: {};
", limit);
+ fmt::format_to(error_msg, "actual length: {};
", str_val.size);
+ }
+
RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
}
- RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
}
}
}
Review Comment:
Thanks for the review! I've added comprehensive regression tests covering
all the requested scenarios:
Test file:
regression-test/suites/load_p0/stream_load/test_utf8_varchar_truncation.groovy
Test 1 — Non-strict mode (proves the reported bug examples now succeed):
10 rows with mixed ASCII, Chinese (3-byte UTF-8), and Turkish U+0131 (2-byte
UTF-8)
All 10 rows load successfully with 0 filtered
Each truncated value is verified with exact assertions:
"中1234567890" (13B) → "中1234567" (10B) — direct reproduction of #64334
"abcıdefghij" (12B) → "abcıdefgh" (10B) — direct reproduction of #64334
"正常数据" (12B) → "正常数" (9B) — walk-back across continuation byte
Test 2 — Strict mode (proves strict mode still rejects):
4 rows within limit loaded, 6 over-limit rows correctly filtered
ErrorURL is present, confirming proper rejection
Test data covers all edge cases:
Multi-byte char exactly at truncation boundary (no walk-back)
Multi-byte char crossing boundary (walk-back required)
Multiple consecutive multi-byte chars
Strings exactly at the byte limit
Please re-review when you have a chance. Thanks!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]