github-actions[bot] commented on code in PR #60892:
URL: https://github.com/apache/doris/pull/60892#discussion_r2931632741
##########
be/src/vec/functions/function_string.h:
##########
@@ -2163,44 +2158,143 @@ class FunctionSplitByString : public IFunction {
}
}
- void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars&
column_string_chars,
- ColumnString::Offsets& column_string_offsets,
- ColumnArray::Offset64& string_pos,
- ColumnArray::Offset64& dest_pos) const {
+ static void split_empty_delimiter(const StringRef& str_ref,
+ ColumnString::Chars& column_string_chars,
+ ColumnString::Offsets&
column_string_offsets,
+ ColumnArray::Offset64& string_pos,
+ ColumnArray::Offset64& dest_pos, Int32
limit_value) {
const size_t old_size = column_string_chars.size();
const size_t new_size = old_size + str_ref.size;
column_string_chars.resize(new_size);
memcpy(column_string_chars.data() + old_size, str_ref.data,
str_ref.size);
- if (simd::VStringFunctions::is_ascii(str_ref)) {
- const auto size = str_ref.size;
-
- const auto nested_old_size = column_string_offsets.size();
- const auto nested_new_size = nested_old_size + size;
- column_string_offsets.resize(nested_new_size);
- std::iota(column_string_offsets.data() + nested_old_size,
- column_string_offsets.data() + nested_new_size,
string_pos + 1);
-
- string_pos += size;
- dest_pos += size;
- // The above code is equivalent to the code in the following
comment.
- // for (size_t i = 0; i < str_ref.size; i++) {
- // string_pos++;
- // column_string_offsets.push_back(string_pos);
- // (*dest_nested_null_map).push_back(false);
- // dest_pos++;
- // }
+
+ if (limit_value > 0) {
+ // With limit: split character by character up to limit-1, then
remainder
+ Int32 split_count = 0;
+ size_t i = 0;
+ if (simd::VStringFunctions::is_ascii(str_ref)) {
+ for (; i < str_ref.size; i++) {
+ if (split_count == limit_value - 1) {
+ // remainder
+ string_pos += str_ref.size - i;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ return;
+ }
+ string_pos++;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ split_count++;
+ }
+ } else {
+ for (size_t utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
+ utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
+ if (split_count == limit_value - 1) {
+ // remainder
+ string_pos += str_ref.size - i;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ return;
+ }
+ string_pos += utf8_char_len;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ split_count++;
+ }
+ }
} else {
- for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
- utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
+ // No limit: original behavior
+ if (simd::VStringFunctions::is_ascii(str_ref)) {
+ const auto size = str_ref.size;
+
+ const auto nested_old_size = column_string_offsets.size();
+ const auto nested_new_size = nested_old_size + size;
+ column_string_offsets.resize(nested_new_size);
+ std::iota(column_string_offsets.data() + nested_old_size,
+ column_string_offsets.data() + nested_new_size,
string_pos + 1);
+
+ string_pos += size;
+ dest_pos += size;
+ } else {
+ for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
+ utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
- string_pos += utf8_char_len;
- column_string_offsets.push_back(string_pos);
- dest_pos++;
+ string_pos += utf8_char_len;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ }
}
}
}
};
+struct SplitByStringTwoArgImpl {
+ static DataTypes get_variadic_argument_types() {
+ return {std::make_shared<DataTypeString>(),
std::make_shared<DataTypeString>()};
+ }
+
+ static Status execute_impl(FunctionContext* /*context*/, Block& block,
+ const ColumnNumbers& arguments, uint32_t result,
+ size_t input_rows_count) {
+ DCHECK_EQ(arguments.size(), 2);
+ return SplitByStringExecutor::execute_core(block, arguments, result,
input_rows_count, -1);
+ }
+};
+
+struct SplitByStringThreeArgImpl {
+ static DataTypes get_variadic_argument_types() {
+ return {std::make_shared<DataTypeString>(),
std::make_shared<DataTypeString>(),
+ std::make_shared<DataTypeInt32>()};
+ }
+
+ static Status execute_impl(FunctionContext* /*context*/, Block& block,
+ const ColumnNumbers& arguments, uint32_t result,
+ size_t input_rows_count) {
+ DCHECK_EQ(arguments.size(), 3);
+ const auto& [limit_column, limit_is_const] =
+ unpack_if_const(block.get_by_position(arguments[2]).column);
+ DCHECK(limit_is_const) << "limit argument of split_by_string must be a
constant";
+ auto limit_value = assert_cast<const
ColumnInt32&>(*limit_column).get_element(0);
Review Comment:
**Issue [Medium]**: `DCHECK(limit_is_const)` disappears in RELEASE builds.
If this invariant is ever violated in production, the code would silently read
`get_element(0)` from a non-const column, using the first row's value as the
limit for all rows — producing **silently wrong results** rather than crashing.
While the FE enforces this constraint via `checkLegalityAfterRewrite()`,
defensive checks in BE should not rely solely on FE validation. Per project
standards, `DCHECK` should not be used for invariants that, when violated,
would cause silent data corruption.
Suggestion: Replace with `DORIS_CHECK` or add a proper error return:
```cpp
if (!limit_is_const) {
return Status::InvalidArgument("limit argument of split_by_string must
be a constant");
}
```
Note: The rest of the file uses `DCHECK` consistently (file-wide technical
debt), but this particular invariant violation would cause silent wrong results
rather than just suboptimal behavior, making it higher priority to protect.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]