floation-cutie commented on code in PR #60892:
URL: https://github.com/apache/doris/pull/60892#discussion_r2936288925


##########
be/src/exprs/function/function_string.h:
##########
@@ -2163,44 +2158,143 @@ class FunctionSplitByString : public IFunction {
         }
     }
 
-    void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars& 
column_string_chars,
-                               ColumnString::Offsets& column_string_offsets,
-                               ColumnArray::Offset64& string_pos,
-                               ColumnArray::Offset64& dest_pos) const {
+    static void split_empty_delimiter(const StringRef& str_ref,
+                                      ColumnString::Chars& column_string_chars,
+                                      ColumnString::Offsets& 
column_string_offsets,
+                                      ColumnArray::Offset64& string_pos,
+                                      ColumnArray::Offset64& dest_pos, Int32 
limit_value) {
         const size_t old_size = column_string_chars.size();
         const size_t new_size = old_size + str_ref.size;
         column_string_chars.resize(new_size);
         memcpy(column_string_chars.data() + old_size, str_ref.data, 
str_ref.size);
-        if (simd::VStringFunctions::is_ascii(str_ref)) {
-            const auto size = str_ref.size;
-
-            const auto nested_old_size = column_string_offsets.size();
-            const auto nested_new_size = nested_old_size + size;
-            column_string_offsets.resize(nested_new_size);
-            std::iota(column_string_offsets.data() + nested_old_size,
-                      column_string_offsets.data() + nested_new_size, 
string_pos + 1);
-
-            string_pos += size;
-            dest_pos += size;
-            // The above code is equivalent to the code in the following 
comment.
-            // for (size_t i = 0; i < str_ref.size; i++) {
-            //     string_pos++;
-            //     column_string_offsets.push_back(string_pos);
-            //     (*dest_nested_null_map).push_back(false);
-            //     dest_pos++;
-            // }
+
+        if (limit_value > 0) {
+            // With limit: split character by character up to limit-1, then 
remainder
+            Int32 split_count = 0;
+            size_t i = 0;
+            if (simd::VStringFunctions::is_ascii(str_ref)) {
+                for (; i < str_ref.size; i++) {
+                    if (split_count == limit_value - 1) {
+                        // remainder
+                        string_pos += str_ref.size - i;
+                        column_string_offsets.push_back(string_pos);
+                        dest_pos++;
+                        return;
+                    }
+                    string_pos++;
+                    column_string_offsets.push_back(string_pos);
+                    dest_pos++;
+                    split_count++;
+                }
+            } else {
+                for (size_t utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
+                    utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
+                    if (split_count == limit_value - 1) {
+                        // remainder
+                        string_pos += str_ref.size - i;
+                        column_string_offsets.push_back(string_pos);
+                        dest_pos++;
+                        return;
+                    }
+                    string_pos += utf8_char_len;
+                    column_string_offsets.push_back(string_pos);
+                    dest_pos++;
+                    split_count++;
+                }
+            }
         } else {
-            for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
-                utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
+            // No limit: original behavior
+            if (simd::VStringFunctions::is_ascii(str_ref)) {
+                const auto size = str_ref.size;
+
+                const auto nested_old_size = column_string_offsets.size();
+                const auto nested_new_size = nested_old_size + size;
+                column_string_offsets.resize(nested_new_size);
+                std::iota(column_string_offsets.data() + nested_old_size,
+                          column_string_offsets.data() + nested_new_size, 
string_pos + 1);
+
+                string_pos += size;
+                dest_pos += size;
+            } else {
+                for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
+                    utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
 
-                string_pos += utf8_char_len;
-                column_string_offsets.push_back(string_pos);
-                dest_pos++;
+                    string_pos += utf8_char_len;
+                    column_string_offsets.push_back(string_pos);
+                    dest_pos++;
+                }
             }
         }
     }
 };
 
+struct SplitByStringTwoArgImpl {
+    static DataTypes get_variadic_argument_types() {
+        return {std::make_shared<DataTypeString>(), 
std::make_shared<DataTypeString>()};
+    }
+
+    static Status execute_impl(FunctionContext* /*context*/, Block& block,
+                               const ColumnNumbers& arguments, uint32_t result,
+                               size_t input_rows_count) {
+        DCHECK_EQ(arguments.size(), 2);
+        return SplitByStringExecutor::execute_core(block, arguments, result, 
input_rows_count, -1);
+    }
+};
+
+struct SplitByStringThreeArgImpl {
+    static DataTypes get_variadic_argument_types() {
+        return {std::make_shared<DataTypeString>(), 
std::make_shared<DataTypeString>(),
+                std::make_shared<DataTypeInt32>()};
+    }
+
+    static Status execute_impl(FunctionContext* /*context*/, Block& block,
+                               const ColumnNumbers& arguments, uint32_t result,
+                               size_t input_rows_count) {
+        DCHECK_EQ(arguments.size(), 3);
+        const auto& [limit_column, limit_is_const] =
+                unpack_if_const(block.get_by_position(arguments[2]).column);
+        DCHECK(limit_is_const) << "limit argument of split_by_string must be a 
constant";
+        auto limit_value = assert_cast<const 
ColumnInt32&>(*limit_column).get_element(0);

Review Comment:
   > The `DCHECK(limit_is_const)` here is incorrect. when 
`use_default_implementation_for_constants()` returns true (the default) and all 
arguments are constant, unwrap every ColumnConst into its underlying data 
column before calling execute_impl. In that case limit_is_const will be false, 
causing the DCHECK to failed, so just remove it.
   
   Having removed the DCHECK statement



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to