This is an automated email from the ASF dual-hosted git repository.

zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a2daa074d3f [fix](be) Report string column overflow on checked append 
paths (#63183)
a2daa074d3f is described below

commit a2daa074d3fca7b84be5edc454230734eeabad9e
Author: zclllyybb <[email protected]>
AuthorDate: Mon May 18 10:14:42 2026 +0800

    [fix](be) Report string column overflow on checked append paths (#63183)
    
    String column overflow can be triggered when ordinary vectorized string
    output grows past the 32-bit `ColumnString` offset limit. The previous
    widening-to-`ColumnString64` approach is unsafe for ordinary
    operator/function outputs because many downstream STRING paths still
    assume `ColumnString` and are not prepared for `DataTypeString` backed
    by `ColumnString64`.
    
    This PR changes the boundary:
    
    - Checked `ColumnString` append paths keep 32-bit `ColumnString`. When
    the effective string byte limit would be exceeded, they throw
    `STRING_OVERFLOW_IN_VEC_ENGINE` with size/limit details.
    - `string_overflow_size` keeps the production default `4294967295`, so
    the normal limit remains 4GB. Tests lower it dynamically to exercise the
    overflow branch without allocating multi-GB columns.
    - Existing owned accumulator paths that deliberately call
    `insert_range_from_ignore_overflow()` and later
    `convert_column_if_overflow()` are left intact, because those are the
    safe places to replace owned mutable columns.
---
 be/src/core/column/column_string.h                 | 12 ++-
 be/test/core/block/block_test.cpp                  | 50 ++++++++++++
 be/test/core/column/column_string_test.cpp         | 26 +++++-
 ...is_25531_string_overflow_fault_injection.groovy | 95 ++++++++++++++++++++++
 4 files changed, 179 insertions(+), 4 deletions(-)

diff --git a/be/src/core/column/column_string.h 
b/be/src/core/column/column_string.h
index 4bf6e3ad95d..db0301dc059 100644
--- a/be/src/core/column/column_string.h
+++ b/be/src/core/column/column_string.h
@@ -31,6 +31,7 @@
 
 #include "common/cast_set.h"
 #include "common/compiler_util.h" // IWYU pragma: keep
+#include "common/config.h"
 #include "common/exception.h"
 #include "common/status.h"
 #include "core/assert_cast.h"
@@ -65,11 +66,16 @@ public:
 
     void static check_chars_length(size_t total_length, size_t element_number, 
size_t rows = 0) {
         if constexpr (std::is_same_v<T, UInt32>) {
-            if (UNLIKELY(total_length > MAX_STRING_SIZE)) {
+            size_t max_string_size = MAX_STRING_SIZE;
+            if (config::string_overflow_size > 0 &&
+                config::string_overflow_size < 
static_cast<int64_t>(MAX_STRING_SIZE)) {
+                max_string_size = 
static_cast<size_t>(config::string_overflow_size);
+            }
+            if (UNLIKELY(total_length > max_string_size)) {
                 throw Exception(ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE,
                                 "string column length is too large: 
total_length={}, "
-                                "element_number={}, rows={}",
-                                total_length, element_number, rows);
+                                "limit={}, element_number={}, rows={}",
+                                total_length, max_string_size, element_number, 
rows);
             }
         }
     }
diff --git a/be/test/core/block/block_test.cpp 
b/be/test/core/block/block_test.cpp
index 1bb930bb15a..fa10c13f887 100644
--- a/be/test/core/block/block_test.cpp
+++ b/be/test/core/block/block_test.cpp
@@ -29,6 +29,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <initializer_list>
 #include <memory>
 #include <string>
 #include <vector>
@@ -64,6 +65,7 @@
 #include "runtime/descriptor_helper.h"
 #include "runtime/descriptors.h"
 #include "testutil/column_helper.h"
+#include "util/defer_op.h"
 
 namespace doris {
 
@@ -127,6 +129,17 @@ static void fill_block_with_array_string(Block& block) {
     block.insert(test_array_string);
 }
 
+static Block create_string_block(std::initializer_list<std::string> values) {
+    auto column = ColumnString::create();
+    for (const auto& value : values) {
+        column->insert_data(value.data(), value.size());
+    }
+
+    Block block;
+    block.insert({std::move(column), std::make_shared<DataTypeString>(), "s"});
+    return block;
+}
+
 void serialize_and_deserialize_test(segment_v2::CompressionTypePB 
compression_type) {
     // int
     {
@@ -953,6 +966,43 @@ TEST(BlockTest, clear_blocks) {
     }
 }
 
+TEST(BlockTest, merge_returns_error_when_checked_string_append_exceeds_limit) {
+    auto input_block = create_string_block({"abcde", "fghij"});
+    auto output_block = create_string_block({});
+
+    auto string_overflow_size = config::string_overflow_size;
+    config::string_overflow_size = 9;
+    Defer defer([string_overflow_size]() { config::string_overflow_size = 
string_overflow_size; });
+
+    MutableBlock mutable_block(&output_block);
+    auto status = mutable_block.merge(input_block);
+    ASSERT_FALSE(status.ok());
+    EXPECT_NE(status.to_string().find("string column length is too large"), 
std::string::npos)
+            << status.to_string();
+
+    ASSERT_EQ(output_block.rows(), 0);
+    ASSERT_FALSE(output_block.get_by_position(0).column->is_column_string64());
+}
+
+TEST(BlockTest, merge_ignore_overflow_keeps_owned_accumulation_convertible) {
+    auto input_block = create_string_block({"abcde", "fghij"});
+    auto output_block = create_string_block({});
+
+    auto string_overflow_size = config::string_overflow_size;
+    config::string_overflow_size = 9;
+    Defer defer([string_overflow_size]() { config::string_overflow_size = 
string_overflow_size; });
+
+    MutableBlock mutable_block(&output_block);
+    auto status = mutable_block.merge_ignore_overflow(input_block);
+    ASSERT_TRUE(status.ok()) << status.to_string();
+
+    auto converted_column = 
mutable_block.get_column_by_position(0)->convert_column_if_overflow();
+    ASSERT_TRUE(converted_column->is_column_string64());
+    ASSERT_EQ(converted_column->size(), 2);
+    EXPECT_EQ(converted_column->get_data_at(0).to_string(), "abcde");
+    EXPECT_EQ(converted_column->get_data_at(1).to_string(), "fghij");
+}
+
 TEST(BlockTest, replace_by_position) {
     auto block = ColumnHelper::create_block<DataTypeInt32>({1, 2, 3});
     block.insert(0, 
ColumnHelper::create_column_with_name<DataTypeString>({"a", "b", "c"}));
diff --git a/be/test/core/column/column_string_test.cpp 
b/be/test/core/column/column_string_test.cpp
index 399188f046f..fe5927ba009 100644
--- a/be/test/core/column/column_string_test.cpp
+++ b/be/test/core/column/column_string_test.cpp
@@ -33,6 +33,7 @@
 #include "core/string_ref.h"
 #include "core/types.h"
 #include "exprs/function/function_string_concat.h"
+#include "util/defer_op.h"
 
 using namespace doris;
 namespace doris {
@@ -854,6 +855,29 @@ TEST_F(ColumnStringTest, 
insert_range_from_ignore_overflow) {
     
column_string_common_test(assert_column_vector_insert_range_from_ignore_overflow_callback,
                               false);
 }
+
+TEST_F(ColumnStringTest, checked_insert_reports_configured_string_overflow) {
+    auto source = ColumnString::create();
+    source->insert_data("abcde", 5);
+    source->insert_data("fghij", 5);
+
+    auto string_overflow_size = config::string_overflow_size;
+    config::string_overflow_size = 9;
+    Defer defer([string_overflow_size]() { config::string_overflow_size = 
string_overflow_size; });
+
+    auto checked_target = ColumnString::create();
+    EXPECT_THROW(checked_target->insert_range_from(*source, 0, 
source->size()), Exception);
+
+    auto accumulator_target = ColumnString::create();
+    EXPECT_NO_THROW(
+            accumulator_target->insert_range_from_ignore_overflow(*source, 0, 
source->size()));
+    auto converted = accumulator_target->convert_column_if_overflow();
+    ASSERT_TRUE(converted->is_column_string64());
+    ASSERT_EQ(converted->size(), source->size());
+    EXPECT_EQ(converted->get_data_at(0).to_string(), "abcde");
+    EXPECT_EQ(converted->get_data_at(1).to_string(), "fghij");
+}
+
 TEST_F(ColumnStringTest, insert_indices_from) {
     auto test_func = [](auto& target_column, const auto& source_column) {
         // Test case 1: Empty source column
@@ -1510,4 +1534,4 @@ TEST_F(ColumnStringTest, is_valid_utf8) {
     }
 }
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git 
a/regression-test/suites/fault_injection_p0/test_doris_25531_string_overflow_fault_injection.groovy
 
b/regression-test/suites/fault_injection_p0/test_doris_25531_string_overflow_fault_injection.groovy
new file mode 100644
index 00000000000..a6b5624e719
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/test_doris_25531_string_overflow_fault_injection.groovy
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_doris_25531_string_overflow_fault_injection", "nonConcurrent") {
+    def forcedOverflowSize = "31"
+
+    def normalizeRows = { rows ->
+        rows.collect { row ->
+            row.collect { value -> value == null ? null : value.toString() }
+        }
+    }
+
+    def backendIdToBackendIP = [:]
+    def backendIdToBackendHttpPort = [:]
+    getBackendIpHttpPort(backendIdToBackendIP, backendIdToBackendHttpPort)
+
+    def originalStringOverflowSize = "4294967295"
+    if (!backendIdToBackendIP.isEmpty()) {
+        def backendId = backendIdToBackendIP.keySet()[0]
+        def (code, out, err) = show_be_config(
+                backendIdToBackendIP.get(backendId), 
backendIdToBackendHttpPort.get(backendId))
+        logger.info("show BE config: code=${code}, out=${out}, err=${err}")
+        assertEquals(0, code)
+        def configList = parseJson(out.trim())
+        for (Object entry in (List) configList) {
+            def values = (List<String>) entry
+            if (values[0] == "string_overflow_size") {
+                originalStringOverflowSize = values[2]
+                break
+            }
+        }
+    }
+
+    def expectStringOverflow = { query ->
+        test {
+            sql query
+            check { result, exception, startTime, endTime ->
+                assert exception != null: "Expected query to fail with string 
overflow"
+                def details = exception.toString()
+                logger.info("Expected string overflow exception: 
${details}".toString())
+                assert details.contains("string column length is too large")
+            }
+        }
+    }
+
+    def overflowQuery = """
+        SELECT repeat(v, 20)
+        FROM test_doris_25531_string_overflow_error
+        ORDER BY 1
+    """
+
+    sql """ DROP TABLE IF EXISTS test_doris_25531_string_overflow_error """
+    sql """
+        CREATE TABLE test_doris_25531_string_overflow_error (
+            k INT,
+            v STRING
+        )
+        DUPLICATE KEY(k)
+        DISTRIBUTED BY HASH(k) BUCKETS 1
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    sql """
+        INSERT INTO test_doris_25531_string_overflow_error VALUES
+            (1, 'a'),
+            (2, 'b')
+    """
+
+    try {
+        update_all_be_config("string_overflow_size", forcedOverflowSize)
+
+        assertEquals([["a"], ["b"]],
+                normalizeRows(sql("SELECT v FROM 
test_doris_25531_string_overflow_error ORDER BY k")))
+        expectStringOverflow(overflowQuery)
+    } finally {
+        update_all_be_config("string_overflow_size", 
originalStringOverflowSize)
+    }
+
+    assertEquals([["aaaaaaaaaaaaaaaaaaaa"], ["bbbbbbbbbbbbbbbbbbbb"]], 
normalizeRows(sql(overflowQuery)))
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to