This is an automated email from the ASF dual-hosted git repository.
zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a2daa074d3f [fix](be) Report string column overflow on checked append
paths (#63183)
a2daa074d3f is described below
commit a2daa074d3fca7b84be5edc454230734eeabad9e
Author: zclllyybb <[email protected]>
AuthorDate: Mon May 18 10:14:42 2026 +0800
[fix](be) Report string column overflow on checked append paths (#63183)
String column overflow can be triggered when ordinary vectorized string
output grows past the 32-bit `ColumnString` offset limit. The previous
widening-to-`ColumnString64` approach is unsafe for ordinary
operator/function outputs because many downstream STRING paths still
assume `ColumnString` and are not prepared for `DataTypeString` backed
by `ColumnString64`.
This PR changes the boundary:
- Checked `ColumnString` append paths keep 32-bit `ColumnString`. When
the effective string byte limit would be exceeded, they throw
`STRING_OVERFLOW_IN_VEC_ENGINE` with size/limit details.
- `string_overflow_size` keeps the production default `4294967295`, so
the normal limit remains 4GB. Tests lower it dynamically to exercise the
overflow branch without allocating multi-GB columns.
- Existing owned accumulator paths that deliberately call
`insert_range_from_ignore_overflow()` and later
`convert_column_if_overflow()` are left intact, because those are the
safe places to replace owned mutable columns.
---
be/src/core/column/column_string.h | 12 ++-
be/test/core/block/block_test.cpp | 50 ++++++++++++
be/test/core/column/column_string_test.cpp | 26 +++++-
...is_25531_string_overflow_fault_injection.groovy | 95 ++++++++++++++++++++++
4 files changed, 179 insertions(+), 4 deletions(-)
diff --git a/be/src/core/column/column_string.h
b/be/src/core/column/column_string.h
index 4bf6e3ad95d..db0301dc059 100644
--- a/be/src/core/column/column_string.h
+++ b/be/src/core/column/column_string.h
@@ -31,6 +31,7 @@
#include "common/cast_set.h"
#include "common/compiler_util.h" // IWYU pragma: keep
+#include "common/config.h"
#include "common/exception.h"
#include "common/status.h"
#include "core/assert_cast.h"
@@ -65,11 +66,16 @@ public:
void static check_chars_length(size_t total_length, size_t element_number,
size_t rows = 0) {
if constexpr (std::is_same_v<T, UInt32>) {
- if (UNLIKELY(total_length > MAX_STRING_SIZE)) {
+ size_t max_string_size = MAX_STRING_SIZE;
+ if (config::string_overflow_size > 0 &&
+ config::string_overflow_size <
static_cast<int64_t>(MAX_STRING_SIZE)) {
+ max_string_size =
static_cast<size_t>(config::string_overflow_size);
+ }
+ if (UNLIKELY(total_length > max_string_size)) {
throw Exception(ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE,
"string column length is too large:
total_length={}, "
- "element_number={}, rows={}",
- total_length, element_number, rows);
+ "limit={}, element_number={}, rows={}",
+ total_length, max_string_size, element_number,
rows);
}
}
}
diff --git a/be/test/core/block/block_test.cpp
b/be/test/core/block/block_test.cpp
index 1bb930bb15a..fa10c13f887 100644
--- a/be/test/core/block/block_test.cpp
+++ b/be/test/core/block/block_test.cpp
@@ -29,6 +29,7 @@
#include <cmath>
#include <cstddef>
#include <cstdint>
+#include <initializer_list>
#include <memory>
#include <string>
#include <vector>
@@ -64,6 +65,7 @@
#include "runtime/descriptor_helper.h"
#include "runtime/descriptors.h"
#include "testutil/column_helper.h"
+#include "util/defer_op.h"
namespace doris {
@@ -127,6 +129,17 @@ static void fill_block_with_array_string(Block& block) {
block.insert(test_array_string);
}
+static Block create_string_block(std::initializer_list<std::string> values) {
+ auto column = ColumnString::create();
+ for (const auto& value : values) {
+ column->insert_data(value.data(), value.size());
+ }
+
+ Block block;
+ block.insert({std::move(column), std::make_shared<DataTypeString>(), "s"});
+ return block;
+}
+
void serialize_and_deserialize_test(segment_v2::CompressionTypePB
compression_type) {
// int
{
@@ -953,6 +966,43 @@ TEST(BlockTest, clear_blocks) {
}
}
+TEST(BlockTest, merge_returns_error_when_checked_string_append_exceeds_limit) {
+ auto input_block = create_string_block({"abcde", "fghij"});
+ auto output_block = create_string_block({});
+
+ auto string_overflow_size = config::string_overflow_size;
+ config::string_overflow_size = 9;
+ Defer defer([string_overflow_size]() { config::string_overflow_size =
string_overflow_size; });
+
+ MutableBlock mutable_block(&output_block);
+ auto status = mutable_block.merge(input_block);
+ ASSERT_FALSE(status.ok());
+ EXPECT_NE(status.to_string().find("string column length is too large"),
std::string::npos)
+ << status.to_string();
+
+ ASSERT_EQ(output_block.rows(), 0);
+ ASSERT_FALSE(output_block.get_by_position(0).column->is_column_string64());
+}
+
+TEST(BlockTest, merge_ignore_overflow_keeps_owned_accumulation_convertible) {
+ auto input_block = create_string_block({"abcde", "fghij"});
+ auto output_block = create_string_block({});
+
+ auto string_overflow_size = config::string_overflow_size;
+ config::string_overflow_size = 9;
+ Defer defer([string_overflow_size]() { config::string_overflow_size =
string_overflow_size; });
+
+ MutableBlock mutable_block(&output_block);
+ auto status = mutable_block.merge_ignore_overflow(input_block);
+ ASSERT_TRUE(status.ok()) << status.to_string();
+
+ auto converted_column =
mutable_block.get_column_by_position(0)->convert_column_if_overflow();
+ ASSERT_TRUE(converted_column->is_column_string64());
+ ASSERT_EQ(converted_column->size(), 2);
+ EXPECT_EQ(converted_column->get_data_at(0).to_string(), "abcde");
+ EXPECT_EQ(converted_column->get_data_at(1).to_string(), "fghij");
+}
+
TEST(BlockTest, replace_by_position) {
auto block = ColumnHelper::create_block<DataTypeInt32>({1, 2, 3});
block.insert(0,
ColumnHelper::create_column_with_name<DataTypeString>({"a", "b", "c"}));
diff --git a/be/test/core/column/column_string_test.cpp
b/be/test/core/column/column_string_test.cpp
index 399188f046f..fe5927ba009 100644
--- a/be/test/core/column/column_string_test.cpp
+++ b/be/test/core/column/column_string_test.cpp
@@ -33,6 +33,7 @@
#include "core/string_ref.h"
#include "core/types.h"
#include "exprs/function/function_string_concat.h"
+#include "util/defer_op.h"
using namespace doris;
namespace doris {
@@ -854,6 +855,29 @@ TEST_F(ColumnStringTest,
insert_range_from_ignore_overflow) {
column_string_common_test(assert_column_vector_insert_range_from_ignore_overflow_callback,
false);
}
+
+TEST_F(ColumnStringTest, checked_insert_reports_configured_string_overflow) {
+ auto source = ColumnString::create();
+ source->insert_data("abcde", 5);
+ source->insert_data("fghij", 5);
+
+ auto string_overflow_size = config::string_overflow_size;
+ config::string_overflow_size = 9;
+ Defer defer([string_overflow_size]() { config::string_overflow_size =
string_overflow_size; });
+
+ auto checked_target = ColumnString::create();
+ EXPECT_THROW(checked_target->insert_range_from(*source, 0,
source->size()), Exception);
+
+ auto accumulator_target = ColumnString::create();
+ EXPECT_NO_THROW(
+ accumulator_target->insert_range_from_ignore_overflow(*source, 0,
source->size()));
+ auto converted = accumulator_target->convert_column_if_overflow();
+ ASSERT_TRUE(converted->is_column_string64());
+ ASSERT_EQ(converted->size(), source->size());
+ EXPECT_EQ(converted->get_data_at(0).to_string(), "abcde");
+ EXPECT_EQ(converted->get_data_at(1).to_string(), "fghij");
+}
+
TEST_F(ColumnStringTest, insert_indices_from) {
auto test_func = [](auto& target_column, const auto& source_column) {
// Test case 1: Empty source column
@@ -1510,4 +1534,4 @@ TEST_F(ColumnStringTest, is_valid_utf8) {
}
}
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git
a/regression-test/suites/fault_injection_p0/test_doris_25531_string_overflow_fault_injection.groovy
b/regression-test/suites/fault_injection_p0/test_doris_25531_string_overflow_fault_injection.groovy
new file mode 100644
index 00000000000..a6b5624e719
--- /dev/null
+++
b/regression-test/suites/fault_injection_p0/test_doris_25531_string_overflow_fault_injection.groovy
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_doris_25531_string_overflow_fault_injection", "nonConcurrent") {
+ def forcedOverflowSize = "31"
+
+ def normalizeRows = { rows ->
+ rows.collect { row ->
+ row.collect { value -> value == null ? null : value.toString() }
+ }
+ }
+
+ def backendIdToBackendIP = [:]
+ def backendIdToBackendHttpPort = [:]
+ getBackendIpHttpPort(backendIdToBackendIP, backendIdToBackendHttpPort)
+
+ def originalStringOverflowSize = "4294967295"
+ if (!backendIdToBackendIP.isEmpty()) {
+ def backendId = backendIdToBackendIP.keySet()[0]
+ def (code, out, err) = show_be_config(
+ backendIdToBackendIP.get(backendId),
backendIdToBackendHttpPort.get(backendId))
+ logger.info("show BE config: code=${code}, out=${out}, err=${err}")
+ assertEquals(0, code)
+ def configList = parseJson(out.trim())
+ for (Object entry in (List) configList) {
+ def values = (List<String>) entry
+ if (values[0] == "string_overflow_size") {
+ originalStringOverflowSize = values[2]
+ break
+ }
+ }
+ }
+
+ def expectStringOverflow = { query ->
+ test {
+ sql query
+ check { result, exception, startTime, endTime ->
+ assert exception != null: "Expected query to fail with string
overflow"
+ def details = exception.toString()
+ logger.info("Expected string overflow exception:
${details}".toString())
+ assert details.contains("string column length is too large")
+ }
+ }
+ }
+
+ def overflowQuery = """
+ SELECT repeat(v, 20)
+ FROM test_doris_25531_string_overflow_error
+ ORDER BY 1
+ """
+
+ sql """ DROP TABLE IF EXISTS test_doris_25531_string_overflow_error """
+ sql """
+ CREATE TABLE test_doris_25531_string_overflow_error (
+ k INT,
+ v STRING
+ )
+ DUPLICATE KEY(k)
+ DISTRIBUTED BY HASH(k) BUCKETS 1
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """
+ INSERT INTO test_doris_25531_string_overflow_error VALUES
+ (1, 'a'),
+ (2, 'b')
+ """
+
+ try {
+ update_all_be_config("string_overflow_size", forcedOverflowSize)
+
+ assertEquals([["a"], ["b"]],
+ normalizeRows(sql("SELECT v FROM
test_doris_25531_string_overflow_error ORDER BY k")))
+ expectStringOverflow(overflowQuery)
+ } finally {
+ update_all_be_config("string_overflow_size",
originalStringOverflowSize)
+ }
+
+ assertEquals([["aaaaaaaaaaaaaaaaaaaa"], ["bbbbbbbbbbbbbbbbbbbb"]],
normalizeRows(sql(overflowQuery)))
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]