This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new e6ba4aec29d branch-4.0: [refactor](load) replace table data dump with
single-line JSON format #56073 (#56417)
e6ba4aec29d is described below
commit e6ba4aec29d09420cb8208c2739fac7a59fb1fdf
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Sep 27 09:52:08 2025 +0800
branch-4.0: [refactor](load) replace table data dump with single-line JSON
format #56073 (#56417)
Cherry-picked from #56073
Co-authored-by: Kaijie Chen <[email protected]>
---
.../exec/group_commit_block_sink_operator.cpp | 6 +-
be/src/vec/core/block.cpp | 79 ++++++++++++++++++++++
be/src/vec/core/block.h | 4 ++
be/src/vec/sink/vtablet_finder.cpp | 4 +-
be/test/vec/core/block_test.cpp | 19 ++++++
5 files changed, 107 insertions(+), 5 deletions(-)
diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
index afce2bf80b7..db5de4da5e0 100644
--- a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
+++ b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
@@ -352,7 +352,7 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState*
state, vectorized::Bloc
if (local_state._partitions[row_index] == nullptr) [[unlikely]] {
local_state._filter_bitmap.Set(row_index, true);
LOG(WARNING) << "no partition for this tuple. tuple="
- << block->dump_data(row_index, 1);
+ << block->dump_data_json(row_index, 1);
local_state._has_filtered_rows = true;
state->update_num_rows_load_filtered(1);
state->update_num_rows_load_total(-1);
@@ -362,8 +362,8 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState*
state, vectorized::Bloc
[]() -> std::string { return ""; },
[&]() -> std::string {
fmt::memory_buffer buf;
- fmt::format_to(buf, "no partition for this tuple.
tuple=\n{}",
- block->dump_data(row_index, 1));
+ fmt::format_to(buf, "no partition for this tuple.
tuple={}",
+ block->dump_data_json(row_index,
1));
return fmt::to_string(buf);
}));
}
diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp
index 2e7b62ca657..db8314ca3c8 100644
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@@ -527,6 +527,56 @@ std::string Block::dump_types() const {
return out;
}
+std::string Block::dump_data_json(size_t begin, size_t row_limit, bool
allow_null_mismatch) const {
+ std::stringstream ss;
+
+ std::vector<std::string> headers;
+ headers.reserve(columns());
+ for (const auto& it : data) {
+ // fmt::format is from the {fmt} library, you might be using
std::format in C++20
+ // If not, you can build the string with a stringstream as a fallback.
+ headers.push_back(fmt::format("{}({})", it.name, it.type->get_name()));
+ }
+
+ size_t start_row = std::min(begin, rows());
+ size_t end_row = std::min(rows(), begin + row_limit);
+
+ ss << "[";
+ for (size_t row_num = start_row; row_num < end_row; ++row_num) {
+ if (row_num > start_row) {
+ ss << ",";
+ }
+ ss << "{";
+ for (size_t i = 0; i < columns(); ++i) {
+ if (i > 0) {
+ ss << ",";
+ }
+ ss << "\"" << headers[i] << "\":";
+ std::string s;
+
+ // This value-extraction logic is preserved from your original
function
+ // to maintain consistency, especially for handling nullability
mismatches.
+ if (data[i].column && data[i].type->is_nullable() &&
+ !data[i].column->is_concrete_nullable()) {
+ // This branch handles a specific internal representation of
nullable columns.
+ // The original code would assert here if allow_null_mismatch
is false.
+ assert(allow_null_mismatch);
+ s = assert_cast<const DataTypeNullable*>(data[i].type.get())
+ ->get_nested_type()
+ ->to_string(*data[i].column, row_num);
+ } else {
+ // This is the standard path. The to_string method is expected
to correctly
+ // handle all cases, including when the column is null (e.g.,
by returning "NULL").
+ s = data[i].to_string(row_num);
+ }
+ ss << "\"" << s << "\"";
+ }
+ ss << "}";
+ }
+ ss << "]";
+ return ss.str();
+}
+
std::string Block::dump_data(size_t begin, size_t row_limit, bool
allow_null_mismatch) const {
std::vector<std::string> headers;
std::vector<int> headers_size;
@@ -1133,6 +1183,35 @@ Block MutableBlock::to_block(int start_column, int
end_column) {
return {columns_with_schema};
}
+std::string MutableBlock::dump_data_json(size_t row_limit) const {
+ std::stringstream ss;
+ std::vector<std::string> headers;
+
+ headers.reserve(columns());
+ for (size_t i = 0; i < columns(); ++i) {
+ headers.push_back(_data_types[i]->get_name());
+ }
+ size_t num_rows_to_dump = std::min(rows(), row_limit);
+ ss << "[";
+ for (size_t row_num = 0; row_num < num_rows_to_dump; ++row_num) {
+ if (row_num > 0) {
+ ss << ",";
+ }
+ ss << "{";
+ for (size_t i = 0; i < columns(); ++i) {
+ if (i > 0) {
+ ss << ",";
+ }
+ ss << "\"" << headers[i] << "\":";
+ std::string s = _data_types[i]->to_string(*_columns[i].get(),
row_num);
+ ss << "\"" << s << "\"";
+ }
+ ss << "}";
+ }
+ ss << "]";
+ return ss.str();
+}
+
std::string MutableBlock::dump_data(size_t row_limit) const {
std::vector<std::string> headers;
std::vector<int> headers_size;
diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h
index ab6caeb5097..609ad0af6a6 100644
--- a/be/src/vec/core/block.h
+++ b/be/src/vec/core/block.h
@@ -264,6 +264,9 @@ public:
std::string dump_data(size_t begin = 0, size_t row_limit = 100,
bool allow_null_mismatch = false) const;
+ std::string dump_data_json(size_t begin = 0, size_t row_limit = 100,
+ bool allow_null_mismatch = false) const;
+
/** Get one line data from block, only use in load data */
std::string dump_one_line(size_t row, int column_end) const;
@@ -617,6 +620,7 @@ public:
void erase(const String& name);
std::string dump_data(size_t row_limit = 100) const;
+ std::string dump_data_json(size_t row_limit = 100) const;
void clear() {
_columns.clear();
diff --git a/be/src/vec/sink/vtablet_finder.cpp
b/be/src/vec/sink/vtablet_finder.cpp
index b0c54e295a4..834d1f31d74 100644
--- a/be/src/vec/sink/vtablet_finder.cpp
+++ b/be/src/vec/sink/vtablet_finder.cpp
@@ -58,8 +58,8 @@ Status OlapTabletFinder::find_tablets(RuntimeState* state,
Block* block, int row
[]() -> std::string { return ""; },
[&]() -> std::string {
fmt::memory_buffer buf;
- fmt::format_to(buf, "no partition for this tuple.
tuple=\n{}",
- block->dump_data(row_index, 1));
+ fmt::format_to(buf, "no partition for this tuple.
tuple={}",
+ block->dump_data_json(row_index, 1));
return fmt::to_string(buf);
}));
continue;
diff --git a/be/test/vec/core/block_test.cpp b/be/test/vec/core/block_test.cpp
index 6a0efe76a17..eebc25d9ccc 100644
--- a/be/test/vec/core/block_test.cpp
+++ b/be/test/vec/core/block_test.cpp
@@ -308,6 +308,7 @@ void
serialize_and_deserialize_test(segment_v2::CompressionTypePB compression_ty
block_to_pb(block2, &pblock2, compression_type);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(s1, s2);
+ EXPECT_GT(block.dump_data_json().size(), 1);
}
}
@@ -332,6 +333,7 @@ void serialize_and_deserialize_test_one() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
}
@@ -357,6 +359,7 @@ void serialize_and_deserialize_test_int() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
@@ -380,6 +383,7 @@ void serialize_and_deserialize_test_int() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
}
@@ -404,6 +408,7 @@ void serialize_and_deserialize_test_long() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
@@ -427,6 +432,7 @@ void serialize_and_deserialize_test_long() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
}
@@ -451,6 +457,7 @@ void serialize_and_deserialize_test_string() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
@@ -475,6 +482,7 @@ void serialize_and_deserialize_test_string() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
}
@@ -502,6 +510,7 @@ void serialize_and_deserialize_test_nullable() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
@@ -526,6 +535,7 @@ void serialize_and_deserialize_test_nullable() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
@@ -571,6 +581,7 @@ void serialize_and_deserialize_test_nullable() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
}
@@ -596,6 +607,7 @@ void serialize_and_deserialize_test_decimal() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
@@ -619,6 +631,7 @@ void serialize_and_deserialize_test_decimal() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(block.dump_data(), block2.dump_data());
+ EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
EXPECT_EQ(s1, s2);
}
}
@@ -678,6 +691,7 @@ void serialize_and_deserialize_test_bitmap() {
static_cast<void>(block2.deserialize(pblock));
std::string bb2 = block2.dump_data(0, 1024);
EXPECT_EQ(bb1, bb2);
+ EXPECT_EQ(block.dump_data_json(0, 1024), block2.dump_data_json(0,
1024));
PBlock pblock2;
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
std::string s2 = pblock2.DebugString();
@@ -700,6 +714,7 @@ void serialize_and_deserialize_test_array() {
block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
std::string s2 = pblock2.DebugString();
EXPECT_EQ(s1, s2);
+ EXPECT_GT(block.dump_data_json().size(), 1);
}
}
@@ -790,6 +805,7 @@ TEST(BlockTest, dump_data) {
vectorized::Block block({test_int, test_string, test_decimal,
test_nullable_int32, test_date,
test_datetime, test_date_v2});
EXPECT_GT(block.dump_data().size(), 1);
+ EXPECT_GT(block.dump_data_json().size(), 1);
// test dump array int and array string
vectorized::Block block1;
@@ -797,6 +813,7 @@ TEST(BlockTest, dump_data) {
fill_block_with_array_string(block1);
// Note: here we should set 'row_num' in dump_data
EXPECT_GT(block1.dump_data(10).size(), 1);
+ EXPECT_GT(block.dump_data_json(10).size(), 1);
vectorized::IColumn::Filter filter;
int size = block1.rows() / 2;
@@ -1362,6 +1379,8 @@ TEST(BlockTest, others) {
vectorized::MutableBlock mutable_block(&block);
auto dumped = mutable_block.dump_data();
ASSERT_GT(dumped.size(), 0) << "Dumped data size: " << dumped.size();
+ auto dumped_json = mutable_block.dump_data_json();
+ ASSERT_GT(dumped_json.size(), 0) << "Dumped data json size: " <<
dumped_json.size();
mutable_block.clear_column_data();
ASSERT_EQ(mutable_block.get_column_by_position(0)->size(), 0);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]