This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new e6ba4aec29d branch-4.0: [refactor](load) replace table data dump with 
single-line JSON format #56073 (#56417)
e6ba4aec29d is described below

commit e6ba4aec29d09420cb8208c2739fac7a59fb1fdf
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Sep 27 09:52:08 2025 +0800

    branch-4.0: [refactor](load) replace table data dump with single-line JSON 
format #56073 (#56417)
    
    Cherry-picked from #56073
    
    Co-authored-by: Kaijie Chen <[email protected]>
---
 .../exec/group_commit_block_sink_operator.cpp      |  6 +-
 be/src/vec/core/block.cpp                          | 79 ++++++++++++++++++++++
 be/src/vec/core/block.h                            |  4 ++
 be/src/vec/sink/vtablet_finder.cpp                 |  4 +-
 be/test/vec/core/block_test.cpp                    | 19 ++++++
 5 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp 
b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
index afce2bf80b7..db5de4da5e0 100644
--- a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
+++ b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp
@@ -352,7 +352,7 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* 
state, vectorized::Bloc
             if (local_state._partitions[row_index] == nullptr) [[unlikely]] {
                 local_state._filter_bitmap.Set(row_index, true);
                 LOG(WARNING) << "no partition for this tuple. tuple="
-                             << block->dump_data(row_index, 1);
+                             << block->dump_data_json(row_index, 1);
                 local_state._has_filtered_rows = true;
                 state->update_num_rows_load_filtered(1);
                 state->update_num_rows_load_total(-1);
@@ -362,8 +362,8 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* 
state, vectorized::Bloc
                         []() -> std::string { return ""; },
                         [&]() -> std::string {
                             fmt::memory_buffer buf;
-                            fmt::format_to(buf, "no partition for this tuple. 
tuple=\n{}",
-                                           block->dump_data(row_index, 1));
+                            fmt::format_to(buf, "no partition for this tuple. 
tuple={}",
+                                           block->dump_data_json(row_index, 
1));
                             return fmt::to_string(buf);
                         }));
             }
diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp
index 2e7b62ca657..db8314ca3c8 100644
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@@ -527,6 +527,56 @@ std::string Block::dump_types() const {
     return out;
 }
 
+std::string Block::dump_data_json(size_t begin, size_t row_limit, bool 
allow_null_mismatch) const {
+    std::stringstream ss;
+
+    std::vector<std::string> headers;
+    headers.reserve(columns());
+    for (const auto& it : data) {
+        // fmt::format is from the {fmt} library, you might be using 
std::format in C++20
+        // If not, you can build the string with a stringstream as a fallback.
+        headers.push_back(fmt::format("{}({})", it.name, it.type->get_name()));
+    }
+
+    size_t start_row = std::min(begin, rows());
+    size_t end_row = std::min(rows(), begin + row_limit);
+
+    ss << "[";
+    for (size_t row_num = start_row; row_num < end_row; ++row_num) {
+        if (row_num > start_row) {
+            ss << ",";
+        }
+        ss << "{";
+        for (size_t i = 0; i < columns(); ++i) {
+            if (i > 0) {
+                ss << ",";
+            }
+            ss << "\"" << headers[i] << "\":";
+            std::string s;
+
+            // This value-extraction logic is preserved from your original 
function
+            // to maintain consistency, especially for handling nullability 
mismatches.
+            if (data[i].column && data[i].type->is_nullable() &&
+                !data[i].column->is_concrete_nullable()) {
+                // This branch handles a specific internal representation of 
nullable columns.
+                // The original code would assert here if allow_null_mismatch 
is false.
+                assert(allow_null_mismatch);
+                s = assert_cast<const DataTypeNullable*>(data[i].type.get())
+                            ->get_nested_type()
+                            ->to_string(*data[i].column, row_num);
+            } else {
+                // This is the standard path. The to_string method is expected 
to correctly
+                // handle all cases, including when the column is null (e.g., 
by returning "NULL").
+                s = data[i].to_string(row_num);
+            }
+            ss << "\"" << s << "\"";
+        }
+        ss << "}";
+    }
+    ss << "]";
+    return ss.str();
+}
+
 std::string Block::dump_data(size_t begin, size_t row_limit, bool 
allow_null_mismatch) const {
     std::vector<std::string> headers;
     std::vector<int> headers_size;
@@ -1133,6 +1183,35 @@ Block MutableBlock::to_block(int start_column, int 
end_column) {
     return {columns_with_schema};
 }
 
+std::string MutableBlock::dump_data_json(size_t row_limit) const {
+    std::stringstream ss;
+    std::vector<std::string> headers;
+
+    headers.reserve(columns());
+    for (size_t i = 0; i < columns(); ++i) {
+        headers.push_back(_data_types[i]->get_name());
+    }
+    size_t num_rows_to_dump = std::min(rows(), row_limit);
+    ss << "[";
+    for (size_t row_num = 0; row_num < num_rows_to_dump; ++row_num) {
+        if (row_num > 0) {
+            ss << ",";
+        }
+        ss << "{";
+        for (size_t i = 0; i < columns(); ++i) {
+            if (i > 0) {
+                ss << ",";
+            }
+            ss << "\"" << headers[i] << "\":";
+            std::string s = _data_types[i]->to_string(*_columns[i].get(), 
row_num);
+            ss << "\"" << s << "\"";
+        }
+        ss << "}";
+    }
+    ss << "]";
+    return ss.str();
+}
+
 std::string MutableBlock::dump_data(size_t row_limit) const {
     std::vector<std::string> headers;
     std::vector<int> headers_size;
diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h
index ab6caeb5097..609ad0af6a6 100644
--- a/be/src/vec/core/block.h
+++ b/be/src/vec/core/block.h
@@ -264,6 +264,9 @@ public:
     std::string dump_data(size_t begin = 0, size_t row_limit = 100,
                           bool allow_null_mismatch = false) const;
 
+    std::string dump_data_json(size_t begin = 0, size_t row_limit = 100,
+                               bool allow_null_mismatch = false) const;
+
     /** Get one line data from block, only use in load data */
     std::string dump_one_line(size_t row, int column_end) const;
 
@@ -617,6 +620,7 @@ public:
     void erase(const String& name);
 
     std::string dump_data(size_t row_limit = 100) const;
+    std::string dump_data_json(size_t row_limit = 100) const;
 
     void clear() {
         _columns.clear();
diff --git a/be/src/vec/sink/vtablet_finder.cpp 
b/be/src/vec/sink/vtablet_finder.cpp
index b0c54e295a4..834d1f31d74 100644
--- a/be/src/vec/sink/vtablet_finder.cpp
+++ b/be/src/vec/sink/vtablet_finder.cpp
@@ -58,8 +58,8 @@ Status OlapTabletFinder::find_tablets(RuntimeState* state, 
Block* block, int row
                     []() -> std::string { return ""; },
                     [&]() -> std::string {
                         fmt::memory_buffer buf;
-                        fmt::format_to(buf, "no partition for this tuple. 
tuple=\n{}",
-                                       block->dump_data(row_index, 1));
+                        fmt::format_to(buf, "no partition for this tuple. 
tuple={}",
+                                       block->dump_data_json(row_index, 1));
                         return fmt::to_string(buf);
                     }));
             continue;
diff --git a/be/test/vec/core/block_test.cpp b/be/test/vec/core/block_test.cpp
index 6a0efe76a17..eebc25d9ccc 100644
--- a/be/test/vec/core/block_test.cpp
+++ b/be/test/vec/core/block_test.cpp
@@ -308,6 +308,7 @@ void 
serialize_and_deserialize_test(segment_v2::CompressionTypePB compression_ty
         block_to_pb(block2, &pblock2, compression_type);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(s1, s2);
+        EXPECT_GT(block.dump_data_json().size(), 1);
     }
 }
 
@@ -332,6 +333,7 @@ void serialize_and_deserialize_test_one() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 }
@@ -357,6 +359,7 @@ void serialize_and_deserialize_test_int() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 
@@ -380,6 +383,7 @@ void serialize_and_deserialize_test_int() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 }
@@ -404,6 +408,7 @@ void serialize_and_deserialize_test_long() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 
@@ -427,6 +432,7 @@ void serialize_and_deserialize_test_long() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 }
@@ -451,6 +457,7 @@ void serialize_and_deserialize_test_string() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 
@@ -475,6 +482,7 @@ void serialize_and_deserialize_test_string() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 }
@@ -502,6 +510,7 @@ void serialize_and_deserialize_test_nullable() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 
@@ -526,6 +535,7 @@ void serialize_and_deserialize_test_nullable() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 
@@ -571,6 +581,7 @@ void serialize_and_deserialize_test_nullable() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 }
@@ -596,6 +607,7 @@ void serialize_and_deserialize_test_decimal() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 
@@ -619,6 +631,7 @@ void serialize_and_deserialize_test_decimal() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(block.dump_data(), block2.dump_data());
+        EXPECT_EQ(block.dump_data_json(), block2.dump_data_json());
         EXPECT_EQ(s1, s2);
     }
 }
@@ -678,6 +691,7 @@ void serialize_and_deserialize_test_bitmap() {
         static_cast<void>(block2.deserialize(pblock));
         std::string bb2 = block2.dump_data(0, 1024);
         EXPECT_EQ(bb1, bb2);
+        EXPECT_EQ(block.dump_data_json(0, 1024), block2.dump_data_json(0, 
1024));
         PBlock pblock2;
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::LZ4);
         std::string s2 = pblock2.DebugString();
@@ -700,6 +714,7 @@ void serialize_and_deserialize_test_array() {
         block_to_pb(block2, &pblock2, segment_v2::CompressionTypePB::SNAPPY);
         std::string s2 = pblock2.DebugString();
         EXPECT_EQ(s1, s2);
+        EXPECT_GT(block.dump_data_json().size(), 1);
     }
 }
 
@@ -790,6 +805,7 @@ TEST(BlockTest, dump_data) {
     vectorized::Block block({test_int, test_string, test_decimal, 
test_nullable_int32, test_date,
                              test_datetime, test_date_v2});
     EXPECT_GT(block.dump_data().size(), 1);
+    EXPECT_GT(block.dump_data_json().size(), 1);
 
     // test dump array int and array string
     vectorized::Block block1;
@@ -797,6 +813,7 @@ TEST(BlockTest, dump_data) {
     fill_block_with_array_string(block1);
     // Note: here we should set 'row_num' in dump_data
     EXPECT_GT(block1.dump_data(10).size(), 1);
+    EXPECT_GT(block.dump_data_json(10).size(), 1);
 
     vectorized::IColumn::Filter filter;
     int size = block1.rows() / 2;
@@ -1362,6 +1379,8 @@ TEST(BlockTest, others) {
     vectorized::MutableBlock mutable_block(&block);
     auto dumped = mutable_block.dump_data();
     ASSERT_GT(dumped.size(), 0) << "Dumped data size: " << dumped.size();
+    auto dumped_json = mutable_block.dump_data_json();
+    ASSERT_GT(dumped_json.size(), 0) << "Dumped data json size: " << 
dumped_json.size();
 
     mutable_block.clear_column_data();
     ASSERT_EQ(mutable_block.get_column_by_position(0)->size(), 0);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to