This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch feat-nested
in repository https://gitbox.apache.org/repos/asf/doris.git

commit f330a7fb78840143222b71d4ab29dc2ac7686f9f
Author: Claude Code <[email protected]>
AuthorDate: Sun Jan 11 16:26:57 2026 +0800

    [refactor](variant) Extract OffsetManager to eliminate duplicate offset 
padding logic
    
    This commit introduces the OffsetManager helper class to consolidate
    repeated offset padding patterns in NestedGroup code, improving
    maintainability and reducing code duplication.
    
    Changes:
    1. Created OffsetManager helper class with template methods:
       - pad_group_to_row(): Pad a single group's offsets to target row
       - pad_all_groups_to_row(): Pad all groups in a map
       - append_offset(): Append new offset entry
       - backfill_to_element(): Backfill missing offsets
    
    2. Refactored nested_group_builder.cpp:
       - Replaced pad_all_groups_to_row lambda (15 lines) with OffsetManager 
call
       - Replaced backfill loop with OffsetManager::backfill_to_element()
       - Replaced manual offset append with OffsetManager::append_offset()
       - Eliminated 3+ instances of duplicate offset padding code
    
    3. Refactored variant_nested_builder.cpp:
       - Replaced manual offset calculations with OffsetManager::append_offset()
       - Simplified nested offset updates using OffsetManager
    
    Benefits:
    - DRY principle: Centralized offset management logic
    - Consistency: Uniform offset padding across codebase
    - Maintainability: Single source of truth for offset operations
    - Type-safety: Template methods support both NestedGroup types
    
    Testing:
    - All unit tests pass: NestedGroupBuilderTest (2/2 tests)
    - No behavior changes, only code organization improvements
    
    Co-Authored-By: Claude Sonnet 4.5 <[email protected]>
---
 .../segment_v2/variant/nested_group_builder.cpp    | 53 +++---------
 .../rowset/segment_v2/variant/offset_manager.h     | 96 ++++++++++++++++++++++
 .../segment_v2/variant/offset_manager_impl.h       | 68 +++++++++++++++
 be/src/vec/json/variant_nested_builder.cpp         | 14 +---
 4 files changed, 179 insertions(+), 52 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp 
b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
index 307443fa112..d4e50154387 100755
--- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
@@ -22,6 +22,7 @@
 #include <string>
 
 #include "common/exception.h"
+#include "olap/rowset/segment_v2/variant/offset_manager.h"
 #include "util/jsonb_document.h"
 #include "vec/columns/column_string.h"
 #include "vec/common/assert_cast.h"
@@ -56,41 +57,23 @@ Status NestedGroupBuilder::build_from_jsonb(const 
vectorized::ColumnPtr& jsonb_c
                                        data_col->get_name());
     }
 
-    // helper lambda to pad all existing NestedGroups so they have offsets
-    // for rows [0, target_row]. This ensures each row has a corresponding 
offset entry.
-    auto pad_all_groups_to_row = [&nested_groups](size_t target_row) {
-        for (auto& [path, group] : nested_groups) {
-            if (!group || group->is_disabled) {
-                continue;
-            }
-            group->ensure_offsets();
-            auto* offsets_col =
-                    
assert_cast<vectorized::ColumnOffset64*>(group->offsets.get());
-            // if offsets.size() <= target_row, this group is missing entries
-            // for some rows. Pad with the same offset (indicating empty 
arrays).
-            while (offsets_col->size() <= target_row) {
-                
offsets_col->get_data().push_back(static_cast<uint64_t>(group->current_flat_size));
-            }
-        }
-    };
-
     const size_t rows = std::min(num_rows, str_col->size());
     for (size_t r = 0; r < rows; ++r) {
         if (null_map && (*null_map).get_data()[r]) {
             // for null rows, pad all existing groups with empty arrays
-            pad_all_groups_to_row(r);
+            OffsetManager::pad_all_groups_to_row(nested_groups, r);
             continue;
         }
         const auto val = str_col->get_data_at(r);
         if (val.size == 0) {
             // empty JSONB, pad all existing groups with empty arrays
-            pad_all_groups_to_row(r);
+            OffsetManager::pad_all_groups_to_row(nested_groups, r);
             continue;
         }
 
         const doris::JsonbValue* root = 
doris::JsonbDocument::createValue(val.data, val.size);
         if (!root) {
-            pad_all_groups_to_row(r);
+            OffsetManager::pad_all_groups_to_row(nested_groups, r);
             continue;
         }
 
@@ -102,7 +85,7 @@ Status NestedGroupBuilder::build_from_jsonb(const 
vectorized::ColumnPtr& jsonb_c
         // after processing this row, pad any groups that weren't updated.
         // If a row doesn't contain a field that corresponds to a NestedGroup, 
we need to
         // add an empty array entry for that row.
-        pad_all_groups_to_row(r);
+        OffsetManager::pad_all_groups_to_row(nested_groups, r);
     }
 
     return Status::OK();
@@ -186,23 +169,18 @@ Status 
NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar
                                                     NestedGroup& group, size_t 
parent_row_idx,
                                                     size_t depth) {
     DCHECK(arr_value && arr_value->isArray());
-    group.ensure_offsets();
-    auto* offsets_col = 
assert_cast<vectorized::ColumnOffset64*>(group.offsets.get());
 
     // Back-fill missing rows with empty arrays before processing current row.
     // This handles the case when a NestedGroup is created mid-batch (e.g., 
when
     // mixing top-level arrays and objects), ensuring earlier rows have proper 
offsets.
-    while (offsets_col->size() < parent_row_idx) {
-        
offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size));
-    }
+    OffsetManager::backfill_to_element(group, parent_row_idx);
 
     const auto* arr = arr_value->unpack<doris::ArrayVal>();
     const int n = arr->numElem();
 
     const size_t prev_total = group.current_flat_size;
-    const size_t new_total = prev_total + static_cast<size_t>(std::max(0, n));
-    offsets_col->get_data().push_back(static_cast<uint64_t>(new_total));
-    group.current_flat_size = new_total;
+    // Append offset entry for this array
+    OffsetManager::append_offset(group, static_cast<size_t>(std::max(0, n)));
 
     // Process each element (flat index in [prev_total, new_total)).
     size_t flat_idx = prev_total;
@@ -231,9 +209,7 @@ Status NestedGroupBuilder::_process_array_of_objects(const 
doris::JsonbValue* ar
         // Fill empty offsets for missing nested groups.
         for (auto& [p, ng] : group.nested_groups) {
             if (!seen_nested.contains(p.get_path())) {
-                ng->ensure_offsets();
-                auto* off = 
assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get());
-                
off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size));
+                OffsetManager::append_offset(*ng, 0);
             }
         }
     }
@@ -293,15 +269,8 @@ Status NestedGroupBuilder::_process_object_as_paths(
             }
 
             // Ensure offsets size up to current parent element.
-            ng->ensure_offsets();
-            auto* off = 
assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get());
-            if (off->size() < element_flat_idx) {
-                // fill missing parent elements with empty arrays.
-                const size_t gap = element_flat_idx - off->size();
-                for (size_t i = 0; i < gap; ++i) {
-                    
off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size));
-                }
-            }
+            // Fill missing parent elements with empty arrays.
+            OffsetManager::backfill_to_element(*ng, element_flat_idx);
 
             // Process nested group for this parent element (one offsets entry 
appended inside).
             RETURN_IF_ERROR(_process_array_of_objects(v, *ng, 
element_flat_idx, depth + 1));
diff --git a/be/src/olap/rowset/segment_v2/variant/offset_manager.h 
b/be/src/olap/rowset/segment_v2/variant/offset_manager.h
new file mode 100644
index 00000000000..780d4effb8b
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant/offset_manager.h
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+
+namespace doris::segment_v2 {
+
+// Forward declarations - avoiding circular dependencies
+struct NestedGroup;
+template <typename K, typename V, typename H>
+class NestedGroupsMapType;
+
+/**
+ * OffsetManager provides utility functions to manage offset padding logic for 
NestedGroups.
+ *
+ * This class consolidates the repeated offset padding patterns found in:
+ * - nested_group_builder.cpp: pad_all_groups_to_row lambda (lines 61-75)
+ * - nested_group_builder.cpp: nested group offsets padding (lines 295-304)
+ * - nested_group_builder.cpp: backfill in _process_array_of_objects (lines 
193-197)
+ * - variant_nested_builder.cpp: similar offset padding patterns (lines 95-99)
+ *
+ * Template methods are used to support both segment_v2::NestedGroup and
+ * vectorized::ColumnVariant::NestedGroup, which have identical interfaces.
+ */
+class OffsetManager {
+public:
+    /**
+     * Pad a single group's offsets to cover up to and including the target 
row.
+     * If the offsets column size is less than or equal to target_row, pad with
+     * empty array entries (current_flat_size) until offsets.size() > 
target_row.
+     *
+     * @tparam NestedGroupT The NestedGroup type (segment_v2::NestedGroup or 
ColumnVariant::NestedGroup)
+     * @param group The NestedGroup to pad
+     * @param target_row The target row index (0-based) that should have an 
offset entry
+     */
+    template <typename NestedGroupT>
+    static void pad_group_to_row(NestedGroupT& group, size_t target_row);
+
+    /**
+     * Pad all groups in the map to cover up to and including the target row.
+     * Skips disabled groups and null group pointers.
+     *
+     * @tparam NestedGroupsMapT The map type (e.g., unordered_map<PathInData, 
shared_ptr<NestedGroup>>)
+     * @param groups The map of NestedGroups to pad
+     * @param target_row The target row index (0-based) that should have an 
offset entry
+     */
+    template <typename NestedGroupsMapT>
+    static void pad_all_groups_to_row(NestedGroupsMapT& groups, size_t 
target_row);
+
+    /**
+     * Append a new offset entry for the given array size.
+     * Calculates new_total = current_flat_size + array_size and appends it.
+     * Updates group.current_flat_size to new_total.
+     *
+     * @tparam NestedGroupT The NestedGroup type (segment_v2::NestedGroup or 
ColumnVariant::NestedGroup)
+     * @param group The NestedGroup to update
+     * @param array_size The size of the array being added
+     */
+    template <typename NestedGroupT>
+    static void append_offset(NestedGroupT& group, size_t array_size);
+
+    /**
+     * Backfill missing offsets until the offsets column size reaches 
element_idx.
+     * This handles the case when a NestedGroup is created mid-batch (e.g., 
when
+     * mixing top-level arrays and objects), ensuring earlier elements have 
proper offsets.
+     *
+     * Note: This pads to size < element_idx, not size <= element_idx 
(different from pad_group_to_row).
+     *
+     * @tparam NestedGroupT The NestedGroup type (segment_v2::NestedGroup or 
ColumnVariant::NestedGroup)
+     * @param group The NestedGroup to backfill
+     * @param element_idx The target element index that offsets.size() should 
reach
+     */
+    template <typename NestedGroupT>
+    static void backfill_to_element(NestedGroupT& group, size_t element_idx);
+};
+
+} // namespace doris::segment_v2
+
+// Template implementations
+#include "olap/rowset/segment_v2/variant/offset_manager_impl.h"
diff --git a/be/src/olap/rowset/segment_v2/variant/offset_manager_impl.h 
b/be/src/olap/rowset/segment_v2/variant/offset_manager_impl.h
new file mode 100644
index 00000000000..6b795baf13f
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant/offset_manager_impl.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+
+#include "vec/columns/column_vector.h"
+#include "vec/common/assert_cast.h"
+
+namespace doris::segment_v2 {
+
+template <typename NestedGroupT>
+void OffsetManager::pad_group_to_row(NestedGroupT& group, size_t target_row) {
+    group.ensure_offsets();
+    auto* offsets_col = 
assert_cast<vectorized::ColumnOffset64*>(group.offsets.get());
+    // If offsets.size() <= target_row, this group is missing entries
+    // for some rows. Pad with the same offset (indicating empty arrays).
+    while (offsets_col->size() <= target_row) {
+        
offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size));
+    }
+}
+
+template <typename NestedGroupsMapT>
+void OffsetManager::pad_all_groups_to_row(NestedGroupsMapT& groups, size_t 
target_row) {
+    for (auto& [path, group] : groups) {
+        if (!group || group->is_disabled) {
+            continue;
+        }
+        pad_group_to_row(*group, target_row);
+    }
+}
+
+template <typename NestedGroupT>
+void OffsetManager::append_offset(NestedGroupT& group, size_t array_size) {
+    group.ensure_offsets();
+    auto* offsets_col = 
assert_cast<vectorized::ColumnOffset64*>(group.offsets.get());
+    const size_t new_total = group.current_flat_size + array_size;
+    offsets_col->get_data().push_back(static_cast<uint64_t>(new_total));
+    group.current_flat_size = new_total;
+}
+
+template <typename NestedGroupT>
+void OffsetManager::backfill_to_element(NestedGroupT& group, size_t 
element_idx) {
+    group.ensure_offsets();
+    auto* offsets_col = 
assert_cast<vectorized::ColumnOffset64*>(group.offsets.get());
+    // Back-fill missing rows with empty arrays before processing current row.
+    // This handles the case when a NestedGroup is created mid-batch.
+    while (offsets_col->size() < element_idx) {
+        
offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size));
+    }
+}
+
+} // namespace doris::segment_v2
diff --git a/be/src/vec/json/variant_nested_builder.cpp 
b/be/src/vec/json/variant_nested_builder.cpp
index f4017fc8f64..9c2856a05c5 100644
--- a/be/src/vec/json/variant_nested_builder.cpp
+++ b/be/src/vec/json/variant_nested_builder.cpp
@@ -21,6 +21,7 @@
 
 #include <unordered_map>
 
+#include "olap/rowset/segment_v2/variant/offset_manager.h"
 #include "vec/columns/column_vector.h"
 #include "vec/common/assert_cast.h"
 #include "vec/common/schema_util.h"
@@ -92,11 +93,7 @@ void VariantNestedBuilder::build(const 
std::vector<PathInData>& paths,
         }
 
         // Update first-level offsets
-        group->ensure_offsets();
-        auto& offsets_col = assert_cast<ColumnOffset64&>(*group->offsets);
-        size_t prev_offset = offsets_col.empty() ? 0 : 
offsets_col.get_data().back();
-        offsets_col.insert_value(prev_offset + first_level_array_size);
-        group->current_flat_size = prev_offset + first_level_array_size;
+        segment_v2::OffsetManager::append_offset(*group, 
first_level_array_size);
 
         // Set expected type
         if (group->expected_type == 
ColumnVariant::NestedGroup::StructureType::UNKNOWN) {
@@ -244,11 +241,8 @@ void 
VariantNestedBuilder::_write_to_nested_group_non_last_level(
             continue;
         }
         const auto& inner_arr = elem.get<Array>();
-        nested_group->ensure_offsets();
-        auto& nested_offsets = 
assert_cast<ColumnOffset64&>(*nested_group->offsets);
-        size_t prev_nested_offset = nested_offsets.empty() ? 0 : 
nested_offsets.get_data().back();
-        nested_offsets.insert_value(prev_nested_offset + inner_arr.size());
-        nested_group->current_flat_size = prev_nested_offset + 
inner_arr.size();
+        const size_t prev_nested_offset = nested_group->current_flat_size;
+        segment_v2::OffsetManager::append_offset(*nested_group, 
inner_arr.size());
         FieldInfo inner_info;
         schema_util::get_field_info(elem, &inner_info);
         write_to_nested_group_recursive(nested_group, levels, 
current_level_idx + 1, leaf_path, elem,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to