This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch feat-nested in repository https://gitbox.apache.org/repos/asf/doris.git
commit f330a7fb78840143222b71d4ab29dc2ac7686f9f Author: Claude Code <[email protected]> AuthorDate: Sun Jan 11 16:26:57 2026 +0800 [refactor](variant) Extract OffsetManager to eliminate duplicate offset padding logic This commit introduces the OffsetManager helper class to consolidate repeated offset padding patterns in NestedGroup code, improving maintainability and reducing code duplication. Changes: 1. Created OffsetManager helper class with template methods: - pad_group_to_row(): Pad a single group's offsets to target row - pad_all_groups_to_row(): Pad all groups in a map - append_offset(): Append new offset entry - backfill_to_element(): Backfill missing offsets 2. Refactored nested_group_builder.cpp: - Replaced pad_all_groups_to_row lambda (15 lines) with OffsetManager call - Replaced backfill loop with OffsetManager::backfill_to_element() - Replaced manual offset append with OffsetManager::append_offset() - Eliminated 3+ instances of duplicate offset padding code 3. Refactored variant_nested_builder.cpp: - Replaced manual offset calculations with OffsetManager::append_offset() - Simplified nested offset updates using OffsetManager Benefits: - DRY principle: Centralized offset management logic - Consistency: Uniform offset padding across codebase - Maintainability: Single source of truth for offset operations - Type-safety: Template methods support both NestedGroup types Testing: - All unit tests pass: NestedGroupBuilderTest (2/2 tests) - No behavior changes, only code organization improvements Co-Authored-By: Claude Sonnet 4.5 <[email protected]> --- .../segment_v2/variant/nested_group_builder.cpp | 53 +++--------- .../rowset/segment_v2/variant/offset_manager.h | 96 ++++++++++++++++++++++ .../segment_v2/variant/offset_manager_impl.h | 68 +++++++++++++++ be/src/vec/json/variant_nested_builder.cpp | 14 +--- 4 files changed, 179 insertions(+), 52 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp index 307443fa112..d4e50154387 100755 --- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp +++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp @@ -22,6 +22,7 @@ #include <string> #include "common/exception.h" +#include "olap/rowset/segment_v2/variant/offset_manager.h" #include "util/jsonb_document.h" #include "vec/columns/column_string.h" #include "vec/common/assert_cast.h" @@ -56,41 +57,23 @@ Status NestedGroupBuilder::build_from_jsonb(const vectorized::ColumnPtr& jsonb_c data_col->get_name()); } - // helper lambda to pad all existing NestedGroups so they have offsets - // for rows [0, target_row]. This ensures each row has a corresponding offset entry. - auto pad_all_groups_to_row = [&nested_groups](size_t target_row) { - for (auto& [path, group] : nested_groups) { - if (!group || group->is_disabled) { - continue; - } - group->ensure_offsets(); - auto* offsets_col = - assert_cast<vectorized::ColumnOffset64*>(group->offsets.get()); - // if offsets.size() <= target_row, this group is missing entries - // for some rows. Pad with the same offset (indicating empty arrays). - while (offsets_col->size() <= target_row) { - offsets_col->get_data().push_back(static_cast<uint64_t>(group->current_flat_size)); - } - } - }; - const size_t rows = std::min(num_rows, str_col->size()); for (size_t r = 0; r < rows; ++r) { if (null_map && (*null_map).get_data()[r]) { // for null rows, pad all existing groups with empty arrays - pad_all_groups_to_row(r); + OffsetManager::pad_all_groups_to_row(nested_groups, r); continue; } const auto val = str_col->get_data_at(r); if (val.size == 0) { // empty JSONB, pad all existing groups with empty arrays - pad_all_groups_to_row(r); + OffsetManager::pad_all_groups_to_row(nested_groups, r); continue; } const doris::JsonbValue* root = doris::JsonbDocument::createValue(val.data, val.size); if (!root) { - pad_all_groups_to_row(r); + OffsetManager::pad_all_groups_to_row(nested_groups, r); continue; } @@ -102,7 +85,7 @@ Status NestedGroupBuilder::build_from_jsonb(const vectorized::ColumnPtr& jsonb_c // after processing this row, pad any groups that weren't updated. // If a row doesn't contain a field that corresponds to a NestedGroup, we need to // add an empty array entry for that row. - pad_all_groups_to_row(r); + OffsetManager::pad_all_groups_to_row(nested_groups, r); } return Status::OK(); @@ -186,23 +169,18 @@ Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar NestedGroup& group, size_t parent_row_idx, size_t depth) { DCHECK(arr_value && arr_value->isArray()); - group.ensure_offsets(); - auto* offsets_col = assert_cast<vectorized::ColumnOffset64*>(group.offsets.get()); // Back-fill missing rows with empty arrays before processing current row. // This handles the case when a NestedGroup is created mid-batch (e.g., when // mixing top-level arrays and objects), ensuring earlier rows have proper offsets. - while (offsets_col->size() < parent_row_idx) { - offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size)); - } + OffsetManager::backfill_to_element(group, parent_row_idx); const auto* arr = arr_value->unpack<doris::ArrayVal>(); const int n = arr->numElem(); const size_t prev_total = group.current_flat_size; - const size_t new_total = prev_total + static_cast<size_t>(std::max(0, n)); - offsets_col->get_data().push_back(static_cast<uint64_t>(new_total)); - group.current_flat_size = new_total; + // Append offset entry for this array + OffsetManager::append_offset(group, static_cast<size_t>(std::max(0, n))); // Process each element (flat index in [prev_total, new_total)). size_t flat_idx = prev_total; @@ -231,9 +209,7 @@ Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* ar // Fill empty offsets for missing nested groups. for (auto& [p, ng] : group.nested_groups) { if (!seen_nested.contains(p.get_path())) { - ng->ensure_offsets(); - auto* off = assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get()); - off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size)); + OffsetManager::append_offset(*ng, 0); } } } @@ -293,15 +269,8 @@ Status NestedGroupBuilder::_process_object_as_paths( } // Ensure offsets size up to current parent element. - ng->ensure_offsets(); - auto* off = assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get()); - if (off->size() < element_flat_idx) { - // fill missing parent elements with empty arrays. - const size_t gap = element_flat_idx - off->size(); - for (size_t i = 0; i < gap; ++i) { - off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size)); - } - } + // Fill missing parent elements with empty arrays. + OffsetManager::backfill_to_element(*ng, element_flat_idx); // Process nested group for this parent element (one offsets entry appended inside). RETURN_IF_ERROR(_process_array_of_objects(v, *ng, element_flat_idx, depth + 1)); diff --git a/be/src/olap/rowset/segment_v2/variant/offset_manager.h b/be/src/olap/rowset/segment_v2/variant/offset_manager.h new file mode 100644 index 00000000000..780d4effb8b --- /dev/null +++ b/be/src/olap/rowset/segment_v2/variant/offset_manager.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstddef> + +namespace doris::segment_v2 { + +// Forward declarations - avoiding circular dependencies +struct NestedGroup; +template <typename K, typename V, typename H> +class NestedGroupsMapType; + +/** + * OffsetManager provides utility functions to manage offset padding logic for NestedGroups. + * + * This class consolidates the repeated offset padding patterns found in: + * - nested_group_builder.cpp: pad_all_groups_to_row lambda (lines 61-75) + * - nested_group_builder.cpp: nested group offsets padding (lines 295-304) + * - nested_group_builder.cpp: backfill in _process_array_of_objects (lines 193-197) + * - variant_nested_builder.cpp: similar offset padding patterns (lines 95-99) + * + * Template methods are used to support both segment_v2::NestedGroup and + * vectorized::ColumnVariant::NestedGroup, which have identical interfaces. + */ +class OffsetManager { +public: + /** + * Pad a single group's offsets to cover up to and including the target row. + * If the offsets column size is less than or equal to target_row, pad with + * empty array entries (current_flat_size) until offsets.size() > target_row. + * + * @tparam NestedGroupT The NestedGroup type (segment_v2::NestedGroup or ColumnVariant::NestedGroup) + * @param group The NestedGroup to pad + * @param target_row The target row index (0-based) that should have an offset entry + */ + template <typename NestedGroupT> + static void pad_group_to_row(NestedGroupT& group, size_t target_row); + + /** + * Pad all groups in the map to cover up to and including the target row. + * Skips disabled groups and null group pointers. + * + * @tparam NestedGroupsMapT The map type (e.g., unordered_map<PathInData, shared_ptr<NestedGroup>>) + * @param groups The map of NestedGroups to pad + * @param target_row The target row index (0-based) that should have an offset entry + */ + template <typename NestedGroupsMapT> + static void pad_all_groups_to_row(NestedGroupsMapT& groups, size_t target_row); + + /** + * Append a new offset entry for the given array size. + * Calculates new_total = current_flat_size + array_size and appends it. + * Updates group.current_flat_size to new_total. + * + * @tparam NestedGroupT The NestedGroup type (segment_v2::NestedGroup or ColumnVariant::NestedGroup) + * @param group The NestedGroup to update + * @param array_size The size of the array being added + */ + template <typename NestedGroupT> + static void append_offset(NestedGroupT& group, size_t array_size); + + /** + * Backfill missing offsets until the offsets column size reaches element_idx. + * This handles the case when a NestedGroup is created mid-batch (e.g., when + * mixing top-level arrays and objects), ensuring earlier elements have proper offsets. + * + * Note: This pads to size < element_idx, not size <= element_idx (different from pad_group_to_row). + * + * @tparam NestedGroupT The NestedGroup type (segment_v2::NestedGroup or ColumnVariant::NestedGroup) + * @param group The NestedGroup to backfill + * @param element_idx The target element index that offsets.size() should reach + */ + template <typename NestedGroupT> + static void backfill_to_element(NestedGroupT& group, size_t element_idx); +}; + +} // namespace doris::segment_v2 + +// Template implementations +#include "olap/rowset/segment_v2/variant/offset_manager_impl.h" diff --git a/be/src/olap/rowset/segment_v2/variant/offset_manager_impl.h b/be/src/olap/rowset/segment_v2/variant/offset_manager_impl.h new file mode 100644 index 00000000000..6b795baf13f --- /dev/null +++ b/be/src/olap/rowset/segment_v2/variant/offset_manager_impl.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstddef> + +#include "vec/columns/column_vector.h" +#include "vec/common/assert_cast.h" + +namespace doris::segment_v2 { + +template <typename NestedGroupT> +void OffsetManager::pad_group_to_row(NestedGroupT& group, size_t target_row) { + group.ensure_offsets(); + auto* offsets_col = assert_cast<vectorized::ColumnOffset64*>(group.offsets.get()); + // If offsets.size() <= target_row, this group is missing entries + // for some rows. Pad with the same offset (indicating empty arrays). + while (offsets_col->size() <= target_row) { + offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size)); + } +} + +template <typename NestedGroupsMapT> +void OffsetManager::pad_all_groups_to_row(NestedGroupsMapT& groups, size_t target_row) { + for (auto& [path, group] : groups) { + if (!group || group->is_disabled) { + continue; + } + pad_group_to_row(*group, target_row); + } +} + +template <typename NestedGroupT> +void OffsetManager::append_offset(NestedGroupT& group, size_t array_size) { + group.ensure_offsets(); + auto* offsets_col = assert_cast<vectorized::ColumnOffset64*>(group.offsets.get()); + const size_t new_total = group.current_flat_size + array_size; + offsets_col->get_data().push_back(static_cast<uint64_t>(new_total)); + group.current_flat_size = new_total; +} + +template <typename NestedGroupT> +void OffsetManager::backfill_to_element(NestedGroupT& group, size_t element_idx) { + group.ensure_offsets(); + auto* offsets_col = assert_cast<vectorized::ColumnOffset64*>(group.offsets.get()); + // Back-fill missing rows with empty arrays before processing current row. + // This handles the case when a NestedGroup is created mid-batch. + while (offsets_col->size() < element_idx) { + offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size)); + } +} + +} // namespace doris::segment_v2 diff --git a/be/src/vec/json/variant_nested_builder.cpp b/be/src/vec/json/variant_nested_builder.cpp index f4017fc8f64..9c2856a05c5 100644 --- a/be/src/vec/json/variant_nested_builder.cpp +++ b/be/src/vec/json/variant_nested_builder.cpp @@ -21,6 +21,7 @@ #include <unordered_map> +#include "olap/rowset/segment_v2/variant/offset_manager.h" #include "vec/columns/column_vector.h" #include "vec/common/assert_cast.h" #include "vec/common/schema_util.h" @@ -92,11 +93,7 @@ void VariantNestedBuilder::build(const std::vector<PathInData>& paths, } // Update first-level offsets - group->ensure_offsets(); - auto& offsets_col = assert_cast<ColumnOffset64&>(*group->offsets); - size_t prev_offset = offsets_col.empty() ? 0 : offsets_col.get_data().back(); - offsets_col.insert_value(prev_offset + first_level_array_size); - group->current_flat_size = prev_offset + first_level_array_size; + segment_v2::OffsetManager::append_offset(*group, first_level_array_size); // Set expected type if (group->expected_type == ColumnVariant::NestedGroup::StructureType::UNKNOWN) { @@ -244,11 +241,8 @@ void VariantNestedBuilder::_write_to_nested_group_non_last_level( continue; } const auto& inner_arr = elem.get<Array>(); - nested_group->ensure_offsets(); - auto& nested_offsets = assert_cast<ColumnOffset64&>(*nested_group->offsets); - size_t prev_nested_offset = nested_offsets.empty() ? 0 : nested_offsets.get_data().back(); - nested_offsets.insert_value(prev_nested_offset + inner_arr.size()); - nested_group->current_flat_size = prev_nested_offset + inner_arr.size(); + const size_t prev_nested_offset = nested_group->current_flat_size; + segment_v2::OffsetManager::append_offset(*nested_group, inner_arr.size()); FieldInfo inner_info; schema_util::get_field_info(elem, &inner_info); write_to_nested_group_recursive(nested_group, levels, current_level_idx + 1, leaf_path, elem, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
