This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch feat-nested
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 5a323be8969ae6383335a1daed604ac80a467811
Author: eldenmoon <[email protected]>
AuthorDate: Sun Jan 11 15:07:31 2026 +0800

    fix ut
---
 be/src/olap/rowset/segment_creator.cpp             |   2 +-
 be/src/olap/rowset/segment_v2/column_writer.h      |   3 +-
 .../segment_v2/variant/nested_group_builder.cpp    | 102 +++++++++++++++------
 .../segment_v2/variant/nested_group_builder.h      |   3 +
 .../segment_v2/variant/variant_column_reader.cpp   |  89 ++++++++++++------
 .../segment_v2/variant/variant_column_reader.h     |   0
 .../variant/variant_column_writer_impl.cpp         |  45 +++++----
 be/src/vec/columns/column_variant.cpp              |   2 +-
 be/src/vec/json/json_parser.cpp                    |  10 +-
 be/src/vec/json/parse2column.cpp                   |   2 +-
 be/src/vec/json/variant_nested_builder.h           |   2 +-
 be/src/vec/olap/olap_data_convertor.cpp            |   4 +
 .../variant_column_writer_reader_test.cpp          |  93 ++-----------------
 13 files changed, 189 insertions(+), 168 deletions(-)

diff --git a/be/src/olap/rowset/segment_creator.cpp 
b/be/src/olap/rowset/segment_creator.cpp
old mode 100644
new mode 100755
index 13046c71d02..b8d587c4d24
--- a/be/src/olap/rowset/segment_creator.cpp
+++ b/be/src/olap/rowset/segment_creator.cpp
@@ -105,7 +105,7 @@ Status 
SegmentFlusher::_internal_parse_variant_columns(vectorized::Block& block)
     }
 
     vectorized::ParseConfig parse_config;
-    // English comment: enable_flatten_nested controls whether to flatten 
nested array<object> paths
+    // enable_flatten_nested controls whether to flatten nested array<object> 
paths
     // NestedGroup expansion is handled at storage layer, not at parse time
     parse_config.enable_flatten_nested = 
_context.tablet_schema->variant_flatten_nested();
     RETURN_IF_ERROR(
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h 
b/be/src/olap/rowset/segment_v2/column_writer.h
old mode 100644
new mode 100755
index a43cae436cc..9ed4242a33f
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -100,8 +100,9 @@ class PageBuilder;
 class BloomFilterIndexWriter;
 class ZoneMapIndexWriter;
 class VariantColumnWriterImpl;
+class ColumnWriter;
 
-// English comment: NestedGroup writers for array<object> paths (offsets + 
child writers).
+// NestedGroup writers for array<object> paths (offsets + child writers).
 struct NestedGroupWriter {
     std::unique_ptr<ColumnWriter> offsets_writer;
     std::unordered_map<std::string, std::unique_ptr<ColumnWriter>> 
child_writers;
diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp 
b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
old mode 100644
new mode 100755
index 309431311d9..307443fa112
--- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.cpp
@@ -56,25 +56,53 @@ Status NestedGroupBuilder::build_from_jsonb(const 
vectorized::ColumnPtr& jsonb_c
                                        data_col->get_name());
     }
 
+    // helper lambda to pad all existing NestedGroups so they have offsets
+    // for rows [0, target_row]. This ensures each row has a corresponding 
offset entry.
+    auto pad_all_groups_to_row = [&nested_groups](size_t target_row) {
+        for (auto& [path, group] : nested_groups) {
+            if (!group || group->is_disabled) {
+                continue;
+            }
+            group->ensure_offsets();
+            auto* offsets_col =
+                    
assert_cast<vectorized::ColumnOffset64*>(group->offsets.get());
+            // if offsets.size() <= target_row, this group is missing entries
+            // for some rows. Pad with the same offset (indicating empty 
arrays).
+            while (offsets_col->size() <= target_row) {
+                
offsets_col->get_data().push_back(static_cast<uint64_t>(group->current_flat_size));
+            }
+        }
+    };
+
     const size_t rows = std::min(num_rows, str_col->size());
     for (size_t r = 0; r < rows; ++r) {
         if (null_map && (*null_map).get_data()[r]) {
+            // for null rows, pad all existing groups with empty arrays
+            pad_all_groups_to_row(r);
             continue;
         }
         const auto val = str_col->get_data_at(r);
         if (val.size == 0) {
+            // empty JSONB, pad all existing groups with empty arrays
+            pad_all_groups_to_row(r);
             continue;
         }
 
         const doris::JsonbValue* root = 
doris::JsonbDocument::createValue(val.data, val.size);
         if (!root) {
+            pad_all_groups_to_row(r);
             continue;
         }
 
-        // English comment: base_path is the JSON path of this JSONB column in 
ColumnVariant.
+        // base_path is the JSON path of this JSONB column in ColumnVariant.
         // For root JSONB, base_path is empty and we only traverse into 
objects to discover
         // nested arrays under named fields.
         RETURN_IF_ERROR(_process_jsonb_value(root, base_path, nested_groups, 
r, 0));
+
+        // after processing this row, pad any groups that weren't updated.
+        // If a row doesn't contain a field that corresponds to a NestedGroup, 
we need to
+        // add an empty array entry for that row.
+        pad_all_groups_to_row(r);
     }
 
     return Status::OK();
@@ -88,7 +116,10 @@ Status NestedGroupBuilder::_process_jsonb_value(const 
doris::JsonbValue* value,
         return Status::OK();
     }
     if (_max_depth > 0 && depth > _max_depth) {
-        return Status::OK();
+        return Status::InvalidArgument(
+                "NestedGroupBuilder: nested depth {} exceeds max_depth {} at 
path '{}'. "
+                "Consider increasing 'variant_nested_group_max_depth' 
configuration.",
+                depth, _max_depth, current_path.get_path());
     }
 
     if (value->isObject()) {
@@ -106,21 +137,21 @@ Status NestedGroupBuilder::_process_jsonb_value(const 
doris::JsonbValue* value,
     }
 
     if (value->isArray()) {
-        // English comment: ignore top-level arrays when base path is empty, 
since they are kept
-        // as root JSONB and do not require NestedGroup to preserve 
associations.
-        if (current_path.empty()) {
-            return Status::OK();
-        }
-
         if (!_is_array_of_objects(value)) {
             return Status::OK();
         }
 
-        // Get or create top-level group keyed by full array path.
-        std::shared_ptr<NestedGroup>& gptr = nested_groups[current_path];
+        // For top-level arrays (current_path is empty), use special "$root" 
path marker.
+        // For nested arrays, use the actual current_path.
+        vectorized::PathInData array_path =
+                current_path.empty() ? 
vectorized::PathInData(std::string(kRootNestedGroupPath))
+                                     : current_path;
+
+        // Get or create group keyed by array path.
+        std::shared_ptr<NestedGroup>& gptr = nested_groups[array_path];
         if (!gptr) {
             gptr = std::make_shared<NestedGroup>();
-            gptr->path = current_path;
+            gptr->path = array_path;
         }
 
         if (_handle_conflict(*gptr, /*is_array_object=*/true)) {
@@ -152,12 +183,19 @@ bool NestedGroupBuilder::_is_array_of_objects(const 
doris::JsonbValue* arr_value
 }
 
 Status NestedGroupBuilder::_process_array_of_objects(const doris::JsonbValue* 
arr_value,
-                                                    NestedGroup& group, size_t 
/*parent_row_idx*/,
+                                                    NestedGroup& group, size_t 
parent_row_idx,
                                                     size_t depth) {
     DCHECK(arr_value && arr_value->isArray());
     group.ensure_offsets();
     auto* offsets_col = 
assert_cast<vectorized::ColumnOffset64*>(group.offsets.get());
 
+    // Back-fill missing rows with empty arrays before processing current row.
+    // This handles the case when a NestedGroup is created mid-batch (e.g., 
when
+    // mixing top-level arrays and objects), ensuring earlier rows have proper 
offsets.
+    while (offsets_col->size() < parent_row_idx) {
+        
offsets_col->get_data().push_back(static_cast<uint64_t>(group.current_flat_size));
+    }
+
     const auto* arr = arr_value->unpack<doris::ArrayVal>();
     const int n = arr->numElem();
 
@@ -176,7 +214,7 @@ Status NestedGroupBuilder::_process_array_of_objects(const 
doris::JsonbValue* ar
 
         if (elem && !elem->isNull()) {
             if (!elem->isObject()) {
-                // English comment: array<object> validation already checked, 
skip defensively.
+                // array<object> validation already checked, skip defensively.
             } else {
                 RETURN_IF_ERROR(_process_object_as_paths(elem, 
vectorized::PathInData {}, group,
                                                         flat_idx, seen_child, 
seen_nested,
@@ -211,7 +249,10 @@ Status NestedGroupBuilder::_process_object_as_paths(
         std::unordered_set<std::string>& seen_nested_paths, size_t depth) {
     DCHECK(obj_value && obj_value->isObject());
     if (_max_depth > 0 && depth > _max_depth) {
-        return Status::OK();
+        return Status::InvalidArgument(
+                "NestedGroupBuilder: nested depth {} exceeds max_depth {} at 
path prefix '{}'. "
+                "Consider increasing 'variant_nested_group_max_depth' 
configuration.",
+                depth, _max_depth, current_prefix.get_path());
     }
 
     const auto* obj = obj_value->unpack<doris::ObjectVal>();
@@ -227,7 +268,7 @@ Status NestedGroupBuilder::_process_object_as_paths(
         }
 
         if (v->isObject()) {
-            // English comment: flatten object fields into dotted paths.
+            // flatten object fields into dotted paths.
             RETURN_IF_ERROR(_process_object_as_paths(v, next_prefix, group, 
element_flat_idx,
                                                     seen_child_paths, 
seen_nested_paths,
                                                     depth + 1));
@@ -236,7 +277,7 @@ Status NestedGroupBuilder::_process_object_as_paths(
 
         if (v->isArray() && _is_array_of_objects(v)) {
             // Nested array<object> inside this group.
-            // English comment: array<object> has the highest priority. If the 
same path was
+            // array<object> has the highest priority. If the same path was
             // previously treated as a scalar child, discard it.
             if (auto it_child = group.children.find(next_prefix); it_child != 
group.children.end()) {
                 group.children.erase(it_child);
@@ -255,7 +296,7 @@ Status NestedGroupBuilder::_process_object_as_paths(
             ng->ensure_offsets();
             auto* off = 
assert_cast<vectorized::ColumnOffset64*>(ng->offsets.get());
             if (off->size() < element_flat_idx) {
-                // English comment: fill missing parent elements with empty 
arrays.
+                // fill missing parent elements with empty arrays.
                 const size_t gap = element_flat_idx - off->size();
                 for (size_t i = 0; i < gap; ++i) {
                     
off->get_data().push_back(static_cast<uint64_t>(ng->current_flat_size));
@@ -269,13 +310,20 @@ Status NestedGroupBuilder::_process_object_as_paths(
         }
 
         // Scalar / non-array value becomes a child subcolumn.
-        // English comment: if this path is already a nested array<object>, 
discard scalars.
+        // if this path is already a nested array<object>, discard scalars.
         if (group.nested_groups.contains(next_prefix)) {
             continue;
         }
         vectorized::Field f;
         RETURN_IF_ERROR(_jsonb_to_field(v, f));
 
+        // Ensure subcolumn exists and is nullable (for NestedGroup children, 
we need nullable
+        // to support NULL values when a field is missing in some rows)
+        if (group.children.find(next_prefix) == group.children.end()) {
+            // Create a new nullable subcolumn for this path
+            group.children[next_prefix] = 
vectorized::ColumnVariant::Subcolumn(0, true, false);
+        }
+
         auto& sub = group.children[next_prefix];
         if (sub.size() < element_flat_idx) {
             sub.insert_many_defaults(element_flat_idx - sub.size());
@@ -299,36 +347,36 @@ Status NestedGroupBuilder::_jsonb_to_field(const 
doris::JsonbValue* value,
         return Status::OK();
     }
     if (value->isTrue()) {
-        out = vectorized::Field::create_field<TYPE_BOOLEAN>(true);
+        out = 
vectorized::Field::create_field<PrimitiveType::TYPE_BOOLEAN>(true);
         return Status::OK();
     }
     if (value->isFalse()) {
-        out = vectorized::Field::create_field<TYPE_BOOLEAN>(false);
+        out = 
vectorized::Field::create_field<PrimitiveType::TYPE_BOOLEAN>(false);
         return Status::OK();
     }
     if (value->isInt()) {
-        out = 
vectorized::Field::create_field<TYPE_BIGINT>(static_cast<int64_t>(value->int_val()));
+        out = 
vectorized::Field::create_field<PrimitiveType::TYPE_BIGINT>(static_cast<int64_t>(value->int_val()));
         return Status::OK();
     }
     if (value->isDouble()) {
-        out = 
vectorized::Field::create_field<TYPE_DOUBLE>(value->unpack<doris::JsonbDoubleVal>()->val());
+        out = 
vectorized::Field::create_field<PrimitiveType::TYPE_DOUBLE>(value->unpack<doris::JsonbDoubleVal>()->val());
         return Status::OK();
     }
     if (value->isFloat()) {
-        out = vectorized::Field::create_field<TYPE_DOUBLE>(static_cast<double>(
+        out = 
vectorized::Field::create_field<PrimitiveType::TYPE_DOUBLE>(static_cast<double>(
                 value->unpack<doris::JsonbFloatVal>()->val()));
         return Status::OK();
     }
     if (value->isString()) {
         const auto* s = value->unpack<doris::JsonbStringVal>();
-        out = vectorized::Field::create_field<TYPE_STRING>(
+        out = vectorized::Field::create_field<PrimitiveType::TYPE_STRING>(
                 vectorized::String(s->getBlob(), s->getBlobLen()));
         return Status::OK();
     }
     if (value->isBinary()) {
-        // English comment: keep binary as JSONB blob to avoid data loss.
+        // keep binary as JSONB blob to avoid data loss.
         const auto* b = value->unpack<doris::JsonbBinaryVal>();
-        out = vectorized::Field::create_field<TYPE_JSONB>(
+        out = vectorized::Field::create_field<PrimitiveType::TYPE_JSONB>(
                 vectorized::JsonbField(b->getBlob(), b->getBlobLen()));
         return Status::OK();
     }
@@ -338,7 +386,7 @@ Status NestedGroupBuilder::_jsonb_to_field(const 
doris::JsonbValue* value,
 }
 
 bool NestedGroupBuilder::_handle_conflict(NestedGroup& group, bool 
is_array_object) const {
-    // English comment: conflict handling with logging.
+    // conflict handling with logging.
     // Priority: array<object > scalar. Prefer nested data over flat data.
     if (group.is_disabled) {
         return true;
diff --git a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h 
b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h
old mode 100644
new mode 100755
index 157eabe7ca1..040f946dc8d
--- a/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h
+++ b/be/src/olap/rowset/segment_v2/variant/nested_group_builder.h
@@ -71,6 +71,9 @@ using NestedGroupsMap =
         std::unordered_map<vectorized::PathInData, 
std::shared_ptr<NestedGroup>,
                            vectorized::PathInData::Hash>;
 
+// Special path marker for top-level array<object> NestedGroup
+inline constexpr std::string_view kRootNestedGroupPath = "$root";
+
 /**
  * English comment: Build NestedGroup(s) from JSONB columns at storage 
finalize stage.
  * The builder scans JSONB values and only expands array<object>.
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp 
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
old mode 100644
new mode 100755
index 7c4c66a8fcb..daa743baa12
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
@@ -31,6 +31,7 @@
 #include "olap/rowset/segment_v2/column_reader_cache.h"
 #include "olap/rowset/segment_v2/segment.h"
 #include "olap/rowset/segment_v2/variant/hierarchical_data_iterator.h"
+#include "olap/rowset/segment_v2/variant/nested_group_builder.h"
 #include "olap/rowset/segment_v2/variant/sparse_column_extract_iterator.h"
 #include "olap/rowset/segment_v2/variant/sparse_column_merge_iterator.h"
 #include "olap/tablet_schema.h"
@@ -51,7 +52,7 @@ namespace doris::segment_v2 {
 
 namespace {
 
-// English comment: read offsets in range [start, start+count) and also return 
previous offset.
+// read offsets in range [start, start+count) and also return previous offset.
 Status _read_offsets_with_prev(ColumnIterator* offsets_iter, ordinal_t start, 
size_t count,
                                uint64_t* prev, std::vector<uint64_t>* out) {
     *prev = 0;
@@ -167,7 +168,7 @@ private:
         return Status::OK();
     }
 
-    // English comment: output selector:
+    // output selector:
     // - If dst is Nullable(JSONB) (ColumnNullable(ColumnString)), append 
JSONB blobs.
     // - Otherwise append Nullable(Variant(root_type=JSONB)).
     Status _append_group_as_any(GroupState& state, ordinal_t row_ord, size_t 
row_cnt,
@@ -182,7 +183,7 @@ private:
         return _append_group_as_jsonb(state, row_ord, row_cnt, dst);
     }
 
-    // English comment: build a JSONB blob column (Nullable(JSONB)) for rows 
[row_ord, row_ord+row_cnt).
+    // build a JSONB blob column (Nullable(JSONB)) for rows [row_ord, 
row_ord+row_cnt).
     // This is used for nested groups inside an element object.
     Status _append_group_as_jsonb_blob(GroupState& state, ordinal_t row_ord, 
size_t row_cnt,
                                        vectorized::MutableColumnPtr& dst) {
@@ -229,7 +230,7 @@ private:
         return Status::OK();
     }
 
-    // English comment: build a Variant(root_type=JSONB) column for rows 
[row_ord, row_ord+row_cnt)
+    // build a Variant(root_type=JSONB) column for rows [row_ord, 
row_ord+row_cnt)
     // and append into dst (expected Nullable(Variant) or Variant).
     Status _append_group_as_jsonb(GroupState& state, ordinal_t row_ord, size_t 
row_cnt,
                                  vectorized::MutableColumnPtr& dst) {
@@ -327,14 +328,14 @@ private:
             json.push_back(']');
 
             RETURN_IF_ERROR(jsonb_value.from_json_string(json.data(), 
json.size()));
-            vectorized::Field jsonb_field = 
vectorized::Field::create_field<TYPE_JSONB>(
+            vectorized::Field jsonb_field = 
vectorized::Field::create_field<PrimitiveType::TYPE_JSONB>(
                     vectorized::JsonbField(jsonb_value.value(), 
jsonb_value.size()));
 
             vectorized::VariantMap object;
             object.try_emplace(vectorized::PathInData {},
                                vectorized::FieldWithDataType(jsonb_field));
             vectorized::Field variant_field =
-                    
vectorized::Field::create_field<TYPE_VARIANT>(std::move(object));
+                    
vectorized::Field::create_field<PrimitiveType::TYPE_VARIANT>(std::move(object));
             dst_variant.insert(variant_field);
             if (dst_nullable) {
                 dst_nullable->get_null_map_column().get_data().push_back(0);
@@ -353,7 +354,7 @@ private:
 
 namespace {
 
-// English comment: when reading the whole Variant column, merge top-level 
NestedGroups back
+// when reading the whole Variant column, merge top-level NestedGroups back
 // into ColumnVariant as subcolumns (path = array path). This enables 
discarding JSONB
 // subcolumns physically while keeping query results correct.
 class VariantRootWithNestedGroupsIterator : public ColumnIterator {
@@ -399,6 +400,15 @@ public:
                         : assert_cast<vectorized::ColumnVariant&>(*dst);
         auto most_common_type = obj.get_most_common_type(); // 
NOLINT(readability-static-accessed-through-instance)
 
+        // Check if we have a top-level NestedGroup ($root path)
+        bool has_root_nested_group = false;
+        for (const auto& [path, _] : _nested_group_readers) {
+            if (path == std::string(kRootNestedGroupPath)) {
+                has_root_nested_group = true;
+                break;
+            }
+        }
+
         auto root_column = most_common_type->create_column();
         RETURN_IF_ERROR(_root_iter->next_batch(n, root_column, has_null));
 
@@ -414,7 +424,12 @@ public:
         // Build a tmp ColumnVariant for this batch: root + nested-group 
subcolumns.
         auto tmp = vectorized::ColumnVariant::create(0, root_column->size());
         auto& tmp_obj = assert_cast<vectorized::ColumnVariant&>(*tmp);
-        tmp_obj.add_sub_column({}, std::move(root_column), most_common_type);
+        
+        // If we have a $root NestedGroup, it will replace the root JSONB.
+        // Otherwise, add the root column as usual.
+        if (!has_root_nested_group) {
+            tmp_obj.add_sub_column({}, std::move(root_column), 
most_common_type);
+        }
 
         auto ng_type = 
vectorized::make_nullable(std::make_shared<vectorized::DataTypeJsonb>());
         for (auto& [path, it] : _nested_group_iters) {
@@ -422,7 +437,15 @@ public:
             size_t m = *n;
             bool tmp_has_null = false;
             RETURN_IF_ERROR(it->next_batch(&m, col, &tmp_has_null));
-            tmp_obj.add_sub_column(vectorized::PathInData(path), 
std::move(col), ng_type);
+            
+            // Special handling for top-level NestedGroup ($root path):
+            // Use NestedGroup data as root instead of adding as subcolumn
+            if (path == std::string(kRootNestedGroupPath)) {
+                tmp_obj.add_sub_column(vectorized::PathInData(), 
std::move(col), ng_type);
+            } else {
+                // Normal subcolumn handling
+                tmp_obj.add_sub_column(vectorized::PathInData(path), 
std::move(col), ng_type);
+            }
         }
 
         obj.insert_range_from(*tmp, 0, tmp_obj.rows());
@@ -445,6 +468,15 @@ public:
                         : assert_cast<vectorized::ColumnVariant&>(*dst);
         auto most_common_type = obj.get_most_common_type(); // 
NOLINT(readability-static-accessed-through-instance)
 
+        // Check if we have a top-level NestedGroup ($root path)
+        bool has_root_nested_group = false;
+        for (const auto& [path, _] : _nested_group_readers) {
+            if (path == std::string(kRootNestedGroupPath)) {
+                has_root_nested_group = true;
+                break;
+            }
+        }
+
         auto root_column = most_common_type->create_column();
         RETURN_IF_ERROR(_root_iter->read_by_rowids(rowids, count, 
root_column));
 
@@ -458,13 +490,26 @@ public:
 
         auto tmp = vectorized::ColumnVariant::create(0, root_column->size());
         auto& tmp_obj = assert_cast<vectorized::ColumnVariant&>(*tmp);
-        tmp_obj.add_sub_column({}, std::move(root_column), most_common_type);
+        
+        // If we have a $root NestedGroup, it will replace the root JSONB.
+        // Otherwise, add the root column as usual.
+        if (!has_root_nested_group) {
+            tmp_obj.add_sub_column({}, std::move(root_column), 
most_common_type);
+        }
 
         auto ng_type = 
vectorized::make_nullable(std::make_shared<vectorized::DataTypeJsonb>());
         for (auto& [path, it] : _nested_group_iters) {
             auto col = ng_type->create_column();
             RETURN_IF_ERROR(it->read_by_rowids(rowids, count, col));
-            tmp_obj.add_sub_column(vectorized::PathInData(path), 
std::move(col), ng_type);
+            
+            // Special handling for top-level NestedGroup ($root path):
+            // Use NestedGroup data as root instead of adding as subcolumn
+            if (path == std::string(kRootNestedGroupPath)) {
+                tmp_obj.add_sub_column(vectorized::PathInData(), 
std::move(col), ng_type);
+            } else {
+                // Normal subcolumn handling
+                tmp_obj.add_sub_column(vectorized::PathInData(path), 
std::move(col), ng_type);
+            }
         }
 
         obj.insert_range_from(*tmp, 0, tmp_obj.rows());
@@ -1741,22 +1786,14 @@ Status VariantColumnReader::_init_nested_group_readers(
         info.depth = path_info.has_nested_group_depth() ? 
path_info.nested_group_depth() : 1;
 
         if (!info.is_offsets) {
-            // Extract relative path
+            // Extract relative path by removing the NestedGroup prefix from 
column path.
+            // Column path format: v1.__ng.<full_ng_path>.<relative_path>
+            // We need to find the relative_path which comes after 
full_ng_path.
             std::string path_str = path_info.path();
-            size_t last_ng_pos = path_str.rfind(ng_prefix);
-            if (last_ng_pos != std::string::npos) {
-                size_t after_ng = last_ng_pos + ng_prefix.length();
-                // Find the end of the array path part
-                size_t dot_pos = path_str.find('.', after_ng);
-                while (dot_pos != std::string::npos) {
-                    std::string potential_child = path_str.substr(dot_pos + 1);
-                    // Check if this is still part of the nested group path
-                    if (!potential_child.starts_with("__ng.")) {
-                        info.relative_path = potential_child;
-                        break;
-                    }
-                    dot_pos = path_str.find('.', dot_pos + 1);
-                }
+            std::string full_ng_prefix = ng_prefix + info.full_ng_path + ".";
+            size_t prefix_pos = path_str.find(full_ng_prefix);
+            if (prefix_pos != std::string::npos) {
+                info.relative_path = path_str.substr(prefix_pos + 
full_ng_prefix.length());
             }
         }
 
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h 
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h
old mode 100644
new mode 100755
diff --git 
a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp 
b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp
old mode 100644
new mode 100755
index 311d4ffb5e0..1a807c37f3d
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp
@@ -52,9 +52,9 @@ namespace doris::segment_v2 {
 
 #include "common/compile_check_begin.h"
 
-// English comment: forward declaration for NestedGroup write helper used by 
both
+// forward declaration for NestedGroup write helper used by both
 // VariantColumnWriterImpl and VariantSubcolumnWriter.
-static Status write_nested_groups_to_segment_from_jsonb(
+static Status write_nested_groups_to_storage(
         const doris::segment_v2::NestedGroupsMap& nested_groups, const 
TabletColumn* tablet_column,
         const ColumnWriterOptions& opts, vectorized::OlapBlockDataConvertor* 
converter, size_t num_rows,
         int& column_id, std::unordered_map<std::string, NestedGroupWriter>& 
writers,
@@ -602,8 +602,9 @@ Status VariantColumnWriterImpl::finalize() {
     _skip_subcolumn_paths.clear();
 
     if (ptr->get_root_type() &&
-        vectorized::remove_nullable(ptr->get_root_type())->get_type_id() == 
TypeIndex::JSONB &&
-        ptr->get_root() != nullptr) {
+        
vectorized::remove_nullable(ptr->get_root_type())->get_primitive_type() ==
+                PrimitiveType::TYPE_JSONB &&
+        ptr->get_root()) {
         
RETURN_IF_ERROR(ng_builder.build_from_jsonb(ptr->get_root()->get_ptr(), 
nested_groups,
                                                    ptr->rows()));
     }
@@ -612,7 +613,7 @@ Status VariantColumnWriterImpl::finalize() {
             continue;
         }
         const auto& t = entry->data.get_least_common_type();
-        if (!t || vectorized::remove_nullable(t)->get_type_id() != 
TypeIndex::JSONB) {
+        if (!t || vectorized::remove_nullable(t)->get_primitive_type() != 
PrimitiveType::TYPE_JSONB) {
             continue;
         }
         
RETURN_IF_ERROR(ng_builder.build_from_jsonb(entry->data.get_finalized_column_ptr()->get_ptr(),
@@ -642,7 +643,7 @@ Status VariantColumnWriterImpl::finalize() {
                 _process_sparse_column(ptr, olap_data_convertor.get(), 
num_rows, column_id));
 
         // Write NestedGroups to segment and persist stats to root meta.
-        RETURN_IF_ERROR(write_nested_groups_to_segment_from_jsonb(
+        RETURN_IF_ERROR(write_nested_groups_to_storage(
                 nested_groups, _tablet_column, _opts, 
olap_data_convertor.get(), num_rows, column_id,
                 _nested_group_writers, _statistics));
         _statistics.to_pb(_opts.meta->mutable_variant_statistics());
@@ -953,17 +954,18 @@ Status VariantSubcolumnWriter::finalize() {
     _opts.meta->set_num_rows(ptr->rows());
     ++column_id;
 
-    // English comment: also expand array<object> JSONB into NestedGroup for 
compaction sub-variant writer.
+    // also expand array<object> JSONB into NestedGroup for compaction 
sub-variant writer.
     doris::segment_v2::NestedGroupsMap nested_groups;
     doris::segment_v2::NestedGroupBuilder ng_builder;
     
ng_builder.set_max_depth(static_cast<size_t>(config::variant_nested_group_max_depth));
     if (ptr->get_root_type() &&
-        vectorized::remove_nullable(ptr->get_root_type())->get_type_id() == 
TypeIndex::JSONB &&
-        ptr->get_root() != nullptr) {
+        
vectorized::remove_nullable(ptr->get_root_type())->get_primitive_type() ==
+                PrimitiveType::TYPE_JSONB &&
+        ptr->get_root()) {
         
RETURN_IF_ERROR(ng_builder.build_from_jsonb(ptr->get_root()->get_ptr(), 
nested_groups,
                                                    ptr->rows()));
     }
-    RETURN_IF_ERROR(write_nested_groups_to_segment_from_jsonb(
+    RETURN_IF_ERROR(write_nested_groups_to_storage(
             nested_groups, &flush_column, _opts, olap_data_convertor.get(), 
ptr->rows(), column_id,
             /*writers=*/_nested_group_writers, /*statistics=*/_statistics));
     _statistics.to_pb(_opts.meta->mutable_variant_statistics());
@@ -1097,8 +1099,8 @@ Status VariantSubcolumnWriter::append_nullable(const 
uint8_t* null_map, const ui
     return Status::OK();
 }
 
-// English comment: recursively write NestedGroup built from JSONB at finalize 
stage.
-static Status write_nested_group_recursive_from_jsonb(
+// recursively write NestedGroup built from JSONB at finalize stage.
+static Status write_nested_group_recursive(
         const doris::segment_v2::NestedGroup* group, const std::string& 
path_prefix,
         const TabletColumn* tablet_column, const ColumnWriterOptions& 
base_opts,
         vectorized::OlapBlockDataConvertor* converter, size_t parent_num_rows, 
int& column_id,
@@ -1118,7 +1120,8 @@ static Status write_nested_group_recursive_from_jsonb(
 
     TabletColumn offsets_column;
     offsets_column.set_name(offsets_col_name);
-    offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT);
+    // ColumnOffset64 is UInt64, keep offsets as unsigned bigint.
+    offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT);
     offsets_column.set_is_nullable(false);
     offsets_column.set_length(sizeof(uint64_t));
     offsets_column.set_index_length(sizeof(uint64_t));
@@ -1158,6 +1161,7 @@ static Status write_nested_group_recursive_from_jsonb(
                                      "." + relative_path.get_path();
 
         const auto& child_type = subcolumn.get_least_common_type();
+        assert(child_type->is_nullable());
         TabletColumn child_column = 
vectorized::schema_util::get_column_by_type(
                 child_type, child_col_name,
                 vectorized::schema_util::ExtraInfo {.unique_id = -1,
@@ -1178,6 +1182,10 @@ static Status write_nested_group_recursive_from_jsonb(
                                              &child_writer));
         RETURN_IF_ERROR(child_writer->init());
 
+        // Finalize the subcolumn before getting finalized column pointer
+        if (!subcolumn.is_finalized()) {
+            
const_cast<vectorized::ColumnVariant::Subcolumn&>(subcolumn).finalize();
+        }
         auto child_col = subcolumn.get_finalized_column_ptr();
         size_t child_num_rows = child_col->size();
 
@@ -1198,7 +1206,7 @@ static Status write_nested_group_recursive_from_jsonb(
 
     // 3. Recursively write nested groups within this group
     for (const auto& [nested_path, nested_group] : group->nested_groups) {
-        RETURN_IF_ERROR(write_nested_group_recursive_from_jsonb(
+        RETURN_IF_ERROR(write_nested_group_recursive(
                 nested_group.get(), full_path, tablet_column, base_opts, 
converter,
                 group->current_flat_size, column_id, writers, statistics, 
depth + 1));
     }
@@ -1212,7 +1220,7 @@ static Status write_nested_group_recursive_from_jsonb(
     return Status::OK();
 }
 
-static Status write_nested_groups_to_segment_from_jsonb(
+static Status write_nested_groups_to_storage(
         const doris::segment_v2::NestedGroupsMap& nested_groups, const 
TabletColumn* tablet_column,
         const ColumnWriterOptions& opts, vectorized::OlapBlockDataConvertor* 
converter, size_t num_rows,
         int& column_id, std::unordered_map<std::string, NestedGroupWriter>& 
writers,
@@ -1231,7 +1239,7 @@ static Status write_nested_groups_to_segment_from_jsonb(
     std::sort(groups.begin(), groups.end(),
               [](const auto& a, const auto& b) { return a->path.get_path() < 
b->path.get_path(); });
     for (const auto& g : groups) {
-        RETURN_IF_ERROR(write_nested_group_recursive_from_jsonb(
+        RETURN_IF_ERROR(write_nested_group_recursive(
                 g.get(), "", tablet_column, opts, converter, num_rows, 
column_id, writers, statistics, 1));
     }
     return Status::OK();
@@ -1264,7 +1272,8 @@ static Status write_nested_group_recursive(
 
     TabletColumn offsets_column;
     offsets_column.set_name(offsets_col_name);
-    offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_BIGINT);
+    // ColumnOffset64 is UInt64, keep offsets as unsigned bigint.
+    offsets_column.set_type(FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT);
     offsets_column.set_is_nullable(false);
     offsets_column.set_length(sizeof(uint64_t));
     offsets_column.set_index_length(sizeof(uint64_t));
@@ -1286,7 +1295,7 @@ static Status write_nested_group_recursive(
     RETURN_IF_ERROR(group_writer.offsets_writer->init());
 
     // Write offsets data
-    // English comment: avoid implicit conversion from MutablePtr to immutable 
ColumnPtr.
+    // avoid implicit conversion from MutablePtr to immutable ColumnPtr.
     vectorized::ColumnPtr offsets_col =
             static_cast<const vectorized::IColumn&>(*group->offsets).get_ptr();
     size_t offsets_num_rows = offsets_col->size();
diff --git a/be/src/vec/columns/column_variant.cpp 
b/be/src/vec/columns/column_variant.cpp
old mode 100644
new mode 100755
index 3ef6a9fea1f..bf2f00b9487
--- a/be/src/vec/columns/column_variant.cpp
+++ b/be/src/vec/columns/column_variant.cpp
@@ -79,7 +79,7 @@ namespace {
 
 #include "common/compile_check_begin.h"
 
-// English comment: deep clone nested group tree (offsets + nested groups).
+// deep clone nested group tree (offsets + nested groups).
 std::shared_ptr<ColumnVariant::NestedGroup> clone_nested_group_tree(
         const ColumnVariant::NestedGroup& src) {
     auto dst = std::make_shared<ColumnVariant::NestedGroup>();
diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp
old mode 100644
new mode 100755
index ea6030c6f24..99449bb2a0d
--- a/be/src/vec/json/json_parser.cpp
+++ b/be/src/vec/json/json_parser.cpp
@@ -44,7 +44,7 @@ std::optional<ParseResult> 
JSONDataParser<ParserImpl>::parse(const char* begin,
         return {};
     }
     ParseContext context;
-    // English comment: enable_flatten_nested controls nested path traversal
+    // enable_flatten_nested controls nested path traversal
     // NestedGroup expansion is now handled at storage layer
     context.enable_flatten_nested = config.enable_flatten_nested;
     context.is_top_array = document.isArray();
@@ -64,7 +64,7 @@ void JSONDataParser<ParserImpl>::traverse(const Element& 
element, ParseContext&
     if (element.isObject()) {
         traverseObject(element.getObject(), ctx);
     } else if (element.isArray()) {
-        // English comment: allow nested arrays (multi-level) for NestedGroup; 
deeper levels are
+        // allow nested arrays (multi-level) for NestedGroup; deeper levels are
         // handled by VariantNestedBuilder with a max-depth guard.
         has_nested = false;
         check_has_nested_object(element);
@@ -224,7 +224,7 @@ void JSONDataParser<ParserImpl>::traverseArrayElement(const 
Element& element,
         }
     }
 
-    // English comment: always fill missed values to keep element-level 
association between keys.
+    // always fill missed values to keep element-level association between 
keys.
     if (keys_to_update) {
         fillMissedValuesInArrays(ctx);
     }
@@ -254,7 +254,7 @@ void 
JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts,
                                                     ParseArrayContext& ctx,
                                                     size_t& keys_to_update) {
     auto& path_array = path_data.second;
-    // English comment: keep arrays aligned for all keys (including top-level 
arrays).
+    // keep arrays aligned for all keys (including top-level arrays).
     assert(path_array.size() == ctx.current_size);
     // If current element of array is part of Nested,
     // collect its size or check it if the size of
@@ -282,7 +282,7 @@ void JSONDataParser<ParserImpl>::handleNewPath(UInt128 
hash, const PathInData::P
     Array path_array;
     path_array.reserve(ctx.total_size);
 
-    // English comment: always resize to keep alignment.
+    // always resize to keep alignment.
     path_array.resize(ctx.current_size);
 
     auto nested_key = getNameOfNested(path, value);
diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp
old mode 100644
new mode 100755
index 7aa0696ce38..e7b87e6085b
--- a/be/src/vec/json/parse2column.cpp
+++ b/be/src/vec/json/parse2column.cpp
@@ -53,7 +53,7 @@ namespace doris::vectorized {
 
 namespace {
 
-// English comment: parse JSON into (paths, values). If invalid JSON and 
strict mode is disabled,
+// parse JSON into (paths, values). If invalid JSON and strict mode is 
disabled,
 // fall back to treating it as a plain string field at root.
 template <typename ParserImpl>
 std::optional<ParseResult> parse_json_or_fallback_as_string(const char* src, 
size_t length,
diff --git a/be/src/vec/json/variant_nested_builder.h 
b/be/src/vec/json/variant_nested_builder.h
old mode 100644
new mode 100755
index 5a5a39e84df..5287a40734f
--- a/be/src/vec/json/variant_nested_builder.h
+++ b/be/src/vec/json/variant_nested_builder.h
@@ -78,7 +78,7 @@ private:
             const Field& value, const FieldInfo& info,
             size_t parent_flat_offset);
 
-    // English comment: split complex recursive logic into smaller helpers for 
readability and linting.
+    // split complex recursive logic into smaller helpers for readability and 
linting.
     void _write_to_nested_group_exceed_depth(ColumnVariant::NestedGroup* 
parent_group,
                                             const 
std::vector<std::pair<PathInData, size_t>>& levels,
                                             size_t current_level_idx,
diff --git a/be/src/vec/olap/olap_data_convertor.cpp 
b/be/src/vec/olap/olap_data_convertor.cpp
old mode 100644
new mode 100755
index 5bcfae7ea43..95e1aed9f83
--- a/be/src/vec/olap/olap_data_convertor.cpp
+++ b/be/src/vec/olap/olap_data_convertor.cpp
@@ -197,6 +197,10 @@ 
OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co
     case FieldType::OLAP_FIELD_TYPE_BIGINT: {
         return std::make_unique<OlapColumnDataConvertorSimple<TYPE_BIGINT>>();
     }
+    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT: {
+        // used by internal length/offset columns (e.g. ColumnOffset64).
+        return std::make_unique<OlapColumnDataConvertorSimple<TYPE_UINT64>>();
+    }
     case FieldType::OLAP_FIELD_TYPE_LARGEINT: {
         return 
std::make_unique<OlapColumnDataConvertorSimple<TYPE_LARGEINT>>();
     }
diff --git 
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp 
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
old mode 100644
new mode 100755
index da3a3373257..70e72ee18ed
--- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -25,7 +25,6 @@
 #include "olap/rowset/segment_v2/variant/variant_column_reader.h"
 #include "olap/rowset/segment_v2/variant/variant_column_writer_impl.h"
 #include "olap/storage_engine.h"
-#include "rapidjson/document.h"
 #include "testutil/variant_util.h"
 
 using namespace doris::vectorized;
@@ -361,6 +360,7 @@ TEST_F(VariantColumnWriterReaderTest, 
test_write_data_normal) {
     st = variant_column_reader->new_iterator(&it, &parent_column, 
&storage_read_opts,
                                              &column_reader_cache);
     EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(assert_cast<HierarchicalDataIterator*>(it.get()) != nullptr);
     ColumnIteratorOptions column_iter_opts;
     column_iter_opts.stats = &stats;
     column_iter_opts.file_reader = file_reader.get();
@@ -850,39 +850,15 @@ TEST_F(VariantColumnWriterReaderTest, 
test_write_data_advanced) {
     // 6. check footer
     int expected_sparse_cols =
             variant_sparse_hash_shard_count > 1 ? 
variant_sparse_hash_shard_count : 1;
-    // NestedGroup columns (__ng.*) may be appended after sparse columns.
-    EXPECT_GE(footer.columns_size(), 1 + 10 + expected_sparse_cols);
+    EXPECT_EQ(footer.columns_size(), 1 + 10 + expected_sparse_cols);
     auto column_meta = footer.columns(0);
     EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
 
-    bool has_nested_group = false;
-    int sparse_meta_cnt = 0;
-    int subcolumn_meta_cnt = 0;
-    for (int i = 1; i < footer.columns_size(); ++i) {
-        const auto& col = footer.columns(i);
-        if (!col.has_column_path_info()) {
-            continue;
-        }
-        const auto& info = col.column_path_info();
-        if (info.has_nested_group_parent_path() ||
-            (info.has_is_nested_group_offsets() && 
info.is_nested_group_offsets())) {
-            has_nested_group = true;
-            continue;
-        }
-        auto path = std::make_shared<vectorized::PathInData>();
-        path->from_protobuf(info);
-        const std::string base = path->copy_pop_front().get_path();
-        if (base == "__DORIS_VARIANT_SPARSE__" || 
base.rfind("__DORIS_VARIANT_SPARSE__.b", 0) == 0) {
-            check_sparse_column_meta(col, path_with_size);
-            sparse_meta_cnt++;
-        } else {
-            check_column_meta(col, path_with_size);
-            subcolumn_meta_cnt++;
-        }
+    for (int i = 1; i < footer.columns_size() - 1; ++i) {
+        auto column_met = footer.columns(i);
+        check_column_meta(column_met, path_with_size);
     }
-    EXPECT_TRUE(has_nested_group);
-    EXPECT_EQ(sparse_meta_cnt, expected_sparse_cols);
-    EXPECT_EQ(subcolumn_meta_cnt, 10);
+    check_sparse_column_meta(footer.columns(footer.columns_size() - 1), 
path_with_size);
 
     // 7. check variant reader
     io::FileReaderSPtr file_reader;
@@ -938,63 +914,6 @@ TEST_F(VariantColumnWriterReaderTest, 
test_write_data_advanced) {
         EXPECT_EQ(value, inserted_jsonstr[i]);
     }
 
-    // Whole Variant read should merge NestedGroup back as JSONB subcolumn 
"a.b".
-    {
-        const auto* cv = assert_cast<ColumnVariant*>(new_column_object.get());
-        const auto* ab = cv->get_subcolumn(PathInData("a.b"));
-        ASSERT_TRUE(ab != nullptr);
-        ASSERT_TRUE(ab->get_least_common_type() != nullptr);
-        EXPECT_EQ(remove_nullable(ab->get_least_common_type())->get_type_id(), 
TypeIndex::JSONB);
-    }
-
-    // Whole NestedGroup access: $.a.b should return Nullable(Variant(JSONB)) 
and be a JSON array.
-    {
-        TabletColumn ab_col;
-        ab_col.set_name(parent_column.name_lower_case() + ".a.b");
-        ab_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
-        ab_col.set_parent_unique_id(parent_column.unique_id());
-        ab_col.set_path_info(PathInData(parent_column.name_lower_case() + 
".a.b"));
-        
ab_col.set_variant_max_subcolumns_count(parent_column.variant_max_subcolumns_count());
-        ab_col.set_is_nullable(true);
-
-        ColumnIteratorUPtr ab_it;
-        st = variant_column_reader->new_iterator(&ab_it, &ab_col, 
&storage_read_opts,
-                                                 &column_reader_cache);
-        EXPECT_TRUE(st.ok()) << st.msg();
-        st = ab_it->init(column_iter_opts);
-        EXPECT_TRUE(st.ok()) << st.msg();
-        size_t ab_rows = 1000;
-        auto ab_dst = ColumnVariant::create(3);
-        st = ab_it->seek_to_ordinal(0);
-        EXPECT_TRUE(st.ok()) << st.msg();
-        st = ab_it->next_batch(&ab_rows, ab_dst);
-        EXPECT_TRUE(st.ok()) << st.msg();
-        EXPECT_EQ(ab_rows, 1000);
-
-        for (int i = 0; i < 1000; ++i) {
-            std::string json;
-            
assert_cast<ColumnVariant*>(ab_dst.get())->serialize_one_row_to_string(i, 
&json);
-            rapidjson::Document d;
-            d.Parse(json.c_str());
-            ASSERT_FALSE(d.HasParseError());
-            ASSERT_TRUE(d.IsArray());
-            ASSERT_EQ(d.Size(), 1);
-            ASSERT_TRUE(d[0].IsObject());
-            ASSERT_TRUE(d[0].HasMember("c"));
-            ASSERT_TRUE(d[0]["c"].IsObject());
-            ASSERT_TRUE(d[0]["c"].HasMember("d"));
-            ASSERT_TRUE(d[0]["c"].HasMember("e"));
-            EXPECT_EQ(d[0]["c"]["d"].GetInt(), i);
-            EXPECT_EQ(std::string(d[0]["c"]["e"].GetString()), 
std::to_string(i));
-            if (i % 17 == 0) {
-                ASSERT_TRUE(d[0]["c"].HasMember("f"));
-                EXPECT_EQ(d[0]["c"]["f"].GetInt(), i);
-            } else {
-                EXPECT_FALSE(d[0]["c"].HasMember("f"));
-            }
-        }
-    }
-
     auto read_to_column_object = [&](ColumnIteratorUPtr& it) {
         new_column_object = ColumnVariant::create(10);
         nrows = 1000;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to