This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f563b2eb382 [Improve](variant) Keep first duplicate Variant JSON path 
(#63082)
f563b2eb382 is described below

commit f563b2eb382a6a42c359d977707f7731d8e0d0a0
Author: lihangyu <[email protected]>
AuthorDate: Tue May 12 10:50:38 2026 +0800

    [Improve](variant) Keep first duplicate Variant JSON path (#63082)
    
    Add BE config variant_enable_duplicate_json_path_check to keep the first
    duplicate JSON object key during Variant parsing when enabled.
---
 be/src/common/config.cpp                           |   1 +
 be/src/common/config.h                             |   2 +
 .../data_type_serde/data_type_variant_serde.cpp    |   6 +-
 be/src/exec/common/variant_util.cpp                |  27 ++-
 be/src/util/json/json_parser.cpp                   |  54 ++++--
 be/src/util/json/json_parser.h                     |   5 +
 be/test/storage/segment/variant_util_test.cpp      | 206 +++++++++++++++++++++
 .../data/variant_p0/duplicate_json_path.json       |   7 +
 .../suites/variant_p0/duplicate_json_path.groovy   | 106 +++++++++++
 9 files changed, 395 insertions(+), 19 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 8b9158d8f67..b1e7949d8c1 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1164,6 +1164,7 @@ DEFINE_mBool(variant_use_cloud_schema_dict_cache, "true");
 DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048");
 DEFINE_mInt32(variant_max_json_key_length, "255");
 DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false");
+DEFINE_mBool(variant_enable_duplicate_json_path_check, "false");
 DEFINE_mBool(enable_vertical_compact_variant_subcolumns, "true");
 DEFINE_mBool(enable_variant_doc_sparse_write_subcolumns, "true");
 // Maximum depth of nested arrays to track with NestedGroup
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 930e9f67fb6..2814f0539ff 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1425,6 +1425,8 @@ 
DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column);
 DECLARE_mInt32(variant_max_json_key_length);
 // Treat invalid json format str as string, instead of throwing exception if 
false
 DECLARE_mBool(variant_throw_exeception_on_invalid_json);
+// Enable duplicate path check when parsing json into variant subcolumns/jsonb.
+DECLARE_mBool(variant_enable_duplicate_json_path_check);
 // Enable vertical compact subcolumns of variant column
 DECLARE_mBool(enable_vertical_compact_variant_subcolumns);
 DECLARE_mBool(enable_variant_doc_sparse_write_subcolumns);
diff --git a/be/src/core/data_type_serde/data_type_variant_serde.cpp 
b/be/src/core/data_type_serde/data_type_variant_serde.cpp
index 31538f9fcc5..93ec3edfdfb 100644
--- a/be/src/core/data_type_serde/data_type_variant_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_variant_serde.cpp
@@ -21,6 +21,7 @@
 #include <string>
 
 #include "common/cast_set.h"
+#include "common/config.h"
 #include "common/exception.h"
 #include "common/status.h"
 #include "core/assert_cast.h"
@@ -107,10 +108,11 @@ Status 
DataTypeVariantSerDe::serialize_one_cell_to_json(const IColumn& column, i
 
 Status DataTypeVariantSerDe::deserialize_one_cell_from_json(IColumn& column, 
Slice& slice,
                                                             const 
FormatOptions& options) const {
-    ParseConfig config;
+    ParseConfig parse_config;
+    parse_config.check_duplicate_json_path = 
config::variant_enable_duplicate_json_path_check;
     StringRef json_ref(slice.data, slice.size);
     RETURN_IF_CATCH_EXCEPTION(
-            variant_util::parse_json_to_variant(column, json_ref, nullptr, 
config));
+            variant_util::parse_json_to_variant(column, json_ref, nullptr, 
parse_config));
     return Status::OK();
 }
 
diff --git a/be/src/exec/common/variant_util.cpp 
b/be/src/exec/common/variant_util.cpp
index af10e0d4ff2..2f2ce53d8ea 100644
--- a/be/src/exec/common/variant_util.cpp
+++ b/be/src/exec/common/variant_util.cpp
@@ -1928,16 +1928,26 @@ void parse_json_to_variant_impl(IColumn& column, const 
char* src, size_t length,
         }
     };
 
+    auto is_plain_path = [](const PathInData& path) {
+        for (const auto& part : path.get_parts()) {
+            if (part.is_nested || part.anonymous_array_level != 0) {
+                return false;
+            }
+        }
+        return true;
+    };
+
     auto get_or_create_subcolumn = [&](const PathInData& path, size_t 
index_hint,
                                        const FieldInfo& field_info) -> 
ColumnVariant::Subcolumn* {
-        if (column_variant.get_subcolumn(path, index_hint) == nullptr) {
+        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
+        if (subcolumn == nullptr) {
             if (path.has_nested_part()) {
                 column_variant.add_nested_subcolumn(path, field_info, 
old_num_rows);
             } else {
                 column_variant.add_sub_column(path, old_num_rows);
             }
+            subcolumn = column_variant.get_subcolumn(path, index_hint);
         }
-        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
         if (!subcolumn) {
             throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to 
find sub column {}",
                                    path.get_path());
@@ -1945,6 +1955,13 @@ void parse_json_to_variant_impl(IColumn& column, const 
char* src, size_t length,
         return subcolumn;
     };
 
+    auto normalize_plain_path = [&](const PathInData& path) {
+        if (!config.check_duplicate_json_path || path.empty() || 
!is_plain_path(path)) {
+            return path;
+        }
+        return PathInData(path.get_path());
+    };
+
     auto insert_into_subcolumn = [&](size_t i,
                                      bool check_size_mismatch) -> 
ColumnVariant::Subcolumn* {
         FieldInfo field_info;
@@ -1952,12 +1969,13 @@ void parse_json_to_variant_impl(IColumn& column, const 
char* src, size_t length,
         if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
             return nullptr;
         }
-        auto* subcolumn = get_or_create_subcolumn(paths[i], i, field_info);
+        auto path = normalize_plain_path(paths[i]);
+        auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
         flush_defaults(subcolumn);
         if (check_size_mismatch && subcolumn->size() != old_num_rows) {
             throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
                                    "subcolumn {} size missmatched, may 
contains duplicated entry",
-                                   paths[i].get_path());
+                                   path.get_path());
         }
         subcolumn->insert(std::move(values[i]), std::move(field_info));
         return subcolumn;
@@ -2221,6 +2239,7 @@ Status parse_and_materialize_variant_columns(Block& 
block, const TabletSchema& t
         // Deprecated legacy flatten-nested switch. Distinct from 
variant_enable_nested_group.
         configs[i].deprecated_enable_flatten_nested =
                 tablet_schema.deprecated_variant_flatten_nested();
+        configs[i].check_duplicate_json_path = 
config::variant_enable_duplicate_json_path_check;
         const auto& column = tablet_schema.column(variant_schema_pos[i]);
         if (!column.is_variant_type()) {
             return Status::InternalError("column is not variant type, column 
name: {}",
diff --git a/be/src/util/json/json_parser.cpp b/be/src/util/json/json_parser.cpp
index 2a7c401b9d8..3df723c3849 100644
--- a/be/src/util/json/json_parser.cpp
+++ b/be/src/util/json/json_parser.cpp
@@ -26,6 +26,7 @@
 #include <algorithm>
 #include <cassert>
 #include <string_view>
+#include <vector>
 
 #include "common/cast_set.h"
 // IWYU pragma: keep
@@ -46,6 +47,7 @@ std::optional<ParseResult> 
JSONDataParser<ParserImpl>::parse(const char* begin,
     // deprecated_enable_flatten_nested controls nested path traversal
     // NestedGroup expansion is now handled at storage layer
     context.deprecated_enable_flatten_nested = 
config.deprecated_enable_flatten_nested;
+    context.check_duplicate_json_path = config.check_duplicate_json_path;
     context.is_top_array = document.isArray();
     traverse(document, context);
     ParseResult result;
@@ -72,25 +74,39 @@ void JSONDataParser<ParserImpl>::traverse(const Element& 
element, ParseContext&
             // Parse nested arrays to JsonbField
             JsonbWriter writer;
             traverseArrayAsJsonb(element.getArray(), writer);
-            ctx.paths.push_back(ctx.builder.get_parts());
-            ctx.values.push_back(Field::create_field<TYPE_JSONB>(
-                    JsonbField(writer.getOutput()->getBuffer(), 
writer.getOutput()->getSize())));
+            appendValueIfNotDuplicate(
+                    ctx, ctx.builder.get_parts(),
+                    
Field::create_field<TYPE_JSONB>(JsonbField(writer.getOutput()->getBuffer(),
+                                                               
writer.getOutput()->getSize())));
         } else {
             traverseArray(element.getArray(), ctx);
         }
         // we should set has_nested_in_flatten to false when traverse array 
finished for next array otherwise it will be true for next array
         ctx.has_nested_in_flatten = false;
     } else {
-        ctx.paths.push_back(ctx.builder.get_parts());
-        ctx.values.push_back(getValueAsField(element));
+        appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(), 
getValueAsField(element));
     }
 }
+
+template <typename ParserImpl>
+void JSONDataParser<ParserImpl>::appendValueIfNotDuplicate(ParseContext& ctx,
+                                                           const 
PathInData::Parts& path,
+                                                           Field&& value) {
+    if (ctx.check_duplicate_json_path) {
+        PathInData path_in_data(path);
+        if (!ctx.visited_path_names.emplace(path_in_data.get_path()).second) {
+            return;
+        }
+    }
+    ctx.paths.push_back(path);
+    ctx.values.push_back(std::move(value));
+}
+
 template <typename ParserImpl>
 void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, 
ParseContext& ctx) {
     ctx.paths.reserve(ctx.paths.size() + object.size());
     ctx.values.reserve(ctx.values.size() + object.size());
-    for (auto it = object.begin(); it != object.end(); ++it) {
-        const auto& [key, value] = *it;
+    auto check_key_length = [](const auto& key) {
         const size_t max_key_length = 
cast_set<size_t>(config::variant_max_json_key_length);
         if (key.size() > max_key_length) {
             throw doris::Exception(
@@ -98,9 +114,17 @@ void JSONDataParser<ParserImpl>::traverseObject(const 
JSONObject& object, ParseC
                     fmt::format("Key length exceeds maximum allowed size of {} 
bytes.",
                                 max_key_length));
         }
+    };
+    auto traverse_object_member = [&](const auto& key, const auto& value) {
+        check_key_length(key);
         ctx.builder.append(key, false);
         traverse(value, ctx);
         ctx.builder.pop_back();
+    };
+
+    for (auto it = object.begin(); it != object.end(); ++it) {
+        const auto& [key, value] = *it;
+        traverse_object_member(key, value);
     }
 }
 
@@ -176,6 +200,7 @@ void JSONDataParser<ParserImpl>::traverseArray(const 
JSONArray& array, ParseCont
     ParseArrayContext array_ctx;
     array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
     array_ctx.is_top_array = ctx.is_top_array;
+    array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
     array_ctx.total_size = array.size();
     for (auto it = array.begin(); it != array.end(); ++it) {
         traverseArrayElement(*it, array_ctx);
@@ -183,16 +208,17 @@ void JSONDataParser<ParserImpl>::traverseArray(const 
JSONArray& array, ParseCont
     }
     auto&& arrays_by_path = array_ctx.arrays_by_path;
     if (arrays_by_path.empty()) {
-        ctx.paths.push_back(ctx.builder.get_parts());
-        ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array()));
+        appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
+                                  Field::create_field<TYPE_ARRAY>(Array()));
     } else {
         ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
         ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
         for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); 
++it) {
             auto&& [path, path_array] = it->second;
             /// Merge prefix path and path of array element.
-            ctx.paths.push_back(ctx.builder.append(path, true).get_parts());
-            
ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array)));
+            ctx.builder.append(path, true);
+            appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
+                                      
Field::create_field<TYPE_ARRAY>(std::move(path_array)));
             ctx.builder.pop_back(path.size());
         }
     }
@@ -204,10 +230,12 @@ void 
JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
     ParseContext element_ctx;
     element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
     element_ctx.is_top_array = ctx.is_top_array;
+    element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
     traverse(element, element_ctx);
-    auto& [_, paths, values, deprecated_flatten_nested, __, is_top_array] = 
element_ctx;
+    auto& paths = element_ctx.paths;
+    auto& values = element_ctx.values;
 
-    if (element_ctx.has_nested_in_flatten && is_top_array) {
+    if (element_ctx.has_nested_in_flatten && element_ctx.is_top_array) {
         checkAmbiguousStructure(ctx, paths);
     }
 
diff --git a/be/src/util/json/json_parser.h b/be/src/util/json/json_parser.h
index 4b49259588b..c4a165e8995 100644
--- a/be/src/util/json/json_parser.h
+++ b/be/src/util/json/json_parser.h
@@ -101,6 +101,7 @@ void writeValueAsJsonb(const Element& element, JsonbWriter& 
writer) {
 
 struct ParseConfig {
     bool deprecated_enable_flatten_nested = false;
+    bool check_duplicate_json_path = false;
     enum class ParseTo {
         OnlySubcolumns = 0,
         OnlyDocValueColumn = 1,
@@ -127,7 +128,9 @@ private:
         PathInDataBuilder builder;
         std::vector<PathInData::Parts> paths;
         std::vector<Field> values;
+        phmap::flat_hash_set<std::string> visited_path_names;
         bool deprecated_enable_flatten_nested = false;
+        bool check_duplicate_json_path = false;
         bool has_nested_in_flatten = false;
         bool is_top_array = false;
     };
@@ -141,10 +144,12 @@ private:
         KeyToSizes nested_sizes_by_key;
         bool has_nested_in_flatten = false;
         bool is_top_array = false;
+        bool check_duplicate_json_path = false;
     };
     void traverse(const Element& element, ParseContext& ctx);
     void traverseObject(const JSONObject& object, ParseContext& ctx);
     void traverseArray(const JSONArray& array, ParseContext& ctx);
+    void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts& 
path, Field&& value);
     void traverseArrayElement(const Element& element, ParseArrayContext& ctx);
     void checkAmbiguousStructure(const ParseArrayContext& ctx,
                                  const std::vector<PathInData::Parts>& paths);
diff --git a/be/test/storage/segment/variant_util_test.cpp 
b/be/test/storage/segment/variant_util_test.cpp
index 597623c1018..902bf9c843b 100644
--- a/be/test/storage/segment/variant_util_test.cpp
+++ b/be/test/storage/segment/variant_util_test.cpp
@@ -23,11 +23,13 @@
 #include <string_view>
 #include <vector>
 
+#include "common/config.h"
 #include "core/block/block.h"
 #include "core/column/column_string.h"
 #include "core/column/column_variant.h"
 #include "core/data_type/data_type_variant.h"
 #include "core/field.h"
+#include "core/value/jsonb_value.h"
 #include "exec/common/variant_util.h"
 #include "gtest/gtest.h"
 #include "storage/tablet/tablet_schema.h"
@@ -42,6 +44,20 @@ static ColumnString::MutablePtr _make_json_column(const 
std::vector<std::string_
     return col;
 }
 
+class ScopedDuplicateJsonPathCheck {
+public:
+    explicit ScopedDuplicateJsonPathCheck(bool value)
+            : _old_value(config::variant_enable_duplicate_json_path_check) {
+        config::variant_enable_duplicate_json_path_check = value;
+    }
+    ~ScopedDuplicateJsonPathCheck() {
+        config::variant_enable_duplicate_json_path_check = _old_value;
+    }
+
+private:
+    bool _old_value;
+};
+
 TEST(VariantUtilTest, ParseDocValueToSubcolumns_FillsDefaultsAndValues) {
     const std::vector<std::string_view> jsons = {
             R"({"a":1,"b":"x"})", //
@@ -225,6 +241,196 @@ TEST(VariantUtilTest, 
ParseOnlyDocValueColumn_SerializesMixedTypes) {
     EXPECT_EQ(f.field.get<TYPE_STRING>(), "y");
 }
 
+TEST(VariantUtilTest, ParseDuplicateJsonPathsKeepsFirstValue) {
+    ScopedDuplicateJsonPathCheck check_guard(true);
+    const std::vector<std::string_view> jsons = {
+            R"({"a":42,"a":{"b":42}})", R"({"a":123,"a":"123"})",       
R"({"a.b":1,"a":{"b":2}})",
+            R"({"a":{"b":3},"a.b":4})", R"({"a":{"b":5},"a":{"c":6}})",
+    };
+
+    auto variant = ColumnVariant::create(0, false);
+    auto json_col = _make_json_column(jsons);
+
+    ParseConfig cfg;
+    cfg.deprecated_enable_flatten_nested = false;
+    cfg.check_duplicate_json_path = true;
+    cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns;
+    parse_json_to_variant(*variant, *json_col, cfg);
+    ASSERT_TRUE(variant->sanitize().ok());
+
+    const auto* sub_a = variant->get_subcolumn(PathInData("a"));
+    const auto* sub_ab = variant->get_subcolumn(PathInData("a.b"));
+    const auto* sub_ac = variant->get_subcolumn(PathInData("a.c"));
+    ASSERT_NE(sub_a, nullptr);
+    ASSERT_NE(sub_ab, nullptr);
+    ASSERT_NE(sub_ac, nullptr);
+
+    FieldWithDataType f;
+    sub_a->get(0, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+    sub_a->get(1, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 123);
+
+    sub_ab->get(0, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+    sub_ab->get(1, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_NULL);
+    sub_ab->get(2, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 1);
+    sub_ab->get(3, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 3);
+    sub_ab->get(4, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 5);
+
+    sub_ac->get(4, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 6);
+}
+
+TEST(VariantUtilTest, ParseDuplicateJsonPathsKeepsFirstArrayOrScalarValue) {
+    ScopedDuplicateJsonPathCheck check_guard(true);
+    const std::vector<std::string_view> jsons = {
+            R"({"a":[1],"a":2})",
+            R"({"a":2,"a":[1]})",
+    };
+
+    auto variant = ColumnVariant::create(0, false);
+    auto json_col = _make_json_column(jsons);
+
+    ParseConfig cfg;
+    cfg.deprecated_enable_flatten_nested = false;
+    cfg.check_duplicate_json_path = true;
+    cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns;
+    parse_json_to_variant(*variant, *json_col, cfg);
+    ASSERT_TRUE(variant->sanitize().ok());
+
+    const auto* sub_a = variant->get_subcolumn(PathInData("a"));
+    ASSERT_NE(sub_a, nullptr);
+
+    FieldWithDataType f;
+    sub_a->get(0, f);
+    ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+    const auto& first = f.field.get<TYPE_JSONB>();
+    EXPECT_EQ(JsonbToJson::jsonb_to_json_string(first.get_value(), 
first.get_size()), "[1]");
+
+    sub_a->get(1, f);
+    ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+    const auto& second = f.field.get<TYPE_JSONB>();
+    EXPECT_EQ(JsonbToJson::jsonb_to_json_string(second.get_value(), 
second.get_size()), "2");
+}
+
+TEST(VariantUtilTest, ParseDuplicateJsonPathsInDocModeKeepsFirstValue) {
+    ScopedDuplicateJsonPathCheck check_guard(true);
+    const std::vector<std::string_view> jsons = {
+            R"({"a":42,"a":{"b":42}})", R"({"a":123,"a":"123"})",       
R"({"a.b":1,"a":{"b":2}})",
+            R"({"a":{"b":3},"a.b":4})", R"({"a":{"b":5},"a":{"c":6}})",
+    };
+
+    auto variant = ColumnVariant::create(0, true);
+    auto json_col = _make_json_column(jsons);
+
+    ParseConfig cfg;
+    cfg.deprecated_enable_flatten_nested = false;
+    cfg.check_duplicate_json_path = true;
+    cfg.parse_to = ParseConfig::ParseTo::OnlyDocValueColumn;
+    parse_json_to_variant(*variant, *json_col, cfg);
+    ASSERT_TRUE(variant->sanitize().ok());
+
+    auto subcolumns = materialize_docs_to_subcolumns_map(*variant);
+    ASSERT_TRUE(subcolumns.contains("a"));
+    ASSERT_TRUE(subcolumns.contains("a.b"));
+    ASSERT_TRUE(subcolumns.contains("a.c"));
+
+    auto& sub_a = subcolumns.at("a");
+    auto& sub_ab = subcolumns.at("a.b");
+    auto& sub_ac = subcolumns.at("a.c");
+    sub_a.finalize();
+    sub_ab.finalize();
+    sub_ac.finalize();
+
+    FieldWithDataType f;
+    sub_a.get(0, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+    sub_a.get(1, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 123);
+
+    sub_ab.get(0, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+    sub_ab.get(1, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_NULL);
+    sub_ab.get(2, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 1);
+    sub_ab.get(3, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 3);
+    sub_ab.get(4, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 5);
+
+    sub_ac.get(4, f);
+    EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 6);
+}
+
+TEST(VariantUtilTest, 
ParseDuplicateJsonPathsInDocModeKeepsFirstArrayOrScalarValue) {
+    ScopedDuplicateJsonPathCheck check_guard(true);
+    const std::vector<std::string_view> jsons = {
+            R"({"a":[1],"a":2})",
+            R"({"a":2,"a":[1]})",
+    };
+
+    auto variant = ColumnVariant::create(0, true);
+    auto json_col = _make_json_column(jsons);
+
+    ParseConfig cfg;
+    cfg.deprecated_enable_flatten_nested = false;
+    cfg.check_duplicate_json_path = true;
+    cfg.parse_to = ParseConfig::ParseTo::OnlyDocValueColumn;
+    parse_json_to_variant(*variant, *json_col, cfg);
+    ASSERT_TRUE(variant->sanitize().ok());
+
+    auto subcolumns = materialize_docs_to_subcolumns_map(*variant);
+    ASSERT_TRUE(subcolumns.contains("a"));
+
+    auto& sub_a = subcolumns.at("a");
+    sub_a.finalize();
+
+    FieldWithDataType f;
+    sub_a.get(0, f);
+    ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+    const auto& first = f.field.get<TYPE_JSONB>();
+    EXPECT_EQ(JsonbToJson::jsonb_to_json_string(first.get_value(), 
first.get_size()), "[1]");
+
+    sub_a.get(1, f);
+    ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+    const auto& second = f.field.get<TYPE_JSONB>();
+    EXPECT_EQ(JsonbToJson::jsonb_to_json_string(second.get_value(), 
second.get_size()), "2");
+}
+
+TEST(VariantUtilTest, ParseDuplicateJsonPathsCheckDisabledByDefault) {
+    ScopedDuplicateJsonPathCheck check_guard(false);
+    const std::vector<std::string_view> jsons = {
+            R"({"a":123,"a":"123"})",
+    };
+
+    auto variant = ColumnVariant::create(0, false);
+    auto json_col = _make_json_column(jsons);
+
+    ParseConfig cfg;
+    cfg.deprecated_enable_flatten_nested = false;
+    EXPECT_THROW(parse_json_to_variant(*variant, *json_col, cfg), Exception);
+}
+
 TEST(VariantUtilTest, ParseVariantColumns_ScalarJsonStringToSubcolumns) {
     TabletSchemaPB schema_pb;
     schema_pb.set_keys_type(KeysType::DUP_KEYS);
diff --git a/regression-test/data/variant_p0/duplicate_json_path.json 
b/regression-test/data/variant_p0/duplicate_json_path.json
new file mode 100644
index 00000000000..e065c9b2331
--- /dev/null
+++ b/regression-test/data/variant_p0/duplicate_json_path.json
@@ -0,0 +1,7 @@
+{"k":8,"v":{"a":42,"a":{"b":42}}}
+{"k":9,"v":{"a":123,"a":"123"}}
+{"k":10,"v":{"a.b":8,"a":{"b":9}}}
+{"k":11,"v":{"a":{"b":10},"a.b":11}}
+{"k":12,"v":{"a":{"b":11},"a":{"c":12}}}
+{"k":13,"v":{"a":[13],"a":14}}
+{"k":14,"v":{"a":14,"a":[13]}}
diff --git a/regression-test/suites/variant_p0/duplicate_json_path.groovy 
b/regression-test/suites/variant_p0/duplicate_json_path.groovy
new file mode 100644
index 00000000000..0c6802f461e
--- /dev/null
+++ b/regression-test/suites/variant_p0/duplicate_json_path.groovy
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("duplicate_json_path", "p0") {
+    def customBeConfig = [
+        variant_enable_duplicate_json_path_check: true
+    ]
+    setBeConfigTemporary(customBeConfig) {
+        sql "DROP TABLE IF EXISTS duplicate_json_path"
+        sql """
+            CREATE TABLE duplicate_json_path (
+                k int,
+                v variant
+            )
+            DUPLICATE KEY(k)
+            DISTRIBUTED BY HASH(k) BUCKETS 1
+            PROPERTIES (
+                "replication_num" = "1",
+                "group_commit_interval_ms" = "2000",
+                "disable_auto_compaction" = "true"
+            );
+        """
+
+        sql """insert into duplicate_json_path values (1, 
'{"a":42,"a":{"b":42}}')"""
+        sql """insert into duplicate_json_path values (2, '{"a" : 123, "a" : 
"123"}')"""
+        sql """insert into duplicate_json_path values (3, 
'{"a.b":1,"a":{"b":2}}')"""
+        sql """insert into duplicate_json_path values (4, 
'{"a":{"b":3},"a.b":4}')"""
+        sql """insert into duplicate_json_path values (5, 
'{"a":{"b":5},"a":{"c":6}}')"""
+        sql """insert into duplicate_json_path values (6, '{"a":[1],"a":2}')"""
+        sql """insert into duplicate_json_path values (7, '{"a":2,"a":[1]}')"""
+
+        streamLoad {
+            table "duplicate_json_path"
+            set 'read_json_by_line', 'true'
+            set 'format', 'json'
+            set 'group_commit', 'async_mode'
+            unset 'label'
+            file 'duplicate_json_path.json'
+            time 10000
+
+            check { result, exception, startTime, endTime ->
+                if (exception != null) {
+                    throw exception
+                }
+                def json = parseJson(result)
+                assertEquals("success", json.Status.toLowerCase())
+                assertEquals(7, json.NumberTotalRows)
+                assertEquals(7, json.NumberLoadedRows)
+            }
+        }
+
+        for (int i = 0; i < 30; i++) {
+            def count = sql "select count(*) from duplicate_json_path"
+            if (count[0][0] == 14) {
+                break
+            }
+            sleep(1000)
+        }
+        def totalRows = sql "select count(*) from duplicate_json_path"
+        assertEquals(14, totalRows[0][0])
+
+        // When duplicate path check is enabled, duplicate Variant paths keep 
the first value.
+        def expectedResult = [
+                [1, "{\"b\":42}", "42", null],
+                [2, "123", null, null],
+                [3, "{\"b\":1}", "1", null],
+                [4, "{\"b\":3}", "3", null],
+                [5, "{\"b\":5,\"c\":6}", "5", "6"],
+                [6, "[1]", null, null],
+                [7, "2", null, null],
+                [8, "{\"b\":42}", "42", null],
+                [9, "123", null, null],
+                [10, "{\"b\":8}", "8", null],
+                [11, "{\"b\":10}", "10", null],
+                [12, "{\"b\":11,\"c\":12}", "11", "12"],
+                [13, "[13]", null, null],
+                [14, "14", null, null]
+        ]
+
+        def queryResult = {
+            sql """
+            select k, cast(v['a'] as string), cast(v['a']['b'] as string), 
cast(v['a']['c'] as string)
+            from duplicate_json_path
+            order by k
+            """
+        }
+        assertEquals(expectedResult, queryResult())
+
+        trigger_and_wait_compaction("duplicate_json_path", "full")
+        assertEquals(expectedResult, queryResult())
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to