This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new eaab105a394 [fix](variant) fix batch insert into with 
structure-conflicts strings (#53923)
eaab105a394 is described below

commit eaab105a3945455f74dd1bb310fbc578531fd312
Author: amory <[email protected]>
AuthorDate: Tue Jul 29 10:06:21 2025 +0800

    [fix](variant) fix batch insert into with structure-conflicts strings 
(#53923)
    
    fix batch insert into with structure-conflicts strings
    Before this we were able to successfully insert, but will meet query
    error
    like this:
    ```
    mysql> insert into var_nested_load_conflict values (3, '{"nested": [{"a": 
2.5, "b": "123.1"}]}'),  (4, '{"nested": {"a": 2.5, "b": "123.1"}}');
    Query OK, 2 rows affected (0.16 sec)
    {'label':'label_9279242ae3fd40e2_aabe077db2d37bb9', 'status':'VISIBLE', 
'txnId':'16028'}
    
    mysql> desc var_nested_load_conflict;
    +------------+---------------+------+-------+---------+-------+
    | Field      | Type          | Null | Key   | Default | Extra |
    +------------+---------------+------+-------+---------+-------+
    | k          | bigint        | Yes  | true  | NULL    |       |
    | v          | variant       | Yes  | false | NULL    | NONE  |
    | v.nested.a | json          | Yes  | false | NULL    | NONE  |
    | v.nested.b | json          | Yes  | false | NULL    | NONE  |
    | v.nested.c | array<double> | Yes  | false | NULL    | NONE  |
    +------------+---------------+------+-------+---------+-------+
    5 rows in set (0.10 sec)
    
    mysql> select * from var_nested_load_conflict;
    ERROR 1105 (HY000): errCode = 2, detailMessage = 
(10.16.10.6)[INTERNAL_ERROR]Meet none array column when flatten nested array, 
path nested.b, type Nullable(JSONB)
    ```
    
    So we don't allow this kind of insertion
    ```
    mysql> insert into var_nested_load_conflict values (3, '{"nested": [{"a": 
2.5, "b": "123.1"}]}'),  (4, '{"nested": {"a": 2.5, "b": "123.1"}}');
    ERROR 1105 (HY000): errCode = 2, detailMessage = 
(10.16.10.6)[DATA_QUALITY_ERROR][E46] Ambiguous paths: nested.b vs nested.b 
with different nested part false vs true
    ```
---
 be/src/vec/common/schema_util.cpp                |  3 --
 be/src/vec/json/parse2column.cpp                 | 10 ++++++
 be/test/common/schema_util_test.cpp              | 42 ++++++++++++++++++++++++
 regression-test/suites/variant_p0/nested2.groovy |  7 ++++
 4 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 044f6527e72..693451d1cdc 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -267,9 +267,6 @@ Status check_variant_has_no_ambiguous_paths(const 
PathsInData& tuple_paths) {
         path_groups[tuple_paths[i].get_path()].push_back(i);
         // print part of tuple_paths[i]
         VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
-        for (const auto& part : tuple_paths[i].get_parts()) {
-            VLOG_DEBUG << "part: " << part.key << ", is_nested: " << 
part.is_nested;
-        }
     }
 
     // Only compare paths within the same group
diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp
index 858b0aeb78f..7b84e29504a 100644
--- a/be/src/vec/json/parse2column.cpp
+++ b/be/src/vec/json/parse2column.cpp
@@ -155,6 +155,16 @@ void parse_json_to_variant(IColumn& column, const char* 
src, size_t length,
     auto& [paths, values] = *result;
     assert(paths.size() == values.size());
     size_t old_num_rows = column_variant.size();
+    if (config.enable_flatten_nested) {
+        // here we should check the paths in variant and paths in result,
+        // if two paths which same prefix have different structure, we should 
throw an exception
+        std::vector<PathInData> check_paths;
+        for (const auto& entry : column_variant.get_subcolumns()) {
+            check_paths.push_back(entry->path);
+        }
+        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
+        
THROW_IF_ERROR(vectorized::schema_util::check_variant_has_no_ambiguous_paths(check_paths));
+    }
     for (size_t i = 0; i < paths.size(); ++i) {
         FieldInfo field_info;
         get_field_info(values[i], &field_info);
diff --git a/be/test/common/schema_util_test.cpp 
b/be/test/common/schema_util_test.cpp
index 743db751dc1..2fa3fd11f87 100644
--- a/be/test/common/schema_util_test.cpp
+++ b/be/test/common/schema_util_test.cpp
@@ -19,6 +19,14 @@
 
 #include <gtest/gtest.h>
 
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_variant.h"
+#include "vec/core/block.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/data_types/data_type_variant.h"
+#include "vec/json/json_parser.h"
+
 namespace doris {
 
 class SchemaUtilTest : public testing::Test {};
@@ -387,4 +395,38 @@ TEST_F(SchemaUtilTest, check_path_conflicts_with_existing) 
{
     }
 }
 
+TEST_F(SchemaUtilTest, parse_variant_columns_ambiguous_paths) {
+    using namespace doris::vectorized;
+    // Prepare the string column with two rows
+    auto string_col = ColumnString::create();
+    string_col->insert(doris::vectorized::Field::create_field<TYPE_STRING>(
+            String("{\"nested\": [{\"a\": 2.5, \"b\": \"123.1\"}]}")));
+    string_col->insert(doris::vectorized::Field::create_field<TYPE_STRING>(
+            String("{\"nested\": {\"a\": 2.5, \"b\": \"123.1\"}}")));
+    auto string_type = std::make_shared<DataTypeString>();
+
+    // Prepare the variant column with the string column as root
+    vectorized::ColumnVariant::Subcolumns dynamic_subcolumns;
+    dynamic_subcolumns.create_root(
+            vectorized::ColumnVariant::Subcolumn(string_col->assume_mutable(), 
string_type, true));
+
+    auto variant_col = ColumnVariant::create(std::move(dynamic_subcolumns), 
true);
+    auto variant_type = std::make_shared<DataTypeVariant>();
+
+    // Construct the block
+    Block block;
+    block.insert(
+            vectorized::ColumnWithTypeAndName(variant_col->assume_mutable(), 
variant_type, "v"));
+
+    // The variant column is at index 0
+    std::vector<int> variant_pos = {0};
+    ParseConfig config;
+    config.enable_flatten_nested = true;
+
+    // Should throw due to ambiguous paths
+    Status st = schema_util::parse_variant_columns(block, variant_pos, config);
+    EXPECT_FALSE(st.ok());
+    EXPECT_TRUE(st.to_string().find("Ambiguous paths") != std::string::npos);
+}
+
 } // namespace doris
diff --git a/regression-test/suites/variant_p0/nested2.groovy 
b/regression-test/suites/variant_p0/nested2.groovy
index 75d84a664c2..099b1c903f1 100644
--- a/regression-test/suites/variant_p0/nested2.groovy
+++ b/regression-test/suites/variant_p0/nested2.groovy
@@ -68,6 +68,13 @@ suite("variant_nested_type_conflict", "p0"){
                 """
             exception "Nesting of array in Nested array within variant 
subcolumns is currently not supported."
         }
+        // insert batch different structure in same path
+        test {
+            sql """
+                insert into ${table_name} values (3, '{"nested": [{"a": 2.5, 
"b": "123.1"}]}'),  (4, '{"nested": {"a": 2.5, "b": "123.1"}}');
+                """
+            exception "Ambiguous paths"
+        }
         /// insert a array of object for a, b, c 
         // insert type conflict in multiple rows
         sql """


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to