This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 35492968ea9 [refactor](variant) reimplement function insert field 
(#51307)
35492968ea9 is described below

commit 35492968ea9d725e25951041e0283d10b3119596
Author: Sun Chenyang <[email protected]>
AuthorDate: Wed Aug 13 11:28:21 2025 +0800

    [refactor](variant) reimplement function insert field (#51307)
    
    reimplement function insert field
---
 be/src/vec/columns/column_variant.cpp       |  76 ++++-----
 be/test/vec/columns/column_variant_test.cpp | 234 +++++++++++++++++++++++++++-
 2 files changed, 267 insertions(+), 43 deletions(-)

diff --git a/be/src/vec/columns/column_variant.cpp 
b/be/src/vec/columns/column_variant.cpp
index d94f02de0bc..54068ac5c78 100644
--- a/be/src/vec/columns/column_variant.cpp
+++ b/be/src/vec/columns/column_variant.cpp
@@ -45,6 +45,7 @@
 #include "common/status.h"
 #include "exprs/json_functions.h"
 #include "olap/olap_common.h"
+#include "runtime/define_primitive_type.h"
 #include "runtime/jsonb_value.h"
 #include "runtime/primitive_type.h"
 #include "util/defer_op.h"
@@ -206,60 +207,51 @@ void 
ColumnVariant::Subcolumn::add_new_column_part(DataTypePtr type) {
 }
 
 void ColumnVariant::Subcolumn::insert(Field field, FieldInfo info) {
-    auto base_type = info.scalar_type_id;
-    if (base_type == PrimitiveType::INVALID_TYPE && info.num_dimensions == 0) {
+    if (field.is_null()) {
         insert_default();
         return;
     }
-    ++num_rows;
-    auto column_dim = least_common_type.get_dimensions();
-    auto value_dim = info.num_dimensions;
-    if (least_common_type.get_base()->get_primitive_type() == INVALID_TYPE) {
-        column_dim = value_dim;
-    }
-    if (base_type == PrimitiveType::INVALID_TYPE) {
-        value_dim = column_dim;
-    }
-    bool type_changed = false;
-    if (value_dim != column_dim || info.num_dimensions >= 2) {
-        // Deduce to JSONB
-        VLOG_DEBUG << fmt::format(
-                "Dimension of types mismatched between inserted value and 
column, "
-                "expected:{}, but meet:{} for type:{}",
-                column_dim, value_dim, least_common_type.get()->get_name());
-        base_type = MOST_COMMON_TYPE_ID;
-        value_dim = 0;
-        type_changed = true;
-    }
-    // Currently we support specify predefined schema for other types include 
decimal, datetime ...etc
-    // so we should set specified info to create correct types, and those 
predefined types are static and
-    // no conflict, so we can set them directly.
-    auto base_data_type =
-            create_array_of_type(base_type, value_dim, is_nullable, 
info.precision, info.scale);
+    auto from_type_id = info.scalar_type_id;
+    auto from_dim = info.num_dimensions;
+    auto least_common_type_id = least_common_type.get_base_type_id();
+    auto least_common_type_dim = least_common_type.get_dimensions();
+    bool type_changed = info.need_convert;
     if (data.empty()) {
-        add_new_column_part(base_data_type);
-    } else if ((least_common_type.get_base_type_id() != base_type &&
-                base_type != PrimitiveType::INVALID_TYPE) ||
-               type_changed) {
-        if (schema_util::is_conversion_required_between_integers(
-                    base_type, least_common_type.get_base_type_id())) {
-            DataTypePtr least_type;
-            get_least_supertype_jsonb(DataTypes {base_data_type, 
least_common_type.get()},
-                                      &least_type);
-            if (!least_type->equals(*base_data_type)) {
+        if (from_dim > 1) {
+            
add_new_column_part(create_array_of_type(PrimitiveType::TYPE_JSONB, 0, 
is_nullable));
+            type_changed = true;
+        } else {
+            add_new_column_part(create_array_of_type(from_type_id, from_dim, 
is_nullable));
+        }
+    } else {
+        if (least_common_type_dim != from_dim) {
+            
add_new_column_part(create_array_of_type(PrimitiveType::TYPE_JSONB, 0, 
is_nullable));
+            if (from_type_id != PrimitiveType::TYPE_JSONB || from_dim != 0) {
                 type_changed = true;
             }
-            add_new_column_part(least_type);
+        } else {
+            if (least_common_type_id != from_type_id &&
+                
schema_util::is_conversion_required_between_integers(from_type_id,
+                                                                     
least_common_type_id)) {
+                type_changed = true;
+                DataTypePtr new_least_common_base_type;
+                get_least_supertype_jsonb(PrimitiveTypeSet {from_type_id, 
least_common_type_id},
+                                          &new_least_common_base_type);
+                if (new_least_common_base_type->get_primitive_type() != 
least_common_type_id) {
+                    add_new_column_part(
+                            
create_array_of_type(new_least_common_base_type->get_primitive_type(),
+                                                 least_common_type_dim, 
is_nullable));
+                }
+            }
         }
     }
-    // 1. type changed means encounter different type, we need to convert it 
to the least common type
-    // 2. need_convert means the type is not the same as the least common 
type, we need to convert it
-    if (type_changed || info.need_convert) {
+
+    if (type_changed) {
         Field new_field;
         convert_field_to_type(field, *least_common_type.get(), &new_field);
         field = new_field;
     }
-
+    ++num_rows;
     data.back()->insert(field);
 }
 
diff --git a/be/test/vec/columns/column_variant_test.cpp 
b/be/test/vec/columns/column_variant_test.cpp
index 9d686d574ea..8bb2ce4ce73 100644
--- a/be/test/vec/columns/column_variant_test.cpp
+++ b/be/test/vec/columns/column_variant_test.cpp
@@ -3567,4 +3567,236 @@ TEST_F(ColumnVariantTest, 
compatibility_deserialize_and_verify) {
 
     std::cout << "Successfully verified deserialized data integrity!" << 
std::endl;
 }
-}
\ No newline at end of file
+
+TEST_F(ColumnVariantTest, subcolumn_insert_range_from_test) {
+    ColumnVariant::Subcolumn subcolumn(0, true /* is_nullable */, false /* 
is_root */);
+    Field int_field = Field::create_field<TYPE_INT>(200000);
+    Field string_field = Field::create_field<TYPE_STRING>("hello");
+
+    Array array_int(2);
+    array_int[0] = int_field;
+    array_int[1] = int_field;
+    Field array_int_field = Field::create_field<TYPE_ARRAY>(array_int);
+    ColumnVariant::Subcolumn subcolumn2(0, true /* is_nullable */, false /* 
is_root */);
+    subcolumn2.insert(array_int_field);
+    subcolumn2.finalize();
+
+    Array array_tiny_int(2);
+    Field tiny_int = Field::create_field<TYPE_TINYINT>(100);
+    array_tiny_int[0] = tiny_int;
+    array_tiny_int[1] = tiny_int;
+    Field array_tiny_int_field = 
Field::create_field<TYPE_ARRAY>(array_tiny_int);
+    ColumnVariant::Subcolumn subcolumn1(0, true /* is_nullable */, false /* 
is_root */);
+    subcolumn1.insert(array_tiny_int_field);
+    subcolumn1.finalize();
+
+    Array array_string(2);
+    array_string[0] = string_field;
+    array_string[1] = string_field;
+    Field array_string_field = Field::create_field<TYPE_ARRAY>(array_string);
+    ColumnVariant::Subcolumn subcolumn3(0, true /* is_nullable */, false /* 
is_root */);
+    subcolumn3.insert(array_string_field);
+    subcolumn3.finalize();
+
+    subcolumn.insert_range_from(subcolumn1, 0, 1);
+    subcolumn.insert_range_from(subcolumn2, 0, 1);
+    subcolumn.insert_range_from(subcolumn3, 0, 1);
+    subcolumn.finalize();
+    EXPECT_EQ(subcolumn.data.size(), 1);
+    std::cout << subcolumn.get_least_common_type()->get_name() << std::endl;
+    EXPECT_EQ(subcolumn.get_least_common_type()->get_primitive_type(), 
PrimitiveType::TYPE_ARRAY);
+}
+
+TEST_F(ColumnVariantTest, subcolumn_insert_test) {
+    ColumnVariant::Subcolumn subcolumn(0, true /* is_nullable */, false /* 
is_root */);
+    Field int_field = Field::create_field<TYPE_INT>(200000);
+    Field string_field = Field::create_field<TYPE_STRING>("hello");
+    Array array_int(2);
+    array_int[0] = int_field;
+    array_int[1] = int_field;
+    Field array_int_field = Field::create_field<TYPE_ARRAY>(array_int);
+
+    Array array_int2(2);
+    Field tiny_int = Field::create_field<TYPE_TINYINT>(100);
+    array_int2[0] = tiny_int;
+    array_int2[1] = tiny_int;
+    Field array_int2_field = Field::create_field<TYPE_ARRAY>(array_int2);
+
+    Array array_string(2);
+    array_string[0] = string_field;
+    array_string[1] = string_field;
+    Field array_string_field = Field::create_field<TYPE_ARRAY>(array_string);
+
+    subcolumn.insert(array_int2_field);
+    subcolumn.insert(array_int_field);
+    subcolumn.insert(array_string_field);
+    subcolumn.finalize();
+    EXPECT_EQ(subcolumn.data.size(), 1);
+    EXPECT_EQ(subcolumn.get_least_common_type()->get_primitive_type(), 
PrimitiveType::TYPE_ARRAY);
+
+    subcolumn.insert(string_field);
+    subcolumn.insert(int_field);
+    EXPECT_EQ(subcolumn.data.size(), 2);
+    
EXPECT_EQ(remove_nullable(subcolumn.get_least_common_type())->get_primitive_type(),
+              PrimitiveType::TYPE_JSONB);
+}
+
+TEST_F(ColumnVariantTest, subcolumn_insert_test_advanced) {
+    std::vector<Field> fields;
+
+    fields.emplace_back(Field::create_field<TYPE_NULL>(Null()));
+
+    fields.emplace_back(Field::create_field<TYPE_BOOLEAN>(true));
+
+    fields.emplace_back(Field::create_field<TYPE_BIGINT>(922337203685477588));
+
+    
fields.emplace_back(Field::create_field<TYPE_LARGEINT>(922337203685477588));
+
+    fields.emplace_back(Field::create_field<TYPE_DOUBLE>(-3.14159265359));
+
+    fields.emplace_back(Field::create_field<TYPE_STRING>("hello world"));
+
+    Array arr_boolean(2);
+    arr_boolean[0] = Field::create_field<TYPE_BOOLEAN>(true);
+    arr_boolean[1] = Field::create_field<TYPE_BOOLEAN>(false);
+    Field arr_boolean_field = Field::create_field<TYPE_ARRAY>(arr_boolean);
+    fields.emplace_back(arr_boolean_field);
+
+    Array arr_int64(2);
+    arr_int64[0] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+    arr_int64[1] = Field::create_field<TYPE_BIGINT>(2232323223232323232);
+    Field arr_int64_field = Field::create_field<TYPE_ARRAY>(arr_int64);
+    fields.emplace_back(arr_int64_field);
+
+    Array arr_double(2);
+    arr_double[0] = Field::create_field<TYPE_DOUBLE>(1.1);
+    arr_double[1] = Field::create_field<TYPE_DOUBLE>(2.2);
+    Field arr_double_field = Field::create_field<TYPE_ARRAY>(arr_double);
+    fields.emplace_back(arr_double_field);
+
+    Array arr_string(2);
+    arr_string[0] = Field::create_field<TYPE_STRING>("one");
+    arr_string[1] = Field::create_field<TYPE_STRING>("two");
+    Field arr_string_field = Field::create_field<TYPE_ARRAY>(arr_string);
+    fields.emplace_back(arr_string_field);
+
+    Array arr_jsonb(5);
+    arr_jsonb[0] = Field::create_field<TYPE_STRING>("one");
+    arr_jsonb[1] = Field::create_field<TYPE_DOUBLE>(1.1);
+    arr_jsonb[2] = Field::create_field<TYPE_BOOLEAN>(true);
+    arr_jsonb[3] = Field::create_field<TYPE_LARGEINT>(1232323232323232323);
+    arr_jsonb[4] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+    Field arr_jsonb_field = Field::create_field<TYPE_ARRAY>(arr_jsonb);
+    fields.emplace_back(arr_jsonb_field);
+
+    std::random_device rd;
+    std::mt19937 g(rd());
+
+    for (int i = 0; i < (1 << fields.size()); i++) {
+        std::shuffle(fields.begin(), fields.end(), g);
+        auto subcolumn = ColumnVariant::Subcolumn(0, true, false);
+
+        for (const auto& field : fields) {
+            subcolumn.insert(field);
+        }
+
+        subcolumn.finalize();
+        EXPECT_EQ(subcolumn.data.size(), 1);
+        // std::cout << "least common type: " << 
subcolumn.get_least_common_type()->get_name() << std::endl;
+        EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(), 
PrimitiveType::TYPE_JSONB);
+
+        for (const auto& field : fields) {
+            subcolumn.insert(field);
+        }
+        EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(), 
PrimitiveType::TYPE_JSONB);
+
+        if (i % 1000 == 0) {
+            std::cout << "insert count " << i << std::endl;
+        }
+    }
+}
+
+TEST_F(ColumnVariantTest, subcolumn_insert_range_from_test_advanced) {
+    std::vector<Field> fields;
+
+    fields.emplace_back(Field::create_field<TYPE_NULL>(Null()));
+
+    fields.emplace_back(Field::create_field<TYPE_BOOLEAN>(true));
+
+    fields.emplace_back(Field::create_field<TYPE_BIGINT>(922337203685477588));
+
+    
fields.emplace_back(Field::create_field<TYPE_LARGEINT>(922337203685477588));
+
+    fields.emplace_back(Field::create_field<TYPE_DOUBLE>(-3.14159265359));
+
+    fields.emplace_back(Field::create_field<TYPE_STRING>("hello world"));
+
+    Array arr_boolean(2);
+    arr_boolean[0] = Field::create_field<TYPE_BOOLEAN>(true);
+    arr_boolean[1] = Field::create_field<TYPE_BOOLEAN>(false);
+    Field arr_boolean_field = Field::create_field<TYPE_ARRAY>(arr_boolean);
+    fields.emplace_back(arr_boolean_field);
+
+    Array arr_int64(2);
+    arr_int64[0] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+    arr_int64[1] = Field::create_field<TYPE_BIGINT>(2232323223232323232);
+    Field arr_int64_field = Field::create_field<TYPE_ARRAY>(arr_int64);
+    fields.emplace_back(arr_int64_field);
+
+    Array arr_largeint(2);
+    arr_largeint[0] = Field::create_field<TYPE_LARGEINT>(1232323232323232323);
+    arr_largeint[1] = Field::create_field<TYPE_LARGEINT>(2232323223232323232);
+    Field arr_largeint_field = Field::create_field<TYPE_ARRAY>(arr_largeint);
+    fields.emplace_back(arr_largeint_field);
+
+    Array arr_double(2);
+    arr_double[0] = Field::create_field<TYPE_DOUBLE>(1.1);
+    arr_double[1] = Field::create_field<TYPE_DOUBLE>(2.2);
+    Field arr_double_field = Field::create_field<TYPE_ARRAY>(arr_double);
+    fields.emplace_back(arr_double_field);
+
+    Array arr_string(2);
+    arr_string[0] = Field::create_field<TYPE_STRING>("one");
+    arr_string[1] = Field::create_field<TYPE_STRING>("two");
+    Field arr_string_field = Field::create_field<TYPE_ARRAY>(arr_string);
+    fields.emplace_back(arr_string_field);
+
+    Array arr_jsonb(5);
+    arr_jsonb[0] = Field::create_field<TYPE_STRING>("one");
+    arr_jsonb[1] = Field::create_field<TYPE_DOUBLE>(1.1);
+    arr_jsonb[2] = Field::create_field<TYPE_BOOLEAN>(true);
+    arr_jsonb[3] = Field::create_field<TYPE_LARGEINT>(1232323232323232323);
+    arr_jsonb[4] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+    Field arr_jsonb_field = Field::create_field<TYPE_ARRAY>(arr_jsonb);
+    fields.emplace_back(arr_jsonb_field);
+
+    std::random_device rd;
+    std::mt19937 g(rd());
+
+    for (int i = 0; i < (1 << fields.size()); i++) {
+        std::shuffle(fields.begin(), fields.end(), g);
+        auto subcolumn = ColumnVariant::Subcolumn(0, true, false);
+
+        for (const auto& field : fields) {
+            auto subcolumn_tmp = ColumnVariant::Subcolumn(0, true, false);
+            subcolumn_tmp.insert(field);
+            subcolumn.insert_range_from(subcolumn_tmp, 0, 1);
+        }
+
+        subcolumn.finalize();
+        EXPECT_EQ(subcolumn.data.size(), 1);
+        // std::cout << "least common type: " << 
subcolumn.get_least_common_type()->get_name() << std::endl;
+        EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(), 
PrimitiveType::TYPE_JSONB);
+
+        for (const auto& field : fields) {
+            subcolumn.insert(field);
+        }
+        EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(), 
PrimitiveType::TYPE_JSONB);
+
+        if (i % 1000 == 0) {
+            std::cout << "insert count " << i << std::endl;
+        }
+    }
+}
+
+} // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to