This is an automated email from the ASF dual-hosted git repository.

csun5285 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b8bfe9b7ca0 [fix](variant) fix wrong element type inference for 
mixed-type arrays in VARIANT  sparse columns (#64273)
b8bfe9b7ca0 is described below

commit b8bfe9b7ca013f2709c0c93b3d240e61ef96a407
Author: Chenyang Sun <[email protected]>
AuthorDate: Tue Jun 9 20:19:14 2026 +0800

    [fix](variant) fix wrong element type inference for mixed-type arrays in 
VARIANT  sparse columns (#64273)
    
    A mixed-type JSON array stored in a `VARIANT` column, e.g. `{"a": ["1",
    2, 1.1]}`
    (string + int + double), could be reconstructed with a **wrong element
    type**.
---
 be/src/core/column/column_variant.cpp              |   4 +-
 .../core/data_type_serde/data_type_array_serde.cpp |  20 ++-
 be/src/core/field.h                                |   4 +
 .../data_type_serde/data_type_serde_array_test.cpp | 178 +++++++++++++++++++++
 .../data/variant_p0/compaction/test_compaction.out |   4 +
 .../variant_p0/compaction/test_compaction.groovy   |   4 +-
 6 files changed, 208 insertions(+), 6 deletions(-)

diff --git a/be/src/core/column/column_variant.cpp 
b/be/src/core/column/column_variant.cpp
index ef0c6c8d9c3..723d52d46b9 100644
--- a/be/src/core/column/column_variant.cpp
+++ b/be/src/core/column/column_variant.cpp
@@ -229,6 +229,7 @@ void ColumnVariant::Subcolumn::insert(FieldWithDataType 
field) {
     info.scale = field.scale;
     info.scalar_type_id = field.base_scalar_type_id;
     info.num_dimensions = field.num_dimensions;
+    info.need_convert = field.need_convert;
     insert(std::move(field.field), info);
 }
 
@@ -1080,7 +1081,8 @@ void ColumnVariant::get(size_t n, Field& res) const {
                                               .num_dimensions = 
static_cast<uint8_t>(
                                                       
data.second.num_dimensions),
                                               .precision = 
data.second.precision,
-                                              .scale = data.second.scale});
+                                              .scale = data.second.scale,
+                                              .need_convert = 
data.second.need_convert});
     }
     try_get_from_doc_value_column(n, res);
     if (object.empty()) {
diff --git a/be/src/core/data_type_serde/data_type_array_serde.cpp 
b/be/src/core/data_type_serde/data_type_array_serde.cpp
index 8acb6f47dc1..42cd2e9df6b 100644
--- a/be/src/core/data_type_serde/data_type_array_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_array_serde.cpp
@@ -26,6 +26,7 @@
 #include "core/column/column_const.h"
 #include "core/data_type/data_type.h"
 #include "core/data_type/data_type_array.h"
+#include "core/data_type/get_least_supertype.h"
 #include "core/data_type_serde/complex_type_deserialize_util.h"
 #include "core/string_ref.h"
 #include "exprs/function/function_helpers.h"
@@ -504,16 +505,29 @@ const uint8_t* 
DataTypeArraySerDe::deserialize_binary_to_field(const uint8_t* da
     field = Field::create_field<TYPE_ARRAY>(Array(nested_size));
     info.num_dimensions++;
     auto& array = field.get<TYPE_ARRAY>();
-    PrimitiveType nested_type = PrimitiveType::TYPE_NULL;
+    // Element type is the common type of all elements, not the last element's 
type.
+    // For a mixed-type array like ["1", 2, 1.1] the last-element rule picks 
array<double>
+    // and loses the string, which crashes later when the field is re-inserted.
+    PrimitiveTypeSet element_types;
     for (size_t i = 0; i < nested_size; ++i) {
         Field nested_field;
         data = DataTypeSerDe::deserialize_binary_to_field(data, nested_field, 
info);
         array[i] = std::move(nested_field);
         if (info.scalar_type_id != PrimitiveType::TYPE_NULL) {
-            nested_type = info.scalar_type_id;
+            element_types.insert(info.scalar_type_id);
         }
     }
-    info.scalar_type_id = nested_type;
+    if (element_types.empty()) {
+        info.scalar_type_id = PrimitiveType::TYPE_NULL;
+    } else if (element_types.size() == 1) {
+        info.scalar_type_id = *element_types.begin();
+    } else {
+        DataTypePtr common_type;
+        get_least_supertype_jsonb(element_types, &common_type);
+        info.scalar_type_id = common_type->get_primitive_type();
+        // Mixed-type elements need converting to the common type on insert.
+        info.need_convert = true;
+    }
     return data;
 }
 
diff --git a/be/src/core/field.h b/be/src/core/field.h
index 1bb4160d0dd..d39d8208777 100644
--- a/be/src/core/field.h
+++ b/be/src/core/field.h
@@ -323,6 +323,10 @@ struct FieldWithDataType {
     uint8_t num_dimensions = 0;
     int precision = -1;
     int scale = -1;
+    // True when the array elements are mixed-type and must be converted to 
the common base
+    // type on insert. Mirrors FieldInfo::need_convert so it survives the 
FieldWithDataType
+    // round trip in the sparse read path.
+    bool need_convert = false;
 };
 
 } // namespace doris
diff --git a/be/test/core/data_type_serde/data_type_serde_array_test.cpp 
b/be/test/core/data_type_serde/data_type_serde_array_test.cpp
new file mode 100644
index 00000000000..f6b4a666f96
--- /dev/null
+++ b/be/test/core/data_type_serde/data_type_serde_array_test.cpp
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+#include "core/column/column_string.h"
+#include "core/column/column_vector.h"
+#include "core/data_type_serde/data_type_number_serde.h"
+#include "core/data_type_serde/data_type_serde.h"
+#include "core/data_type_serde/data_type_string_serde.h"
+#include "core/field.h"
+#include "storage/olap_common.h"
+
+namespace doris {
+
+// Append one tagged scalar cell (the variant sparse-column binary layout: a 
1-byte
+// FieldType tag followed by the value) by serializing a single-element column.
+static void append_string(ColumnString::Chars& chars, const std::string& v) {
+    auto col = ColumnString::create();
+    col->insert_data(v.data(), v.size());
+    DataTypeStringSerDe(TYPE_STRING).write_one_cell_to_binary(*col, chars, 0);
+}
+static void append_bigint(ColumnString::Chars& chars, int64_t v) {
+    auto col = ColumnInt64::create();
+    col->insert_value(v);
+    DataTypeNumberSerDe<TYPE_BIGINT>().write_one_cell_to_binary(*col, chars, 
0);
+}
+static void append_int(ColumnString::Chars& chars, int32_t v) {
+    auto col = ColumnInt32::create();
+    col->insert_value(v);
+    DataTypeNumberSerDe<TYPE_INT>().write_one_cell_to_binary(*col, chars, 0);
+}
+static void append_double(ColumnString::Chars& chars, double v) {
+    auto col = ColumnFloat64::create();
+    col->insert_value(v);
+    DataTypeNumberSerDe<TYPE_DOUBLE>().write_one_cell_to_binary(*col, chars, 
0);
+}
+// Write the array header (ARRAY tag + element count); elements are appended 
by the caller.
+static void append_array_header(ColumnString::Chars& chars, size_t n) {
+    const auto tag = static_cast<uint8_t>(FieldType::OLAP_FIELD_TYPE_ARRAY);
+    chars.push_back(tag);
+    const size_t old_size = chars.size();
+    chars.resize(old_size + sizeof(size_t));
+    memcpy(chars.data() + old_size, &n, sizeof(size_t));
+}
+
+// Regression for DORIS-26221: DataTypeArraySerDe::deserialize_binary_to_field 
used to set the
+// array element type to the last non-null element's type. For a mixed-type 
array such as
+// ["1", 2, 1.1] that yielded array<double> and dropped the string element, 
crashing later when
+// the field was re-inserted (e.g. AGGREGATE-key merge over a variant sparse 
column). The element
+// type must be the least common (JSONB-aware) supertype instead.
+class DataTypeArraySerDeFieldTest : public ::testing::Test {};
+
+TEST_F(DataTypeArraySerDeFieldTest, mixed_type_array_resolves_to_jsonb) {
+    // ["1", 2, 1.1]  -> string, bigint, double
+    auto chars_col = ColumnString::create();
+    ColumnString::Chars& chars = chars_col->get_chars();
+    append_array_header(chars, 3);
+    append_string(chars, "1");
+    append_bigint(chars, 2);
+    append_double(chars, 1.1);
+
+    Field field;
+    FieldInfo info;
+    const uint8_t* end = 
DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+    EXPECT_EQ(end, chars.data() + chars.size());
+
+    // Element type is the common JSONB supertype, NOT the last element's 
DOUBLE.
+    EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_JSONB);
+    EXPECT_TRUE(info.need_convert);
+    EXPECT_EQ(info.num_dimensions, 1);
+
+    // The element values keep their original types in the reconstructed field.
+    ASSERT_EQ(field.get_type(), PrimitiveType::TYPE_ARRAY);
+    const auto& arr = field.get<TYPE_ARRAY>();
+    ASSERT_EQ(arr.size(), 3);
+    EXPECT_EQ(arr[0].get_type(), PrimitiveType::TYPE_STRING);
+    EXPECT_EQ(arr[1].get_type(), PrimitiveType::TYPE_BIGINT);
+    EXPECT_EQ(arr[2].get_type(), PrimitiveType::TYPE_DOUBLE);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, 
mixed_type_array_last_element_string_resolves_to_jsonb) {
+    // [1, 2, "3"]: last element is string; last-wins would wrongly pick 
array<string>.
+    auto chars_col = ColumnString::create();
+    ColumnString::Chars& chars = chars_col->get_chars();
+    append_array_header(chars, 3);
+    append_bigint(chars, 1);
+    append_bigint(chars, 2);
+    append_string(chars, "3");
+
+    Field field;
+    FieldInfo info;
+    DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+    EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_JSONB);
+    EXPECT_TRUE(info.need_convert);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, numeric_only_array_promotes_to_double) {
+    // [2, 1.1]: int + double unify to the numeric supertype double (not 
JSONB) -- numeric
+    // promotion still works, only string/number mixes fall back to JSONB.
+    auto chars_col = ColumnString::create();
+    ColumnString::Chars& chars = chars_col->get_chars();
+    append_array_header(chars, 2);
+    append_int(chars, 2);
+    append_double(chars, 1.1);
+
+    Field field;
+    FieldInfo info;
+    DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+    EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_DOUBLE);
+    EXPECT_TRUE(info.need_convert);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, homogeneous_array_keeps_element_type) {
+    // [1, 2, 3]: single element type, no conversion needed.
+    auto chars_col = ColumnString::create();
+    ColumnString::Chars& chars = chars_col->get_chars();
+    append_array_header(chars, 3);
+    append_bigint(chars, 1);
+    append_bigint(chars, 2);
+    append_bigint(chars, 3);
+
+    Field field;
+    FieldInfo info;
+    DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+    EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_BIGINT);
+    EXPECT_FALSE(info.need_convert);
+    EXPECT_EQ(info.num_dimensions, 1);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, nested_mixed_array_resolves_to_jsonb) {
+    // [[1], ["a"]]: inner arrays have different element types -> outer base 
type is JSONB.
+    auto chars_col = ColumnString::create();
+    ColumnString::Chars& chars = chars_col->get_chars();
+    append_array_header(chars, 2);
+    append_array_header(chars, 1); // [1]
+    append_bigint(chars, 1);
+    append_array_header(chars, 1); // ["a"]
+    append_string(chars, "a");
+
+    Field field;
+    FieldInfo info;
+    DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+    EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_JSONB);
+    EXPECT_TRUE(info.need_convert);
+    ASSERT_EQ(field.get_type(), PrimitiveType::TYPE_ARRAY);
+    EXPECT_EQ(field.get<TYPE_ARRAY>().size(), 2);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, empty_array_is_null_element_type) {
+    auto chars_col = ColumnString::create();
+    ColumnString::Chars& chars = chars_col->get_chars();
+    append_array_header(chars, 0);
+
+    Field field;
+    FieldInfo info;
+    DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+    EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_NULL);
+    EXPECT_FALSE(info.need_convert);
+}
+
+} // namespace doris
diff --git a/regression-test/data/variant_p0/compaction/test_compaction.out 
b/regression-test/data/variant_p0/compaction/test_compaction.out
index 34a0266b216..7ccf1277bc0 100644
--- a/regression-test/data/variant_p0/compaction/test_compaction.out
+++ b/regression-test/data/variant_p0/compaction/test_compaction.out
@@ -122,6 +122,8 @@
 16     {"a":"1223"}
 17     {"a":[1]}
 17     {"a":[1]}
+18     {"a":["1",2,1.1]}
+18     {"a":["1",2,1.1]}
 19     {"a":1,"b":{"c":1}}
 19     {"a":1,"b":{"c":1}}
 20     {"a":1,"b":{"c":[{"a":1}]}}
@@ -244,6 +246,7 @@
 15     {"a":1}
 16     {"a":"1223"}
 17     {"a":[1]}
+18     {"a":["1",2,1.1]}
 19     {"a":1,"b":{"c":1}}
 20     {"a":1,"b":{"c":[{"a":1}]}}
 21     {"a":1,"b":{"c":[{"a":1}]}}
@@ -347,6 +350,7 @@
 15     {"a":1}
 16     {"a":"1223"}
 17     {"a":[1]}
+18     {"a":["1",2,1.1]}
 19     {"a":1,"b":{"c":1}}
 20     {"a":1,"b":{"c":[{"a":1}]}}
 21     {"a":1,"b":{"c":[{"a":1}]}}
diff --git 
a/regression-test/suites/variant_p0/compaction/test_compaction.groovy 
b/regression-test/suites/variant_p0/compaction/test_compaction.groovy
index 1e427c7e3eb..d354bdd0890 100644
--- a/regression-test/suites/variant_p0/compaction/test_compaction.groovy
+++ b/regression-test/suites/variant_p0/compaction/test_compaction.groovy
@@ -81,7 +81,7 @@ suite("test_compaction_variant") {
             }
             insert.call();
             insert.call();
-            qt_sql_1 "SELECT * FROM ${tableName} ORDER BY k, cast(v as 
string); "
+            qt_sql_1 "SELECT k, cast(v as json) FROM ${tableName} ORDER BY k, 
cast(v as string); "
             qt_sql_2 "select k, cast(v['a'] as array<int>) from  ${tableName} 
where  size(cast(v['a'] as array<int>)) > 0 order by k"
             qt_sql_3 "select k, v['a'], cast(v['b'] as string) from  
${tableName} where  length(cast(v['b'] as string)) > 4 order  by k"
             qt_sql_5 "select cast(v['b'] as string), cast(v['b']['c'] as 
string) from  ${tableName} where cast(v['b'] as string) != 'null' or 
cast(v['b'] as string) != '{}' order by k desc, 1, 2 limit 10;"
@@ -106,7 +106,7 @@ suite("test_compaction_variant") {
                 }
             }
             // assert (rowCount < 8)
-            qt_sql_11 "SELECT * FROM ${tableName} where k != 18 ORDER BY k, 
cast(v as string); "
+            qt_sql_11 "SELECT k, cast(v as json) FROM ${tableName} ORDER BY k, 
cast(v as string); "
             qt_sql_22 "select k, cast(v['a'] as array<int>) from  ${tableName} 
where  size(cast(v['a'] as array<int>)) > 0 order by k"
             qt_sql_33 "select k, v['a'], cast(v['b'] as string) from  
${tableName} where  length(cast(v['b'] as string)) > 4 order  by k"
             qt_sql_55 "select cast(v['b'] as string), cast(v['b']['c'] as 
string) from  ${tableName} where cast(v['b'] as string) != 'null' and 
cast(v['b'] as string) != '{}' order by k desc limit 10;"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to