This is an automated email from the ASF dual-hosted git repository.
csun5285 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new b8bfe9b7ca0 [fix](variant) fix wrong element type inference for
mixed-type arrays in VARIANT sparse columns (#64273)
b8bfe9b7ca0 is described below
commit b8bfe9b7ca013f2709c0c93b3d240e61ef96a407
Author: Chenyang Sun <[email protected]>
AuthorDate: Tue Jun 9 20:19:14 2026 +0800
[fix](variant) fix wrong element type inference for mixed-type arrays in
VARIANT sparse columns (#64273)
A mixed-type JSON array stored in a `VARIANT` column, e.g. `{"a": ["1",
2, 1.1]}`
(string + int + double), could be reconstructed with a **wrong element
type**.
---
be/src/core/column/column_variant.cpp | 4 +-
.../core/data_type_serde/data_type_array_serde.cpp | 20 ++-
be/src/core/field.h | 4 +
.../data_type_serde/data_type_serde_array_test.cpp | 178 +++++++++++++++++++++
.../data/variant_p0/compaction/test_compaction.out | 4 +
.../variant_p0/compaction/test_compaction.groovy | 4 +-
6 files changed, 208 insertions(+), 6 deletions(-)
diff --git a/be/src/core/column/column_variant.cpp
b/be/src/core/column/column_variant.cpp
index ef0c6c8d9c3..723d52d46b9 100644
--- a/be/src/core/column/column_variant.cpp
+++ b/be/src/core/column/column_variant.cpp
@@ -229,6 +229,7 @@ void ColumnVariant::Subcolumn::insert(FieldWithDataType
field) {
info.scale = field.scale;
info.scalar_type_id = field.base_scalar_type_id;
info.num_dimensions = field.num_dimensions;
+ info.need_convert = field.need_convert;
insert(std::move(field.field), info);
}
@@ -1080,7 +1081,8 @@ void ColumnVariant::get(size_t n, Field& res) const {
.num_dimensions =
static_cast<uint8_t>(
data.second.num_dimensions),
.precision =
data.second.precision,
- .scale = data.second.scale});
+ .scale = data.second.scale,
+ .need_convert =
data.second.need_convert});
}
try_get_from_doc_value_column(n, res);
if (object.empty()) {
diff --git a/be/src/core/data_type_serde/data_type_array_serde.cpp
b/be/src/core/data_type_serde/data_type_array_serde.cpp
index 8acb6f47dc1..42cd2e9df6b 100644
--- a/be/src/core/data_type_serde/data_type_array_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_array_serde.cpp
@@ -26,6 +26,7 @@
#include "core/column/column_const.h"
#include "core/data_type/data_type.h"
#include "core/data_type/data_type_array.h"
+#include "core/data_type/get_least_supertype.h"
#include "core/data_type_serde/complex_type_deserialize_util.h"
#include "core/string_ref.h"
#include "exprs/function/function_helpers.h"
@@ -504,16 +505,29 @@ const uint8_t*
DataTypeArraySerDe::deserialize_binary_to_field(const uint8_t* da
field = Field::create_field<TYPE_ARRAY>(Array(nested_size));
info.num_dimensions++;
auto& array = field.get<TYPE_ARRAY>();
- PrimitiveType nested_type = PrimitiveType::TYPE_NULL;
+ // Element type is the common type of all elements, not the last element's
type.
+ // For a mixed-type array like ["1", 2, 1.1] the last-element rule picks
array<double>
+ // and loses the string, which crashes later when the field is re-inserted.
+ PrimitiveTypeSet element_types;
for (size_t i = 0; i < nested_size; ++i) {
Field nested_field;
data = DataTypeSerDe::deserialize_binary_to_field(data, nested_field,
info);
array[i] = std::move(nested_field);
if (info.scalar_type_id != PrimitiveType::TYPE_NULL) {
- nested_type = info.scalar_type_id;
+ element_types.insert(info.scalar_type_id);
}
}
- info.scalar_type_id = nested_type;
+ if (element_types.empty()) {
+ info.scalar_type_id = PrimitiveType::TYPE_NULL;
+ } else if (element_types.size() == 1) {
+ info.scalar_type_id = *element_types.begin();
+ } else {
+ DataTypePtr common_type;
+ get_least_supertype_jsonb(element_types, &common_type);
+ info.scalar_type_id = common_type->get_primitive_type();
+ // Mixed-type elements need converting to the common type on insert.
+ info.need_convert = true;
+ }
return data;
}
diff --git a/be/src/core/field.h b/be/src/core/field.h
index 1bb4160d0dd..d39d8208777 100644
--- a/be/src/core/field.h
+++ b/be/src/core/field.h
@@ -323,6 +323,10 @@ struct FieldWithDataType {
uint8_t num_dimensions = 0;
int precision = -1;
int scale = -1;
+ // True when the array elements are mixed-type and must be converted to
the common base
+ // type on insert. Mirrors FieldInfo::need_convert so it survives the
FieldWithDataType
+ // round trip in the sparse read path.
+ bool need_convert = false;
};
} // namespace doris
diff --git a/be/test/core/data_type_serde/data_type_serde_array_test.cpp
b/be/test/core/data_type_serde/data_type_serde_array_test.cpp
new file mode 100644
index 00000000000..f6b4a666f96
--- /dev/null
+++ b/be/test/core/data_type_serde/data_type_serde_array_test.cpp
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+#include "core/column/column_string.h"
+#include "core/column/column_vector.h"
+#include "core/data_type_serde/data_type_number_serde.h"
+#include "core/data_type_serde/data_type_serde.h"
+#include "core/data_type_serde/data_type_string_serde.h"
+#include "core/field.h"
+#include "storage/olap_common.h"
+
+namespace doris {
+
+// Append one tagged scalar cell (the variant sparse-column binary layout: a
1-byte
+// FieldType tag followed by the value) by serializing a single-element column.
+static void append_string(ColumnString::Chars& chars, const std::string& v) {
+ auto col = ColumnString::create();
+ col->insert_data(v.data(), v.size());
+ DataTypeStringSerDe(TYPE_STRING).write_one_cell_to_binary(*col, chars, 0);
+}
+static void append_bigint(ColumnString::Chars& chars, int64_t v) {
+ auto col = ColumnInt64::create();
+ col->insert_value(v);
+ DataTypeNumberSerDe<TYPE_BIGINT>().write_one_cell_to_binary(*col, chars,
0);
+}
+static void append_int(ColumnString::Chars& chars, int32_t v) {
+ auto col = ColumnInt32::create();
+ col->insert_value(v);
+ DataTypeNumberSerDe<TYPE_INT>().write_one_cell_to_binary(*col, chars, 0);
+}
+static void append_double(ColumnString::Chars& chars, double v) {
+ auto col = ColumnFloat64::create();
+ col->insert_value(v);
+ DataTypeNumberSerDe<TYPE_DOUBLE>().write_one_cell_to_binary(*col, chars,
0);
+}
+// Write the array header (ARRAY tag + element count); elements are appended
by the caller.
+static void append_array_header(ColumnString::Chars& chars, size_t n) {
+ const auto tag = static_cast<uint8_t>(FieldType::OLAP_FIELD_TYPE_ARRAY);
+ chars.push_back(tag);
+ const size_t old_size = chars.size();
+ chars.resize(old_size + sizeof(size_t));
+ memcpy(chars.data() + old_size, &n, sizeof(size_t));
+}
+
+// Regression for DORIS-26221: DataTypeArraySerDe::deserialize_binary_to_field
used to set the
+// array element type to the last non-null element's type. For a mixed-type
array such as
+// ["1", 2, 1.1] that yielded array<double> and dropped the string element,
crashing later when
+// the field was re-inserted (e.g. AGGREGATE-key merge over a variant sparse
column). The element
+// type must be the least common (JSONB-aware) supertype instead.
+class DataTypeArraySerDeFieldTest : public ::testing::Test {};
+
+TEST_F(DataTypeArraySerDeFieldTest, mixed_type_array_resolves_to_jsonb) {
+ // ["1", 2, 1.1] -> string, bigint, double
+ auto chars_col = ColumnString::create();
+ ColumnString::Chars& chars = chars_col->get_chars();
+ append_array_header(chars, 3);
+ append_string(chars, "1");
+ append_bigint(chars, 2);
+ append_double(chars, 1.1);
+
+ Field field;
+ FieldInfo info;
+ const uint8_t* end =
DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+ EXPECT_EQ(end, chars.data() + chars.size());
+
+ // Element type is the common JSONB supertype, NOT the last element's
DOUBLE.
+ EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_JSONB);
+ EXPECT_TRUE(info.need_convert);
+ EXPECT_EQ(info.num_dimensions, 1);
+
+ // The element values keep their original types in the reconstructed field.
+ ASSERT_EQ(field.get_type(), PrimitiveType::TYPE_ARRAY);
+ const auto& arr = field.get<TYPE_ARRAY>();
+ ASSERT_EQ(arr.size(), 3);
+ EXPECT_EQ(arr[0].get_type(), PrimitiveType::TYPE_STRING);
+ EXPECT_EQ(arr[1].get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(arr[2].get_type(), PrimitiveType::TYPE_DOUBLE);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest,
mixed_type_array_last_element_string_resolves_to_jsonb) {
+ // [1, 2, "3"]: last element is string; last-wins would wrongly pick
array<string>.
+ auto chars_col = ColumnString::create();
+ ColumnString::Chars& chars = chars_col->get_chars();
+ append_array_header(chars, 3);
+ append_bigint(chars, 1);
+ append_bigint(chars, 2);
+ append_string(chars, "3");
+
+ Field field;
+ FieldInfo info;
+ DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+ EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_JSONB);
+ EXPECT_TRUE(info.need_convert);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, numeric_only_array_promotes_to_double) {
+ // [2, 1.1]: int + double unify to the numeric supertype double (not
JSONB) -- numeric
+ // promotion still works, only string/number mixes fall back to JSONB.
+ auto chars_col = ColumnString::create();
+ ColumnString::Chars& chars = chars_col->get_chars();
+ append_array_header(chars, 2);
+ append_int(chars, 2);
+ append_double(chars, 1.1);
+
+ Field field;
+ FieldInfo info;
+ DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+ EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_DOUBLE);
+ EXPECT_TRUE(info.need_convert);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, homogeneous_array_keeps_element_type) {
+ // [1, 2, 3]: single element type, no conversion needed.
+ auto chars_col = ColumnString::create();
+ ColumnString::Chars& chars = chars_col->get_chars();
+ append_array_header(chars, 3);
+ append_bigint(chars, 1);
+ append_bigint(chars, 2);
+ append_bigint(chars, 3);
+
+ Field field;
+ FieldInfo info;
+ DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+ EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_BIGINT);
+ EXPECT_FALSE(info.need_convert);
+ EXPECT_EQ(info.num_dimensions, 1);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, nested_mixed_array_resolves_to_jsonb) {
+ // [[1], ["a"]]: inner arrays have different element types -> outer base
type is JSONB.
+ auto chars_col = ColumnString::create();
+ ColumnString::Chars& chars = chars_col->get_chars();
+ append_array_header(chars, 2);
+ append_array_header(chars, 1); // [1]
+ append_bigint(chars, 1);
+ append_array_header(chars, 1); // ["a"]
+ append_string(chars, "a");
+
+ Field field;
+ FieldInfo info;
+ DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+ EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_JSONB);
+ EXPECT_TRUE(info.need_convert);
+ ASSERT_EQ(field.get_type(), PrimitiveType::TYPE_ARRAY);
+ EXPECT_EQ(field.get<TYPE_ARRAY>().size(), 2);
+}
+
+TEST_F(DataTypeArraySerDeFieldTest, empty_array_is_null_element_type) {
+ auto chars_col = ColumnString::create();
+ ColumnString::Chars& chars = chars_col->get_chars();
+ append_array_header(chars, 0);
+
+ Field field;
+ FieldInfo info;
+ DataTypeSerDe::deserialize_binary_to_field(chars.data(), field, info);
+ EXPECT_EQ(info.scalar_type_id, PrimitiveType::TYPE_NULL);
+ EXPECT_FALSE(info.need_convert);
+}
+
+} // namespace doris
diff --git a/regression-test/data/variant_p0/compaction/test_compaction.out
b/regression-test/data/variant_p0/compaction/test_compaction.out
index 34a0266b216..7ccf1277bc0 100644
--- a/regression-test/data/variant_p0/compaction/test_compaction.out
+++ b/regression-test/data/variant_p0/compaction/test_compaction.out
@@ -122,6 +122,8 @@
16 {"a":"1223"}
17 {"a":[1]}
17 {"a":[1]}
+18 {"a":["1",2,1.1]}
+18 {"a":["1",2,1.1]}
19 {"a":1,"b":{"c":1}}
19 {"a":1,"b":{"c":1}}
20 {"a":1,"b":{"c":[{"a":1}]}}
@@ -244,6 +246,7 @@
15 {"a":1}
16 {"a":"1223"}
17 {"a":[1]}
+18 {"a":["1",2,1.1]}
19 {"a":1,"b":{"c":1}}
20 {"a":1,"b":{"c":[{"a":1}]}}
21 {"a":1,"b":{"c":[{"a":1}]}}
@@ -347,6 +350,7 @@
15 {"a":1}
16 {"a":"1223"}
17 {"a":[1]}
+18 {"a":["1",2,1.1]}
19 {"a":1,"b":{"c":1}}
20 {"a":1,"b":{"c":[{"a":1}]}}
21 {"a":1,"b":{"c":[{"a":1}]}}
diff --git
a/regression-test/suites/variant_p0/compaction/test_compaction.groovy
b/regression-test/suites/variant_p0/compaction/test_compaction.groovy
index 1e427c7e3eb..d354bdd0890 100644
--- a/regression-test/suites/variant_p0/compaction/test_compaction.groovy
+++ b/regression-test/suites/variant_p0/compaction/test_compaction.groovy
@@ -81,7 +81,7 @@ suite("test_compaction_variant") {
}
insert.call();
insert.call();
- qt_sql_1 "SELECT * FROM ${tableName} ORDER BY k, cast(v as
string); "
+ qt_sql_1 "SELECT k, cast(v as json) FROM ${tableName} ORDER BY k,
cast(v as string); "
qt_sql_2 "select k, cast(v['a'] as array<int>) from ${tableName}
where size(cast(v['a'] as array<int>)) > 0 order by k"
qt_sql_3 "select k, v['a'], cast(v['b'] as string) from
${tableName} where length(cast(v['b'] as string)) > 4 order by k"
qt_sql_5 "select cast(v['b'] as string), cast(v['b']['c'] as
string) from ${tableName} where cast(v['b'] as string) != 'null' or
cast(v['b'] as string) != '{}' order by k desc, 1, 2 limit 10;"
@@ -106,7 +106,7 @@ suite("test_compaction_variant") {
}
}
// assert (rowCount < 8)
- qt_sql_11 "SELECT * FROM ${tableName} where k != 18 ORDER BY k,
cast(v as string); "
+ qt_sql_11 "SELECT k, cast(v as json) FROM ${tableName} ORDER BY k,
cast(v as string); "
qt_sql_22 "select k, cast(v['a'] as array<int>) from ${tableName}
where size(cast(v['a'] as array<int>)) > 0 order by k"
qt_sql_33 "select k, v['a'], cast(v['b'] as string) from
${tableName} where length(cast(v['b'] as string)) > 4 order by k"
qt_sql_55 "select cast(v['b'] as string), cast(v['b']['c'] as
string) from ${tableName} where cast(v['b'] as string) != 'null' and
cast(v['b'] as string) != '{}' order by k desc limit 10;"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]