This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 40ddeabf42f [Fix](Variant) fix serialize with json key contains `.` as
name (#51857)
40ddeabf42f is described below
commit 40ddeabf42f73638da78c49c8736cb670dbb663e
Author: lihangyu <[email protected]>
AuthorDate: Thu Jun 19 14:55:18 2025 +0800
[Fix](Variant) fix serialize with json key contains `.` as name (#51857)
1. get_path with lost object nesting level information when calling
ColumnObject::get when VariantMap is std::map<std::string, Field>, so
change VariantMap to std::<PathInData, field> to maintain nesting level
2. serialize/deserialize should also serialize PathInData to
ColumnPathInfo to maintain nesting level
---
.../rowset/segment_v2/hierarchical_data_reader.h | 8 ++++++++
be/src/vec/columns/column_variant.cpp | 8 ++------
be/src/vec/core/field.h | 3 ++-
be/src/vec/data_types/data_type_variant.cpp | 10 ++++++++--
be/src/vec/data_types/serde/data_type_serde.cpp | 3 ++-
be/src/vec/json/json_parser.h | 8 ++++++++
be/src/vec/json/path_in_data.h | 19 -------------------
be/test/vec/columns/column_object_test.cpp | 19 ++++++++++---------
gensrc/proto/data.proto | 1 +
regression-test/data/variant_p0/column_name.out | Bin 469 -> 545 bytes
regression-test/suites/variant_p0/column_name.groovy | 15 +++++++++++++--
11 files changed, 54 insertions(+), 40 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
index 5aaba4a5265..160c2946497 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
@@ -45,6 +45,14 @@
namespace doris::segment_v2 {
+struct PathWithColumnAndType {
+ vectorized::PathInData path;
+ vectorized::ColumnPtr column;
+ vectorized::DataTypePtr type;
+};
+
+using PathsWithColumnAndType = std::vector<PathWithColumnAndType>;
+
// Reader for hierarchical data for variant, merge with root(sparse encoded
columns)
class HierarchicalDataReader : public ColumnIterator {
public:
diff --git a/be/src/vec/columns/column_variant.cpp
b/be/src/vec/columns/column_variant.cpp
index 62a13488760..432b1327374 100644
--- a/be/src/vec/columns/column_variant.cpp
+++ b/be/src/vec/columns/column_variant.cpp
@@ -861,11 +861,7 @@ void ColumnVariant::try_insert(const Field& field) {
}
const auto& object = field.get<const VariantMap&>();
size_t old_size = size();
- for (const auto& [key_str, value] : object) {
- PathInData key;
- if (!key_str.empty()) {
- key = PathInData(key_str);
- }
+ for (const auto& [key, value] : object) {
if (!has_subcolumn(key)) {
bool succ = add_sub_column(key, old_size);
if (!succ) {
@@ -958,7 +954,7 @@ void ColumnVariant::get(size_t n, Field& res) const {
entry->data.get(n, field);
// Notice: we treat null as empty field, since we do not distinguish
null and empty for Variant type.
if (field.get_type() != PrimitiveType::TYPE_NULL) {
- object.try_emplace(entry->path.get_path(), field);
+ object.try_emplace(entry->path, field);
}
}
if (object.empty()) {
diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h
index 27ae9a7f430..bca1273cdc2 100644
--- a/be/src/vec/core/field.h
+++ b/be/src/vec/core/field.h
@@ -40,6 +40,7 @@
#include "util/quantile_state.h"
#include "vec/common/uint128.h"
#include "vec/core/types.h"
+#include "vec/json/path_in_data.h"
namespace doris {
template <PrimitiveType type>
@@ -82,7 +83,7 @@ struct Map : public FieldVector {
using FieldVector::FieldVector;
};
-using VariantMap = std::map<String, Field>;
+using VariantMap = std::map<PathInData, Field>;
//TODO: rethink if we really need this? it only save one pointer from
std::string
// not POD type so could only use read/write_json_binary instead of
read/write_binary
diff --git a/be/src/vec/data_types/data_type_variant.cpp
b/be/src/vec/data_types/data_type_variant.cpp
index 3625f895c28..57e9e956b29 100644
--- a/be/src/vec/data_types/data_type_variant.cpp
+++ b/be/src/vec/data_types/data_type_variant.cpp
@@ -70,6 +70,7 @@ int64_t
DataTypeVariant::get_uncompressed_serialized_bytes(const IColumn& column
}
PColumnMeta column_meta_pb;
column_meta_pb.set_name(entry->path.get_path());
+ entry->path.to_protobuf(column_meta_pb.mutable_column_path(), -1 /*not
used here*/);
type->to_pb_column_meta(&column_meta_pb);
std::string meta_binary;
column_meta_pb.SerializeToString(&meta_binary);
@@ -113,6 +114,7 @@ char* DataTypeVariant::serialize(const IColumn& column,
char* buf, int be_exec_v
++num_of_columns;
PColumnMeta column_meta_pb;
column_meta_pb.set_name(entry->path.get_path());
+ entry->path.to_protobuf(column_meta_pb.mutable_column_path(), -1 /*not
used here*/);
type->to_pb_column_meta(&column_meta_pb);
std::string meta_binary;
column_meta_pb.SerializeToString(&meta_binary);
@@ -173,11 +175,15 @@ const char* DataTypeVariant::deserialize(const char* buf,
MutableColumnPtr* colu
MutableColumnPtr sub_column = type->create_column();
buf = type->deserialize(buf, &sub_column, be_exec_version);
- // add subcolumn to column_object
PathInData key;
- if (!column_meta_pb.name().empty()) {
+ if (column_meta_pb.has_column_path()) {
+ // init from path pb
+ key.from_protobuf(column_meta_pb.column_path());
+ } else if (!column_meta_pb.name().empty()) {
+ // init from name for compatible
key = PathInData {column_meta_pb.name()};
}
+ // add subcolumn to column_object
column_object->add_sub_column(key, std::move(sub_column), type);
}
size_t num_rows = 0;
diff --git a/be/src/vec/data_types/serde/data_type_serde.cpp
b/be/src/vec/data_types/serde/data_type_serde.cpp
index dfc81c03fa2..12b9e4e2c47 100644
--- a/be/src/vec/data_types/serde/data_type_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_serde.cpp
@@ -57,7 +57,8 @@ void DataTypeSerDe::convert_variant_map_to_rapidjson(
continue;
}
rapidjson::Value key;
- key.SetString(item.first.data(),
cast_set<rapidjson::SizeType>(item.first.size()));
+ key.SetString(item.first.get_path().data(),
+
cast_set<rapidjson::SizeType>(item.first.get_path().size()));
rapidjson::Value val;
convert_field_to_rapidjson(item.second, val, allocator);
if (val.IsNull() && item.first.empty()) {
diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h
index 897ba043b05..845aaea178d 100644
--- a/be/src/vec/json/json_parser.h
+++ b/be/src/vec/json/json_parser.h
@@ -28,6 +28,7 @@
#include <utility>
#include <vector>
+#include "runtime/primitive_type.h"
#include "util/jsonb_writer.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
@@ -124,6 +125,13 @@ enum class ExtractType {
struct ParseConfig {
bool enable_flatten_nested = false;
};
+/// Result of parsing of a document.
+/// Contains all paths extracted from document
+/// and values which are related to them.
+struct ParseResult {
+ std::vector<PathInData> paths;
+ std::vector<Field> values;
+};
template <typename ParserImpl>
class JSONDataParser {
public:
diff --git a/be/src/vec/json/path_in_data.h b/be/src/vec/json/path_in_data.h
index 8d94b02f37a..769a4b186c7 100644
--- a/be/src/vec/json/path_in_data.h
+++ b/be/src/vec/json/path_in_data.h
@@ -29,11 +29,7 @@
#include <vector>
#include "gen_cpp/segment_v2.pb.h"
-#include "vec/columns/column.h"
#include "vec/common/uint128.h"
-#include "vec/core/field.h"
-#include "vec/core/types.h"
-#include "vec/data_types/data_type.h"
namespace doris::vectorized {
@@ -129,13 +125,6 @@ private:
size_t current_anonymous_array_level = 0;
};
using PathsInData = std::vector<PathInData>;
-/// Result of parsing of a document.
-/// Contains all paths extracted from document
-/// and values which are related to them.
-struct ParseResult {
- std::vector<PathInData> paths;
- std::vector<Field> values;
-};
struct PathInDataRef {
const PathInData* ref;
@@ -148,12 +137,4 @@ struct PathInDataRef {
bool operator==(const PathInDataRef& other) const { return *this->ref ==
*other.ref; }
};
-struct PathWithColumnAndType {
- PathInData path;
- ColumnPtr column;
- DataTypePtr type;
-};
-
-using PathsWithColumnAndType = std::vector<PathWithColumnAndType>;
-
} // namespace doris::vectorized
diff --git a/be/test/vec/columns/column_object_test.cpp
b/be/test/vec/columns/column_object_test.cpp
index 056e1ae4a2f..8939fe75983 100644
--- a/be/test/vec/columns/column_object_test.cpp
+++ b/be/test/vec/columns/column_object_test.cpp
@@ -21,6 +21,7 @@
#include "vec/columns/column_variant.h"
#include "vec/columns/common_column_test.h"
+#include "vec/json/path_in_data.h"
namespace doris::vectorized {
@@ -196,11 +197,11 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) {
Field result1;
dst_column->get(0, result1);
- EXPECT_EQ(result1.get<VariantMap>().at("").get<Int64>(), 123);
+ EXPECT_EQ(result1.get<VariantMap>().at({}).get<Int64>(), 123);
Field result2;
dst_column->get(1, result2);
- EXPECT_EQ(result2.get<VariantMap>().at("").get<Int64>(), 456);
+ EXPECT_EQ(result2.get<VariantMap>().at({}).get<Int64>(), 456);
}
// Test case 2: Insert from scalar variant source to non-empty destination
of same type
@@ -237,9 +238,9 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) {
dst_column->get(1, result2);
dst_column->get(2, result3);
- EXPECT_EQ(result1.get<VariantMap>().at("").get<Int64>(), 789);
- EXPECT_EQ(result2.get<VariantMap>().at("").get<Int64>(), 456);
- EXPECT_EQ(result3.get<VariantMap>().at("").get<Int64>(), 123);
+ EXPECT_EQ(result1.get<VariantMap>().at({}).get<Int64>(), 789);
+ EXPECT_EQ(result2.get<VariantMap>().at({}).get<Int64>(), 456);
+ EXPECT_EQ(result3.get<VariantMap>().at({}).get<Int64>(), 123);
}
// Test case 3: Insert from non-scalar or different type source (fallback
to try_insert)
@@ -250,13 +251,13 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) {
// Create a map with {"a": 123}
Field field_map = Field::create_field<TYPE_VARIANT>(VariantMap());
auto& map1 = field_map.get<VariantMap&>();
- map1["a"] = Field::create_field<TYPE_INT>(123);
+ map1[PathInData("a")] = Field::create_field<TYPE_INT>(123);
src_column->try_insert(field_map);
// Create another map with {"b": "hello"}
field_map = Field::create_field<TYPE_VARIANT>(VariantMap());
auto& map2 = field_map.get<VariantMap&>();
- map2["b"] = Field::create_field<TYPE_STRING>(String("hello"));
+ map2[PathInData("b")] =
Field::create_field<TYPE_STRING>(String("hello"));
src_column->try_insert(field_map);
src_column->finalize();
@@ -285,8 +286,8 @@ TEST_F(ColumnObjectTest, test_insert_indices_from) {
const auto& result1_map = result1.get<const VariantMap&>();
const auto& result2_map = result2.get<const VariantMap&>();
- EXPECT_EQ(result1_map.at("b").get<const String&>(), "hello");
- EXPECT_EQ(result2_map.at("a").get<Int64>(), 123);
+ EXPECT_EQ(result1_map.at(PathInData("b")).get<const String&>(),
"hello");
+ EXPECT_EQ(result2_map.at(PathInData("a")).get<Int64>(), 123);
}
}
diff --git a/gensrc/proto/data.proto b/gensrc/proto/data.proto
index 95fb522289e..54cbed7f427 100644
--- a/gensrc/proto/data.proto
+++ b/gensrc/proto/data.proto
@@ -65,6 +65,7 @@ message PColumnMeta {
optional bool result_is_nullable = 6;
optional string function_name = 7;
optional int32 be_exec_version = 8;
+ optional segment_v2.ColumnPathInfo column_path = 9;
}
message PBlock {
diff --git a/regression-test/data/variant_p0/column_name.out
b/regression-test/data/variant_p0/column_name.out
index 6ac882d2922..0f54df05d91 100644
Binary files a/regression-test/data/variant_p0/column_name.out and
b/regression-test/data/variant_p0/column_name.out differ
diff --git a/regression-test/suites/variant_p0/column_name.groovy
b/regression-test/suites/variant_p0/column_name.groovy
index 7962112ff75..7cf7fe198b1 100644
--- a/regression-test/suites/variant_p0/column_name.groovy
+++ b/regression-test/suites/variant_p0/column_name.groovy
@@ -25,7 +25,7 @@ suite("regression_test_variant_column_name", "variant_type"){
)
DUPLICATE KEY(`k`)
DISTRIBUTED BY HASH(k) BUCKETS 1
- properties("replication_num" = "1", "disable_auto_compaction" =
"true");
+ properties("replication_num" = "1", "disable_auto_compaction" =
"false");
"""
sql """insert into ${table_name} values (1, '{"中文" : "中文",
"\\\u4E2C\\\u6587": "unicode"}')"""
@@ -61,7 +61,18 @@ suite("regression_test_variant_column_name", "variant_type"){
sql """insert into var_column_name values (7, '{"": 1234566}')"""
sql """insert into var_column_name values (7, '{"": 8888888}')"""
- qt_sql "select Tags[''] from var_column_name order by cast(Tags[''] as
string)"
+ qt_sql "select cast(Tags[''] as text) from var_column_name order by
cast(Tags[''] as string)"
+
+ // name with `.`
+ sql "truncate table var_column_name"
+ sql """insert into var_column_name values (7, '{"a.b": "UPPER CASE",
"a.c": "lower case", "a" : {"b" : 123}, "a" : {"c" : 456}}')"""
+ for (int i = 0; i < 7; i++) {
+ sql """insert into var_column_name select * from var_column_name"""
+ }
+ qt_sql_cnt_1 "select count(Tags['a.b']) from var_column_name"
+ qt_sql_cnt_2 "select count(Tags['a.c']) from var_column_name"
+ qt_sql_cnt_3 "select count(Tags['a']['b']) from var_column_name"
+ qt_sql_cnt_4 "select count(Tags['a']['c']) from var_column_name"
try {
sql """insert into var_column_name values (7, '{"": "UPPER CASE", "":
"lower case"}')"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]