This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new cbddff0694 [FIX](map) fix map key-column nullable for arrow serde
#21762
cbddff0694 is described below
commit cbddff0694ab7497ae4d5666c8f2d3a930f9f8a0
Author: amory <[email protected]>
AuthorDate: Fri Jul 14 00:30:07 2023 +0800
[FIX](map) fix map key-column nullable for arrow serde #21762
arrow is not support key column has null element , but doris default map
key column is nullable , so need to deal with if doris map row if key column
has null element , we put null to arrow
---
.../vec/data_types/serde/data_type_map_serde.cpp | 15 +++-
.../serde/data_type_serde_arrow_test.cpp | 84 +++++++++++++++++++++-
2 files changed, 96 insertions(+), 3 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index fb3eddc4ff..edb21a60ef 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -19,6 +19,7 @@
#include "arrow/array/builder_nested.h"
#include "util/jsonb_document.h"
+#include "util/simd/bits.h"
#include "vec/columns/column.h"
#include "vec/columns/column_const.h"
#include "vec/columns/column_map.h"
@@ -50,13 +51,23 @@ void DataTypeMapSerDe::write_column_to_arrow(const IColumn&
column, const NullMa
auto& builder = assert_cast<arrow::MapBuilder&>(*array_builder);
auto& map_column = assert_cast<const ColumnMap&>(column);
const IColumn& nested_keys_column = map_column.get_keys();
- CHECK(!nested_keys_column.is_nullable());
const IColumn& nested_values_column = map_column.get_values();
+ // now we default set key value in map is nullable
+ DCHECK(nested_keys_column.is_nullable());
+ DCHECK(nested_values_column.is_nullable());
+ auto keys_nullmap_data =
+
check_and_get_column<ColumnNullable>(nested_keys_column)->get_null_map_data().data();
auto& offsets = map_column.get_offsets();
auto key_builder = builder.key_builder();
auto value_builder = builder.item_builder();
+
for (size_t r = start; r < end; ++r) {
- if (null_map && (*null_map)[r]) {
+ if ((null_map && (*null_map)[r])) {
+ checkArrowStatus(builder.AppendNull(), column.get_name(),
+ array_builder->type()->name());
+ } else if (simd::contain_byte(keys_nullmap_data + offsets[r - 1],
+ offsets[r] - offsets[r - 1], 1)) {
+ // arrow do not support key is null so we just put null with this
row
checkArrowStatus(builder.AppendNull(), column.get_name(),
array_builder->type()->name());
} else {
diff --git a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
index 7792d40839..c1913e6d86 100644
--- a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
@@ -356,7 +356,9 @@ void serialize_and_deserialize_arrow_test() {
type_desc.add_sub_type(TYPE_STRING, true);
tslot.__set_slotType(type_desc.to_thrift());
{
- DataTypePtr s = std::make_shared<DataTypeString>();
+ DataTypePtr s =
+
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+ ;
DataTypePtr d =
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
DataTypePtr m = std::make_shared<DataTypeMap>(s, d);
@@ -503,4 +505,84 @@ TEST(DataTypeSerDeArrowTest, DataTypeCollectionSerDeTest) {
serialize_and_deserialize_arrow_test<false>();
}
+TEST(DataTypeSerDeArrowTest, DataTypeMapNullKeySerDeTest) {
+ TupleDescriptor tuple_desc(PTupleDescriptor(), true);
+ TSlotDescriptor tslot;
+ std::string col_name = "map_null_key";
+ tslot.__set_colName(col_name);
+ TypeDescriptor type_desc(TYPE_MAP);
+ type_desc.add_sub_type(TYPE_STRING, true);
+ type_desc.add_sub_type(TYPE_INT, true);
+ tslot.__set_slotType(type_desc.to_thrift());
+ vectorized::Block block;
+ {
+ DataTypePtr s =
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+ ;
+ DataTypePtr d =
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
+ DataTypePtr m = std::make_shared<DataTypeMap>(s, d);
+ Array k1, k2, v1, v2, k3, v3;
+ k1.push_back(Null());
+ k1.push_back("doris");
+ k1.push_back("clever amory");
+ v1.push_back(11);
+ v1.push_back(Null());
+ v1.push_back(30);
+ k2.push_back("hello amory");
+ k2.push_back("NULL");
+ k2.push_back("cute amory");
+ k2.push_back("doris");
+ v2.push_back(26);
+ v2.push_back(Null());
+ v2.push_back(6);
+ v2.push_back(7);
+ k3.push_back("test");
+ k3.push_back(Null());
+ v3.push_back(11);
+ v3.push_back(30);
+ Map m1, m2, m3;
+ m1.push_back(k1);
+ m1.push_back(v1);
+ m2.push_back(k2);
+ m2.push_back(v2);
+ m3.push_back(k3);
+ m3.push_back(v3);
+ MutableColumnPtr map_column = m->create_column();
+ map_column->reserve(3);
+ map_column->insert(m1);
+ map_column->insert(m2);
+ map_column->insert(m3);
+ vectorized::ColumnWithTypeAndName type_and_name(map_column->get_ptr(),
m, col_name);
+ block.insert(type_and_name);
+ }
+
+ tslot.__set_col_unique_id(1);
+ SlotDescriptor* slot = new SlotDescriptor(tslot);
+ tuple_desc.add_slot(slot);
+ RowDescriptor row_desc(&tuple_desc, true);
+ // arrow schema
+ std::shared_ptr<arrow::Schema> _arrow_schema;
+ EXPECT_EQ(convert_to_arrow_schema(row_desc, &_arrow_schema), Status::OK());
+
+ // serialize
+ std::shared_ptr<arrow::RecordBatch> result;
+ std::cout << "block structure: " << block.dump_structure() << std::endl;
+ std::cout << "_arrow_schema: " << _arrow_schema->ToString(true) <<
std::endl;
+
+ convert_to_arrow_batch(block, _arrow_schema, arrow::default_memory_pool(),
&result);
+ Block new_block = block.clone_empty();
+ EXPECT_TRUE(result != nullptr);
+ std::cout << "result: " << result->ToString() << std::endl;
+ // deserialize
+ auto* array = result->GetColumnByName(col_name).get();
+ auto& column_with_type_and_name = new_block.get_by_name(col_name);
+ arrow_column_to_doris_column(array, 0, column_with_type_and_name.column,
+ column_with_type_and_name.type, block.rows(),
"UTC");
+ std::cout << block.dump_data() << std::endl;
+ std::cout << new_block.dump_data() << std::endl;
+ // new block row_index 0, 2 is should be empty
+ EXPECT_EQ(new_block.dump_one_line(0, 1), "{}");
+ EXPECT_EQ(new_block.dump_one_line(2, 1), "{}");
+ EXPECT_EQ(block.dump_data(1, 1), new_block.dump_data(1, 1));
+}
+
} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]