This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new fb8c54092c4 [fix](datatype)Fix for unaligned memory in arrow MapArray 
parsing. (#58248)
fb8c54092c4 is described below

commit fb8c54092c4b47a972248a263eadcdd01d14346d
Author: Chen768959 <[email protected]>
AuthorDate: Sat Nov 22 09:09:37 2025 +0800

    [fix](datatype)Fix for unaligned memory in arrow MapArray parsing. (#58248)
    
    1. Fix for unaligned memory in arrow MapArray parsing.
    2. arrow Map unaligned memory ut.
    3. arrow Struct unaligned memory ut.
    Followup #55274
---
 .../vec/data_types/serde/data_type_map_serde.cpp   |  16 +-
 .../data_types/serde/data_type_serde_map_test.cpp  | 182 +++++++++++++++++++++
 .../serde/data_type_serde_struct_test.cpp          | 162 ++++++++++++++++++
 3 files changed, 357 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp 
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index d6a1b8f5a22..22a6075f0d5 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -394,11 +394,21 @@ Status DataTypeMapSerDe::read_column_from_arrow(IColumn& 
column, const arrow::Ar
     auto arrow_offsets_array = concrete_map->offsets();
     auto* arrow_offsets = 
dynamic_cast<arrow::Int32Array*>(arrow_offsets_array.get());
     auto prev_size = offsets_data.back();
-    auto arrow_nested_start_offset = arrow_offsets->Value(start);
-    auto arrow_nested_end_offset = arrow_offsets->Value(end);
+
+    const auto* base_offsets_ptr = reinterpret_cast<const 
uint8_t*>(arrow_offsets->raw_values());
+    const size_t offset_element_size = sizeof(int32_t);
+    int32_t arrow_nested_start_offset = 0;
+    int32_t arrow_nested_end_offset = 0;
+    const uint8_t* start_offset_ptr = base_offsets_ptr + start * 
offset_element_size;
+    const uint8_t* end_offset_ptr = base_offsets_ptr + end * 
offset_element_size;
+    memcpy(&arrow_nested_start_offset, start_offset_ptr, offset_element_size);
+    memcpy(&arrow_nested_end_offset, end_offset_ptr, offset_element_size);
     for (int64_t i = start + 1; i < end + 1; ++i) {
+        int32_t current_offset = 0;
+        const uint8_t* current_offset_ptr = base_offsets_ptr + i * 
offset_element_size;
+        memcpy(&current_offset, current_offset_ptr, offset_element_size);
         // convert to doris offset, start from offsets.back()
-        offsets_data.emplace_back(prev_size + arrow_offsets->Value(i) - 
arrow_nested_start_offset);
+        offsets_data.emplace_back(prev_size + current_offset - 
arrow_nested_start_offset);
     }
     RETURN_IF_ERROR(key_serde->read_column_from_arrow(
             column_map.get_keys(), concrete_map->keys().get(), 
arrow_nested_start_offset,
diff --git a/be/test/vec/data_types/serde/data_type_serde_map_test.cpp 
b/be/test/vec/data_types/serde/data_type_serde_map_test.cpp
new file mode 100644
index 00000000000..322d923c5c8
--- /dev/null
+++ b/be/test/vec/data_types/serde/data_type_serde_map_test.cpp
@@ -0,0 +1,182 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+#include <gtest/gtest.h>
+#include <lz4/lz4.h>
+#include <streamvbyte.h>
+
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+#include "agent/be_exec_version_manager.h"
+#include "olap/olap_common.h"
+#include "runtime/define_primitive_type.h"
+#include "runtime/types.h"
+#include "testutil/test_util.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_map.h"
+#include "vec/common/assert_cast.h"
+#include "vec/core/field.h"
+#include "vec/core/types.h"
+#include "vec/data_types/common_data_type_serder_test.h"
+#include "vec/data_types/common_data_type_test.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_factory.hpp"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_string.h"
+
+namespace doris::vectorized {
+static auto serde_str_key = std::make_shared<DataTypeStringSerDe>();
+static auto serde_str_value = std::make_shared<DataTypeStringSerDe>();
+
+class DataTypeMapSerDeTest : public ::testing::Test {
+protected:
+    static void SetUpTestSuite() {}
+};
+
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeMapSerDeTest, ArrowMemNotAligned) {
+    // 1.Prepare the data.
+    std::vector<std::string> key_data = {"key1", "key2", "key3", "key4", 
"key5", "key6"};
+    std::vector<std::string> value_data = {"val1", "val2", "val3", "val4", 
"val5", "val6"};
+
+    std::vector<int32_t> key_offsets = {0};
+    std::vector<int32_t> value_offsets = {0};
+
+    int32_t current_key_offset = 0;
+    for (const auto& key : key_data) {
+        current_key_offset += static_cast<int32_t>(key.length());
+        key_offsets.push_back(current_key_offset);
+    }
+
+    int32_t current_value_offset = 0;
+    for (const auto& value : value_data) {
+        current_value_offset += static_cast<int32_t>(value.length());
+        value_offsets.push_back(current_value_offset);
+    }
+
+    std::vector<int32_t> map_offsets = {0, 2, 3, 6, 6};
+    std::vector<int8_t> validity_bitmap = {0x0B};
+
+    std::vector<uint8_t> key_value_data;
+    for (const auto& key : key_data) {
+        key_value_data.insert(key_value_data.end(), key.begin(), key.end());
+    }
+
+    std::vector<uint8_t> value_value_data;
+    for (const auto& value : value_data) {
+        value_value_data.insert(value_value_data.end(), value.begin(), 
value.end());
+    }
+
+    const int64_t num_maps = map_offsets.size() - 1;
+    const int64_t offset_element_size = sizeof(int32_t);
+
+    // 2.Create an unaligned memory buffer.
+    std::vector<uint8_t> map_offset_storage(map_offsets.size() * 
offset_element_size + 10);
+    uint8_t* unaligned_map_offsets = map_offset_storage.data() + 1;
+
+    std::vector<uint8_t> key_offset_storage(key_offsets.size() * 
offset_element_size + 10);
+    uint8_t* unaligned_key_offsets = key_offset_storage.data() + 1;
+
+    std::vector<uint8_t> value_offset_storage(value_offsets.size() * 
offset_element_size + 10);
+    uint8_t* unaligned_value_offsets = value_offset_storage.data() + 1;
+
+    std::vector<uint8_t> key_value_storage(key_value_data.size() + 10);
+    uint8_t* unaligned_key_values = key_value_storage.data() + 1;
+
+    std::vector<uint8_t> value_value_storage(value_value_data.size() + 10);
+    uint8_t* unaligned_value_values = value_value_storage.data() + 1;
+
+    std::vector<uint8_t> validity_storage(validity_bitmap.size() + 10);
+    uint8_t* unaligned_validity = validity_storage.data() + 1;
+
+    // 3. Copy data to unaligned memory
+    for (size_t i = 0; i < map_offsets.size(); ++i) {
+        memcpy(unaligned_map_offsets + i * offset_element_size, 
&map_offsets[i],
+               offset_element_size);
+    }
+
+    for (size_t i = 0; i < key_offsets.size(); ++i) {
+        memcpy(unaligned_key_offsets + i * offset_element_size, 
&key_offsets[i],
+               offset_element_size);
+    }
+
+    for (size_t i = 0; i < value_offsets.size(); ++i) {
+        memcpy(unaligned_value_offsets + i * offset_element_size, 
&value_offsets[i],
+               offset_element_size);
+    }
+
+    memcpy(unaligned_key_values, key_value_data.data(), key_value_data.size());
+    memcpy(unaligned_value_values, value_value_data.data(), 
value_value_data.size());
+    memcpy(unaligned_validity, validity_bitmap.data(), validity_bitmap.size());
+
+    // 4. Create Arrow array with unaligned memory
+    auto key_value_buffer = arrow::Buffer::Wrap(unaligned_key_values, 
key_value_data.size());
+    auto key_offsets_buffer =
+            arrow::Buffer::Wrap(unaligned_key_offsets, key_offsets.size() * 
sizeof(int32_t));
+    auto key_array = std::make_shared<arrow::StringArray>(key_offsets.size() - 
1,
+                                                          key_offsets_buffer, 
key_value_buffer);
+
+    auto value_value_buffer = arrow::Buffer::Wrap(unaligned_value_values, 
value_value_data.size());
+    auto value_offsets_buffer =
+            arrow::Buffer::Wrap(unaligned_value_offsets, value_offsets.size() 
* sizeof(int32_t));
+    auto value_array = std::make_shared<arrow::StringArray>(
+            value_offsets.size() - 1, value_offsets_buffer, 
value_value_buffer);
+
+    auto map_offsets_buffer =
+            arrow::Buffer::Wrap(unaligned_map_offsets, map_offsets.size() * 
offset_element_size);
+    auto validity_buffer = arrow::Buffer::Wrap(unaligned_validity, 
validity_bitmap.size());
+
+    auto map_type = arrow::map(arrow::utf8(), arrow::utf8());
+
+    auto arr = std::make_shared<arrow::MapArray>(map_type, num_maps, 
map_offsets_buffer, key_array,
+                                                 value_array, validity_buffer);
+
+    const auto* concrete_array = dynamic_cast<const 
arrow::MapArray*>(arr.get());
+    auto arrow_offsets_array = concrete_array->offsets();
+    auto* arrow_offsets = 
dynamic_cast<arrow::Int32Array*>(arrow_offsets_array.get());
+
+    const auto* offsets_ptr = arrow_offsets->raw_values();
+    uintptr_t offsets_address = reinterpret_cast<uintptr_t>(offsets_ptr);
+    EXPECT_EQ(offsets_address % 4, 1);
+
+    const auto* keys_ptr = key_array->value_data()->data();
+    uintptr_t keys_address = reinterpret_cast<uintptr_t>(keys_ptr);
+    EXPECT_EQ(keys_address % 4, 1);
+
+    const auto* values_ptr = value_array->value_data()->data();
+    uintptr_t values_address = reinterpret_cast<uintptr_t>(values_ptr);
+    EXPECT_EQ(values_address % 4, 1);
+
+    // 5.Test read_column_from_arrow
+    auto ser_col = ColumnMap::create(ColumnString::create(), 
ColumnString::create(),
+                                     ColumnOffset64::create());
+    cctz::time_zone tz;
+    auto serde_map = std::make_shared<DataTypeMapSerDe>(serde_str_key, 
serde_str_value);
+    auto st = serde_map->read_column_from_arrow(*ser_col, arr.get(), 0, 1, tz);
+    EXPECT_TRUE(st.ok());
+}
+
+} // namespace doris::vectorized
diff --git a/be/test/vec/data_types/serde/data_type_serde_struct_test.cpp 
b/be/test/vec/data_types/serde/data_type_serde_struct_test.cpp
new file mode 100644
index 00000000000..74af068bca0
--- /dev/null
+++ b/be/test/vec/data_types/serde/data_type_serde_struct_test.cpp
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+#include <gtest/gtest.h>
+#include <lz4/lz4.h>
+#include <streamvbyte.h>
+
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+#include "agent/be_exec_version_manager.h"
+#include "olap/olap_common.h"
+#include "runtime/define_primitive_type.h"
+#include "runtime/types.h"
+#include "testutil/test_util.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_struct.h"
+#include "vec/common/assert_cast.h"
+#include "vec/core/field.h"
+#include "vec/core/types.h"
+#include "vec/data_types/common_data_type_serder_test.h"
+#include "vec/data_types/common_data_type_test.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_factory.hpp"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/data_types/data_type_struct.h"
+
+namespace doris::vectorized {
+static auto serde_int32 = std::make_shared<DataTypeNumberSerDe<TYPE_INT>>();
+static auto serde_str = std::make_shared<DataTypeStringSerDe>();
+
+class DataTypeStructSerDeTest : public ::testing::Test {
+protected:
+    static void SetUpTestSuite() {}
+};
+
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeStructSerDeTest, ArrowMemNotAligned) {
+    // 1.Prepare the data.
+    std::vector<int32_t> int_data = {1, 2, 3, 4, 5, 6};
+    std::vector<std::string> string_data = {"hello", "world", "test", "data", 
"arrow", "struct"};
+
+    std::vector<int32_t> string_offsets = {0};
+    int32_t current_string_offset = 0;
+    for (const auto& str : string_data) {
+        current_string_offset += static_cast<int32_t>(str.length());
+        string_offsets.push_back(current_string_offset);
+    }
+
+    std::vector<uint8_t> string_value_data;
+    for (const auto& str : string_data) {
+        string_value_data.insert(string_value_data.end(), str.begin(), 
str.end());
+    }
+
+    std::vector<int8_t> validity_bitmap = {0x3F};
+
+    const int64_t num_elements = int_data.size();
+    const int64_t int_element_size = sizeof(int32_t);
+    const int64_t offset_element_size = sizeof(int32_t);
+
+    // 2.Create an unaligned memory buffer.
+    std::vector<uint8_t> int_storage(int_data.size() * int_element_size + 10);
+    uint8_t* unaligned_ints = int_storage.data() + 1;
+
+    std::vector<uint8_t> string_offset_storage(string_offsets.size() * 
offset_element_size + 10);
+    uint8_t* unaligned_string_offsets = string_offset_storage.data() + 1;
+
+    std::vector<uint8_t> string_value_storage(string_value_data.size() + 10);
+    uint8_t* unaligned_string_values = string_value_storage.data() + 1;
+
+    std::vector<uint8_t> validity_storage(validity_bitmap.size() + 10);
+    uint8_t* unaligned_validity = validity_storage.data() + 1;
+
+    // 3. Copy data to unaligned memory
+    for (size_t i = 0; i < int_data.size(); ++i) {
+        memcpy(unaligned_ints + i * int_element_size, &int_data[i], 
int_element_size);
+    }
+
+    for (size_t i = 0; i < string_offsets.size(); ++i) {
+        memcpy(unaligned_string_offsets + i * offset_element_size, 
&string_offsets[i],
+               offset_element_size);
+    }
+
+    memcpy(unaligned_string_values, string_value_data.data(), 
string_value_data.size());
+    memcpy(unaligned_validity, validity_bitmap.data(), validity_bitmap.size());
+
+    // 4. Create Arrow array with unaligned memory
+    auto int_buffer = arrow::Buffer::Wrap(unaligned_ints, int_data.size() * 
int_element_size);
+    auto int_array = std::make_shared<arrow::Int32Array>(num_elements, 
int_buffer, nullptr, 0);
+
+    auto string_value_buffer =
+            arrow::Buffer::Wrap(unaligned_string_values, 
string_value_data.size());
+    auto string_offsets_buffer = arrow::Buffer::Wrap(unaligned_string_offsets,
+                                                     string_offsets.size() * 
offset_element_size);
+    auto string_array = std::make_shared<arrow::StringArray>(num_elements, 
string_offsets_buffer,
+                                                             
string_value_buffer, nullptr, 0);
+
+    auto validity_buffer = arrow::Buffer::Wrap(unaligned_validity, 
validity_bitmap.size());
+
+    auto field_int = arrow::field("int_field", arrow::int32());
+    auto field_string = arrow::field("string_field", arrow::utf8());
+
+    auto struct_type = arrow::struct_({field_int, field_string});
+
+    arrow::ArrayVector field_arrays = {int_array, string_array};
+
+    auto arr = std::make_shared<arrow::StructArray>(struct_type, num_elements, 
field_arrays,
+                                                    validity_buffer);
+
+    const auto* concrete_array = dynamic_cast<const 
arrow::StructArray*>(arr.get());
+
+    const auto* int_field_array =
+            dynamic_cast<const 
arrow::Int32Array*>(concrete_array->field(0).get());
+    const auto* ints_ptr = int_field_array->raw_values();
+    uintptr_t ints_address = reinterpret_cast<uintptr_t>(ints_ptr);
+    EXPECT_EQ(ints_address % 4, 1);
+
+    const auto* string_field_array =
+            dynamic_cast<const 
arrow::StringArray*>(concrete_array->field(1).get());
+    const auto* string_values_ptr = string_field_array->value_data()->data();
+    uintptr_t string_values_address = 
reinterpret_cast<uintptr_t>(string_values_ptr);
+    EXPECT_EQ(string_values_address % 4, 1);
+
+    // 5.Test read_column_from_arrow
+    std::vector<ColumnPtr> vector_columns;
+    vector_columns.emplace_back(ColumnInt32::create());
+    vector_columns.emplace_back(ColumnString::create());
+    auto ser_col = ColumnStruct::create(vector_columns);
+    cctz::time_zone tz;
+    DataTypeSerDeSPtrs elem_serdes = {serde_int32, serde_str};
+    Strings field_names = {"int_field", "string_field"};
+
+    auto serde_struct = std::make_shared<DataTypeStructSerDe>(elem_serdes, 
field_names);
+
+    auto st = serde_struct->read_column_from_arrow(*ser_col, arr.get(), 0, 
num_elements, tz);
+    EXPECT_TRUE(st.ok());
+}
+
+} // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to