This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 40379b7a8eb branch-4.0: [fix](datatype)Fix unaligned memory access in
read_column_from_arrow #58002 (#58202)
40379b7a8eb is described below
commit 40379b7a8eb2d0520cfbb147baf400cd823fa67d
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Nov 21 09:40:37 2025 +0800
branch-4.0: [fix](datatype)Fix unaligned memory access in
read_column_from_arrow #58002 (#58202)
Cherry-picked from #58002
Co-authored-by: Chen768959 <[email protected]>
---
.../vec/data_types/serde/data_type_array_serde.cpp | 16 ++-
.../serde/data_type_datetimev2_serde.cpp | 7 +-
.../data_types/serde/data_type_datev2_serde.cpp | 8 +-
.../data_types/serde/data_type_decimal_serde.cpp | 6 +-
.../data_types/serde/data_type_number_serde.cpp | 11 +-
.../data_types/serde/data_type_string_serde.cpp | 15 ++-
.../serde/data_type_serde_datetime_v2_test.cpp | 66 ++++++++++-
.../serde/data_type_serde_decimal_test.cpp | 48 +++++++-
.../serde/data_type_serde_number_test.cpp | 53 ++++++++-
.../serde/data_type_serde_string_test.cpp | 129 ++++++++++++++++++++-
10 files changed, 343 insertions(+), 16 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp
b/be/src/vec/data_types/serde/data_type_array_serde.cpp
index 80082da89ef..431fc153a4d 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp
@@ -326,11 +326,21 @@ Status
DataTypeArraySerDe::read_column_from_arrow(IColumn& column, const arrow::
auto arrow_offsets_array = concrete_array->offsets();
auto* arrow_offsets =
dynamic_cast<arrow::Int32Array*>(arrow_offsets_array.get());
auto prev_size = offsets_data.back();
- auto arrow_nested_start_offset = arrow_offsets->Value(start);
- auto arrow_nested_end_offset = arrow_offsets->Value(end);
+ const auto* base_offsets_ptr = reinterpret_cast<const
uint8_t*>(arrow_offsets->raw_values());
+ const size_t offset_element_size = sizeof(int32_t);
+ int32_t arrow_nested_start_offset = 0;
+ int32_t arrow_nested_end_offset = 0;
+ const uint8_t* start_offset_ptr = base_offsets_ptr + start *
offset_element_size;
+ const uint8_t* end_offset_ptr = base_offsets_ptr + end *
offset_element_size;
+ memcpy(&arrow_nested_start_offset, start_offset_ptr, offset_element_size);
+ memcpy(&arrow_nested_end_offset, end_offset_ptr, offset_element_size);
+
for (auto i = start + 1; i < end + 1; ++i) {
+ int32_t current_offset = 0;
+ const uint8_t* current_offset_ptr = base_offsets_ptr + i *
offset_element_size;
+ memcpy(¤t_offset, current_offset_ptr, offset_element_size);
// convert to doris offset, start from offsets.back()
- offsets_data.emplace_back(prev_size + arrow_offsets->Value(i) -
arrow_nested_start_offset);
+ offsets_data.emplace_back(prev_size + current_offset -
arrow_nested_start_offset);
}
return nested_serde->read_column_from_arrow(
column_array.get_data(), concrete_array->values().get(),
arrow_nested_start_offset,
diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
index 6f467d83076..f44c5d38c40 100644
--- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
@@ -397,8 +397,13 @@ Status
DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column,
type->unit());
}
}
+ const auto* base_ptr = reinterpret_cast<const
uint8_t*>(concrete_array->raw_values());
+ const size_t element_size = sizeof(int64_t);
for (auto value_i = start; value_i < end; ++value_i) {
- auto utc_epoch =
static_cast<UInt64>(concrete_array->Value(value_i));
+ int64_t date_value = 0;
+ const uint8_t* raw_byte_ptr = base_ptr + value_i * element_size;
+ memcpy(&date_value, raw_byte_ptr, element_size);
+ auto utc_epoch = static_cast<UInt64>(date_value);
DateV2Value<DateTimeV2ValueType> v;
// convert second
diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
index d6e77bc9b79..620ac636073 100644
--- a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
@@ -114,9 +114,15 @@ Status
DataTypeDateV2SerDe::read_column_from_arrow(IColumn& column, const arrow:
const cctz::time_zone& ctz)
const {
auto& col_data = static_cast<ColumnDateV2&>(column).get_data();
const auto* concrete_array = dynamic_cast<const
arrow::Date32Array*>(arrow_array);
+ const auto* base_ptr = reinterpret_cast<const
uint8_t*>(concrete_array->raw_values());
+ const size_t element_size = sizeof(int32_t);
for (auto value_i = start; value_i < end; ++value_i) {
+ int32_t date_value = 0;
+ const uint8_t* raw_byte_ptr = base_ptr + value_i * element_size;
+ memcpy(&date_value, raw_byte_ptr, element_size);
+
DateV2Value<DateV2ValueType> v;
- v.get_date_from_daynr(concrete_array->Value(value_i) + date_threshold);
+ v.get_date_from_daynr(date_value + date_threshold);
col_data.emplace_back(binary_cast<DateV2Value<DateV2ValueType>,
UInt32>(v));
}
return Status::OK();
diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
index 0af0c5acf7d..464bf73abf8 100644
--- a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
@@ -337,8 +337,10 @@ Status
DataTypeDecimalSerDe<T>::read_column_from_arrow(IColumn& column,
} else if constexpr (T == TYPE_DECIMAL32 || T == TYPE_DECIMAL64 || T ==
TYPE_DECIMAL128I) {
const auto* concrete_array = dynamic_cast<const
arrow::DecimalArray*>(arrow_array);
for (auto value_i = start; value_i < end; ++value_i) {
- column_data.emplace_back(
- *reinterpret_cast<const
FieldType*>(concrete_array->Value(value_i)));
+ const auto* value = concrete_array->Value(value_i);
+ FieldType decimal_value = FieldType {};
+ memcpy(&decimal_value, value, sizeof(FieldType));
+ column_data.emplace_back(decimal_value);
}
} else if constexpr (T == TYPE_DECIMAL256) {
const auto* concrete_array = dynamic_cast<const
arrow::Decimal256Array*>(arrow_array);
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index d483ed4c647..d35f2063512 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -219,10 +219,17 @@ Status
DataTypeNumberSerDe<T>::read_column_from_arrow(IColumn& column,
const auto* concrete_array = dynamic_cast<const
arrow::StringArray*>(arrow_array);
std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+ const auto* offsets_data = concrete_array->value_offsets()->data();
+ const size_t offset_size = sizeof(int32_t);
for (size_t offset_i = start; offset_i < end; ++offset_i) {
if (!concrete_array->IsNull(offset_i)) {
- const auto* raw_data = buffer->data() +
concrete_array->value_offset(offset_i);
- const auto raw_data_len =
concrete_array->value_length(offset_i);
+ int32_t start_offset = 0;
+ int32_t end_offset = 0;
+ memcpy(&start_offset, offsets_data + offset_i * offset_size,
offset_size);
+ memcpy(&end_offset, offsets_data + (offset_i + 1) *
offset_size, offset_size);
+
+ const auto* raw_data = buffer->data() + start_offset;
+ const auto raw_data_len = end_offset - start_offset;
if (raw_data_len == 0) {
col_data.emplace_back(Int128()); // Int128() is NULL
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.cpp
b/be/src/vec/data_types/serde/data_type_string_serde.cpp
index 9fe0d61c28c..3b80d707151 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_string_serde.cpp
@@ -262,12 +262,21 @@ Status
DataTypeStringSerDeBase<ColumnType>::read_column_from_arrow(
arrow_array->type_id() == arrow::Type::BINARY) {
const auto* concrete_array = dynamic_cast<const
arrow::BinaryArray*>(arrow_array);
std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+ const uint8_t* offsets_data = concrete_array->value_offsets()->data();
+ const size_t offset_size = sizeof(int32_t);
for (auto offset_i = start; offset_i < end; ++offset_i) {
if (!concrete_array->IsNull(offset_i)) {
- const auto* raw_data = buffer->data() +
concrete_array->value_offset(offset_i);
+ int32_t start_offset = 0;
+ int32_t end_offset = 0;
+ memcpy(&start_offset, offsets_data + offset_i * offset_size,
offset_size);
+ memcpy(&end_offset, offsets_data + (offset_i + 1) *
offset_size, offset_size);
+
+ int32_t length = end_offset - start_offset;
+ const auto* raw_data = buffer->data() + start_offset;
+
assert_cast<ColumnType&>(column).insert_data(
- (char*)raw_data,
concrete_array->value_length(offset_i));
+ reinterpret_cast<const char*>(raw_data), length);
} else {
assert_cast<ColumnType&>(column).insert_default();
}
@@ -450,4 +459,4 @@ template class DataTypeStringSerDeBase<ColumnString>;
template class DataTypeStringSerDeBase<ColumnString64>;
template class DataTypeStringSerDeBase<ColumnFixedLengthObject>;
-} // namespace doris::vectorized
\ No newline at end of file
+} // namespace doris::vectorized
diff --git a/be/test/vec/data_types/serde/data_type_serde_datetime_v2_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_datetime_v2_test.cpp
index 57e0dd76a7b..6226d5cfd4f 100644
--- a/be/test/vec/data_types/serde/data_type_serde_datetime_v2_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_datetime_v2_test.cpp
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
@@ -221,4 +223,66 @@ TEST_F(DataTypeDateTimeV2SerDeTest, serdes) {
test_func(*serde_time_v2_0, column_time_v2_0);
}
-} // namespace doris::vectorized
\ No newline at end of file
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeDateTimeV2SerDeTest, ArrowMemNotAlignedDate) {
+ // 1.Prepare the data.
+ std::vector<int32_t> dates = {0, 365, 1000, 5000, 10000};
+ const int64_t num_elements = dates.size();
+ const int64_t element_size = sizeof(int32_t);
+
+ // 2.Create an unaligned memory buffer.
+ std::vector<uint8_t> data_storage(num_elements * element_size + 10);
+ uint8_t* unaligned_data = data_storage.data() + 1;
+
+ // 3.Copy data to unaligned memory
+ for (size_t i = 0; i < dates.size(); ++i) {
+ memcpy(unaligned_data + i * element_size, &dates[i], element_size);
+ }
+
+ // 4. Create Arrow array with unaligned memory
+ auto unaligned_buffer = arrow::Buffer::Wrap(unaligned_data, num_elements *
element_size);
+ auto arr = std::make_shared<arrow::Date32Array>(num_elements,
unaligned_buffer);
+ const auto* raw_values_ptr = arr->raw_values();
+ uintptr_t address = reinterpret_cast<uintptr_t>(raw_values_ptr);
+ EXPECT_EQ(address % 4, 1);
+
+ // 5.Test read_column_from_arrow
+ cctz::time_zone tz;
+ auto st = serde_date_v2->read_column_from_arrow(*column_date_v2,
arr.get(), 0, 1, tz);
+ EXPECT_TRUE(st.ok());
+}
+
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeDateTimeV2SerDeTest, ArrowMemNotAlignedDateTime) {
+ // 1.Prepare the data.
+ std::vector<int64_t> timestamps = {0, 86400000000, 31536000000000,
100000000000, 500000000000};
+ const int64_t num_elements = timestamps.size();
+ const int64_t element_size = sizeof(int64_t);
+
+ // 2.Create an unaligned memory buffer.
+ std::vector<uint8_t> data_storage(num_elements * element_size + 10);
+ uint8_t* unaligned_data = data_storage.data() + 1;
+
+ // 3. Copy data to unaligned memory
+ for (size_t i = 0; i < timestamps.size(); ++i) {
+ memcpy(unaligned_data + i * element_size, ×tamps[i],
element_size);
+ }
+
+ // 4. Create Arrow array with unaligned memory
+ auto unaligned_buffer = arrow::Buffer::Wrap(unaligned_data, num_elements *
element_size);
+ auto timestamp_type = arrow::timestamp(arrow::TimeUnit::MICRO);
+ auto arr =
+ std::make_shared<arrow::TimestampArray>(timestamp_type,
num_elements, unaligned_buffer);
+
+ const auto* raw_values_ptr = arr->raw_values();
+ uintptr_t address = reinterpret_cast<uintptr_t>(raw_values_ptr);
+ EXPECT_EQ(address % 4, 1);
+
+ // 5.Test read_column_from_arrow
+ cctz::time_zone tz;
+ auto st =
+ serde_datetime_v2_6->read_column_from_arrow(*column_datetime_v2_6,
arr.get(), 0, 1, tz);
+ EXPECT_TRUE(st.ok());
+}
+
+} // namespace doris::vectorized
diff --git a/be/test/vec/data_types/serde/data_type_serde_decimal_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_decimal_test.cpp
index 994a59aacbc..697272ef847 100644
--- a/be/test/vec/data_types/serde/data_type_serde_decimal_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_decimal_test.cpp
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
@@ -291,4 +293,48 @@ TEST_F(DataTypeDecimalSerDeTest, serdes) {
test_func(*serde_decimal128v2, column_decimal128_v2);
}
-} // namespace doris::vectorized
\ No newline at end of file
+
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeDecimalSerDeTest, ArrowMemNotAligned) {
+ // 1.Prepare the data.
+ arrow::Decimal128Builder builder(arrow::decimal(38, 30));
+ std::vector<std::string> decimal_strings = {"12345.67", "89.10",
"1112.13", "1415.16",
+ "1718.19"};
+
+ for (const auto& str : decimal_strings) {
+ EXPECT_TRUE(builder.Append(arrow::Decimal128(str)).ok());
+ }
+
+ std::shared_ptr<arrow::Array> aligned_array;
+ EXPECT_TRUE(builder.Finish(&aligned_array).ok());
+ auto decimal_array =
std::static_pointer_cast<arrow::DecimalArray>(aligned_array);
+
+ // 2.Create an unaligned memory buffer.
+ const int64_t num_elements = decimal_array->length();
+ const int64_t element_size = decimal_array->byte_width();
+
+ std::vector<uint8_t> data_storage(num_elements * element_size + 10);
+ uint8_t* unaligned_data = data_storage.data() + 1;
+
+ // 3. Copy data to unaligned memory
+ const uint8_t* original_data = decimal_array->raw_values();
+ memcpy(unaligned_data, original_data, num_elements * element_size);
+
+ // 4. Create Arrow array with unaligned memory
+ auto unaligned_buffer = arrow::Buffer::Wrap(unaligned_data, num_elements *
element_size);
+
+ auto arr = std::make_shared<arrow::DecimalArray>(arrow::decimal(38, 30),
num_elements,
+ unaligned_buffer);
+
+ const auto* raw_values_ptr = arr->raw_values();
+ uintptr_t address = reinterpret_cast<uintptr_t>(raw_values_ptr);
+ EXPECT_EQ(address % 4, 1);
+
+ // 5.Test read_column_from_arrow
+ cctz::time_zone tz;
+ auto st =
serde_decimal128v3_2->read_column_from_arrow(*column_decimal128v3_2, arr.get(),
0, 1,
+ tz);
+ EXPECT_TRUE(st.ok());
+}
+
+} // namespace doris::vectorized
diff --git a/be/test/vec/data_types/serde/data_type_serde_number_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_number_test.cpp
index b0a1a58a9e5..360671eaa97 100644
--- a/be/test/vec/data_types/serde/data_type_serde_number_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_number_test.cpp
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
@@ -291,4 +293,53 @@ TEST_F(DataTypeNumberSerDeTest, serdes) {
test_func(*serde_int128, column_int128);
test_func(*serde_uint8, column_uint8);
}
-} // namespace doris::vectorized
\ No newline at end of file
+
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeNumberSerDeTest, ArrowMemNotAligned) {
+ // 1.Prepare the data.
+ std::vector<std::string> strings = {"9223372036854775807",
"9223372036854775806",
+ "9223372036854775805",
"9223372036854775804",
+ "9223372036854775803"};
+
+ int32_t total_length = 0;
+ std::vector<int32_t> offsets = {0};
+ for (const auto& str : strings) {
+ total_length += static_cast<int32_t>(str.length());
+ offsets.push_back(total_length);
+ }
+
+ // 2.Create an unaligned memory buffer.
+ std::vector<uint8_t> value_storage(total_length + 10);
+ std::vector<uint8_t> offset_storage((strings.size() + 1) * sizeof(int32_t)
+ 10);
+
+ uint8_t* unaligned_value_data = value_storage.data() + 1;
+ uint8_t* unaligned_offset_data = offset_storage.data() + 1;
+
+ // 3. Copy data to unaligned memory
+ int32_t current_pos = 0;
+ for (size_t i = 0; i < strings.size(); ++i) {
+ memcpy(unaligned_value_data + current_pos, strings[i].data(),
strings[i].length());
+ current_pos += strings[i].length();
+ }
+
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ memcpy(unaligned_offset_data + i * sizeof(int32_t), &offsets[i],
sizeof(int32_t));
+ }
+
+ // 4. Create Arrow array with unaligned memory
+ auto value_buffer = arrow::Buffer::Wrap(unaligned_value_data,
total_length);
+ auto offset_buffer =
+ arrow::Buffer::Wrap(unaligned_offset_data, offsets.size() *
sizeof(int32_t));
+ auto arr = std::make_shared<arrow::StringArray>(strings.size(),
offset_buffer, value_buffer);
+
+ const auto* offsets_ptr = arr->raw_value_offsets();
+ uintptr_t address = reinterpret_cast<uintptr_t>(offsets_ptr);
+ EXPECT_EQ((reinterpret_cast<uintptr_t>(address) % 4), 1);
+
+ // 5.Test read_column_from_arrow
+ cctz::time_zone tz;
+ auto st = serde_int128->read_column_from_arrow(*column_int128, arr.get(),
0, 1, tz);
+ EXPECT_TRUE(st.ok());
+}
+
+} // namespace doris::vectorized
diff --git a/be/test/vec/data_types/serde/data_type_serde_string_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_string_test.cpp
index 125d870d5b8..06e8f219c7a 100644
--- a/be/test/vec/data_types/serde/data_type_serde_string_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_string_test.cpp
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
@@ -32,12 +34,14 @@
#include "runtime/types.h"
#include "testutil/test_util.h"
#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
#include "vec/common/assert_cast.h"
#include "vec/core/field.h"
#include "vec/core/types.h"
#include "vec/data_types/common_data_type_serder_test.h"
#include "vec/data_types/common_data_type_test.h"
#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_array.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_string.h"
@@ -195,4 +199,127 @@ TEST_F(DataTypeStringSerDeTest, serdes) {
test_func(*serde_str, column_str32);
}
-} // namespace doris::vectorized
\ No newline at end of file
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeStringSerDeTest, ArrowMemNotAligned) {
+ // 1.Prepare the data.
+ std::vector<std::string> strings = {"hello", "world!", "test",
"unaligned", "memory"};
+
+ int32_t total_length = 0;
+ std::vector<int32_t> offsets = {0};
+ for (const auto& str : strings) {
+ total_length += static_cast<int32_t>(str.length());
+ offsets.push_back(total_length);
+ }
+
+ // 2.Create an unaligned memory buffer.
+ std::vector<uint8_t> value_storage(total_length + 10);
+ std::vector<uint8_t> offset_storage((strings.size() + 1) * sizeof(int32_t)
+ 10);
+
+ uint8_t* unaligned_value_data = value_storage.data() + 1;
+ uint8_t* unaligned_offset_data = offset_storage.data() + 1;
+
+ // 3. Copy data to unaligned memory
+ int32_t current_pos = 0;
+ for (size_t i = 0; i < strings.size(); ++i) {
+ memcpy(unaligned_value_data + current_pos, strings[i].data(),
strings[i].length());
+ current_pos += strings[i].length();
+ }
+
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ memcpy(unaligned_offset_data + i * sizeof(int32_t), &offsets[i],
sizeof(int32_t));
+ }
+
+ // 4. Create Arrow array with unaligned memory
+ auto value_buffer = arrow::Buffer::Wrap(unaligned_value_data,
total_length);
+ auto offset_buffer =
+ arrow::Buffer::Wrap(unaligned_offset_data, offsets.size() *
sizeof(int32_t));
+ auto arr = std::make_shared<arrow::StringArray>(strings.size(),
offset_buffer, value_buffer);
+
+ const auto* offsets_ptr = arr->raw_value_offsets();
+ uintptr_t address = reinterpret_cast<uintptr_t>(offsets_ptr);
+ EXPECT_EQ((reinterpret_cast<uintptr_t>(address) % 4), 1);
+
+ // 5.Test read_column_from_arrow
+ cctz::time_zone tz;
+ auto st = serde_str->read_column_from_arrow(*column_str32, arr.get(), 0,
1, tz);
+ EXPECT_TRUE(st.ok());
+}
+
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeStringSerDeTest, ArrowMemNotAlignedNestedArr) {
+ // 1.Prepare the data.
+ std::vector<std::string> string_data = {"hello", "world", "test", "a",
"b", "c"};
+ std::vector<int32_t> string_offsets = {0};
+ int32_t current_offset = 0;
+ for (const auto& str : string_data) {
+ current_offset += static_cast<int32_t>(str.length());
+ string_offsets.push_back(current_offset);
+ }
+
+ std::vector<int32_t> list_offsets = {0, 2, 3, 6, 6};
+ std::vector<uint8_t> value_data;
+ for (const auto& str : string_data) {
+ value_data.insert(value_data.end(), str.begin(), str.end());
+ }
+
+ const int64_t num_lists = list_offsets.size() - 1;
+ const int64_t offset_element_size = sizeof(int32_t);
+
+ // 2.Create an unaligned memory buffer.
+ std::vector<uint8_t> list_offset_storage(list_offsets.size() *
offset_element_size + 10);
+ uint8_t* unaligned_list_offsets = list_offset_storage.data() + 1;
+
+ std::vector<uint8_t> string_offset_storage(string_offsets.size() *
offset_element_size + 10);
+ uint8_t* unaligned_string_offsets = string_offset_storage.data() + 1;
+
+ std::vector<uint8_t> value_storage(value_data.size() + 10);
+ uint8_t* unaligned_values = value_storage.data() + 1;
+
+ // 3. Copy data to unaligned memory
+ for (size_t i = 0; i < list_offsets.size(); ++i) {
+ memcpy(unaligned_list_offsets + i * offset_element_size,
&list_offsets[i],
+ offset_element_size);
+ }
+
+ for (size_t i = 0; i < string_offsets.size(); ++i) {
+ memcpy(unaligned_string_offsets + i * offset_element_size,
&string_offsets[i],
+ offset_element_size);
+ }
+
+ memcpy(unaligned_values, value_data.data(), value_data.size());
+
+ // 4. Create Arrow array with unaligned memory
+ auto value_buffer = arrow::Buffer::Wrap(unaligned_values,
value_data.size());
+ auto string_offsets_buffer =
+ arrow::Buffer::Wrap(unaligned_string_offsets,
string_offsets.size() * sizeof(int32_t));
+ auto string_array =
std::make_shared<arrow::StringArray>(string_offsets.size() - 1,
+
string_offsets_buffer, value_buffer);
+ auto list_offsets_buffer =
+ arrow::Buffer::Wrap(unaligned_list_offsets, list_offsets.size() *
offset_element_size);
+ auto list_offsets_array =
+ std::make_shared<arrow::Int32Array>(list_offsets.size(),
list_offsets_buffer);
+
+ auto arr = std::make_shared<arrow::ListArray>(arrow::list(arrow::utf8()),
num_lists,
+ list_offsets_buffer,
string_array);
+
+ const auto* concrete_array = dynamic_cast<const
arrow::ListArray*>(arr.get());
+ auto arrow_offsets_array = concrete_array->offsets();
+ auto* arrow_offsets =
dynamic_cast<arrow::Int32Array*>(arrow_offsets_array.get());
+
+ const auto* offsets_ptr = arrow_offsets->raw_values();
+ uintptr_t offsets_address = reinterpret_cast<uintptr_t>(offsets_ptr);
+ EXPECT_EQ(offsets_address % 4, 1);
+
+ const auto* values_ptr = string_array->value_data()->data();
+ uintptr_t values_address = reinterpret_cast<uintptr_t>(values_ptr);
+ EXPECT_EQ(values_address % 4, 1);
+
+ // 5.Test read_column_from_arrow
+ auto ser_col = ColumnArray::create(ColumnString::create(),
ColumnOffset64::create());
+ cctz::time_zone tz;
+ auto serde_list = std::make_shared<DataTypeArraySerDe>(serde_str);
+ auto st = serde_list->read_column_from_arrow(*ser_col, arr.get(), 0, 1,
tz);
+ EXPECT_TRUE(st.ok());
+}
+
+} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]