This is an automated email from the ASF dual-hosted git repository.
jacktengg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 07b497ab219 [fix](be) Avoid UB from unaligned __int128 dereference
(#63703)
07b497ab219 is described below
commit 07b497ab2192a8eb3ccfe1e0c41a48123c646e1e
Author: TengJianPing <[email protected]>
AuthorDate: Tue Jun 2 16:22:25 2026 +0800
[fix](be) Avoid UB from unaligned __int128 dereference (#63703)
Issue Number: close #xxx
Problem Summary: Several BE call sites obtained a byte pointer from
StringRef::data / Slice::data / a generic const void* (e.g. ORC pushdown
literal value, JSONB serde, runtime filter literal builder, meta_tool
column dump) and dereferenced it as `__int128*` / `int128_t*` /
`DecimalV2Value*` / `Decimal<int128_t>*`.
Because those buffers carry no 16-byte alignment guarantee, the load is
undefined behavior. On alignment-strict targets (some aarch64 / SPARC
builds) and under UBSan -fsanitize=alignment the read can SIGBUS, abort,
or - with SSE codegen for __int128 - fault on a movdqa instruction.
Sites fixed:
- be/src/core/data_type_serde/data_type_number_serde.cpp (LARGEINT
JSONB)
- be/src/format/orc/vorc_reader.cpp (TYPE_DECIMALV2 / TYPE_DECIMAL128I
literal conversion for ORC predicate push-down)
- be/src/tools/meta_tool.cpp (LARGEINT and DECIMAL128I dump)
- be/src/exprs/vexpr.h create_texpr_literal_node<>: TYPE_LARGEINT,
TYPE_DECIMALV2 and TYPE_DECIMAL128I literal construction
All these sites now load the 16-byte value through the
`unaligned_load<T>` helper from `util/unaligned.h` into a local __int128
/ DecimalV2Value / Decimal<int128_t> before use. Modern compilers reduce
the helper's memcpy to a load, so there is no measurable performance
impact, but the semantics become well-defined regardless of the
producer's alignment.
Note: be/src/runtime/fold_constant_executor.cpp also contains unaligned
__int128 reads for TYPE_LARGEINT and TYPE_DECIMALV2 in `_get_result`,
but that branch is unreachable under the current Nereids planner (which
always sets `is_nereids = true` and uses `be_exec_version >= 4`, taking
the protobuf serde path). It is left untouched here to keep the diff
focused; the dead branch can be cleaned up separately.
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
.../data_type_serde/data_type_number_serde.cpp | 5 +-
be/src/exprs/vexpr.h | 17 ++--
be/src/format/orc/vorc_reader.cpp | 6 +-
be/src/tools/meta_tool.cpp | 7 +-
be/test/exprs/vexpr_unaligned_int128_test.cpp | 107 +++++++++++++++++++++
.../largeint/test_int128_unaligned_access.out | 42 ++++++++
.../largeint/test_int128_unaligned_access.groovy | 88 +++++++++++++++++
7 files changed, 261 insertions(+), 11 deletions(-)
diff --git a/be/src/core/data_type_serde/data_type_number_serde.cpp
b/be/src/core/data_type_serde/data_type_number_serde.cpp
index 39e9c0726c4..a8e379578d0 100644
--- a/be/src/core/data_type_serde/data_type_number_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_number_serde.cpp
@@ -40,6 +40,7 @@
#include "util/jsonb_writer.h"
#include "util/mysql_global.h"
#include "util/to_string.h"
+#include "util/unaligned.h"
namespace doris {
// Type map的基本结构
@@ -708,7 +709,9 @@ void DataTypeNumberSerDe<T>::write_one_cell_to_jsonb(const
IColumn& column,
int64_t val = *reinterpret_cast<const int64_t*>(data_ref.data);
result.writeInt64(val);
} else if constexpr (T == TYPE_LARGEINT) {
- __int128_t val = *reinterpret_cast<const __int128_t*>(data_ref.data);
+ // data_ref.data may not be 16-byte aligned; dereferencing __int128*
+ // directly is UB and may SIGBUS on alignment-strict platforms.
+ __int128_t val = unaligned_load<__int128_t>(data_ref.data);
result.writeInt128(val);
} else if constexpr (T == TYPE_FLOAT) {
float val = *reinterpret_cast<const float*>(data_ref.data);
diff --git a/be/src/exprs/vexpr.h b/be/src/exprs/vexpr.h
index 1576686fcc7..79f3485b3be 100644
--- a/be/src/exprs/vexpr.h
+++ b/be/src/exprs/vexpr.h
@@ -51,6 +51,7 @@
#include "storage/index/index_reader.h"
#include "storage/index/inverted/inverted_index_reader.h"
#include "util/date_func.h"
+#include "util/unaligned.h"
namespace doris {
class BitmapFilterFuncBase;
@@ -489,10 +490,11 @@ Status create_texpr_literal_node(const void* data,
TExprNode* node, int precisio
(*node).__set_int_literal(intLiteral);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_BIGINT));
} else if constexpr (T == TYPE_LARGEINT) {
- const auto* origin_value = reinterpret_cast<const int128_t*>(data);
+ // data may not be 16-byte aligned; use unaligned_load to avoid UB.
+ int128_t origin_value = unaligned_load<int128_t>(data);
(*node).__set_node_type(TExprNodeType::LARGE_INT_LITERAL);
TLargeIntLiteral large_int_literal;
- large_int_literal.__set_value(LargeIntValue::to_string(*origin_value));
+ large_int_literal.__set_value(LargeIntValue::to_string(origin_value));
(*node).__set_large_int_literal(large_int_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_LARGEINT));
} else if constexpr ((T == TYPE_DATE) || (T == TYPE_DATETIME)) {
@@ -536,10 +538,12 @@ Status create_texpr_literal_node(const void* data,
TExprNode* node, int precisio
(*node).__set_node_type(TExprNodeType::DATE_LITERAL);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_TIMESTAMPTZ,
precision, scale));
} else if constexpr (T == TYPE_DECIMALV2) {
- const auto* origin_value = reinterpret_cast<const
DecimalV2Value*>(data);
+ // data may not be 16-byte aligned (DecimalV2Value stores int128_t);
+ // use unaligned_load to avoid UB.
+ DecimalV2Value origin_value = unaligned_load<DecimalV2Value>(data);
(*node).__set_node_type(TExprNodeType::DECIMAL_LITERAL);
TDecimalLiteral decimal_literal;
- decimal_literal.__set_value(origin_value->to_string());
+ decimal_literal.__set_value(origin_value.to_string());
(*node).__set_decimal_literal(decimal_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMALV2,
precision, scale));
} else if constexpr (T == TYPE_DECIMAL32) {
@@ -557,7 +561,8 @@ Status create_texpr_literal_node(const void* data,
TExprNode* node, int precisio
(*node).__set_decimal_literal(decimal_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMAL64,
precision, scale));
} else if constexpr (T == TYPE_DECIMAL128I) {
- const auto* origin_value = reinterpret_cast<const
Decimal<int128_t>*>(data);
+ // data may not be 16-byte aligned; use unaligned_load to avoid UB.
+ Decimal<int128_t> origin_value =
unaligned_load<Decimal<int128_t>>(data);
(*node).__set_node_type(TExprNodeType::DECIMAL_LITERAL);
TDecimalLiteral decimal_literal;
// e.g. For a decimal(26,6) column, the initial value of the _min of
the MinMax RF
@@ -567,7 +572,7 @@ Status create_texpr_literal_node(const void* data,
TExprNode* node, int precisio
// error when casting string back to decimal later.
// TODO: this is a temporary solution, the best solution is to produce
the
// right min max value at the producer side.
- decimal_literal.__set_value(origin_value->to_string(precision, scale));
+ decimal_literal.__set_value(origin_value.to_string(precision, scale));
(*node).__set_decimal_literal(decimal_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMAL128I,
precision, scale));
} else if constexpr (T == TYPE_DECIMAL256) {
diff --git a/be/src/format/orc/vorc_reader.cpp
b/be/src/format/orc/vorc_reader.cpp
index 47eb9a23f4f..f41b4d0a7e5 100644
--- a/be/src/format/orc/vorc_reader.cpp
+++ b/be/src/format/orc/vorc_reader.cpp
@@ -102,6 +102,7 @@
#include "storage/utils.h"
#include "util/slice.h"
#include "util/timezone_utils.h"
+#include "util/unaligned.h"
namespace doris {
class RuntimeState;
@@ -781,7 +782,8 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const
orc::Type* type,
case orc::TypeKind::DECIMAL: {
int128_t decimal_value;
if constexpr (primitive_type == TYPE_DECIMALV2) {
- decimal_value = *reinterpret_cast<const int128_t*>(value);
+ // value may not be 16-byte aligned; use unaligned_load to
avoid UB.
+ decimal_value = unaligned_load<int128_t>(value);
precision = DecimalV2Value::PRECISION;
scale = DecimalV2Value::SCALE;
} else if constexpr (primitive_type == TYPE_DECIMAL32) {
@@ -789,7 +791,7 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const
orc::Type* type,
} else if constexpr (primitive_type == TYPE_DECIMAL64) {
decimal_value = *((int64_t*)value);
} else if constexpr (primitive_type == TYPE_DECIMAL128I) {
- decimal_value = *((int128_t*)value);
+ decimal_value = unaligned_load<int128_t>(value);
} else {
return std::make_tuple(false, orc::Literal(false));
}
diff --git a/be/src/tools/meta_tool.cpp b/be/src/tools/meta_tool.cpp
index 577e70ff053..18bc27c1d83 100644
--- a/be/src/tools/meta_tool.cpp
+++ b/be/src/tools/meta_tool.cpp
@@ -63,6 +63,7 @@
#include "storage/tablet/tablet_schema_cache.h"
#include "storage/types.h"
#include "util/coding.h"
+#include "util/unaligned.h"
using doris::DataDir;
using doris::StorageEngine;
@@ -472,7 +473,8 @@ std::string format_column_value(const doris::IColumn&
column, size_t row,
// LargeInt is stored as Int128
const StringRef& data = column.get_data_at(row);
if (data.size == sizeof(__int128)) {
- __int128 val = *reinterpret_cast<const __int128*>(data.data);
+ // data.data may not be 16-byte aligned; use unaligned_load to
avoid UB.
+ __int128 val = unaligned_load<__int128>(data.data);
return doris::LargeIntValue::to_string(val);
}
return "<invalid largeint>";
@@ -556,7 +558,8 @@ std::string format_column_value(const doris::IColumn&
column, size_t row,
case FieldType::OLAP_FIELD_TYPE_DECIMAL128I: {
const StringRef& data = column.get_data_at(row);
if (data.size == sizeof(__int128)) {
- __int128 val = *reinterpret_cast<const __int128*>(data.data);
+ // data.data may not be 16-byte aligned; use unaligned_load to
avoid UB.
+ __int128 val = unaligned_load<__int128>(data.data);
return doris::LargeIntValue::to_string(val);
}
return "<invalid decimal>";
diff --git a/be/test/exprs/vexpr_unaligned_int128_test.cpp
b/be/test/exprs/vexpr_unaligned_int128_test.cpp
new file mode 100644
index 00000000000..d34a6da7f94
--- /dev/null
+++ b/be/test/exprs/vexpr_unaligned_int128_test.cpp
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression test for unaligned __int128 dereference UB.
+//
+// Several call sites used to dereference a `__int128*` produced from a
+// `StringRef::data` (or similar byte pointer) without any alignment
+// guarantee. This file pins the contract that the helpers that build
+// literal TExprNodes from a raw `const void* data` pointer must accept
+// pointers that are *not* 16-byte aligned, since on alignment-strict
+// platforms (e.g. some aarch64 / SPARC builds, and UBSan
+// -fsanitize=alignment) such reads are undefined behavior and may
+// SIGBUS.
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "core/value/decimalv2_value.h"
+#include "core/value/large_int_value.h"
+#include "exprs/vexpr.h"
+
+namespace doris {
+
+// Returns a pointer guaranteed to be 1 byte off any 16-byte boundary.
+static char* misaligned_slot(std::vector<char>& buf, std::size_t bytes) {
+ buf.assign(bytes + 32, 0);
+ char* base = buf.data();
+ // Move forward until we land on an odd address.
+ std::size_t off = 0;
+ while ((reinterpret_cast<std::uintptr_t>(base + off) & 0xF) != 1) {
+ ++off;
+ }
+ return base + off;
+}
+
+TEST(UnalignedInt128Test, LargeIntLiteralFromUnalignedBuffer) {
+ std::vector<char> buf;
+ char* p = misaligned_slot(buf, sizeof(__int128));
+ ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);
+
+ // 2^126 - 1: a value that uses both 64-bit halves.
+ __int128 expected = (static_cast<__int128>(0x3FFFFFFFFFFFFFFFLL) << 64) |
+ static_cast<__int128>(0xFEEDFACECAFEBEEFULL);
+ std::memcpy(p, &expected, sizeof(expected));
+
+ TExprNode node;
+ Status st = create_texpr_literal_node<TYPE_LARGEINT>(p, &node);
+ ASSERT_TRUE(st.ok()) << st;
+ ASSERT_TRUE(node.__isset.large_int_literal);
+ EXPECT_EQ(node.large_int_literal.value,
LargeIntValue::to_string(expected));
+}
+
+TEST(UnalignedInt128Test, Decimal128ILiteralFromUnalignedBuffer) {
+ std::vector<char> buf;
+ char* p = misaligned_slot(buf, sizeof(__int128));
+ ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);
+
+ // Decimal(20, 4) value: 1234567890123456.7890
+ __int128 raw = static_cast<__int128>(1234567890123456789LL) * 10 + 1;
+ std::memcpy(p, &raw, sizeof(raw));
+
+ TExprNode node;
+ Status st = create_texpr_literal_node<TYPE_DECIMAL128I>(p, &node,
/*precision=*/20,
+ /*scale=*/4);
+ ASSERT_TRUE(st.ok()) << st;
+ ASSERT_TRUE(node.__isset.decimal_literal);
+ // Sanity: the formatted value must contain "1234567890123456".
+ EXPECT_NE(node.decimal_literal.value.find("1234567890123456"),
std::string::npos)
+ << node.decimal_literal.value;
+}
+
+TEST(UnalignedInt128Test, DecimalV2LiteralFromUnalignedBuffer) {
+ std::vector<char> buf;
+ char* p = misaligned_slot(buf, sizeof(DecimalV2Value));
+ ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);
+
+ DecimalV2Value src;
+ // 12345.6789 * 1e9 (DecimalV2 internal scale = 9).
+ src.set_value(static_cast<__int128>(12345678900000LL));
+ std::memcpy(p, &src, sizeof(src));
+
+ TExprNode node;
+ Status st = create_texpr_literal_node<TYPE_DECIMALV2>(p, &node,
/*precision=*/27,
+ /*scale=*/9);
+ ASSERT_TRUE(st.ok()) << st;
+ ASSERT_TRUE(node.__isset.decimal_literal);
+ EXPECT_EQ(node.decimal_literal.value, src.to_string());
+}
+
+} // namespace doris
diff --git
a/regression-test/data/datatype_p0/largeint/test_int128_unaligned_access.out
b/regression-test/data/datatype_p0/largeint/test_int128_unaligned_access.out
new file mode 100644
index 00000000000..66c56c6a8d1
--- /dev/null
+++ b/regression-test/data/datatype_p0/largeint/test_int128_unaligned_access.out
@@ -0,0 +1,42 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !largeint_select --
+1 170141183460469231731687303715884105727
+2 -170141183460469231731687303715884105728
+3 0
+4 1
+5 \N
+
+-- !decimal128_select --
+1 12345678901234567890.1234567890
+2 -12345678901234567890.1234567890
+3 0E-10
+4 1E-10
+5 \N
+
+-- !decimalv2_select --
+1 1234567890.123456789
+2 -1234567890.123456789
+3 0E-9
+4 1E-9
+5 \N
+
+-- !largeint_groupby --
+\N 1
+-170141183460469231731687303715884105728 1
+0 1
+1 1
+170141183460469231731687303715884105727 1
+
+-- !decimal128_groupby --
+\N 1
+-12345678901234567890.1234567890 1
+0E-10 1
+12345678901234567890.1234567890 1
+1E-10 1
+
+-- !largeint_to_json --
+1 170141183460469231731687303715884105727
+2 -170141183460469231731687303715884105728
+3 0
+4 1
+
diff --git
a/regression-test/suites/datatype_p0/largeint/test_int128_unaligned_access.groovy
b/regression-test/suites/datatype_p0/largeint/test_int128_unaligned_access.groovy
new file mode 100644
index 00000000000..2ddca0a92fc
--- /dev/null
+++
b/regression-test/suites/datatype_p0/largeint/test_int128_unaligned_access.groovy
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression for the unaligned __int128 dereference fix. Exercises the
+// real-runtime paths that previously dereferenced a __int128* through a
+// non 16-byte aligned StringRef::data / Slice::data pointer:
+//
+// * cast LARGEINT column -> JSON: DataTypeNumberSerDe<TYPE_LARGEINT>::
+// write_one_cell_to_jsonb
+// * LARGEINT / DECIMALV2 / DECIMAL128 round-trip through a table and
+// group-by: column-data-to-string paths used by vexpr literal
+// construction at runtime (e.g. runtime filter min/max push), and
+// by meta_tool.
+//
+// Under UBSan -fsanitize=alignment or on strict-alignment platforms
+// (e.g. aarch64) the previous code could SIGBUS / abort.
+//
+// Note: FoldConstantExecutor::_get_result is not exercised here because
+// it is unreachable under the current Nereids planner with default
+// be_exec_version (>= 4); the new BE-side fold path serializes the
+// result via DataTypeSerDe::write_column_to_pb which does not contain an
+// unaligned __int128 dereference. The fix to _get_result is retained as
+// defensive code; the BE unit test (vexpr_unaligned_int128_test.cpp)
+// covers the create_texpr_literal_node LARGEINT / DECIMAL128I /
+// DECIMALV2 branches directly from a deliberately misaligned buffer.
+suite("test_int128_unaligned_access") {
+ sql "set enable_sql_cache=false;"
+
+ // Storage path: round-trip largeint / decimal128 / decimalv2 through
+ // a table to exercise the column-data-to-string code paths used by
+ // vexpr literal construction and meta_tool at runtime.
+ sql "drop table if exists test_int128_unaligned"
+ sql """
+ CREATE TABLE test_int128_unaligned (
+ id INT NOT NULL,
+ v_largeint LARGEINT NULL,
+ v_decimal128 DECIMALV3(38, 10) NULL,
+ v_decimalv2 DECIMALV2(27, 9) NULL
+ ) DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1")
+ """
+
+ sql """
+ INSERT INTO test_int128_unaligned VALUES
+ (1, 170141183460469231731687303715884105727,
+ 12345678901234567890.1234567890,
+ 1234567890.123456789),
+ (2, -170141183460469231731687303715884105728,
+ -12345678901234567890.1234567890,
+ -1234567890.123456789),
+ (3, 0, 0, 0),
+ (4, 1, 0.0000000001, 0.000000001),
+ (5, NULL, NULL, NULL)
+ """
+
+ order_qt_largeint_select "select id, v_largeint from test_int128_unaligned"
+ order_qt_decimal128_select "select id, v_decimal128 from
test_int128_unaligned"
+ order_qt_decimalv2_select "select id, v_decimalv2 from
test_int128_unaligned"
+
+ // Aggregation + group by exercises hash-table key serialization where
+ // largeint values are packed into non-aligned byte buffers.
+ order_qt_largeint_groupby """
+ select v_largeint, count(*) from test_int128_unaligned group by
v_largeint
+ """
+ order_qt_decimal128_groupby """
+ select v_decimal128, count(*) from test_int128_unaligned group by
v_decimal128
+ """
+
+ // JSON serialization path: DataTypeNumberSerDe<TYPE_LARGEINT>::
+ // write_one_cell_to_jsonb reads __int128 from StringRef::data.
+ order_qt_largeint_to_json """
+ select id, cast(v_largeint as JSON) from test_int128_unaligned where
v_largeint is not null
+ """
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]