This is an automated email from the ASF dual-hosted git repository.

jacktengg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 07b497ab219 [fix](be) Avoid UB from unaligned __int128 dereference 
(#63703)
07b497ab219 is described below

commit 07b497ab2192a8eb3ccfe1e0c41a48123c646e1e
Author: TengJianPing <[email protected]>
AuthorDate: Tue Jun 2 16:22:25 2026 +0800

    [fix](be) Avoid UB from unaligned __int128 dereference (#63703)
    
    Issue Number: close #xxx
    
    Problem Summary: Several BE call sites obtained a byte pointer from
    StringRef::data / Slice::data / a generic const void* (e.g. ORC pushdown
    literal value, JSONB serde, runtime filter literal builder, meta_tool
    column dump) and dereferenced it as `__int128*` / `int128_t*` /
    `DecimalV2Value*` / `Decimal<int128_t>*`.
    
    Because those buffers carry no 16-byte alignment guarantee, the load is
    undefined behavior. On alignment-strict targets (some aarch64 / SPARC
    builds) and under UBSan -fsanitize=alignment the read can SIGBUS, abort,
    or - with SSE codegen for __int128 - fault on a movdqa instruction.
    
    Sites fixed:
    - be/src/core/data_type_serde/data_type_number_serde.cpp (LARGEINT
    JSONB)
    - be/src/format/orc/vorc_reader.cpp (TYPE_DECIMALV2 / TYPE_DECIMAL128I
    literal conversion for ORC predicate push-down)
    - be/src/tools/meta_tool.cpp (LARGEINT and DECIMAL128I dump)
    - be/src/exprs/vexpr.h create_texpr_literal_node<>: TYPE_LARGEINT,
    TYPE_DECIMALV2 and TYPE_DECIMAL128I literal construction
    
    All these sites now load the 16-byte value through the
    `unaligned_load<T>` helper from `util/unaligned.h` into a local __int128
    / DecimalV2Value / Decimal<int128_t> before use. Modern compilers reduce
    the helper's memcpy to a load, so there is no measurable performance
    impact, but the semantics become well-defined regardless of the
    producer's alignment.
    
    Note: be/src/runtime/fold_constant_executor.cpp also contains unaligned
    __int128 reads for TYPE_LARGEINT and TYPE_DECIMALV2 in `_get_result`,
    but that branch is unreachable under the current Nereids planner (which
    always sets `is_nereids = true` and uses `be_exec_version >= 4`, taking
    the protobuf serde path). It is left untouched here to keep the diff
    focused; the dead branch can be cleaned up separately.
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 .../data_type_serde/data_type_number_serde.cpp     |   5 +-
 be/src/exprs/vexpr.h                               |  17 ++--
 be/src/format/orc/vorc_reader.cpp                  |   6 +-
 be/src/tools/meta_tool.cpp                         |   7 +-
 be/test/exprs/vexpr_unaligned_int128_test.cpp      | 107 +++++++++++++++++++++
 .../largeint/test_int128_unaligned_access.out      |  42 ++++++++
 .../largeint/test_int128_unaligned_access.groovy   |  88 +++++++++++++++++
 7 files changed, 261 insertions(+), 11 deletions(-)

diff --git a/be/src/core/data_type_serde/data_type_number_serde.cpp 
b/be/src/core/data_type_serde/data_type_number_serde.cpp
index 39e9c0726c4..a8e379578d0 100644
--- a/be/src/core/data_type_serde/data_type_number_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_number_serde.cpp
@@ -40,6 +40,7 @@
 #include "util/jsonb_writer.h"
 #include "util/mysql_global.h"
 #include "util/to_string.h"
+#include "util/unaligned.h"
 
 namespace doris {
 // Type map的基本结构
@@ -708,7 +709,9 @@ void DataTypeNumberSerDe<T>::write_one_cell_to_jsonb(const 
IColumn& column,
         int64_t val = *reinterpret_cast<const int64_t*>(data_ref.data);
         result.writeInt64(val);
     } else if constexpr (T == TYPE_LARGEINT) {
-        __int128_t val = *reinterpret_cast<const __int128_t*>(data_ref.data);
+        // data_ref.data may not be 16-byte aligned; dereferencing __int128*
+        // directly is UB and may SIGBUS on alignment-strict platforms.
+        __int128_t val = unaligned_load<__int128_t>(data_ref.data);
         result.writeInt128(val);
     } else if constexpr (T == TYPE_FLOAT) {
         float val = *reinterpret_cast<const float*>(data_ref.data);
diff --git a/be/src/exprs/vexpr.h b/be/src/exprs/vexpr.h
index 1576686fcc7..79f3485b3be 100644
--- a/be/src/exprs/vexpr.h
+++ b/be/src/exprs/vexpr.h
@@ -51,6 +51,7 @@
 #include "storage/index/index_reader.h"
 #include "storage/index/inverted/inverted_index_reader.h"
 #include "util/date_func.h"
+#include "util/unaligned.h"
 
 namespace doris {
 class BitmapFilterFuncBase;
@@ -489,10 +490,11 @@ Status create_texpr_literal_node(const void* data, 
TExprNode* node, int precisio
         (*node).__set_int_literal(intLiteral);
         (*node).__set_type(create_type_desc(PrimitiveType::TYPE_BIGINT));
     } else if constexpr (T == TYPE_LARGEINT) {
-        const auto* origin_value = reinterpret_cast<const int128_t*>(data);
+        // data may not be 16-byte aligned; use unaligned_load to avoid UB.
+        int128_t origin_value = unaligned_load<int128_t>(data);
         (*node).__set_node_type(TExprNodeType::LARGE_INT_LITERAL);
         TLargeIntLiteral large_int_literal;
-        large_int_literal.__set_value(LargeIntValue::to_string(*origin_value));
+        large_int_literal.__set_value(LargeIntValue::to_string(origin_value));
         (*node).__set_large_int_literal(large_int_literal);
         (*node).__set_type(create_type_desc(PrimitiveType::TYPE_LARGEINT));
     } else if constexpr ((T == TYPE_DATE) || (T == TYPE_DATETIME)) {
@@ -536,10 +538,12 @@ Status create_texpr_literal_node(const void* data, 
TExprNode* node, int precisio
         (*node).__set_node_type(TExprNodeType::DATE_LITERAL);
         (*node).__set_type(create_type_desc(PrimitiveType::TYPE_TIMESTAMPTZ, 
precision, scale));
     } else if constexpr (T == TYPE_DECIMALV2) {
-        const auto* origin_value = reinterpret_cast<const 
DecimalV2Value*>(data);
+        // data may not be 16-byte aligned (DecimalV2Value stores int128_t);
+        // use unaligned_load to avoid UB.
+        DecimalV2Value origin_value = unaligned_load<DecimalV2Value>(data);
         (*node).__set_node_type(TExprNodeType::DECIMAL_LITERAL);
         TDecimalLiteral decimal_literal;
-        decimal_literal.__set_value(origin_value->to_string());
+        decimal_literal.__set_value(origin_value.to_string());
         (*node).__set_decimal_literal(decimal_literal);
         (*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMALV2, 
precision, scale));
     } else if constexpr (T == TYPE_DECIMAL32) {
@@ -557,7 +561,8 @@ Status create_texpr_literal_node(const void* data, 
TExprNode* node, int precisio
         (*node).__set_decimal_literal(decimal_literal);
         (*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMAL64, 
precision, scale));
     } else if constexpr (T == TYPE_DECIMAL128I) {
-        const auto* origin_value = reinterpret_cast<const 
Decimal<int128_t>*>(data);
+        // data may not be 16-byte aligned; use unaligned_load to avoid UB.
+        Decimal<int128_t> origin_value = 
unaligned_load<Decimal<int128_t>>(data);
         (*node).__set_node_type(TExprNodeType::DECIMAL_LITERAL);
         TDecimalLiteral decimal_literal;
         // e.g. For a decimal(26,6) column, the initial value of the _min of 
the MinMax RF
@@ -567,7 +572,7 @@ Status create_texpr_literal_node(const void* data, 
TExprNode* node, int precisio
         // error when casting string back to decimal later.
         // TODO: this is a temporary solution, the best solution is to produce 
the
         // right min max value at the producer side.
-        decimal_literal.__set_value(origin_value->to_string(precision, scale));
+        decimal_literal.__set_value(origin_value.to_string(precision, scale));
         (*node).__set_decimal_literal(decimal_literal);
         (*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMAL128I, 
precision, scale));
     } else if constexpr (T == TYPE_DECIMAL256) {
diff --git a/be/src/format/orc/vorc_reader.cpp 
b/be/src/format/orc/vorc_reader.cpp
index 47eb9a23f4f..f41b4d0a7e5 100644
--- a/be/src/format/orc/vorc_reader.cpp
+++ b/be/src/format/orc/vorc_reader.cpp
@@ -102,6 +102,7 @@
 #include "storage/utils.h"
 #include "util/slice.h"
 #include "util/timezone_utils.h"
+#include "util/unaligned.h"
 
 namespace doris {
 class RuntimeState;
@@ -781,7 +782,8 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const 
orc::Type* type,
         case orc::TypeKind::DECIMAL: {
             int128_t decimal_value;
             if constexpr (primitive_type == TYPE_DECIMALV2) {
-                decimal_value = *reinterpret_cast<const int128_t*>(value);
+                // value may not be 16-byte aligned; use unaligned_load to 
avoid UB.
+                decimal_value = unaligned_load<int128_t>(value);
                 precision = DecimalV2Value::PRECISION;
                 scale = DecimalV2Value::SCALE;
             } else if constexpr (primitive_type == TYPE_DECIMAL32) {
@@ -789,7 +791,7 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const 
orc::Type* type,
             } else if constexpr (primitive_type == TYPE_DECIMAL64) {
                 decimal_value = *((int64_t*)value);
             } else if constexpr (primitive_type == TYPE_DECIMAL128I) {
-                decimal_value = *((int128_t*)value);
+                decimal_value = unaligned_load<int128_t>(value);
             } else {
                 return std::make_tuple(false, orc::Literal(false));
             }
diff --git a/be/src/tools/meta_tool.cpp b/be/src/tools/meta_tool.cpp
index 577e70ff053..18bc27c1d83 100644
--- a/be/src/tools/meta_tool.cpp
+++ b/be/src/tools/meta_tool.cpp
@@ -63,6 +63,7 @@
 #include "storage/tablet/tablet_schema_cache.h"
 #include "storage/types.h"
 #include "util/coding.h"
+#include "util/unaligned.h"
 
 using doris::DataDir;
 using doris::StorageEngine;
@@ -472,7 +473,8 @@ std::string format_column_value(const doris::IColumn& 
column, size_t row,
             // LargeInt is stored as Int128
             const StringRef& data = column.get_data_at(row);
             if (data.size == sizeof(__int128)) {
-                __int128 val = *reinterpret_cast<const __int128*>(data.data);
+                // data.data may not be 16-byte aligned; use unaligned_load to 
avoid UB.
+                __int128 val = unaligned_load<__int128>(data.data);
                 return doris::LargeIntValue::to_string(val);
             }
             return "<invalid largeint>";
@@ -556,7 +558,8 @@ std::string format_column_value(const doris::IColumn& 
column, size_t row,
         case FieldType::OLAP_FIELD_TYPE_DECIMAL128I: {
             const StringRef& data = column.get_data_at(row);
             if (data.size == sizeof(__int128)) {
-                __int128 val = *reinterpret_cast<const __int128*>(data.data);
+                // data.data may not be 16-byte aligned; use unaligned_load to 
avoid UB.
+                __int128 val = unaligned_load<__int128>(data.data);
                 return doris::LargeIntValue::to_string(val);
             }
             return "<invalid decimal>";
diff --git a/be/test/exprs/vexpr_unaligned_int128_test.cpp 
b/be/test/exprs/vexpr_unaligned_int128_test.cpp
new file mode 100644
index 00000000000..d34a6da7f94
--- /dev/null
+++ b/be/test/exprs/vexpr_unaligned_int128_test.cpp
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression test for unaligned __int128 dereference UB.
+//
+// Several call sites used to dereference a `__int128*` produced from a
+// `StringRef::data` (or similar byte pointer) without any alignment
+// guarantee. This file pins the contract that the helpers that build
+// literal TExprNodes from a raw `const void* data` pointer must accept
+// pointers that are *not* 16-byte aligned, since on alignment-strict
+// platforms (e.g. some aarch64 / SPARC builds, and UBSan
+// -fsanitize=alignment) such reads are undefined behavior and may
+// SIGBUS.
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "core/value/decimalv2_value.h"
+#include "core/value/large_int_value.h"
+#include "exprs/vexpr.h"
+
+namespace doris {
+
+// Returns a pointer guaranteed to be 1 byte off any 16-byte boundary.
+static char* misaligned_slot(std::vector<char>& buf, std::size_t bytes) {
+    buf.assign(bytes + 32, 0);
+    char* base = buf.data();
+    // Move forward until we land on an odd address.
+    std::size_t off = 0;
+    while ((reinterpret_cast<std::uintptr_t>(base + off) & 0xF) != 1) {
+        ++off;
+    }
+    return base + off;
+}
+
+TEST(UnalignedInt128Test, LargeIntLiteralFromUnalignedBuffer) {
+    std::vector<char> buf;
+    char* p = misaligned_slot(buf, sizeof(__int128));
+    ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);
+
+    // 2^126 - 1: a value that uses both 64-bit halves.
+    __int128 expected = (static_cast<__int128>(0x3FFFFFFFFFFFFFFFLL) << 64) |
+                        static_cast<__int128>(0xFEEDFACECAFEBEEFULL);
+    std::memcpy(p, &expected, sizeof(expected));
+
+    TExprNode node;
+    Status st = create_texpr_literal_node<TYPE_LARGEINT>(p, &node);
+    ASSERT_TRUE(st.ok()) << st;
+    ASSERT_TRUE(node.__isset.large_int_literal);
+    EXPECT_EQ(node.large_int_literal.value, 
LargeIntValue::to_string(expected));
+}
+
+TEST(UnalignedInt128Test, Decimal128ILiteralFromUnalignedBuffer) {
+    std::vector<char> buf;
+    char* p = misaligned_slot(buf, sizeof(__int128));
+    ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);
+
+    // Decimal(20, 4) value: 1234567890123456.7890
+    __int128 raw = static_cast<__int128>(1234567890123456789LL) * 10 + 1;
+    std::memcpy(p, &raw, sizeof(raw));
+
+    TExprNode node;
+    Status st = create_texpr_literal_node<TYPE_DECIMAL128I>(p, &node, 
/*precision=*/20,
+                                                            /*scale=*/4);
+    ASSERT_TRUE(st.ok()) << st;
+    ASSERT_TRUE(node.__isset.decimal_literal);
+    // Sanity: the formatted value must contain "1234567890123456".
+    EXPECT_NE(node.decimal_literal.value.find("1234567890123456"), 
std::string::npos)
+            << node.decimal_literal.value;
+}
+
+TEST(UnalignedInt128Test, DecimalV2LiteralFromUnalignedBuffer) {
+    std::vector<char> buf;
+    char* p = misaligned_slot(buf, sizeof(DecimalV2Value));
+    ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);
+
+    DecimalV2Value src;
+    // 12345.6789 * 1e9 (DecimalV2 internal scale = 9).
+    src.set_value(static_cast<__int128>(12345678900000LL));
+    std::memcpy(p, &src, sizeof(src));
+
+    TExprNode node;
+    Status st = create_texpr_literal_node<TYPE_DECIMALV2>(p, &node, 
/*precision=*/27,
+                                                          /*scale=*/9);
+    ASSERT_TRUE(st.ok()) << st;
+    ASSERT_TRUE(node.__isset.decimal_literal);
+    EXPECT_EQ(node.decimal_literal.value, src.to_string());
+}
+
+} // namespace doris
diff --git 
a/regression-test/data/datatype_p0/largeint/test_int128_unaligned_access.out 
b/regression-test/data/datatype_p0/largeint/test_int128_unaligned_access.out
new file mode 100644
index 00000000000..66c56c6a8d1
--- /dev/null
+++ b/regression-test/data/datatype_p0/largeint/test_int128_unaligned_access.out
@@ -0,0 +1,42 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !largeint_select --
+1      170141183460469231731687303715884105727
+2      -170141183460469231731687303715884105728
+3      0
+4      1
+5      \N
+
+-- !decimal128_select --
+1      12345678901234567890.1234567890
+2      -12345678901234567890.1234567890
+3      0E-10
+4      1E-10
+5      \N
+
+-- !decimalv2_select --
+1      1234567890.123456789
+2      -1234567890.123456789
+3      0E-9
+4      1E-9
+5      \N
+
+-- !largeint_groupby --
+\N     1
+-170141183460469231731687303715884105728       1
+0      1
+1      1
+170141183460469231731687303715884105727        1
+
+-- !decimal128_groupby --
+\N     1
+-12345678901234567890.1234567890       1
+0E-10  1
+12345678901234567890.1234567890        1
+1E-10  1
+
+-- !largeint_to_json --
+1      170141183460469231731687303715884105727
+2      -170141183460469231731687303715884105728
+3      0
+4      1
+
diff --git 
a/regression-test/suites/datatype_p0/largeint/test_int128_unaligned_access.groovy
 
b/regression-test/suites/datatype_p0/largeint/test_int128_unaligned_access.groovy
new file mode 100644
index 00000000000..2ddca0a92fc
--- /dev/null
+++ 
b/regression-test/suites/datatype_p0/largeint/test_int128_unaligned_access.groovy
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression for the unaligned __int128 dereference fix. Exercises the
+// real-runtime paths that previously dereferenced a __int128* through a
+// non 16-byte aligned StringRef::data / Slice::data pointer:
+//
+//   * cast LARGEINT column -> JSON: DataTypeNumberSerDe<TYPE_LARGEINT>::
+//     write_one_cell_to_jsonb
+//   * LARGEINT / DECIMALV2 / DECIMAL128 round-trip through a table and
+//     group-by: column-data-to-string paths used by vexpr literal
+//     construction at runtime (e.g. runtime filter min/max push), and
+//     by meta_tool.
+//
+// Under UBSan -fsanitize=alignment or on strict-alignment platforms
+// (e.g. aarch64) the previous code could SIGBUS / abort.
+//
+// Note: FoldConstantExecutor::_get_result is not exercised here because
+// it is unreachable under the current Nereids planner with default
+// be_exec_version (>= 4); the new BE-side fold path serializes the
+// result via DataTypeSerDe::write_column_to_pb which does not contain an
+// unaligned __int128 dereference. The fix to _get_result is retained as
+// defensive code; the BE unit test (vexpr_unaligned_int128_test.cpp)
+// covers the create_texpr_literal_node LARGEINT / DECIMAL128I /
+// DECIMALV2 branches directly from a deliberately misaligned buffer.
+suite("test_int128_unaligned_access") {
+    sql "set enable_sql_cache=false;"
+
+    // Storage path: round-trip largeint / decimal128 / decimalv2 through
+    // a table to exercise the column-data-to-string code paths used by
+    // vexpr literal construction and meta_tool at runtime.
+    sql "drop table if exists test_int128_unaligned"
+    sql """
+        CREATE TABLE test_int128_unaligned (
+            id INT NOT NULL,
+            v_largeint LARGEINT NULL,
+            v_decimal128 DECIMALV3(38, 10) NULL,
+            v_decimalv2 DECIMALV2(27, 9) NULL
+        ) DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES("replication_num" = "1")
+    """
+
+    sql """
+        INSERT INTO test_int128_unaligned VALUES
+            (1, 170141183460469231731687303715884105727,
+                12345678901234567890.1234567890,
+                1234567890.123456789),
+            (2, -170141183460469231731687303715884105728,
+                -12345678901234567890.1234567890,
+                -1234567890.123456789),
+            (3, 0, 0, 0),
+            (4, 1, 0.0000000001, 0.000000001),
+            (5, NULL, NULL, NULL)
+    """
+
+    order_qt_largeint_select "select id, v_largeint from test_int128_unaligned"
+    order_qt_decimal128_select "select id, v_decimal128 from 
test_int128_unaligned"
+    order_qt_decimalv2_select "select id, v_decimalv2 from 
test_int128_unaligned"
+
+    // Aggregation + group by exercises hash-table key serialization where
+    // largeint values are packed into non-aligned byte buffers.
+    order_qt_largeint_groupby """
+        select v_largeint, count(*) from test_int128_unaligned group by 
v_largeint
+    """
+    order_qt_decimal128_groupby """
+        select v_decimal128, count(*) from test_int128_unaligned group by 
v_decimal128
+    """
+
+    // JSON serialization path: DataTypeNumberSerDe<TYPE_LARGEINT>::
+    // write_one_cell_to_jsonb reads __int128 from StringRef::data.
+    order_qt_largeint_to_json """
+        select id, cast(v_largeint as JSON) from test_int128_unaligned where 
v_largeint is not null
+    """
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to