This is an automated email from the ASF dual-hosted git repository.

Mryange pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4483daf9f03 [fix](be) Clean up aggregate states and use Doris hash 
containers (#63174)
4483daf9f03 is described below

commit 4483daf9f03b93b9ef4fbb168d86219749ca0181
Author: Mryange <[email protected]>
AuthorDate: Tue May 19 14:38:08 2026 +0800

    [fix](be) Clean up aggregate states and use Doris hash containers (#63174)
    
    ### What problem does this PR solve?
    
    Issue Number: N/A
    
    Problem Summary:
    
    Aggregate batch deserialization creates aggregate states with placement
    new before deserializing or merging serialized input. If deserialization
    or merge throws after `create()` succeeds, the previous cleanup only
    destroyed states from earlier rows and skipped the current
    already-created state. This can leak resources owned by aggregate state
    objects, such as hash sets or bitmap internals.
    
    Root cause: the exception cleanup destroyed only states from previous
    rows. If the current row's state was created successfully and
    deserialization failed afterward, that current state was excluded from
    cleanup.
    
    This PR tracks the number of successfully created aggregate states and
    destroys exactly that range on exception. It preserves the
    successful-path ownership model: `deserialize_vec()` leaves created
    states to its caller, while merge helpers still release temporary rhs
    states with `destroy_vec()` after successful merge.
    
    This PR also switches aggregate-local `phmap::flat_hash_map` and
    `phmap::flat_hash_set` usages to Doris wrapper aliases so they use
    Doris' default equality and allocator definitions consistently.
---
 be/src/exprs/aggregate/aggregate_function.h        |  54 ++++---
 .../exprs/aggregate/aggregate_function_collect.h   |   4 +-
 .../exprs/aggregate/aggregate_function_distinct.h  |   8 +-
 be/src/exprs/aggregate/aggregate_function_map.h    |   2 +-
 be/src/exprs/aggregate/aggregate_function_map_v2.h |   2 +-
 .../aggregate_function_exception_test.cpp          | 162 +++++++++++++++++++++
 6 files changed, 200 insertions(+), 32 deletions(-)

diff --git a/be/src/exprs/aggregate/aggregate_function.h 
b/be/src/exprs/aggregate/aggregate_function.h
index d11ea17d9b8..6a0c364b7cb 100644
--- a/be/src/exprs/aggregate/aggregate_function.h
+++ b/be/src/exprs/aggregate/aggregate_function.h
@@ -481,19 +481,21 @@ public:
                          size_t num_rows) const override {
         const Derived* derived = assert_cast<const Derived*>(this);
         const auto size_of_data = derived->size_of_data();
-        for (size_t i = 0; i != num_rows; ++i) {
-            try {
+        size_t created_count = 0;
+        try {
+            for (size_t i = 0; i != num_rows; ++i) {
                 auto place = places + size_of_data * i;
                 VectorBufferReader buffer_reader(column->get_data_at(i));
                 derived->create(place);
+                ++created_count;
                 derived->deserialize(place, buffer_reader, arena);
-            } catch (...) {
-                for (int j = 0; j < i; ++j) {
-                    auto place = places + size_of_data * j;
-                    derived->destroy(place);
-                }
-                throw;
             }
+        } catch (...) {
+            for (size_t j = 0; j < created_count; ++j) {
+                auto place = places + size_of_data * j;
+                derived->destroy(place);
+            }
+            throw;
         }
     }
 
@@ -504,19 +506,21 @@ public:
         const auto size_of_data = derived->size_of_data();
         const auto* column_string = assert_cast<const ColumnString*>(column);
 
-        for (size_t i = 0; i != num_rows; ++i) {
-            try {
+        size_t created_count = 0;
+        try {
+            for (size_t i = 0; i != num_rows; ++i) {
                 auto rhs_place = rhs + size_of_data * i;
                 VectorBufferReader 
buffer_reader(column_string->get_data_at(i));
                 derived->create(rhs_place);
+                ++created_count;
                 derived->deserialize_and_merge(places[i] + offset, rhs_place, 
buffer_reader, arena);
-            } catch (...) {
-                for (int j = 0; j < i; ++j) {
-                    auto place = rhs + size_of_data * j;
-                    derived->destroy(place);
-                }
-                throw;
             }
+        } catch (...) {
+            for (size_t j = 0; j < created_count; ++j) {
+                auto place = rhs + size_of_data * j;
+                derived->destroy(place);
+            }
+            throw;
         }
 
         derived->destroy_vec(rhs, num_rows);
@@ -528,22 +532,24 @@ public:
         const auto* derived = assert_cast<const Derived*>(this);
         const auto size_of_data = derived->size_of_data();
         const auto* column_string = assert_cast<const ColumnString*>(column);
-        for (size_t i = 0; i != num_rows; ++i) {
-            try {
+        size_t created_count = 0;
+        try {
+            for (size_t i = 0; i != num_rows; ++i) {
                 auto rhs_place = rhs + size_of_data * i;
                 VectorBufferReader 
buffer_reader(column_string->get_data_at(i));
                 derived->create(rhs_place);
+                ++created_count;
                 if (places[i]) {
                     derived->deserialize_and_merge(places[i] + offset, 
rhs_place, buffer_reader,
                                                    arena);
                 }
-            } catch (...) {
-                for (int j = 0; j < i; ++j) {
-                    auto place = rhs + size_of_data * j;
-                    derived->destroy(place);
-                }
-                throw;
             }
+        } catch (...) {
+            for (size_t j = 0; j < created_count; ++j) {
+                auto place = rhs + size_of_data * j;
+                derived->destroy(place);
+            }
+            throw;
         }
         derived->destroy_vec(rhs, num_rows);
     }
diff --git a/be/src/exprs/aggregate/aggregate_function_collect.h 
b/be/src/exprs/aggregate/aggregate_function_collect.h
index cd787627ef9..7eef3cff85e 100644
--- a/be/src/exprs/aggregate/aggregate_function_collect.h
+++ b/be/src/exprs/aggregate/aggregate_function_collect.h
@@ -49,7 +49,7 @@ struct AggregateFunctionCollectSetData {
     using ElementType = typename PrimitiveTypeTraits<T>::CppType;
     using ColVecType = typename PrimitiveTypeTraits<T>::ColumnType;
     using SelfType = AggregateFunctionCollectSetData;
-    using Set = phmap::flat_hash_set<ElementType>;
+    using Set = doris::flat_hash_set<ElementType>;
     Set data_set;
     Int64 max_size = -1;
 
@@ -118,7 +118,7 @@ struct AggregateFunctionCollectSetData<T, HasLimit> {
     using ElementType = StringRef;
     using ColVecType = ColumnString;
     using SelfType = AggregateFunctionCollectSetData<T, HasLimit>;
-    using Set = phmap::flat_hash_set<ElementType>;
+    using Set = doris::flat_hash_set<ElementType>;
     Set data_set;
     Int64 max_size = -1;
 
diff --git a/be/src/exprs/aggregate/aggregate_function_distinct.h 
b/be/src/exprs/aggregate/aggregate_function_distinct.h
index 060817188b2..c1dfb6bae5d 100644
--- a/be/src/exprs/aggregate/aggregate_function_distinct.h
+++ b/be/src/exprs/aggregate/aggregate_function_distinct.h
@@ -51,8 +51,8 @@ template <PrimitiveType T, bool stable>
 struct AggregateFunctionDistinctSingleNumericData {
     /// When creating, the hash table must be small.
     using Container = std::conditional_t<
-            stable, phmap::flat_hash_map<typename 
PrimitiveTypeTraits<T>::CppType, uint32_t>,
-            phmap::flat_hash_set<typename PrimitiveTypeTraits<T>::CppType>>;
+            stable, doris::flat_hash_map<typename 
PrimitiveTypeTraits<T>::CppType, uint32_t>,
+            doris::flat_hash_set<typename PrimitiveTypeTraits<T>::CppType>>;
     using Self = AggregateFunctionDistinctSingleNumericData<T, stable>;
     Container data;
 
@@ -125,8 +125,8 @@ struct AggregateFunctionDistinctSingleNumericData {
 template <bool stable>
 struct AggregateFunctionDistinctGenericData {
     /// When creating, the hash table must be small.
-    using Container = std::conditional_t<stable, 
phmap::flat_hash_map<StringRef, uint32_t>,
-                                         phmap::flat_hash_set<StringRef, 
StringRefHash>>;
+    using Container = std::conditional_t<stable, 
doris::flat_hash_map<StringRef, uint32_t>,
+                                         doris::flat_hash_set<StringRef, 
StringRefHash>>;
     using Self = AggregateFunctionDistinctGenericData;
     Container data;
 
diff --git a/be/src/exprs/aggregate/aggregate_function_map.h 
b/be/src/exprs/aggregate/aggregate_function_map.h
index 6c6c752fc65..f8a0b402d48 100644
--- a/be/src/exprs/aggregate/aggregate_function_map.h
+++ b/be/src/exprs/aggregate/aggregate_function_map.h
@@ -34,7 +34,7 @@ namespace doris {
 template <PrimitiveType K>
 struct AggregateFunctionMapAggData {
     using KeyType = typename PrimitiveTypeTraits<K>::CppType;
-    using Map = phmap::flat_hash_map<StringRef, int64_t>;
+    using Map = doris::flat_hash_map<StringRef, int64_t>;
 
     AggregateFunctionMapAggData() { throw 
Exception(Status::FatalError("__builtin_unreachable")); }
 
diff --git a/be/src/exprs/aggregate/aggregate_function_map_v2.h 
b/be/src/exprs/aggregate/aggregate_function_map_v2.h
index f335453f0c1..d1f33cab826 100644
--- a/be/src/exprs/aggregate/aggregate_function_map_v2.h
+++ b/be/src/exprs/aggregate/aggregate_function_map_v2.h
@@ -32,7 +32,7 @@
 namespace doris {
 
 struct AggregateFunctionMapAggDataV2 {
-    using Map = phmap::flat_hash_map<Field, int64_t>;
+    using Map = doris::flat_hash_map<Field, int64_t>;
 
     AggregateFunctionMapAggDataV2() {
         throw Exception(Status::FatalError("__builtin_unreachable"));
diff --git a/be/test/exprs/aggregate/aggregate_function_exception_test.cpp 
b/be/test/exprs/aggregate/aggregate_function_exception_test.cpp
new file mode 100644
index 00000000000..21ee64dba4a
--- /dev/null
+++ b/be/test/exprs/aggregate/aggregate_function_exception_test.cpp
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "core/arena.h"
+#include "exprs/aggregate/aggregate_function.h"
+
+namespace doris {
+
+struct TrackingAggregateState {
+    TrackingAggregateState() { ++construct_count; }
+    ~TrackingAggregateState() { ++destroy_count; }
+
+    static void reset_counters() {
+        construct_count = 0;
+        destroy_count = 0;
+    }
+
+    static int construct_count;
+    static int destroy_count;
+};
+
+int TrackingAggregateState::construct_count = 0;
+int TrackingAggregateState::destroy_count = 0;
+
+class ThrowOnDeserializeAggregateFunction final
+        : public IAggregateFunctionDataHelper<TrackingAggregateState,
+                                              
ThrowOnDeserializeAggregateFunction> {
+public:
+    ThrowOnDeserializeAggregateFunction()
+            : IAggregateFunctionDataHelper<TrackingAggregateState,
+                                           
ThrowOnDeserializeAggregateFunction>(
+                      DataTypes {std::make_shared<DataTypeString>()}) {}
+
+    String get_name() const override { return "throw_on_deserialize"; }
+
+    DataTypePtr get_return_type() const override { return 
std::make_shared<DataTypeString>(); }
+
+    void add(AggregateDataPtr, const IColumn**, ssize_t, Arena&) const 
override {}
+
+    void merge(AggregateDataPtr, ConstAggregateDataPtr, Arena&) const override 
{}
+
+    void serialize(ConstAggregateDataPtr, BufferWritable& buf) const override {
+        String payload;
+        buf.write_binary(payload);
+    }
+
+    void deserialize(AggregateDataPtr, BufferReadable& buf, Arena&) const 
override {
+        String payload;
+        buf.read_binary(payload);
+        if (payload == "throw") {
+            throw Exception(ErrorCode::INTERNAL_ERROR, "mock deserialize 
failure");
+        }
+    }
+
+    void insert_result_into(ConstAggregateDataPtr, IColumn&) const override {}
+};
+
+class AggregateFunctionExceptionTest : public testing::Test {
+protected:
+    void SetUp() override { TrackingAggregateState::reset_counters(); }
+
+    MutableColumnPtr make_column(std::initializer_list<String> payloads) {
+        auto column = ColumnString::create();
+        VectorBufferWriter writer(*column);
+        for (const auto& payload : payloads) {
+            writer.write_binary(payload);
+            writer.commit();
+        }
+        return column;
+    }
+
+    ThrowOnDeserializeAggregateFunction function;
+    Arena arena;
+};
+
+TEST_F(AggregateFunctionExceptionTest, 
DeserializeVecDestroysCurrentStateOnFailure) {
+    auto column = make_column({"ok", "throw"});
+    std::vector<char> states(function.size_of_data() * 2);
+
+    bool thrown = false;
+    try {
+        function.deserialize_vec(states.data(), 
static_cast<ColumnString*>(column.get()), arena, 2);
+    } catch (const Exception&) {
+        thrown = true;
+    }
+
+    EXPECT_TRUE(thrown);
+    if (!thrown) {
+        function.destroy_vec(states.data(), 2);
+    }
+    EXPECT_EQ(TrackingAggregateState::construct_count, 2);
+    EXPECT_EQ(TrackingAggregateState::destroy_count, 2);
+}
+
+TEST_F(AggregateFunctionExceptionTest, 
DeserializeAndMergeVecDestroysRhsStateOnFailure) {
+    auto column = make_column({"throw"});
+    std::vector<char> place_storage(function.size_of_data());
+    std::vector<char> rhs_storage(function.size_of_data());
+    auto* place = place_storage.data();
+    function.create(place);
+
+    std::array<AggregateDataPtr, 1> places {place};
+    const auto destroy_count_before_call = 
TrackingAggregateState::destroy_count;
+    bool thrown = false;
+    try {
+        function.deserialize_and_merge_vec(places.data(), 0, 
rhs_storage.data(), column.get(),
+                                           arena, 1);
+    } catch (const Exception&) {
+        thrown = true;
+    }
+
+    EXPECT_TRUE(thrown);
+    EXPECT_EQ(TrackingAggregateState::destroy_count - 
destroy_count_before_call, 1);
+
+    function.destroy(place);
+    EXPECT_EQ(TrackingAggregateState::construct_count, 
TrackingAggregateState::destroy_count);
+}
+
+TEST_F(AggregateFunctionExceptionTest,
+       DeserializeAndMergeVecSelectedDestroysAllCreatedRhsStatesOnFailure) {
+    auto column = make_column({"skip", "throw"});
+    std::vector<char> place_storage(function.size_of_data());
+    std::vector<char> rhs_storage(function.size_of_data() * 2);
+    auto* place = place_storage.data();
+    function.create(place);
+
+    std::array<AggregateDataPtr, 2> places {nullptr, place};
+    const auto destroy_count_before_call = 
TrackingAggregateState::destroy_count;
+    bool thrown = false;
+    try {
+        function.deserialize_and_merge_vec_selected(places.data(), 0, 
rhs_storage.data(),
+                                                    column.get(), arena, 2);
+    } catch (const Exception&) {
+        thrown = true;
+    }
+
+    EXPECT_TRUE(thrown);
+    EXPECT_EQ(TrackingAggregateState::destroy_count - 
destroy_count_before_call, 2);
+
+    function.destroy(place);
+    EXPECT_EQ(TrackingAggregateState::construct_count, 
TrackingAggregateState::destroy_count);
+}
+
+} // namespace doris
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to