This is an automated email from the ASF dual-hosted git repository.
zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 5d97b29034a [improvement](be) Optimize count on nullable column
(#64166)
5d97b29034a is described below
commit 5d97b29034ab07f32ccff6f8c9d14efdba3190ec
Author: zclllyybb <[email protected]>
AuthorDate: Mon Jun 8 11:52:56 2026 +0800
[improvement](be) Optimize count on nullable column (#64166)
Count aggregation without GROUP BY reaches
AggFnEvaluator::execute_single_add(), which calls
add_batch_single_place(). AggregateFunctionCount and
AggregateFunctionCountNotNullUnary previously inherited the row-by-row
helper there, so count(*) and count(nullable_expr) paid per-row
add/is_null_at costs even when all rows were aggregated into one state.
This patch adds batch implementations: count(*) increments the state
once by batch_size, while unary count(nullable_expr) checks the nullable
null map once and fast-paths the no-NULL case to count += batch_size.
When NULLs exist it uses simd::count_zero_num() over the null map to
count non-NULL rows. The nullable class name is kept because SQL
count(expr) counts non-NULL values, not NULL values.
Performance:
test with sql
```sql
select count(nullable(number)) from numbers("number"="1000000000");
select count(nullable(if(number >= 0, null, number))) from
numbers("number"="1000000000");
select count(nullable(if(number % 2 = 0, number, null))) from
numbers("number"="1000000000");
```
get result
```
Scenario before median / mean after median / mean median diff
━━━━━━━━━━━ ━━━━━━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━
non NULL 645 / 648.6 ms 555 / 556.4 ms -14.0%
─────────── ────────────────────── ───────────────────── ─────────────
all NULL 1541 / 1539.6 ms 1448 / 1450.6 ms -6.0%
─────────── ────────────────────── ───────────────────── ─────────────
half NULL 4256 / 4261.2 ms 4192 / 4232.2 ms -1.5%
```
---
be/src/exprs/aggregate/aggregate_function_count.h | 23 +++++++++++++++++++++--
be/test/exprs/aggregate/agg_count_test.cpp | 21 +++++++++++++++++++++
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/be/src/exprs/aggregate/aggregate_function_count.h
b/be/src/exprs/aggregate/aggregate_function_count.h
index bc6e57251c0..2ae2692ccb5 100644
--- a/be/src/exprs/aggregate/aggregate_function_count.h
+++ b/be/src/exprs/aggregate/aggregate_function_count.h
@@ -37,6 +37,7 @@
#include "core/data_type/data_type_number.h"
#include "core/types.h"
#include "exprs/aggregate/aggregate_function.h"
+#include "util/simd/bits.h"
namespace doris {
class Arena;
@@ -67,6 +68,11 @@ public:
++data(place).count;
}
+ void add_batch_single_place(size_t batch_size, AggregateDataPtr place,
const IColumn**,
+ Arena&) const override {
+ data(place).count += batch_size;
+ }
+
void reset(AggregateDataPtr place) const override {
AggregateFunctionCount::data(place).count = 0;
}
@@ -180,8 +186,7 @@ public:
}
};
-// TODO: Maybe AggregateFunctionCountNotNullUnary should be a subclass of
AggregateFunctionCount
-// Simply count number of not-NULL values.
+// Used for unary count(nullable_expr). SQL count(expr) counts non-NULL values.
class AggregateFunctionCountNotNullUnary final
: public IAggregateFunctionDataHelper<AggregateFunctionCountData,
AggregateFunctionCountNotNullUnary> {
@@ -202,6 +207,20 @@ public:
.is_null_at(row_num);
}
+ void add_batch_single_place(size_t batch_size, AggregateDataPtr place,
const IColumn** columns,
+ Arena&) const override {
+ const auto& nullable_column =
+ assert_cast<const ColumnNullable&,
TypeCheckOnRelease::DISABLE>(*columns[0]);
+ const auto& null_map = nullable_column.get_null_map_data();
+ DCHECK_LE(batch_size, null_map.size());
+ if (!nullable_column.has_null(0, batch_size)) {
+ data(place).count += batch_size;
+ return;
+ }
+ data(place).count +=
+ simd::count_zero_num(reinterpret_cast<const
int8_t*>(null_map.data()), batch_size);
+ }
+
void reset(AggregateDataPtr place) const override { data(place).count = 0;
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs,
diff --git a/be/test/exprs/aggregate/agg_count_test.cpp
b/be/test/exprs/aggregate/agg_count_test.cpp
index 7b86cf5324f..21162c97022 100644
--- a/be/test/exprs/aggregate/agg_count_test.cpp
+++ b/be/test/exprs/aggregate/agg_count_test.cpp
@@ -17,6 +17,7 @@
#include <gtest/gtest.h>
+#include "core/data_type/data_type_nullable.h"
#include "core/data_type/data_type_number.h"
#include "exprs/aggregate/agg_function_test.h"
@@ -31,4 +32,24 @@ TEST_F(AggregateFunctionCountTest, test_int64) {
execute(Block({ColumnHelper::create_column_with_name<DataTypeInt64>({1, 2,
3})}),
ColumnHelper::create_column_with_name<DataTypeInt64>({3}));
}
+
+TEST_F(AggregateFunctionCountTest, test_nullable_int64_without_null) {
+ create_agg("count", false,
+
{std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt64>())},
+ std::make_shared<DataTypeInt64>());
+
+
execute(Block({ColumnHelper::create_nullable_column_with_name<DataTypeInt64>({1,
2, 3, 4},
+
{0, 0, 0, 0})}),
+ ColumnHelper::create_column_with_name<DataTypeInt64>({4}));
+}
+
+TEST_F(AggregateFunctionCountTest, test_nullable_int64_with_null) {
+ create_agg("count", false,
+
{std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt64>())},
+ std::make_shared<DataTypeInt64>());
+
+
execute(Block({ColumnHelper::create_nullable_column_with_name<DataTypeInt64>({1,
2, 3, 4, 5},
+
{0, 1, 0, 1, 0})}),
+ ColumnHelper::create_column_with_name<DataTypeInt64>({3}));
+}
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]