Copilot commented on code in PR #61303:
URL: https://github.com/apache/doris/pull/61303#discussion_r2929336869
##########
be/src/pipeline/exec/streaming_aggregation_operator.cpp:
##########
@@ -66,20 +66,32 @@ struct StreamingHtMinReductionEntry {
// of the machine that we're running on.
static constexpr StreamingHtMinReductionEntry STREAMING_HT_MIN_REDUCTION[] = {
// Expand up to L2 cache always.
- {0, 0.0},
+ {.min_ht_mem = 0, .streaming_ht_min_reduction = 0.0},
// Expand into L3 cache if we look like we're getting some reduction.
// At present, The L2 cache is generally 1024k or more
- {1024 * 1024, 1.1},
+ {.min_ht_mem = 256 * 1024, .streaming_ht_min_reduction = 1.1},
Review Comment:
The comments state the L2 cache threshold is “generally 1024k or more”, but
the code now switches at `256 * 1024`. Either the comment should be updated to
match the new threshold rationale, or the threshold should be adjusted to
reflect the documented intent (same issue in both the normal and single-backend
tables).
##########
be/src/vec/common/hash_table/hash_map_context.h:
##########
@@ -365,8 +417,305 @@ struct MethodStringNoCache : public MethodBase<TData> {
key_columns[0]->reserve(num_rows);
key_columns[0]->insert_many_strings(input_keys.data(), num_rows);
}
+
+ const StringKeySubTableGroups& get_sub_table_groups() const { return
_sub_table_groups; }
+};
+
+/// Helper: detect whether HashMap is a nullable-wrapped StringHashMap.
+template <typename HashMap>
+struct IsNullableStringHashMap : std::false_type {};
+
+template <typename Mapped, typename Allocator>
+struct IsNullableStringHashMap<DataWithNullKey<StringHashMap<Mapped,
Allocator>>> : std::true_type {
};
+/// Helper: get the underlying StringHashTable from a hash table (handles
DataWithNullKey wrapper).
+template <typename HashMap>
+auto& get_string_hash_table(HashMap& data) {
+ return data;
+}
+
+/// Compile-time key conversion for each sub-table group.
+/// Groups 1-4 use to_string_key<T>(); groups 0 and 5 use StringRef directly.
+/// Returns the converted key for the given group.
+/// For groups 0 and 5, the key is returned as a non-const copy
(lazy_emplace_if_zero takes Key&).
+template <int GroupIdx>
+auto convert_key_for_submap(const StringRef& origin) {
+ if constexpr (GroupIdx == 0) {
+ return StringRef(origin); // copy — m0 needs non-const Key&
+ } else if constexpr (GroupIdx == 1) {
+ return to_string_key<StringKey2>(origin);
+ } else if constexpr (GroupIdx == 2) {
+ return to_string_key<StringKey4>(origin);
+ } else if constexpr (GroupIdx == 3) {
+ return to_string_key<StringKey8>(origin);
+ } else if constexpr (GroupIdx == 4) {
+ return to_string_key<StringKey16>(origin);
+ } else {
+ return StringRef(origin); // copy — ms uses StringRef as Key
+ }
+}
+
+/// Hash value to use for a given group. Group 0 (empty string) always uses
hash=0.
+template <int GroupIdx, typename HashValues>
+size_t hash_for_group(const HashValues& hash_values, uint32_t row) {
+ if constexpr (GroupIdx == 0) {
+ return 0;
+ } else {
+ return hash_values[row];
+ }
+}
+
+/// Whether prefetch is useful for a group. Group 0 (StringHashTableEmpty, at
most 1 element)
+/// does not benefit from prefetch.
+template <int GroupIdx>
+static constexpr bool group_needs_prefetch = (GroupIdx != 0);
+
+/// Process one sub-table group for emplace with result_handler.
+/// Handles nullable null-key check, prefetch, key conversion, and emplace.
+/// pre_handler(row) is called before each emplace, allowing callers to set
per-row state
+/// (e.g., current row index used inside creator lambdas).
+template <int GroupIdx, bool is_nullable, typename Submap, typename
HashMethodType, typename State,
+ typename HashMap, typename F, typename FF, typename PreHandler,
typename ResultHandler>
+void process_submap_emplace(Submap& submap, const uint32_t* indices, size_t
count,
+ HashMethodType& agg_method, State& state, HashMap&
hash_table,
+ F&& creator, FF&& creator_for_null_key,
PreHandler&& pre_handler,
+ ResultHandler&& result_handler) {
+ using Mapped = typename HashMethodType::Mapped;
+ for (size_t j = 0; j < count; j++) {
+ if constexpr (group_needs_prefetch<GroupIdx>) {
+ if (j + HASH_MAP_PREFETCH_DIST < count) {
+ submap.template prefetch<false>(
+ agg_method.hash_values[indices[j +
HASH_MAP_PREFETCH_DIST]]);
+ }
+ }
+ uint32_t row = indices[j];
+ pre_handler(row);
+ if constexpr (is_nullable) {
+ if (state.key_column->is_null_at(row)) {
+ bool has_null_key = hash_table.has_null_key_data();
+ hash_table.has_null_key_data() = true;
+ if (!has_null_key) {
+ std::forward<FF>(creator_for_null_key)(
+ hash_table.template get_null_key_data<Mapped>());
+ }
+ result_handler(row, hash_table.template
get_null_key_data<Mapped>());
+ continue;
+ }
+ }
+ auto origin = agg_method.keys[row];
+ auto converted_key = convert_key_for_submap<GroupIdx>(origin);
+ typename Submap::LookupResult result;
+ if constexpr (GroupIdx == 0 || GroupIdx == 5) {
+ // Groups 0,5: key and origin are the same StringRef
+ submap.lazy_emplace_with_origin(converted_key, converted_key,
result,
+
hash_for_group<GroupIdx>(agg_method.hash_values, row),
+ std::forward<F>(creator));
+ } else {
+ // Groups 1-4: converted_key differs from origin
+ submap.lazy_emplace_with_origin(converted_key, origin, result,
+
hash_for_group<GroupIdx>(agg_method.hash_values, row),
+ std::forward<F>(creator));
+ }
+ result_handler(row, result->get_second());
+ }
+}
+
+/// Process one sub-table group for emplace without result_handler (void
version).
+/// pre_handler(row) is called before each emplace.
+template <int GroupIdx, bool is_nullable, typename Submap, typename
HashMethodType, typename State,
+ typename HashMap, typename F, typename FF, typename PreHandler>
+void process_submap_emplace_void(Submap& submap, const uint32_t* indices,
size_t count,
+ HashMethodType& agg_method, State& state,
HashMap& hash_table,
+ F&& creator, FF&& creator_for_null_key,
PreHandler&& pre_handler) {
+ for (size_t j = 0; j < count; j++) {
+ if constexpr (group_needs_prefetch<GroupIdx>) {
+ if (j + HASH_MAP_PREFETCH_DIST < count) {
+ submap.template prefetch<false>(
+ agg_method.hash_values[indices[j +
HASH_MAP_PREFETCH_DIST]]);
+ }
+ }
+ uint32_t row = indices[j];
+ pre_handler(row);
+ if constexpr (is_nullable) {
+ if (state.key_column->is_null_at(row)) {
+ bool has_null_key = hash_table.has_null_key_data();
+ hash_table.has_null_key_data() = true;
+ if (!has_null_key) {
+ std::forward<FF>(creator_for_null_key)();
+ }
+ continue;
+ }
+ }
Review Comment:
In `process_submap_emplace_void`, the null-key path sets
`has_null_key_data()` but never initializes/accesses the null-key mapped
storage (unlike the non-void version which calls
`creator_for_null_key(get_null_key_data<Mapped>())`). If this helper is ever
instantiated for a nullable *map* (not a set), this can leave null-key mapped
state uninitialized and later reads may be incorrect. Consider aligning the
void-path behavior with the non-void version (e.g., initialize via
`get_null_key_data<Mapped>()` and invoke `creator_for_null_key` with mapped
when it’s invocable, otherwise fall back to the no-arg form).
##########
be/src/vec/common/hash_table/hash_map_context.h:
##########
@@ -112,27 +112,22 @@ struct MethodBaseInner {
}
template <bool read>
- ALWAYS_INLINE void prefetch(size_t i) {
+ void prefetch(size_t i) {
if (LIKELY(i + HASH_MAP_PREFETCH_DIST < hash_values.size())) {
hash_table->template prefetch<read>(keys[i +
HASH_MAP_PREFETCH_DIST],
hash_values[i +
HASH_MAP_PREFETCH_DIST]);
}
}
template <typename State>
- ALWAYS_INLINE auto find(State& state, size_t i) {
- if constexpr (!is_string_hash_map()) {
- prefetch<true>(i);
- }
+ auto find(State& state, size_t i) {
+ prefetch<true>(i);
return state.find_key_with_hash(*hash_table, i, keys[i],
hash_values[i]);
}
template <typename State, typename F, typename FF>
- ALWAYS_INLINE auto lazy_emplace(State& state, size_t i, F&& creator,
- FF&& creator_for_null_key) {
- if constexpr (!is_string_hash_map()) {
- prefetch<false>(i);
- }
+ auto lazy_emplace(State& state, size_t i, F&& creator, FF&&
creator_for_null_key) {
+ prefetch<false>(i);
return state.lazy_emplace_key(*hash_table, i, keys[i], hash_values[i],
creator,
creator_for_null_key);
}
Review Comment:
ALWAYS_INLINE` was removed from `prefetch/find/lazy_emplace`, and the prior
`if constexpr (!is_string_hash_map())` gating is gone, so string hash-map paths
will now always prefetch and pay extra call overhead. This risks a regression
in the very hot hashing loop; consider restoring `ALWAYS_INLINE` and re-adding
the compile-time guard so prefetch is skipped where it previously was (or
justify/measure the new behavior explicitly).
##########
be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp:
##########
@@ -53,13 +53,25 @@ static constexpr StreamingHtMinReductionEntry
STREAMING_HT_MIN_REDUCTION[] = {
{.min_ht_mem = 16 * 1024 * 1024, .streaming_ht_min_reduction = 2.0},
};
+static constexpr StreamingHtMinReductionEntry
SINGLE_BE_STREAMING_HT_MIN_REDUCTION[] = {
+ // Expand up to L2 cache always.
+ {.min_ht_mem = 0, .streaming_ht_min_reduction = 0.0},
+ // Expand into L3 cache if we look like we're getting some reduction.
+ // At present, The L2 cache is generally 1024k or more
+ {.min_ht_mem = 256 * 1024, .streaming_ht_min_reduction = 5.0},
Review Comment:
Same inconsistency as in `streaming_aggregation_operator.cpp`: the comment
references ~1024k L2 cache sizing, but the threshold uses `256 * 1024`. Please
reconcile the comment and the chosen cutoff so future tuning doesn’t rely on
misleading documentation.
##########
be/src/pipeline/exec/streaming_aggregation_operator.cpp:
##########
@@ -66,20 +66,32 @@ struct StreamingHtMinReductionEntry {
// of the machine that we're running on.
static constexpr StreamingHtMinReductionEntry STREAMING_HT_MIN_REDUCTION[] = {
// Expand up to L2 cache always.
- {0, 0.0},
+ {.min_ht_mem = 0, .streaming_ht_min_reduction = 0.0},
// Expand into L3 cache if we look like we're getting some reduction.
// At present, The L2 cache is generally 1024k or more
- {1024 * 1024, 1.1},
+ {.min_ht_mem = 256 * 1024, .streaming_ht_min_reduction = 1.1},
// Expand into main memory if we're getting a significant reduction.
// The L3 cache is generally 16MB or more
- {16 * 1024 * 1024, 2.0},
+ {.min_ht_mem = 16 * 1024 * 1024, .streaming_ht_min_reduction = 2.0},
+};
+
+static constexpr StreamingHtMinReductionEntry
SINGLE_BE_STREAMING_HT_MIN_REDUCTION[] = {
+ // Expand up to L2 cache always.
+ {.min_ht_mem = 0, .streaming_ht_min_reduction = 0.0},
+ // Expand into L3 cache if we look like we're getting some reduction.
+ // At present, The L2 cache is generally 1024k or more
+ {.min_ht_mem = 256 * 1024, .streaming_ht_min_reduction = 5.0},
Review Comment:
The comments state the L2 cache threshold is “generally 1024k or more”, but
the code now switches at `256 * 1024`. Either the comment should be updated to
match the new threshold rationale, or the threshold should be adjusted to
reflect the documented intent (same issue in both the normal and single-backend
tables).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]