This is an automated email from the ASF dual-hosted git repository. iilyak pushed a commit to branch couch-stats-resource-tracker-http-api in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 39e238721f17960c057cbcb7efb993350ff88eda Author: ILYA Khlopotov <[email protected]> AuthorDate: Thu Jun 12 06:22:58 2025 -0700 Use sliding topK --- src/couch_stats/src/csrt_query.erl | 59 ++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/src/couch_stats/src/csrt_query.erl b/src/couch_stats/src/csrt_query.erl index c46345cd3..cb59f369e 100644 --- a/src/couch_stats/src/csrt_query.erl +++ b/src/couch_stats/src/csrt_query.erl @@ -157,18 +157,61 @@ group_by(KeyFun, ValFun, AggFun, Limit) -> {limit, Acc} end. -%% Sorts largest first -sorted(Map) when is_map(Map) -> - lists:sort(fun({_K1, A}, {_K2, B}) -> B < A end, maps:to_list(Map)). +%% +%% Auxiliary functions to calculate topK +%% -shortened(L) -> - lists:sublist(L, 10). +-record(topK, { + % we store ordered elements in ascending order + seq = [] :: list(pos_integer()), + % we rely on erlang sorting order where `number < atom` + min = infinite :: infinite | pos_integer(), + max = 0 :: pos_integer(), + size = 0 :: non_neg_integer(), + % capacity cannot be less than 1 + capacity = 1 :: pos_integer() +}). + +new_topK(K) when K >= 1 -> + #topK{capacity = K}. + +% when we are at capacity +% don't bother adding the value since it is less than what we already saw +update_topK(_Key, Value, #topK{size = S, capacity = S, min = Min} = Top) when Value < Min -> + Top#topK{min = Value}; +% when we are at capacity evict smallest value +update_topK(Key, Value, #topK{size = S, capacity = S, max = Max, seq = Seq} = Top) when Value > Max -> + % capacity cannot be less than 1, so we can avoid handling the case when Seq is empty + [_ | Truncated] = Seq, + Top#topK{max = Value, seq = lists:keysort(2, [{Key, Value} | Truncated])}; +% when we are at capacity and value is in between min and max evict smallest value +update_topK(Key, Value, #topK{size = S, capacity = S, seq = Seq} = Top) -> + % capacity cannot be less than 1, so we can avoid handling the case when Seq is empty + [_ | Truncated] = Seq, + Top#topK{seq = lists:keysort(2, [{Key, Value} | Truncated])}; +update_topK(Key, Value, #topK{size = S, min = Min, seq = Seq} = Top) when Value < Min -> + Top#topK{size = S + 1, min = Value, seq = lists:keysort(2, [{Key, Value} | Seq])}; +update_topK(Key, Value, #topK{size = S, max = Max, seq = Seq} = Top) when Value > Max -> + Top#topK{size = S + 1, max = Value, seq = lists:keysort(2, [{Key, Value} | Seq])}; +update_topK(Key, Value, #topK{size = S, seq = Seq} = Top) -> + Top#topK{size = S + 1, seq = lists:keysort(2, [{Key, Value} | Seq])}. + +get_topK(#topK{seq = S}) -> + lists:reverse(S). + +topK(Results, K) -> + TopK = maps:fold(fun update_topK/3, new_topK(K), Results), + get_topK(TopK). %% eg: sorted_by([username, dbname, mfa], ioq_calls) %% eg: sorted_by([dbname, mfa], doc_reads) -sorted_by(KeyFun) -> shortened(sorted(count_by(KeyFun))). -sorted_by(KeyFun, ValFun) -> shortened(sorted(group_by(KeyFun, ValFun))). -sorted_by(KeyFun, ValFun, AggFun) -> shortened(sorted(group_by(KeyFun, ValFun, AggFun))). +sorted_by(KeyFun) -> topK(count_by(KeyFun), 10). +sorted_by(KeyFun, ValFun) -> + {Result, Acc} = group_by(KeyFun, ValFun), + {Result, topK(Acc, 10)}. +sorted_by(KeyFun, ValFun, AggFun) -> + {Result, Acc} = group_by(KeyFun, ValFun, AggFun), + {Result, topK(Acc, 10)}. to_json_list(List) when is_list(List) -> lists:map(fun csrt_util:to_json/1, List).
