nickva commented on code in PR #5602:
URL: https://github.com/apache/couchdb/pull/5602#discussion_r2317648749


##########
src/couch_srt/src/couch_srt_query.erl:
##########
@@ -0,0 +1,985 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(couch_srt_query).
+
+-feature(maybe_expr, enable).
+
+-include_lib("stdlib/include/ms_transform.hrl").
+-include_lib("couch_srt.hrl").
+
+%% aggregate query api
+-export([
+    active/0,
+    active/1,
+    active_coordinators/0,
+    active_coordinators/1,
+    active_workers/0,
+    active_workers/1,
+
+    all/0,
+    find_by_nonce/1,
+    find_by_pid/1,
+    find_by_pidref/1,
+    find_workers_by_pidref/1,
+
+    query_matcher/1,
+    query_matcher/2,
+    query_matcher_rows/1,
+    query_matcher_rows/2,
+
+    query/1,
+    from/1,
+    group_by/1,
+    group_by/2,
+    sort_by/1,
+    sort_by/2,
+    count_by/1,
+    options/1,
+    unlimited/0,
+    with_limit/1,
+
+    run/1,
+    unsafe_run/1
+]).
+
+-export_type([
+    query/0,
+    query_expression/0,
+    query_option/0
+]).
+
+-type aggregation_keys_fun() :: fun((Ele :: #rctx{}) -> aggregation_values() | 
aggregation_value()).
+-type value_key_fun() :: fun((Ele :: #rctx{}) -> aggregation_values() | 
aggregation_value()).
+-type count_key_fun() :: fun((A :: pos_integer(), B :: pos_integer()) -> 
pos_integer()).
+
+-record(selector, {
+    aggregation_keys = undefined ::
+        rctx_field()
+        | [rctx_field()]
+        | undefined,
+    value_key = undefined ::
+        rctx_field()
+        | undefined
+}).
+
+-record(unsafe_selector, {
+    aggregation_keys = undefined ::
+        aggregation_keys_fun()
+        | rctx_field()
+        | [rctx_field()]
+        | undefined,
+    value_key = undefined ::
+        value_key_fun()
+        | rctx_field()
+        | undefined
+}).
+
+-record(query_options, {
+    limit = undefined :: pos_integer() | unlimited | undefined,
+    is_safe = undefined :: boolean() | undefined
+}).
+
+-type aggregation() :: group_by | sort_by | count_by.
+
+-record(query, {
+    matcher = undefined :: matcher_name() | all | undefined,
+    selector = undefined :: #selector{} | #unsafe_selector{} | undefined,
+    limit = undefined :: pos_integer() | unlimited | undefined,
+    aggregation = undefined :: aggregation() | undefined,
+    is_safe = true :: boolean()
+}).
+
+-record(from, {
+    matcher = undefined :: matcher_name() | all | undefined,
+    is_safe = undefined :: boolean() | undefined
+}).
+
+-opaque query() :: #query{}.
+-opaque query_expression() ::
+    #from{}
+    | #query_options{}
+    | #selector{}
+    | #unsafe_selector{}
+    | query_option()
+    | {aggregation(), #selector{}}
+    | {aggregation(), #unsafe_selector{}}.
+-opaque query_option() ::
+    {limit, pos_integer() | unlimited | undefined}.
+
+%%
+%% Aggregate query API
+%%
+
+active() ->
+    active_int(all).
+
+active_coordinators() ->
+    active_int(coordinators).
+
+active_workers() ->
+    active_int(workers).
+
+%% active_json() or active(json)?
+active(json) ->
+    to_json_list(active_int(all)).
+
+active_coordinators(json) ->
+    to_json_list(active_int(coordinators)).
+
+active_workers(json) ->
+    to_json_list(active_int(workers)).
+
+active_int(coordinators) ->
+    select_by_type(coordinators);
+active_int(workers) ->
+    select_by_type(workers);
+active_int(all) ->
+    select_by_type(all).
+
+select_by_type(coordinators) ->
+    ets:select(?CSRT_ETS, ets:fun2ms(fun(#rctx{type = #coordinator{}} = R) -> 
R end));
+select_by_type(workers) ->
+    ets:select(?CSRT_ETS, ets:fun2ms(fun(#rctx{type = #rpc_worker{}} = R) -> R 
end));
+select_by_type(all) ->
+    ets:tab2list(?CSRT_ETS).
+
+find_by_nonce(Nonce) ->
+    couch_srt_server:match_resource(#rctx{nonce = Nonce}).
+
+find_by_pid(Pid) ->
+    couch_srt_server:match_resource(#rctx{pid_ref = {Pid, '_'}}).
+
+find_by_pidref(PidRef) ->
+    couch_srt_server:match_resource(#rctx{pid_ref = PidRef}).
+
+find_workers_by_pidref(PidRef) ->
+    couch_srt_server:match_resource(#rctx{type = #rpc_worker{from = PidRef}}).
+
+curry_field(Field) ->
+    fun(Ele) -> couch_srt_entry:value(Field, Ele) end.
+
+-spec group_by(Matcher, KeyFun, ValFun) ->
+    {ok, aggregation_result()} | {limit, aggregation_result()}
+when
+    Matcher :: matcher(),
+    KeyFun ::
+        aggregation_keys_fun()
+        | rctx_field()
+        | [rctx_field()],
+    ValFun ::
+        value_key_fun()
+        | rctx_field().
+group_by(Matcher, KeyFun, ValFun) ->
+    AggFun = fun erlang:'+'/2,
+    group_by(Matcher, KeyFun, ValFun, AggFun).
+
+-spec group_by(Matcher, KeyFun, ValFun, AggFun) ->
+    {ok, aggregation_result()} | {limit, aggregation_result()}
+when
+    Matcher :: matcher(),
+    KeyFun ::
+        aggregation_keys_fun()
+        | rctx_field()
+        | [rctx_field()],
+    ValFun ::
+        value_key_fun()
+        | rctx_field(),
+    AggFun ::
+        count_key_fun().
+group_by(Matcher, KeyFun, ValFun, AggFun) ->
+    group_by(Matcher, KeyFun, ValFun, AggFun, query_cardinality_limit()).
+
+-spec all() ->
+    matcher().
+
+all() ->
+    Spec = ets:fun2ms(fun(#rctx{} = R) -> R end),
+    {Spec, ets:match_spec_compile(Spec)}.
+
+%% eg: group_by(all(), username, docs_read).
+%% eg: ^^ or: group_by(all(), [username, docs_read], ioq_calls).
+%% eg: group_by(all(), [username, dbname, js_filter], docs_read).
+%% eg: group_by(all(), [username, dbname, js_filter], ioq_calls).
+%% eg: group_by(all(), [username, dbname, js_filter], get_kv_node).
+-spec group_by(Matcher, KeyFun, ValFun, AggFun, Limit) ->
+    {ok, aggregation_result()} | {limit, aggregation_result()}
+when
+    Matcher :: matcher(),
+    KeyFun ::
+        aggregation_keys_fun()
+        | rctx_field()
+        | [rctx_field()],
+    ValFun ::
+        value_key_fun()
+        | rctx_field(),
+    AggFun ::
+        count_key_fun(),
+    Limit :: pos_integer().
+
+group_by(Matcher, KeyL, ValFun, AggFun, Limit) when is_list(KeyL) ->
+    KeyFun = fun(Ele) -> list_to_tuple([couch_srt_entry:value(Key, Ele) || Key 
<- KeyL]) end,
+    group_by(Matcher, KeyFun, ValFun, AggFun, Limit);
+group_by(Matcher, Key, ValFun, AggFun, Limit) when is_atom(Key) ->
+    group_by(Matcher, curry_field(Key), ValFun, AggFun, Limit);
+group_by(Matcher, KeyFun, Val, AggFun, Limit) when is_atom(Val) ->
+    group_by(Matcher, KeyFun, curry_field(Val), AggFun, Limit);
+group_by(Matcher, KeyFun, ValFun, AggFun, Limit) ->
+    %% This is a space versus speed tradeoff. Both query modes only filter
+    %% through the table until `Limit` rows have been returned and both will
+    %% utilize the compiled match_specs to do the testing, but
+    %% `group_by_fold/5` will sequentially copy in every row and test it
+    %% locally against the compiled match_spec using `ets:foldl/3`, whereas
+    %% `group_by_select/5` does the filtering internally in the ETS NIF, by way
+    %% of passing the uncompiled match_spec to `ets:select/3` to
+    %% `ets:select/3`. The tradeoff here is that `ets:select` will copy `Limit`
+    %% full `#rctx{}` records into this caller process, which we then aggregate
+    %% over, as opposed to `ets:foldl` only sequentially loading a singular
+    %% `#rctx{}` and extracting the relevant field value to aggregate on.
+    %%
+    %% The use of `query_by_fold` should only be needed if `Limit` is
+    %% drastically increased, and even then, the efficiencies gained here with
+    %% `query_by_fold` are shortlived until we can encode the values needed by
+    %% `ValFun` into the match_spec return fields, at which point it becomes
+    %% strictly worse.
+    %%
+    %% NOTE: This discrepancy of `ets:match_spec_run` taking a `match_spec()`
+    %% vs `ets:select` taking a `comp_match_spec()` is why our CSRT `matcher()`
+    %% type_spec funnels around both versions instead of just reference to the
+    %% compiled spec stored by ETS internally.
+    case config:get_boolean(?CSRT, "use_query_fold", false) of

Review Comment:
   How large of difference does each method have compared to another? Instead 
of keeping both around wonder if there is a middle-ground of querying in 
smaller batches of limit = 500 and doing a bunch of them in a row until we 
reach the users's `limit`. It just seems like a lot of complexity added for a 
method that might be called once in a while by a single operator debugging or 
reporting a large io usage or investigating some performance issues?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscr...@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to