chewbranca commented on code in PR #5213: URL: https://github.com/apache/couchdb/pull/5213#discussion_r1814085042
########## src/couch_stats/src/couch_stats_resource_tracker.erl: ########## @@ -0,0 +1,912 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +-module(couch_stats_resource_tracker). + +-behaviour(gen_server). + +-export([ + start_link/0, + init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + code_change/3, + terminate/2 +]). + +%% PidRef API +-export([ + get_pid_ref/0, + set_pid_ref/1, + create_pid_ref/0, + close_pid_ref/0, close_pid_ref/1 +]). + +%% Context API +-export([ + create_resource/1, + create_context/5, + create_coordinator_context/2, + create_worker_context/3, + destroy_context/0, destroy_context/1, + + get_resource/0, get_resource/1, + + set_context_dbname/1, set_context_dbname/2, + set_context_handler_fun/1, set_context_handler_fun/2, + set_context_username/1, set_context_username/2 +]). + +%% stats collection api +-export([ + is_enabled/0, + + inc/1, inc/2, + maybe_inc/2, + accumulate_delta/1, + make_delta/0, + + ioq_called/0, + + should_track/1 +]). + +%% aggregate query api +-export([ + active/0, active/1, + active_coordinators/0, active_coordinators/1, + active_workers/0, active_workers/1, + + count_by/1, + group_by/2, group_by/3, + sorted/1, + sorted_by/1, sorted_by/2, sorted_by/3, + + find_by_pid/1, + find_by_pidref/1, + find_by_nonce/1, + find_workers_by_pidref/1 +]). + +%% Process lifetime reporting api +-export([ + log_process_lifetime_report/1, + is_logging_enabled/0, + logging_enabled/0, + should_log/1, should_log/2, + tracker/1 +]). + +-include_lib("couch/include/couch_db.hrl"). +-include_lib("stdlib/include/ms_transform.hrl"). + +%% Module pdict markers +-define(DELTA_TA, csrt_delta_ta). +-define(DELTA_TZ, csrt_delta_tz). %% T Zed instead of T0 +-define(PID_REF, csrt_pid_ref). %% track local ID +-define(TRACKER_PID, csrt_tracker). %% tracker pid + +-define(MANGO_EVAL_MATCH, mango_eval_match). +-define(DB_OPEN_DOC, docs_read). +-define(DB_OPEN, db_open). +-define(COUCH_SERVER_OPEN, db_open). +-define(COUCH_BT_GET_KP_NODE, get_kp_node). +-define(COUCH_BT_GET_KV_NODE, get_kv_node). +-define(COUCH_BT_WRITE_KP_NODE, write_kp_node). +-define(COUCH_BT_WRITE_KV_NODE, write_kv_node). +-define(COUCH_JS_FILTER, js_filter). +-define(COUCH_JS_FILTER_ERROR, js_filter_error). +-define(COUCH_JS_FILTERED_DOCS, js_filtered_docs). +-define(IOQ_CALLS, ioq_calls). +-define(ROWS_READ, rows_read). + +%% TODO: overlap between this and couch btree fold invocations +%% TODO: need some way to distinguish fols on views vs find vs all_docs +-define(FRPC_CHANGES_ROW, changes_processed). +-define(FRPC_CHANGES_RETURNED, changes_returned). + +-record(st, {}). + +-record(rctx, { + %% Metadata + started_at = tnow(), + updated_at = tnow(), + pid_ref, + mfa, + nonce, + from, + type = unknown, %% unknown/background/system/rpc/coordinator/fabric_rpc/etc_rpc/etc + dbname, + username, + path, + + %% Stats counters + db_open = 0, + docs_read = 0, + rows_read = 0, + changes_processed = 0, + changes_returned = 0, + ioq_calls = 0, + io_bytes_read = 0, + io_bytes_written = 0, + js_evals = 0, + js_filter = 0, + js_filter_error = 0, + js_filtered_docs = 0, + mango_eval_match = 0, + %% TODO: switch record definitions to be macro based, eg: + %% ?COUCH_BT_GET_KP_NODE = 0, + get_kv_node = 0, + get_kp_node = 0, + write_kv_node = 0, + write_kp_node = 0 +}). + +%% +%% Public API +%% + +%% +%% PidRef operations +%% + +get_pid_ref() -> + get(?PID_REF). + +set_pid_ref(PidRef) -> + erlang:put(?PID_REF, PidRef), + PidRef. + +create_pid_ref() -> + case get_pid_ref() of + undefined -> + ok; + PidRef0 -> + %% TODO: what to do when it already exists? + throw({epidexist, PidRef0}), + close_pid_ref(PidRef0) + end, + PidRef = {self(), make_ref()}, + set_pid_ref(PidRef), + PidRef. + +close_pid_ref() -> + close_pid_ref(get_pid_ref()). + +%%close_pid_ref(undefined) -> +%% undefined; +close_pid_ref(_PidRef) -> + erase(?PID_REF). + +get_resource() -> + get_resource(get_pid_ref()). + +get_resource(undefined) -> + undefined; +get_resource(PidRef) -> + catch case ets:lookup(?MODULE, PidRef) of + [#rctx{}=Rctx] -> + Rctx; + [] -> + undefined + end. + +%% monotonic time now in millisecionds +tnow() -> + erlang:monotonic_time(millisecond). + +is_enabled() -> + config:get_boolean(?MODULE_STRING, "enabled", true). + +%% +%% Aggregate query API +%% + +active() -> active_int(all). +active_coordinators() -> active_int(coordinators). +active_workers() -> active_int(workers). + +%% active_json() or active(json)? +active(json) -> to_json_list(active_int(all)). +active_coordinators(json) -> to_json_list(active_int(coordinators)). +active_workers(json) -> to_json_list(active_int(workers)). + + +active_int(coordinators) -> + select_by_type(coordinators); +active_int(workers) -> + select_by_type(workers); +active_int(all) -> + select_by_type(all). + + +select_by_type(coordinators) -> + ets:select(?MODULE, ets:fun2ms(fun(#rctx{type = {coordinator, _, _}} = R) -> R end)); +select_by_type(workers) -> + ets:select(?MODULE, ets:fun2ms(fun(#rctx{type = {worker, _, _}} = R) -> R end)); +select_by_type(all) -> + ets:tab2list(?MODULE). + +find_by_nonce(Nonce) -> + ets:match_object(?MODULE, ets:fun2ms(fun(#rctx{nonce = Nonce1} = R) when Nonce =:= Nonce1 -> R end)). + +find_by_pid(Pid) -> + [R || #rctx{} = R <- ets:match_object(?MODULE, #rctx{pid_ref={Pid, '_'}, _ = '_'})]. + +find_by_pidref(PidRef) -> + [R || #rctx{} = R <- ets:match_object(?MODULE, #rctx{pid_ref=PidRef, _ = '_'})]. + +find_workers_by_pidref(PidRef) -> + [R || #rctx{} = R <- ets:match_object(?MODULE, #rctx{from=PidRef, _ = '_'})]. + +field(#rctx{pid_ref=Val}, pid_ref) -> Val; +%% NOTE: Pros and cons to doing these convert functions here +%% Ideally, this would be done later so as to prefer the core data structures +%% as long as possible, but we currently need the output of this function to +%% be jiffy:encode'able. The tricky bit is dynamically encoding the group_by +%% structure provided by the caller of *_by aggregator functions below. +%% For now, we just always return jiffy:encode'able data types. +field(#rctx{mfa=Val}, mfa) -> convert_mfa(Val); +field(#rctx{nonce=Val}, nonce) -> Val; +field(#rctx{from=Val}, from) -> Val; +field(#rctx{type=Val}, type) -> convert_type(Val); +field(#rctx{dbname=Val}, dbname) -> Val; +field(#rctx{username=Val}, username) -> Val; +field(#rctx{path=Val}, path) -> Val; +field(#rctx{db_open=Val}, db_open) -> Val; +field(#rctx{docs_read=Val}, docs_read) -> Val; +field(#rctx{rows_read=Val}, rows_read) -> Val; +field(#rctx{changes_processed=Val}, changes_processed) -> Val; +field(#rctx{changes_returned=Val}, changes_returned) -> Val; +field(#rctx{ioq_calls=Val}, ioq_calls) -> Val; +field(#rctx{io_bytes_read=Val}, io_bytes_read) -> Val; +field(#rctx{io_bytes_written=Val}, io_bytes_written) -> Val; +field(#rctx{js_evals=Val}, js_evals) -> Val; +field(#rctx{js_filter=Val}, js_filter) -> Val; +field(#rctx{js_filter_error=Val}, js_filter_error) -> Val; +field(#rctx{js_filtered_docs=Val}, js_filtered_docs) -> Val; +field(#rctx{mango_eval_match=Val}, mango_eval_match) -> Val; +field(#rctx{get_kv_node=Val}, get_kv_node) -> Val; +field(#rctx{get_kp_node=Val}, get_kp_node) -> Val. + +curry_field(Field) -> + fun(Ele) -> field(Ele, Field) end. + +count_by(KeyFun) -> + group_by(KeyFun, fun(_) -> 1 end). + +group_by(KeyFun, ValFun) -> + group_by(KeyFun, ValFun, fun erlang:'+'/2). + +%% eg: group_by(mfa, docs_read). +%% eg: group_by(fun(#rctx{mfa=MFA,docs_read=DR}) -> {MFA, DR} end, ioq_calls). +%% eg: ^^ or: group_by([mfa, docs_read], ioq_calls). +%% eg: group_by([username, dbname, mfa], docs_read). +%% eg: group_by([username, dbname, mfa], ioq_calls). +%% eg: group_by([username, dbname, mfa], js_filters). +group_by(KeyL, ValFun, AggFun) when is_list(KeyL) -> + KeyFun = fun(Ele) -> list_to_tuple([field(Ele, Key) || Key <- KeyL]) end, + group_by(KeyFun, ValFun, AggFun); +group_by(Key, ValFun, AggFun) when is_atom(Key) -> + group_by(curry_field(Key), ValFun, AggFun); +group_by(KeyFun, Val, AggFun) when is_atom(Val) -> + group_by(KeyFun, curry_field(Val), AggFun); +group_by(KeyFun, ValFun, AggFun) -> + FoldFun = fun(Ele, Acc) -> + Key = KeyFun(Ele), + Val = ValFun(Ele), + CurrVal = maps:get(Key, Acc, 0), + NewVal = AggFun(CurrVal, Val), + %% TODO: should we skip here? how to make this optional? + case NewVal > 0 of + true -> + maps:put(Key, NewVal, Acc); + false -> + Acc + end + end, + ets:foldl(FoldFun, #{}, ?MODULE). + +%% Sorts largest first +sorted(Map) when is_map(Map) -> + lists:sort(fun({_K1, A}, {_K2, B}) -> B < A end, maps:to_list(Map)). + +shortened(L) -> + lists:sublist(L, 10). + +%% eg: sorted_by([username, dbname, mfa], ioq_calls) +%% eg: sorted_by([dbname, mfa], doc_reads) +sorted_by(KeyFun) -> shortened(sorted(count_by(KeyFun))). +sorted_by(KeyFun, ValFun) -> shortened(sorted(group_by(KeyFun, ValFun))). +sorted_by(KeyFun, ValFun, AggFun) -> shortened(sorted(group_by(KeyFun, ValFun, AggFun))). + +%% +%% Conversion API for outputting JSON +%% + +convert_mfa(MFA) when is_list(MFA) -> + list_to_binary(MFA); +convert_mfa({M0, F0, A0}) -> + M = atom_to_binary(M0), + F = atom_to_binary(F0), + A = integer_to_binary(A0), + <<M/binary, ":", F/binary, "/", A/binary>>; +convert_mfa(null) -> + null; +convert_mfa(undefined) -> + null. + +convert_type(Atom) when is_atom(Atom) -> + atom_to_binary(Atom); +convert_type({coordinator, Verb0, Atom0}) when is_atom(Atom0) -> + Verb = atom_to_binary(Verb0), + Atom = atom_to_binary(Atom0), + <<"coordinator:", Verb/binary, ":", Atom/binary>>; +convert_type({coordinator, Verb0, Path0}) -> + Verb = atom_to_binary(Verb0), + Path = list_to_binary(Path0), + <<"coordinator:", Verb/binary, ":", Path/binary>>; +convert_type({worker, M0, F0}) -> + M = atom_to_binary(M0), + F = atom_to_binary(F0), + <<"worker:", M/binary, ":", F/binary>>; +convert_type(null) -> + null; +convert_type(undefined) -> + null. + +convert_path(undefined) -> + null; +convert_path(Path) when is_binary(Path) -> + Path. + +convert_pidref({Parent0, ParentRef0}) -> + Parent = convert_pid(Parent0), + ParentRef = convert_ref(ParentRef0), + <<Parent/binary, ":", ParentRef/binary>>; +convert_pidref(null) -> + null; +convert_pidref(undefined) -> + null. + +convert_pid(Pid) when is_pid(Pid) -> + ?l2b(pid_to_list(Pid)). + +convert_ref(Ref) when is_reference(Ref) -> + ?l2b(ref_to_list(Ref)). + +to_json(#rctx{}=Rctx) -> + #rctx{ + updated_at = TP, + started_at = TInit, + pid_ref = PidRef, + mfa = MFA, + nonce = Nonce, + from = From, + dbname = DbName, + path = Path, + username = UserName, + db_open = DbOpens, + docs_read = DocsRead, + rows_read = RowsRead, + js_filter = JSFilters, + js_filter_error = JSFilterErrors, + js_filtered_docs = JSFilteredDocss, + type = Type, + get_kp_node = GetKpNodes, + get_kv_node = GetKvNodes, + %%write_kp_node = WriteKpNodes, + %%write_kv_node = WriteKvNodes, + changes_returned = ChangesReturned, + ioq_calls = IoqCalls + } = Rctx, + + #{ + updated_at => TP, + started_at => TInit, + pid_ref => convert_pidref(PidRef), + mfa => convert_mfa(MFA), + nonce => Nonce, + from => convert_pidref(From), + dbname => DbName, + path => convert_path(Path), + username => UserName, + db_open => DbOpens, + docs_read => DocsRead, + js_filter => JSFilters, + js_filter_error => JSFilterErrors, + js_filtered_docs => JSFilteredDocss, + rows_read => RowsRead, + type => convert_type(Type), + get_kp_nodes => GetKpNodes, + get_kv_nodes => GetKvNodes, + %%write_kp_nodes => WriteKpNodes, + %%write_kv_nodes => WriteKvNodes, + changes_returned => ChangesReturned, + ioq_calls => IoqCalls + }. + +%% +%% Context lifecycle API +%% + +create_resource(#rctx{} = Rctx) -> + catch ets:insert(?MODULE, Rctx). + +create_worker_context(From, {M,F,_A} = MFA, Nonce) -> + case is_enabled() of + true -> + create_context(MFA, {worker, M, F}, null, From, Nonce); + false -> + false + end. + +create_coordinator_context(#httpd{} = Req, Path0) -> + case is_enabled() of + true -> + #httpd{ + method = Verb, + nonce = Nonce + %%path_parts = Parts + } = Req, + %%Path = list_to_binary([$/ | io_lib:format("~p", [Parts])]), + Path = list_to_binary([$/ | Path0]), + Type = {coordinator, Verb, init}, + create_context(null, Type, Path, null, Nonce); + false -> + false + end. + +create_context(MFA, Type, Path, From, Nonce) -> + PidRef = create_pid_ref(), + Rctx = #rctx{ + from = From, + pid_ref = PidRef, + mfa = MFA, + nonce = Nonce, + path = Path, + type = Type + }, + erlang:put(?DELTA_TZ, Rctx), + create_resource(Rctx), + track(Rctx), + PidRef. + +set_context_dbname(DbName) -> + set_context_dbname(DbName, get_pid_ref()). + +set_context_dbname(_, undefined) -> + ok; +set_context_dbname(DbName, PidRef) -> + is_enabled() andalso update_element(PidRef, [{#rctx.dbname, DbName}]). + +set_context_handler_fun(Fun) when is_function(Fun) -> + set_context_handler_fun(Fun, get_pid_ref()). +set_context_handler_fun(_, undefined) -> + ok; +set_context_handler_fun(Fun, PidRef) when is_function(Fun) -> + case is_enabled() of + false -> + ok; + true -> + FunName = erlang:fun_to_list(Fun), + #rctx{type={coordinator, Verb, _}} = get_resource(), + Update = [{#rctx.type, {coordinator, Verb, FunName}}], + update_element(PidRef, Update) + end. + +set_context_username(null) -> + ok; +set_context_username(undefined) -> + ok; +set_context_username(User) -> + set_context_username(User, get_pid_ref()). + +set_context_username(null, _) -> + ok; +set_context_username(_, undefined) -> + ok; +set_context_username(#httpd{user_ctx = Ctx}, PidRef) -> + set_context_username(Ctx, PidRef); +set_context_username(#user_ctx{name = Name}, PidRef) -> + set_context_username(Name, PidRef); +set_context_username(UserName, PidRef) -> + is_enabled() andalso update_element(PidRef, [{#rctx.username, UserName}]). + +destroy_context() -> + destroy_context(get_pid_ref()). + +destroy_context(undefined) -> + ok; +destroy_context({_, _} = PidRef) -> + stop_tracker(get_tracker()), + close_pid_ref(PidRef), + ok. + +%% Stat collection API + +inc(Key) -> + inc(Key, 1). + +%% TODO: inc(io_bytes_read, N) -> +%% TODO: inc(io_bytes_written, N) -> +%% TODO: inc(js_evals, N) -> +inc(?DB_OPEN, N) -> + update_counter(#rctx.?DB_OPEN, N); +inc(?ROWS_READ, N) -> + update_counter(#rctx.?ROWS_READ, N); +inc(?FRPC_CHANGES_RETURNED, N) -> + update_counter(#rctx.?FRPC_CHANGES_RETURNED, N); +inc(?IOQ_CALLS, N) -> + update_counter(#rctx.?IOQ_CALLS, N); +inc(?COUCH_JS_FILTER, N) -> + update_counter(#rctx.?COUCH_JS_FILTER, N); +inc(?COUCH_JS_FILTER_ERROR, N) -> + update_counter(#rctx.?COUCH_JS_FILTER_ERROR, N); +inc(?COUCH_JS_FILTERED_DOCS, N) -> + update_counter(#rctx.?COUCH_JS_FILTERED_DOCS, N); +inc(?MANGO_EVAL_MATCH, N) -> + update_counter(#rctx.?MANGO_EVAL_MATCH, N); +inc(?DB_OPEN_DOC, N) -> + update_counter(#rctx.?DB_OPEN_DOC, N); +inc(?FRPC_CHANGES_ROW, N) -> + update_counter(#rctx.?ROWS_READ, N); %% TODO: rework double use of rows_read +inc(?COUCH_BT_GET_KP_NODE, N) -> + update_counter(#rctx.?COUCH_BT_GET_KP_NODE, N); +inc(?COUCH_BT_GET_KV_NODE, N) -> + update_counter(#rctx.?COUCH_BT_GET_KV_NODE, N); +inc(_, _) -> + %% inc needs to allow unknown types to pass for accumulate_update to handle + %% updates from nodes with newer data formats + 0. + +maybe_inc([mango, evaluate_selector], Val) -> + inc(?MANGO_EVAL_MATCH, Val); +maybe_inc([couchdb, database_reads], Val) -> + inc(?DB_OPEN_DOC, Val); +maybe_inc([fabric_rpc, changes, processed], Val) -> + inc(?FRPC_CHANGES_ROW, Val); +maybe_inc([fabric_rpc, changes, returned], Val) -> + inc(?FRPC_CHANGES_RETURNED, Val); +maybe_inc([fabric_rpc, view, rows_read], Val) -> + inc(?ROWS_READ, Val); +maybe_inc([couchdb, couch_server, open], Val) -> + inc(?DB_OPEN, Val); +maybe_inc([couchdb, btree, get_node, kp_node], Val) -> + inc(?COUCH_BT_GET_KP_NODE, Val); +maybe_inc([couchdb, btree, get_node, kv_node], Val) -> + inc(?COUCH_BT_GET_KV_NODE, Val); +%% The write_node logic won't pickup writes as none of the RPC +%% processes actually perform the write operation +%% TODO: bubble up induced work from other processes +maybe_inc([couchdb, btree, write_node, kp_node], Val) -> + inc(?COUCH_BT_WRITE_KP_NODE, Val); +maybe_inc([couchdb, btree, write_node, kv_node], Val) -> + inc(?COUCH_BT_WRITE_KV_NODE, Val); +maybe_inc([couchdb, query_server, js_filter_error], Val) -> + inc(?COUCH_JS_FILTER_ERROR, Val); +maybe_inc([couchdb, query_server, js_filter], Val) -> + inc(?COUCH_JS_FILTER, Val); +maybe_inc([couchdb, query_server, js_filtered_docs], Val) -> + inc(?COUCH_JS_FILTERED_DOCS, Val); +maybe_inc(_Metric, _Val) -> + %%io:format("SKIPPING MAYBE_INC METRIC[~p]: ~p~n", [Val, Metric]), + 0. + +%% TODO: update stats_descriptions.cfg for relevant apps +should_track([fabric_rpc, all_docs, spawned]) -> + is_enabled(); +should_track([fabric_rpc, changes, spawned]) -> + is_enabled(); +should_track([fabric_rpc, changes, processed]) -> + is_enabled(); +should_track([fabric_rpc, changes, returned]) -> + is_enabled(); +should_track([fabric_rpc, map_view, spawned]) -> + is_enabled(); +should_track([fabric_rpc, reduce_view, spawned]) -> + is_enabled(); +should_track([fabric_rpc, get_all_security, spawned]) -> + is_enabled(); +should_track([fabric_rpc, open_doc, spawned]) -> + is_enabled(); +should_track([fabric_rpc, update_docs, spawned]) -> + is_enabled(); +should_track([fabric_rpc, open_shard, spawned]) -> + is_enabled(); +should_track([mango_cursor, view, all_docs]) -> + is_enabled(); +should_track([mango_cursor, view, idx]) -> + is_enabled(); +should_track(_Metric) -> + %%io:format("SKIPPING METRIC: ~p~n", [Metric]), + false. + +ioq_called() -> + inc(ioq_calls). + +accumulate_delta(Delta) when is_map(Delta) -> + %% TODO: switch to creating a batch of updates to invoke a single + %% update_counter rather than sequentially invoking it for each field + is_enabled() andalso maps:foreach(fun inc/2, Delta); +accumulate_delta(undefined) -> + ok. + +make_delta() -> + TA = case get(?DELTA_TA) of + undefined -> + %% Need to handle this better, can't just make a new T0 at T' as + %% the timestamps will be identical causing a divide by zero error. + %% + %% Realistically need to ensure that all invocations of database + %% operations sets T0 appropriately. Perhaps it's possible to do + %% this is the couch_db:open chain, and then similarly, in + %% couch_server, and uhhhh... couch_file, and... + %% + %% I think we need some type of approach for establishing a T0 that + %% doesn't result in outrageous deltas. For now zero out the + %% microseconds field, or subtract a second on the off chance that + %% microseconds is zero. I'm not uptodate on the latest Erlang time + %% libraries and don't remember how to easily get an + %% `os:timestamp()` out of now() - 100ms or some such. + %% + %% I think it's unavoidable that we'll have some codepaths that do + %% not properly instantiate the T0 at spawn resulting in needing to + %% do some time of "time warp" or ignoring the timing collection + %% entirely. Perhaps if we hoisted out the stats collection into + %% the primary flow of the database and funnel that through all the + %% function clauses we could then utilize Dialyzer to statically + %% analyze and assert all code paths that invoke database + %% operations have properly instantinated a T0 at the appropriate + %% start time such that we don't have to "fudge" deltas with a + %% missing start point, but we're a long ways from that happening + %% so I feel it necessary to address the NULL start time. + + %% Track how often we fail to initiate T0 correctly + %% Perhaps somewhat naughty we're incrementing stats from within + %% couch_stats itself? Might need to handle this differently + %% TODO: determine appropriate course of action here + %% io:format("~n**********MISSING STARTING DELTA************~n~n", []), + couch_stats:increment_counter( + [couchdb, csrt, delta_missing_t0]), + %%[couch_stats_resource_tracker, delta_missing_t0]), + + case erlang:get(?DELTA_TZ) of + undefined -> + TA0 = make_delta_base(), + %% TODO: handline missing deltas, otherwise divide by zero + set_delta_a(TA0), + TA0; + TA0 -> + TA0 + end; + #rctx{} = TA0 -> + TA0 + end, + TB = get_resource(), + Delta = make_delta(TA, TB), + set_delta_a(TB), + Delta. + +make_delta(#rctx{}=TA, #rctx{}=TB) -> + Delta = #{ + docs_read => TB#rctx.docs_read - TA#rctx.docs_read, + js_filter => TB#rctx.js_filter - TA#rctx.js_filter, + js_filter_error => TB#rctx.js_filter_error - TA#rctx.js_filter_error, + js_filtered_docs => TB#rctx.js_filtered_docs - TA#rctx.js_filtered_docs, + rows_read => TB#rctx.rows_read - TA#rctx.rows_read, + changes_returned => TB#rctx.changes_returned - TA#rctx.changes_returned, + get_kp_node => TB#rctx.get_kp_node - TA#rctx.get_kp_node, + get_kv_node => TB#rctx.get_kv_node - TA#rctx.get_kv_node, + db_open => TB#rctx.db_open - TA#rctx.db_open, + ioq_calls => TB#rctx.ioq_calls - TA#rctx.ioq_calls, + dt => TB#rctx.updated_at - TA#rctx.updated_at + }, + %% TODO: reevaluate this decision + %% Only return non zero (and also positive) delta fields + maps:filter(fun(_K,V) -> V > 0 end, Delta); +make_delta(_, #rctx{}) -> + #{error => missing_beg_rctx}; +make_delta(#rctx{}, _) -> + #{error => missing_fin_rctx}. + +%% TODO: what to do when PidRef=undefined? +make_delta_base(PidRef) -> + %% TODO: extract user_ctx and db/shard from request + Now = tnow(), + #rctx{ + pid_ref = PidRef, + %% TODO: confirm this subtraction works + started_at = Now - 100, %% give us 100ms rewind time for missing T0 + updated_at = Now + }. + +make_delta_base() -> + make_delta_base(get_pid_ref()). + +set_delta_a(TA) -> + erlang:put(?DELTA_TA, TA). + +update_counter(Field, Count) -> + is_enabled() andalso update_counter(get_pid_ref(), Field, Count). + +update_counter(undefined, _Field, _Count) -> + ok; +update_counter({_Pid,_Ref}=PidRef, Field, Count) -> + %% TODO: mem3 crashes without catch, why do we lose the stats table? + is_enabled() andalso catch ets:update_counter(?MODULE, PidRef, {Field, Count}, #rctx{pid_ref=PidRef}). + +update_element(undefined, _Update) -> + ok; +update_element({_Pid,_Ref}=PidRef, Update) -> + %% TODO: should we take any action when the update fails? + is_enabled() andalso catch ets:update_element(?MODULE, PidRef, Update). + +%% Process lifetime logging api + +track(#rctx{pid_ref=PidRef}) -> + case get_tracker() of + undefined -> + Pid = spawn(?MODULE, tracker, [PidRef]), + put_tracker(Pid), + Pid; + Pid when is_pid(Pid) -> + Pid + end. + +tracker({Pid, _Ref}=PidRef) -> + MonRef = erlang:monitor(process, Pid), + receive + stop -> + %% TODO: do we need cleanup here? + log_process_lifetime_report(PidRef), + catch evict(PidRef), + demonitor(MonRef), + ok; + {'DOWN', MonRef, _Type, _0DPid, _Reason0} -> + destroy_context(PidRef), + %% TODO: should we pass reason to log_process_lifetime_report? + %% Reason = case Reason0 of + %% {shutdown, Shutdown0} -> + %% Shutdown = atom_to_binary(Shutdown0), + %% <<"shutdown: ", Shutdown/binary>>; + %% Reason0 -> + %% Reason0 + %% end, + %% TODO: should we send the induced work delta to the coordinator? + log_process_lifetime_report(PidRef), + catch evict(PidRef) + end. + +log_process_lifetime_report(PidRef) -> + case is_enabled() andalso is_logging_enabled() of + true -> + Rctx = get_resource(PidRef), + case should_log(Rctx) of + true -> + couch_log:report("csrt-pid-usage-lifetime", to_json(Rctx)); + _ -> + ok + end; + false -> + ok + end. + +is_logging_enabled() -> Review Comment: Alright, I hacked together a `csrt_logger` module in 17021155d that provides two extended mechanisms for dynamically enabling logging, in both a simpler and more complex manner. I just hacked this together, there's no compiler warnings but I haven't actually run it yet or anything wild like that. The first dynamic logging approach utilizes @iilyak's idea for dedicated type records, initially focusing on `#worker{}` type here https://github.com/apache/couchdb/commit/17021155db6ae0495e015d7571469a36c426b894#diff-ae4228aa19a3865ec48888021914ba2db0cf64dca17138497c8db885222f4699R77-R90 eg `#rpc_worker{mod=fabric_rpc, func=map_view}`. We then translate that to a key string like `worker.fabric_rpc.map_view` so we can then do `config:get_boolean("csrt.should_log", Key, true)` to have a direct toggle switch for the core default rpc operations we want to log, and we could use a similar mechanism to dynamically enable logging for additional rpc endpoints we don't enable by default, eg looking at dreyfus_rpc or mem3_rpc. We generate that mapping in a gen_server so that we can easily do direct atom comparisons on module/function names rather than have to repeatedly do string conversion and comparisons, the strings are only changed at configuration initialization/update time and not used in the should log checks. We can then check to see if that module worker type is tracked directly https://github.com/apache/couchdb/commit/17021155db6ae0495e015d7571469a36c426b894#diff-ae4228aa19a3865ec48888021914ba2db0cf64dca17138497c8db885222f4699R220-R221 The next mechanism and core idea with this experimental hack in the commit is to create a way to dynamically register match spec funs to evaluate against the stream of reports. The idea here is to `ets:match_spec_compile` to generate a compiled match_spec we can store a reference to in the gen_server, then we can dynamically enable filtered logging by way of declaring and registering match_specs. For example, here's our matcher registration handling https://github.com/apache/couchdb/commit/17021155db6ae0495e015d7571469a36c426b894#diff-ae4228aa19a3865ec48888021914ba2db0cf64dca17138497c8db885222f4699R171-R174 along with the compilation https://github.com/apache/couchdb/commit/17021155db6ae0495e015d7571469a36c426b894#diff-ae4228aa19a3865ec48888021914ba2db0cf64dca17138497c8db885222f4699R209-R218 and then we can determine if we should logic the particular `#rctx{}` by way of checking if it matches any of the registered filteres here https://github.com/apache/couchdb/commit/17021155db6a e0495e015d7571469a36c426b894#diff-ae4228aa19a3865ec48888021914ba2db0cf64dca17138497c8db885222f4699R133-R142 For example, here's a function to dynamically generate a matchspec parameterized by username: ```erlang (node1@127.0.0.1)4> F = fun(User) -> ets:fun2ms(fun(#rctx{username=User0}=R) when User =:= User0 -> R end) end. #Fun<erl_eval.42.3316493> (node1@127.0.0.1)5> F(bar). [{#rctx{started_at = '_',updated_at = '_',pid_ref = '_', mfa = '_',nonce = '_',from = '_',type = '_',dbname = '_', username = '$1',path = '_',db_open = '_',docs_read = '_', rows_read = '_',changes_processed = '_', changes_returned = '_',ioq_calls = '_',io_bytes_read = '_', io_bytes_written = '_',js_evals = '_',js_filter = '_', js_filtered_docs = '_',mango_eval_match = '_', get_kv_node = '_',get_kp_node = '_',write_kv_node = '_', write_kp_node = '_'}, [{'=:=',{const,bar},'$1'}], ['$_']}] ``` Then we can compile that match spec and store the reference in the gen_server, and check to see if any of the compiled matchers match the given `#rctx{}` record. This opens up considerable options for doing dynamic filtered logging of a subset of the csrt report stream, although if we adopt a model like this I think this would be great to extend out to the main logging systems as well, so for instance we could dynamically enable debug logs for a particular database shard. I think it might be trickier to dynamically define relations, eg `username=Foo and docs_opened>100`, but it _is_ easy to predefine a series of those types of functions for common use cases like filter on user and doc magnitude, or user and db shard plus magnitude, or whatever. Perhaps there's some other clever tricks we can do here too, it's a little tricky with having to define the match record within the macro expansion call, but as I demonstrated above we can dynamically generate parameterized match specs that are then compiled and shareable, so I think there's a lot of potential with this current approach. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@couchdb.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org