This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch improve-scanner-performance in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 00277d5fc4a9d39e88b2f855f2adb515765b1dc9 Author: Nick Vatamaniuc <vatam...@gmail.com> AuthorDate: Fri Aug 1 13:12:36 2025 -0400 Improve scanner performance * If the plugin doesn't need to inspect ddocs at the cluster level, avoid the expensive `fabric:all_docs()` call, both the finder and the conflict checker benefit from this one. * Let the plugins examine the `#full_doc_info{}` structure directly in the new optional `doc_fdi` callback. Previously, the conflict checker had to re-read the FDI just to look for conflict even though the plugin already read the FDI. --- src/couch_scanner/src/couch_scanner_plugin.erl | 59 +++++++++++++++++----- .../src/couch_scanner_plugin_conflict_finder.erl | 10 ++-- .../src/couch_scanner_plugin_find.erl | 6 --- .../test/eunit/couch_scanner_test.erl | 9 ++-- 4 files changed, 55 insertions(+), 29 deletions(-) diff --git a/src/couch_scanner/src/couch_scanner_plugin.erl b/src/couch_scanner/src/couch_scanner_plugin.erl index 04f394c12..9ff3a2505 100644 --- a/src/couch_scanner/src/couch_scanner_plugin.erl +++ b/src/couch_scanner/src/couch_scanner_plugin.erl @@ -119,11 +119,16 @@ -callback db_opened(St :: term(), Db :: term()) -> {ok, St :: term()}. -% Optional. If doc is not defined, then ddoc_id default action is {skip, St}. -% If it is defined, the default action is {ok, St}. +% Optional. If doc and doc_fdi are not defined, then doc_id default +% action is {skip, St}. If it is defined, the default action is {ok, St}. -callback doc_id(St :: term(), DocId :: binary(), Db :: term()) -> {ok | skip | stop, St1 :: term()}. +% Optional. If doc is not defined, then doc_fdi default action is {stop, St}. +% If it is defined, the default action is {ok, St}. +-callback doc_fdi(St :: term(), FDI :: #full_doc_info{}, Db :: term()) -> + {ok | stop, St1 :: term()}. + % Optional. -callback doc(St :: term(), Db :: term(), #doc{}) -> {ok | stop, St1 :: term()}. @@ -139,6 +144,7 @@ shards/2, db_opened/2, doc_id/3, + doc_fdi/3, doc/3, db_closing/2 ]). @@ -153,6 +159,7 @@ {shards, 2, fun default_shards/3}, {db_opened, 2, fun default_db_opened/3}, {doc_id, 3, fun default_doc_id/3}, + {doc_fdi, 3, fun default_doc_fdi/3}, {doc, 3, fun default_doc/3}, {db_closing, 2, fun default_db_closing/3} ]). @@ -382,7 +389,7 @@ scan_docs_fold(#full_doc_info{id = Id} = FDI, #st{} = St) -> {Go, PSt1} = DocIdCbk(PSt, Id, Db), St1 = St#st{pst = PSt1}, case Go of - ok -> scan_doc(FDI, St1); + ok -> scan_fdi(FDI, St1); skip -> {ok, St1}; stop -> {stop, St1} end; @@ -390,6 +397,16 @@ scan_docs_fold(#full_doc_info{id = Id} = FDI, #st{} = St) -> {ok, St} end. +scan_fdi(#full_doc_info{} = FDI, #st{} = St) -> + #st{db = Db, callbacks = Cbks, pst = PSt} = St, + #{doc_fdi := FDICbk} = Cbks, + {Go, PSt1} = FDICbk(PSt, FDI, Db), + St1 = St#st{pst = PSt1}, + case Go of + ok -> scan_doc(FDI, St1); + stop -> {stop, St1} + end. + scan_doc(#full_doc_info{} = FDI, #st{} = St) -> #st{db = Db, callbacks = Cbks, pst = PSt} = St, St1 = rate_limit(St, doc), @@ -575,6 +592,7 @@ default_shards(Mod, _F, _A) when is_atom(Mod) -> case is_exported(Mod, db_opened, 2) orelse is_exported(Mod, doc_id, 3) orelse + is_exported(Mod, doc_fdi, 3) orelse is_exported(Mod, doc, 3) orelse is_exported(Mod, db_closing, 2) of @@ -586,11 +604,17 @@ default_db_opened(Mod, _F, _A) when is_atom(Mod) -> fun(St, _Db) -> {ok, St} end. default_doc_id(Mod, _F, _A) when is_atom(Mod) -> - case is_exported(Mod, doc, 3) of + case is_exported(Mod, doc, 3) orelse is_exported(Mod, doc_fdi, 3) of true -> fun(St, _DocId, _Db) -> {ok, St} end; false -> fun(St, _DocId, _Db) -> {skip, St} end end. +default_doc_fdi(Mod, _F, _A) when is_atom(Mod) -> + case is_exported(Mod, doc, 3) of + true -> fun(St, _FDI, _Db) -> {ok, St} end; + false -> fun(St, _FDI, _Db) -> {stop, St} end + end. + default_doc(Mod, _F, _A) when is_atom(Mod) -> fun(St, _Db, _Doc) -> {ok, St} end. @@ -624,16 +648,23 @@ shards_by_range(Shards) -> % Design doc fetching helper -fold_ddocs(Fun, #st{dbname = DbName} = Acc) -> - QArgs = #mrargs{ - include_docs = true, - extra = [{namespace, <<"_design">>}] - }, - try - {ok, Acc1} = fabric:all_docs(DbName, [?ADMIN_CTX], Fun, Acc, QArgs), - Acc1 - catch - error:database_does_not_exist -> +fold_ddocs(Fun, #st{dbname = DbName, mod = Mod} = Acc) -> + case is_exported(Mod, ddoc, 3) of + true -> + QArgs = #mrargs{ + include_docs = true, + extra = [{namespace, <<"_design">>}] + }, + try + {ok, Acc1} = fabric:all_docs(DbName, [?ADMIN_CTX], Fun, Acc, QArgs), + Acc1 + catch + error:database_does_not_exist -> + Acc + end; + false -> + % If the plugin doesn't export the ddoc callback, don't bother calling + % fabric:all_docs, as it's expensive Acc end. diff --git a/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl b/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl index 5f8f17527..4d57cbd91 100644 --- a/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl +++ b/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl @@ -19,7 +19,7 @@ complete/1, checkpoint/1, db/2, - doc_id/3 + doc_fdi/3 ]). -include_lib("couch_scanner/include/couch_scanner_plugin.hrl"). @@ -76,12 +76,10 @@ checkpoint(#st{sid = SId, opts = Opts}) -> db(#st{} = St, _DbName) -> {ok, St}. -doc_id(#st{} = St, <<?DESIGN_DOC_PREFIX, _/binary>>, _Db) -> - {skip, St}; -doc_id(#st{} = St, DocId, Db) -> - {ok, #doc_info{revs = Revs}} = couch_db:get_doc_info(Db, DocId), +doc_fdi(#st{} = St, #full_doc_info{} = FDI, Db) -> + #doc_info{revs = Revs} = couch_doc:to_doc_info(FDI), DbName = mem3:dbname(couch_db:name(Db)), - {ok, check(St, DbName, DocId, Revs)}. + {ok, check(St, DbName, FDI#full_doc_info.id, Revs)}. % Private diff --git a/src/couch_scanner/src/couch_scanner_plugin_find.erl b/src/couch_scanner/src/couch_scanner_plugin_find.erl index af5c8a550..9b3a162d9 100644 --- a/src/couch_scanner/src/couch_scanner_plugin_find.erl +++ b/src/couch_scanner/src/couch_scanner_plugin_find.erl @@ -23,7 +23,6 @@ complete/1, checkpoint/1, db/2, - ddoc/3, shards/2, db_opened/2, doc_id/3, @@ -77,11 +76,6 @@ db(#st{} = St, DbName) -> report_match(DbName, Pats, Meta), {ok, St}. -ddoc(#st{} = St, _DbName, #doc{} = _DDoc) -> - % We'll check doc bodies during the shard scan - % so no need to keep inspecting ddocs - {stop, St}. - shards(#st{sid = SId} = St, Shards) -> case debug() of true -> ?DEBUG(" ~p shards", [length(Shards)], #{sid => SId}); diff --git a/src/couch_scanner/test/eunit/couch_scanner_test.erl b/src/couch_scanner/test/eunit/couch_scanner_test.erl index 6f7f3d8e2..b609bd69c 100644 --- a/src/couch_scanner/test/eunit/couch_scanner_test.erl +++ b/src/couch_scanner/test/eunit/couch_scanner_test.erl @@ -49,6 +49,7 @@ couch_scanner_test_() -> setup() -> {module, _} = code:ensure_loaded(?FIND_PLUGIN), meck:new(?FIND_PLUGIN, [passthrough]), + meck:new(fabric, [passthrough]), meck:new(couch_scanner_server, [passthrough]), meck:new(couch_scanner_util, [passthrough]), Ctx = test_util:start_couch([fabric, couch_scanner]), @@ -141,8 +142,6 @@ t_run_through_all_callbacks_basic({_, {DbName1, DbName2, _}}) -> ?assertEqual(2, num_calls(checkpoint, 1)), ?assertEqual(1, num_calls(db, ['_', DbName1])), ?assertEqual(1, num_calls(db, ['_', DbName2])), - ?assertEqual(1, num_calls(ddoc, ['_', DbName1, '_'])), - ?assertEqual(1, num_calls(ddoc, ['_', DbName2, '_'])), ?assert(num_calls(shards, 2) >= 2), DbOpenedCount = num_calls(db_opened, 2), ?assert(DbOpenedCount >= 4), @@ -161,10 +160,14 @@ t_find_reporting_works(_) -> config:set(Plugin ++ ".regexes", "foo14", "foo(1|4)", false), config:set(Plugin ++ ".regexes", "baz", "baz", false), meck:reset(couch_scanner_server), + meck:reset(fabric), config:set("couch_scanner_plugins", Plugin, "true", false), wait_exit(10000), % doc2 should have a baz and doc1 and doc4 matches foo14 - ?assertEqual(3, log_calls(warning)). + ?assertEqual(3, log_calls(warning)), + % check that we didn't call fabric:all_docs fetching design docs + % as we don't need to for this plugin + ?assertEqual(0, meck:num_calls(fabric, all_docs, 5)). t_ddoc_features_works({_, {_, DbName2, _}}) -> % Run the "ddoc_features" plugin