This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch improve-scanner-performance
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 00277d5fc4a9d39e88b2f855f2adb515765b1dc9
Author: Nick Vatamaniuc <vatam...@gmail.com>
AuthorDate: Fri Aug 1 13:12:36 2025 -0400

    Improve scanner performance
    
     * If the plugin doesn't need to inspect ddocs at the cluster level, avoid 
the
     expensive `fabric:all_docs()` call, both the finder and the conflict 
checker
     benefit from this one.
    
     * Let the plugins examine the `#full_doc_info{}` structure directly in the 
new
     optional `doc_fdi` callback. Previously, the conflict checker had to 
re-read
     the FDI just to look for conflict even though the plugin already read the 
FDI.
---
 src/couch_scanner/src/couch_scanner_plugin.erl     | 59 +++++++++++++++++-----
 .../src/couch_scanner_plugin_conflict_finder.erl   | 10 ++--
 .../src/couch_scanner_plugin_find.erl              |  6 ---
 .../test/eunit/couch_scanner_test.erl              |  9 ++--
 4 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/src/couch_scanner/src/couch_scanner_plugin.erl 
b/src/couch_scanner/src/couch_scanner_plugin.erl
index 04f394c12..9ff3a2505 100644
--- a/src/couch_scanner/src/couch_scanner_plugin.erl
+++ b/src/couch_scanner/src/couch_scanner_plugin.erl
@@ -119,11 +119,16 @@
 -callback db_opened(St :: term(), Db :: term()) ->
     {ok, St :: term()}.
 
-% Optional. If doc is not defined, then ddoc_id default action is {skip, St}.
-% If it is defined, the default action is {ok, St}.
+% Optional. If doc and doc_fdi are not defined, then doc_id default
+% action is {skip, St}. If it is defined, the default action is {ok, St}.
 -callback doc_id(St :: term(), DocId :: binary(), Db :: term()) ->
     {ok | skip | stop, St1 :: term()}.
 
+% Optional. If doc is not defined, then doc_fdi default action is {stop, St}.
+% If it is defined, the default action is {ok, St}.
+-callback doc_fdi(St :: term(), FDI :: #full_doc_info{}, Db :: term()) ->
+    {ok | stop, St1 :: term()}.
+
 % Optional.
 -callback doc(St :: term(), Db :: term(), #doc{}) ->
     {ok | stop, St1 :: term()}.
@@ -139,6 +144,7 @@
     shards/2,
     db_opened/2,
     doc_id/3,
+    doc_fdi/3,
     doc/3,
     db_closing/2
 ]).
@@ -153,6 +159,7 @@
     {shards, 2, fun default_shards/3},
     {db_opened, 2, fun default_db_opened/3},
     {doc_id, 3, fun default_doc_id/3},
+    {doc_fdi, 3, fun default_doc_fdi/3},
     {doc, 3, fun default_doc/3},
     {db_closing, 2, fun default_db_closing/3}
 ]).
@@ -382,7 +389,7 @@ scan_docs_fold(#full_doc_info{id = Id} = FDI, #st{} = St) ->
             {Go, PSt1} = DocIdCbk(PSt, Id, Db),
             St1 = St#st{pst = PSt1},
             case Go of
-                ok -> scan_doc(FDI, St1);
+                ok -> scan_fdi(FDI, St1);
                 skip -> {ok, St1};
                 stop -> {stop, St1}
             end;
@@ -390,6 +397,16 @@ scan_docs_fold(#full_doc_info{id = Id} = FDI, #st{} = St) 
->
             {ok, St}
     end.
 
+scan_fdi(#full_doc_info{} = FDI, #st{} = St) ->
+    #st{db = Db, callbacks = Cbks, pst = PSt} = St,
+    #{doc_fdi := FDICbk} = Cbks,
+    {Go, PSt1} = FDICbk(PSt, FDI, Db),
+    St1 = St#st{pst = PSt1},
+    case Go of
+        ok -> scan_doc(FDI, St1);
+        stop -> {stop, St1}
+    end.
+
 scan_doc(#full_doc_info{} = FDI, #st{} = St) ->
     #st{db = Db, callbacks = Cbks, pst = PSt} = St,
     St1 = rate_limit(St, doc),
@@ -575,6 +592,7 @@ default_shards(Mod, _F, _A) when is_atom(Mod) ->
     case
         is_exported(Mod, db_opened, 2) orelse
             is_exported(Mod, doc_id, 3) orelse
+            is_exported(Mod, doc_fdi, 3) orelse
             is_exported(Mod, doc, 3) orelse
             is_exported(Mod, db_closing, 2)
     of
@@ -586,11 +604,17 @@ default_db_opened(Mod, _F, _A) when is_atom(Mod) ->
     fun(St, _Db) -> {ok, St} end.
 
 default_doc_id(Mod, _F, _A) when is_atom(Mod) ->
-    case is_exported(Mod, doc, 3) of
+    case is_exported(Mod, doc, 3) orelse is_exported(Mod, doc_fdi, 3) of
         true -> fun(St, _DocId, _Db) -> {ok, St} end;
         false -> fun(St, _DocId, _Db) -> {skip, St} end
     end.
 
+default_doc_fdi(Mod, _F, _A) when is_atom(Mod) ->
+    case is_exported(Mod, doc, 3) of
+        true -> fun(St, _FDI, _Db) -> {ok, St} end;
+        false -> fun(St, _FDI, _Db) -> {stop, St} end
+    end.
+
 default_doc(Mod, _F, _A) when is_atom(Mod) ->
     fun(St, _Db, _Doc) -> {ok, St} end.
 
@@ -624,16 +648,23 @@ shards_by_range(Shards) ->
 
 % Design doc fetching helper
 
-fold_ddocs(Fun, #st{dbname = DbName} = Acc) ->
-    QArgs = #mrargs{
-        include_docs = true,
-        extra = [{namespace, <<"_design">>}]
-    },
-    try
-        {ok, Acc1} = fabric:all_docs(DbName, [?ADMIN_CTX], Fun, Acc, QArgs),
-        Acc1
-    catch
-        error:database_does_not_exist ->
+fold_ddocs(Fun, #st{dbname = DbName, mod = Mod} = Acc) ->
+    case is_exported(Mod, ddoc, 3) of
+        true ->
+            QArgs = #mrargs{
+                include_docs = true,
+                extra = [{namespace, <<"_design">>}]
+            },
+            try
+                {ok, Acc1} = fabric:all_docs(DbName, [?ADMIN_CTX], Fun, Acc, 
QArgs),
+                Acc1
+            catch
+                error:database_does_not_exist ->
+                    Acc
+            end;
+        false ->
+            % If the plugin doesn't export the ddoc callback, don't bother 
calling
+            % fabric:all_docs, as it's expensive
             Acc
     end.
 
diff --git a/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl 
b/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl
index 5f8f17527..4d57cbd91 100644
--- a/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl
+++ b/src/couch_scanner/src/couch_scanner_plugin_conflict_finder.erl
@@ -19,7 +19,7 @@
     complete/1,
     checkpoint/1,
     db/2,
-    doc_id/3
+    doc_fdi/3
 ]).
 
 -include_lib("couch_scanner/include/couch_scanner_plugin.hrl").
@@ -76,12 +76,10 @@ checkpoint(#st{sid = SId, opts = Opts}) ->
 db(#st{} = St, _DbName) ->
     {ok, St}.
 
-doc_id(#st{} = St, <<?DESIGN_DOC_PREFIX, _/binary>>, _Db) ->
-    {skip, St};
-doc_id(#st{} = St, DocId, Db) ->
-    {ok, #doc_info{revs = Revs}} = couch_db:get_doc_info(Db, DocId),
+doc_fdi(#st{} = St, #full_doc_info{} = FDI, Db) ->
+    #doc_info{revs = Revs} = couch_doc:to_doc_info(FDI),
     DbName = mem3:dbname(couch_db:name(Db)),
-    {ok, check(St, DbName, DocId, Revs)}.
+    {ok, check(St, DbName, FDI#full_doc_info.id, Revs)}.
 
 % Private
 
diff --git a/src/couch_scanner/src/couch_scanner_plugin_find.erl 
b/src/couch_scanner/src/couch_scanner_plugin_find.erl
index af5c8a550..9b3a162d9 100644
--- a/src/couch_scanner/src/couch_scanner_plugin_find.erl
+++ b/src/couch_scanner/src/couch_scanner_plugin_find.erl
@@ -23,7 +23,6 @@
     complete/1,
     checkpoint/1,
     db/2,
-    ddoc/3,
     shards/2,
     db_opened/2,
     doc_id/3,
@@ -77,11 +76,6 @@ db(#st{} = St, DbName) ->
     report_match(DbName, Pats, Meta),
     {ok, St}.
 
-ddoc(#st{} = St, _DbName, #doc{} = _DDoc) ->
-    % We'll check doc bodies during the shard scan
-    % so no need to keep inspecting ddocs
-    {stop, St}.
-
 shards(#st{sid = SId} = St, Shards) ->
     case debug() of
         true -> ?DEBUG(" ~p shards", [length(Shards)], #{sid => SId});
diff --git a/src/couch_scanner/test/eunit/couch_scanner_test.erl 
b/src/couch_scanner/test/eunit/couch_scanner_test.erl
index 6f7f3d8e2..b609bd69c 100644
--- a/src/couch_scanner/test/eunit/couch_scanner_test.erl
+++ b/src/couch_scanner/test/eunit/couch_scanner_test.erl
@@ -49,6 +49,7 @@ couch_scanner_test_() ->
 setup() ->
     {module, _} = code:ensure_loaded(?FIND_PLUGIN),
     meck:new(?FIND_PLUGIN, [passthrough]),
+    meck:new(fabric, [passthrough]),
     meck:new(couch_scanner_server, [passthrough]),
     meck:new(couch_scanner_util, [passthrough]),
     Ctx = test_util:start_couch([fabric, couch_scanner]),
@@ -141,8 +142,6 @@ t_run_through_all_callbacks_basic({_, {DbName1, DbName2, 
_}}) ->
     ?assertEqual(2, num_calls(checkpoint, 1)),
     ?assertEqual(1, num_calls(db, ['_', DbName1])),
     ?assertEqual(1, num_calls(db, ['_', DbName2])),
-    ?assertEqual(1, num_calls(ddoc, ['_', DbName1, '_'])),
-    ?assertEqual(1, num_calls(ddoc, ['_', DbName2, '_'])),
     ?assert(num_calls(shards, 2) >= 2),
     DbOpenedCount = num_calls(db_opened, 2),
     ?assert(DbOpenedCount >= 4),
@@ -161,10 +160,14 @@ t_find_reporting_works(_) ->
     config:set(Plugin ++ ".regexes", "foo14", "foo(1|4)", false),
     config:set(Plugin ++ ".regexes", "baz", "baz", false),
     meck:reset(couch_scanner_server),
+    meck:reset(fabric),
     config:set("couch_scanner_plugins", Plugin, "true", false),
     wait_exit(10000),
     % doc2 should have a baz and doc1 and doc4 matches foo14
-    ?assertEqual(3, log_calls(warning)).
+    ?assertEqual(3, log_calls(warning)),
+    % check that we didn't call fabric:all_docs fetching design docs
+    % as we don't need to for this plugin
+    ?assertEqual(0, meck:num_calls(fabric, all_docs, 5)).
 
 t_ddoc_features_works({_, {_, DbName2, _}}) ->
     % Run the "ddoc_features" plugin

Reply via email to