This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch dry-run-mode-for-auto-purge in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit c9979d977994fe30b65a164f9132e2ebedfc1049 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Thu Jan 22 14:34:31 2026 -0500 Implement dry-run for auto-purge plugin Add a dry-run mode for the auto-purge plugin. Users can enable it and schedule the plugin to run to see how many deleted documents would have been purged for each db shard range. Users may adjust the ttl or the plugin schedule (to run more or less often) and get an idea how long it would take to scan over all the data on the cluster. --- rel/overlay/etc/default.ini | 5 ++++ src/couch/src/couch_auto_purge_plugin.erl | 33 ++++++++++++++++++---- .../test/eunit/couch_auto_purge_plugin_tests.erl | 17 ++++++++++- src/docs/src/config/scanner.rst | 7 +++++ 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini index b6ab9d3ba..0fc4f9ef2 100644 --- a/rel/overlay/etc/default.ini +++ b/rel/overlay/etc/default.ini @@ -1223,6 +1223,11 @@ url = {{nouveau_url}} ; Defaults to undefined, which disables auto purging. ;deleted_document_ttl = +; When set to "true" the plugin does everything (scanning, revision processing, +; etc) but skips the purge step. For each db file it will also log a warning +; with the count of revisions it would have purged for that db shard range. +;dry_run = false + [nouveau_index_upgrader] ; Common scanner scheduling settings ;after = restart diff --git a/src/couch/src/couch_auto_purge_plugin.erl b/src/couch/src/couch_auto_purge_plugin.erl index e63516d30..bd6b3b74f 100644 --- a/src/couch/src/couch_auto_purge_plugin.erl +++ b/src/couch/src/couch_auto_purge_plugin.erl @@ -34,7 +34,10 @@ start(ScanId, #{}) -> skip; false -> St = init_config(ScanId), - ?INFO("Starting.", [], St), + case dry_run() of + false -> ?INFO("Starting.", [], St); + true -> ?WARN("Starting.", [], St) + end, {ok, St} end. @@ -45,12 +48,18 @@ resume(ScanId, #{}) -> skip; false -> St = init_config(ScanId), - ?INFO("Resuming.", [], St), + case dry_run() of + false -> ?INFO("Resuming.", [], St); + true -> ?WARN("Resuming.", [], St) + end, {ok, St} end. complete(St) -> - ?INFO("Completed", [], St), + case dry_run() of + false -> ?INFO("Completed.", [], St); + true -> ?WARN("Completed.", [], St) + end, {ok, #{}}. checkpoint(St) -> @@ -84,7 +93,14 @@ db_opened(#{} = St, Db) -> db_closing(#{} = St, Db) -> St1 = #{count := Count} = flush_queue(St, Db), - ?INFO("purged ~B deleted documents from ~s", [Count, couch_db:name(Db)], meta(St1)), + LogMsg = "purged ~B deleted documents from ~s", + LogArgs = [Count, couch_db:name(Db)], + LogMeta = meta(St1), + % In a dry run log at a higher level if anything would have been purged. + case dry_run() andalso Count > 0 of + false -> ?INFO(LogMsg, LogArgs, LogMeta); + true -> ?WARN(LogMsg, LogArgs, LogMeta) + end, {ok, St1}. doc_fdi(#{} = St, #full_doc_info{deleted = true} = FDI, Db) -> @@ -121,7 +137,11 @@ flush_queue(#{queue := []} = St, _Db) -> flush_queue(#{queue := IdRevs} = St, Db) -> DbName = mem3:dbname(couch_db:name(Db)), N = mem3:n(DbName), - PurgeFun = fun() -> fabric:purge_docs(DbName, IdRevs, [?ADMIN_CTX, {w, N}]) end, + PurgeFun = + case dry_run() of + false -> fun() -> fabric:purge_docs(DbName, IdRevs, [?ADMIN_CTX, {w, N}]) end; + true -> fun() -> {ok, [{ok, Revs} || {_Id, Revs} <- IdRevs]} end + end, Timeout = fabric_util:request_timeout(), try fabric_util:isolate(PurgeFun, Timeout) of {Health, Results} when Health == ok; Health == accepted -> @@ -232,5 +252,8 @@ min_batch_size() -> max_batch_size() -> erlang:max(min_batch_size(), config:get_integer(atom_to_list(?MODULE), "max_batch_size", 500)). +dry_run() -> + config:get_boolean(atom_to_list(?MODULE), "dry_run", false). + dead_nodes() -> [] =/= (mem3:nodes() -- mem3_util:live_nodes()). diff --git a/src/couch/test/eunit/couch_auto_purge_plugin_tests.erl b/src/couch/test/eunit/couch_auto_purge_plugin_tests.erl index a42ef52c5..5c55c4f85 100644 --- a/src/couch/test/eunit/couch_auto_purge_plugin_tests.erl +++ b/src/couch/test/eunit/couch_auto_purge_plugin_tests.erl @@ -29,7 +29,8 @@ couch_quickjs_scanner_plugin_test_() -> ?TDEF_FE(t_min_batch_size_1, 10), ?TDEF_FE(t_min_batch_size_2, 10), ?TDEF_FE(t_max_batch_size_1, 10), - ?TDEF_FE(t_max_batch_size_2, 10) + ?TDEF_FE(t_max_batch_size_2, 10), + ?TDEF_FE(t_dry_run, 10) ] }. @@ -42,6 +43,7 @@ setup() -> DbName = ?tempdb(), ok = fabric:create_db(DbName, [{q, "2"}, {n, "1"}]), config:set(atom_to_list(?PLUGIN), "max_batch_items", "1", false), + config:set(atom_to_list(?PLUGIN), "dry_run", "false", false), reset_stats(), {Ctx, DbName}. @@ -87,6 +89,19 @@ t_auto_purge_after_db_ttl({_, DbName}) -> ?assertEqual(0, doc_del_count(DbName)), ok. +t_dry_run({_, DbName}) -> + config:set(atom_to_list(?PLUGIN), "dry_run", "true", false), + config:set(atom_to_list(?PLUGIN), "deleted_document_ttl", "-3_hour", false), + ok = add_doc(DbName, <<"doc1">>, #{<<"_deleted">> => true}), + ?assertEqual(1, doc_del_count(DbName)), + meck:reset(couch_scanner_server), + meck:reset(?PLUGIN), + config:set("couch_scanner_plugins", atom_to_list(?PLUGIN), "true", false), + wait_exit(10000), + % didn't actually purge + ?assertEqual(1, doc_del_count(DbName)), + ok. + t_min_batch_size_1({_, DbName}) -> meck:new(fabric, [passthrough]), config:set_integer(atom_to_list(?PLUGIN), "min_batch_size", 5), diff --git a/src/docs/src/config/scanner.rst b/src/docs/src/config/scanner.rst index 9c56891d2..d0dc4b082 100644 --- a/src/docs/src/config/scanner.rst +++ b/src/docs/src/config/scanner.rst @@ -263,3 +263,10 @@ settings in their ``[{plugin}]`` section. The database may override this setting with the :ref:`api/db/auto_purge` endpoint. If neither is set, the plugin will not purge deleted documents. + + .. config:option:: dry_run + + When set to ``true`` the plugin does everything (scanning, revision + processing, etc) but skips the purge step. For each db file it will + also log a warning with the count of revisions it would have purged for + that db shard range.
