(couchdb) 01/01: Fix bulk_get error handling

vatamane Wed, 03 Dec 2025 23:23:27 -0800

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch fix-bulk-get-maintenance-mode
in repository https://gitbox.apache.org/repos/asf/couchdb.git


commit 1111d3b15d2d4940dfe353f58412d6147d194c29
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Thu Dec 4 01:55:40 2025 -0500

    Fix bulk_get error handling
    
     * Previously, responses could end with an error sooner than expected if 
some
     of the results were of type `not_found`. That could happen because 
`not_found`
     responses do not increment the `rcnt` counter, and `success_possible/1`
     function relied on the `wcnt + rcnt > 0` check. Update the function to 
return
     `true` if there are any workers left (`wcnt > 0`) or there is at least one
     response for each request.
    
     * Maintenance mode errors were not masked and could easily leak to the 
users.
     That happened because we returned the last error at the moment we detect 
that
     we cannot make progress. If that last error was maintenance mode error, 
we'd
     emit that. To fix it, keep track of all the errors we got so far, and if 
there
     any other errors besides maintenance mode, then return those error.
---
 src/fabric/src/fabric_open_revs.erl | 52 ++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/fabric/src/fabric_open_revs.erl 
b/src/fabric/src/fabric_open_revs.erl
index b4f95df8a..30ab7c74b 100644
--- a/src/fabric/src/fabric_open_revs.erl
+++ b/src/fabric/src/fabric_open_revs.erl
@@ -30,7 +30,8 @@
     r,
     args,
     reqs,
-    workers
+    workers,
+    errors = []
 }).
 
 go(_DbName, [], _Options) ->
@@ -72,6 +73,9 @@ handle_message({rexi_DOWN, _, {_, NodeRef}, _}, _Worker, 
#st{} = St) ->
     Reqs1 = maps:fold(FoldFun, Reqs, DeadWorkers),
     Error = {error, {nodedown, <<"progress not possible">>}},
     handle_error(Error, St#st{workers = Workers1, reqs = Reqs1});
+handle_message({rexi_EXIT, {maintenance_mode, _Node}}, Worker, #st{} = St) ->
+    % Remove the node to make it easier to de-duplicate later
+    handle_message(maintenance_mode, Worker, St);
 handle_message({rexi_EXIT, Reason}, Worker, #st{} = St) ->
     handle_message(Reason, Worker, St);
 handle_message({error, Reason}, Worker, #st{} = St) ->
@@ -112,7 +116,7 @@ responses_fold({ArgRef, NewResp}, #{} = Reqs) ->
         }
     }.
 
-handle_error(Error, #st{workers = Workers, reqs = Reqs} = St) ->
+handle_error(Error, #st{workers = Workers, errors = Errors, reqs = Reqs} = St) 
->
     case success_possible(Reqs) of
         true ->
             case have_viable_workers(Workers) of
@@ -124,7 +128,8 @@ handle_error(Error, #st{workers = Workers, reqs = Reqs} = 
St) ->
             end;
         false ->
             stop_workers(Workers),
-            {error, Error}
+            % We may have multiple errors but need to pick one, so pick the 
first
+            {error, hd(merge_errors(Errors, Error))}
     end.
 
 % De-duplicate identical responses as we go along
@@ -142,6 +147,17 @@ sort_key({ok, #doc{id = Id, revs = Revs, deleted = 
Deleted}}) ->
 sort_key(NotFound) ->
     NotFound.
 
+% We're trying to hide maintenance mode if possible. So if there are
+% non-maintenance mode errors, such as timeouts, etc, we remove the maintenance
+% mode from the list, otherwise, we keep it.
+%
+merge_errors(Errors, Error) ->
+    Errors1 = lists:uniq([Error | Errors]),
+    case Errors1 of
+        [maintenance_mode] -> [maintenance_mode];
+        [_ | _] -> lists:delete(maintenance_mode, Errors1)
+    end.
+
 % Build a #{ArgRef => #req{}} map. ArgRef references the initial {{Id, Revs},
 % Opts} tuples and the #req{} is a record keeping track of response for just
 % that {Id, Revs} pair.
@@ -206,11 +222,17 @@ have_viable_workers(#{} = Workers) ->
     map_size(Workers) > 0.
 
 % We can still return success if we either have some waiting workers, or at
-% least one response already for each {Id, Revs} pair.
+% least one response already for each {Id, Revs} pair. We don't simply check
+% for W + R > 0 but check that responses have any entries, as not_found entries
+% won't bump the R values.
 %
 success_possible(#{} = Reqs) ->
-    Fun = fun(_, #req{wcnt = W, rcnt = R}, Acc) -> Acc andalso W + R > 0 end,
-    maps:fold(Fun, true, Reqs).
+    maps:fold(fun success_possible_fold/3, true, Reqs).
+
+success_possible_fold(_Key, #req{}, _Acc = false) ->
+    false;
+success_possible_fold(_Key, #req{wcnt = W, responses = Resps}, _Acc) ->
+    W > 0 orelse Resps =/= [].
 
 r_met(#{} = Reqs, ExpectedR) ->
     Fun = fun(_, #req{rcnt = R}, Acc) -> min(R, Acc) end,
@@ -316,6 +338,8 @@ open_revs_quorum_test_() ->
                 ?TDEF_FE(t_finish_quorum),
                 ?TDEF_FE(t_no_quorum_on_different_responses),
                 ?TDEF_FE(t_no_quorum_on_not_found),
+                ?TDEF_FE(t_not_found_and_maintenance_mode),
+                ?TDEF_FE(t_all_maintenance_mode),
                 ?TDEF_FE(t_done_on_third),
                 ?TDEF_FE(t_all_different_responses),
                 ?TDEF_FE(t_ancestors_merge_correctly),
@@ -400,6 +424,22 @@ t_no_quorum_on_not_found(_) ->
     Res2 = handle_message([[foo2(), bar1()]], W3, S2),
     ?assertEqual({stop, [[foo2(), bar1()]]}, Res2).
 
+t_not_found_and_maintenance_mode(_) ->
+    S0 = #st{workers = Workers0} = st0(),
+    [W1, W2, W3] = lists:sort(maps:keys(Workers0)),
+    {ok, S1} = handle_message([[bazNF()]], W1, S0),
+    {ok, S2} = handle_message({error, timeout}, W2, S1),
+    Res = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W3, S2),
+    ?assertEqual({stop, [[bazNF()]]}, Res).
+
+t_all_maintenance_mode(_) ->
+    S0 = #st{workers = Workers0} = st0(),
+    [W1, W2, W3] = lists:sort(maps:keys(Workers0)),
+    {ok, S1} = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W1, S0),
+    {ok, S2} = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W2, S1),
+    Res = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W3, S2),
+    ?assertEqual({error, maintenance_mode}, Res).
+
 t_done_on_third(_) ->
     S0 = #st{workers = Workers0} = st0(),
     [W1, W2, W3] = lists:sort(maps:keys(Workers0)),

(couchdb) 01/01: Fix bulk_get error handling

Reply via email to