This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch fix-bulk-get-maintenance-mode in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 1111d3b15d2d4940dfe353f58412d6147d194c29 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Thu Dec 4 01:55:40 2025 -0500 Fix bulk_get error handling * Previously, responses could end with an error sooner than expected if some of the results were of type `not_found`. That could happen because `not_found` responses do not increment the `rcnt` counter, and `success_possible/1` function relied on the `wcnt + rcnt > 0` check. Update the function to return `true` if there are any workers left (`wcnt > 0`) or there is at least one response for each request. * Maintenance mode errors were not masked and could easily leak to the users. That happened because we returned the last error at the moment we detect that we cannot make progress. If that last error was maintenance mode error, we'd emit that. To fix it, keep track of all the errors we got so far, and if there any other errors besides maintenance mode, then return those error. --- src/fabric/src/fabric_open_revs.erl | 52 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/fabric/src/fabric_open_revs.erl b/src/fabric/src/fabric_open_revs.erl index b4f95df8a..30ab7c74b 100644 --- a/src/fabric/src/fabric_open_revs.erl +++ b/src/fabric/src/fabric_open_revs.erl @@ -30,7 +30,8 @@ r, args, reqs, - workers + workers, + errors = [] }). go(_DbName, [], _Options) -> @@ -72,6 +73,9 @@ handle_message({rexi_DOWN, _, {_, NodeRef}, _}, _Worker, #st{} = St) -> Reqs1 = maps:fold(FoldFun, Reqs, DeadWorkers), Error = {error, {nodedown, <<"progress not possible">>}}, handle_error(Error, St#st{workers = Workers1, reqs = Reqs1}); +handle_message({rexi_EXIT, {maintenance_mode, _Node}}, Worker, #st{} = St) -> + % Remove the node to make it easier to de-duplicate later + handle_message(maintenance_mode, Worker, St); handle_message({rexi_EXIT, Reason}, Worker, #st{} = St) -> handle_message(Reason, Worker, St); handle_message({error, Reason}, Worker, #st{} = St) -> @@ -112,7 +116,7 @@ responses_fold({ArgRef, NewResp}, #{} = Reqs) -> } }. -handle_error(Error, #st{workers = Workers, reqs = Reqs} = St) -> +handle_error(Error, #st{workers = Workers, errors = Errors, reqs = Reqs} = St) -> case success_possible(Reqs) of true -> case have_viable_workers(Workers) of @@ -124,7 +128,8 @@ handle_error(Error, #st{workers = Workers, reqs = Reqs} = St) -> end; false -> stop_workers(Workers), - {error, Error} + % We may have multiple errors but need to pick one, so pick the first + {error, hd(merge_errors(Errors, Error))} end. % De-duplicate identical responses as we go along @@ -142,6 +147,17 @@ sort_key({ok, #doc{id = Id, revs = Revs, deleted = Deleted}}) -> sort_key(NotFound) -> NotFound. +% We're trying to hide maintenance mode if possible. So if there are +% non-maintenance mode errors, such as timeouts, etc, we remove the maintenance +% mode from the list, otherwise, we keep it. +% +merge_errors(Errors, Error) -> + Errors1 = lists:uniq([Error | Errors]), + case Errors1 of + [maintenance_mode] -> [maintenance_mode]; + [_ | _] -> lists:delete(maintenance_mode, Errors1) + end. + % Build a #{ArgRef => #req{}} map. ArgRef references the initial {{Id, Revs}, % Opts} tuples and the #req{} is a record keeping track of response for just % that {Id, Revs} pair. @@ -206,11 +222,17 @@ have_viable_workers(#{} = Workers) -> map_size(Workers) > 0. % We can still return success if we either have some waiting workers, or at -% least one response already for each {Id, Revs} pair. +% least one response already for each {Id, Revs} pair. We don't simply check +% for W + R > 0 but check that responses have any entries, as not_found entries +% won't bump the R values. % success_possible(#{} = Reqs) -> - Fun = fun(_, #req{wcnt = W, rcnt = R}, Acc) -> Acc andalso W + R > 0 end, - maps:fold(Fun, true, Reqs). + maps:fold(fun success_possible_fold/3, true, Reqs). + +success_possible_fold(_Key, #req{}, _Acc = false) -> + false; +success_possible_fold(_Key, #req{wcnt = W, responses = Resps}, _Acc) -> + W > 0 orelse Resps =/= []. r_met(#{} = Reqs, ExpectedR) -> Fun = fun(_, #req{rcnt = R}, Acc) -> min(R, Acc) end, @@ -316,6 +338,8 @@ open_revs_quorum_test_() -> ?TDEF_FE(t_finish_quorum), ?TDEF_FE(t_no_quorum_on_different_responses), ?TDEF_FE(t_no_quorum_on_not_found), + ?TDEF_FE(t_not_found_and_maintenance_mode), + ?TDEF_FE(t_all_maintenance_mode), ?TDEF_FE(t_done_on_third), ?TDEF_FE(t_all_different_responses), ?TDEF_FE(t_ancestors_merge_correctly), @@ -400,6 +424,22 @@ t_no_quorum_on_not_found(_) -> Res2 = handle_message([[foo2(), bar1()]], W3, S2), ?assertEqual({stop, [[foo2(), bar1()]]}, Res2). +t_not_found_and_maintenance_mode(_) -> + S0 = #st{workers = Workers0} = st0(), + [W1, W2, W3] = lists:sort(maps:keys(Workers0)), + {ok, S1} = handle_message([[bazNF()]], W1, S0), + {ok, S2} = handle_message({error, timeout}, W2, S1), + Res = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W3, S2), + ?assertEqual({stop, [[bazNF()]]}, Res). + +t_all_maintenance_mode(_) -> + S0 = #st{workers = Workers0} = st0(), + [W1, W2, W3] = lists:sort(maps:keys(Workers0)), + {ok, S1} = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W1, S0), + {ok, S2} = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W2, S1), + Res = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W3, S2), + ?assertEqual({error, maintenance_mode}, Res). + t_done_on_third(_) -> S0 = #st{workers = Workers0} = st0(), [W1, W2, W3] = lists:sort(maps:keys(Workers0)),
