This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch tolerate-mmode-betteron-partition-info in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 155af858cf1566383af5ed3d42b16512fe0d8a5a Author: Nick Vatamaniuc <[email protected]> AuthorDate: Tue Jan 13 01:53:01 2026 -0500 Tolerate maintenance mode and a node down in partition info calls If we get a node down plus a node in maintenance mode, and we just figured out we can't make progress, try to return any existing results if we got them, instead of always bailing with the last error (an mm or nodedown). --- src/fabric/src/fabric_db_partition_info.erl | 58 ++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/src/fabric/src/fabric_db_partition_info.erl b/src/fabric/src/fabric_db_partition_info.erl index 2b4685fd2..824ccce56 100644 --- a/src/fabric/src/fabric_db_partition_info.erl +++ b/src/fabric/src/fabric_db_partition_info.erl @@ -61,30 +61,33 @@ handle_message({rexi_DOWN, _, {_, NodeRef}, _}, _Shard, #acc{} = Acc) -> error -> {error, {nodedown, <<"progress not possible">>}} end; -handle_message({rexi_EXIT, Reason}, Shard, #acc{} = Acc) -> - #acc{counters = Counters, ring_opts = RingOpts} = Acc, +handle_message({rexi_EXIT, Reason}, #shard{dbname = Name} = Shard, #acc{} = Acc) -> + #acc{counters = Counters, ring_opts = RingOpts, replies = Replies} = Acc, NewCounters = fabric_dict:erase(Shard, Counters), case fabric_ring:is_progress_possible(NewCounters, RingOpts) of true -> {ok, Acc#acc{counters = NewCounters}}; false -> - {error, Reason} + case Replies of + [_ | _] -> {stop, format_response(Name, Replies)}; + _ -> {error, Reason} + end end; handle_message({ok, Info}, #shard{dbname = Name} = Shard, #acc{} = Acc) -> #acc{counters = Counters, replies = Replies} = Acc, Replies1 = [Info | Replies], Counters1 = fabric_dict:erase(Shard, Counters), case fabric_dict:size(Counters1) =:= 0 of - true -> - [FirstInfo | RestInfos] = Replies1, - PartitionInfo = get_max_partition_size(FirstInfo, RestInfos), - {stop, [{db_name, Name} | format_partition(PartitionInfo)]}; - false -> - {ok, Acc#acc{counters = Counters1, replies = Replies1}} + true -> {stop, format_response(Name, Replies1)}; + false -> {ok, Acc#acc{counters = Counters1, replies = Replies1}} end; handle_message(_, _, #acc{} = Acc) -> {ok, Acc}. +format_response(DbName, [FirstInfo | RestInfos]) -> + PartitionInfo = get_max_partition_size(FirstInfo, RestInfos), + [{db_name, DbName} | format_partition(PartitionInfo)]. + get_max_partition_size(Max, []) -> Max; get_max_partition_size(MaxInfo, [NextInfo | Rest]) -> @@ -139,6 +142,43 @@ worker_exit_test() -> ?assertEqual({error, bam}, handle_message({rexi_EXIT, bam}, S2, Acc2)). +worker_down_and_mm_test() -> + [S1, S2, S3] = [ + mk_shard("n1", [0, 4]), + mk_shard("n2", [0, 8]), + mk_shard("n3", [0, 4]) + ], + Acc1 = #acc{ + counters = fabric_dict:init([S1, S2, S3], nil), + ring_opts = [{any, [S1, S2, S3]}], + replies = [] + }, + + N1 = S1#shard.node, + {ok, Acc2} = handle_message({rexi_DOWN, nil, {nil, N1}, nil}, nil, Acc1), + + Info = [ + {partition, <<"xx">>}, + {doc_count, 0}, + {doc_del_count, 0}, + {sizes, [{active, 0}, {external, 0}]} + ], + + {ok, Acc3} = handle_message({ok, Info}, S2, Acc2), + + N3 = S3#shard.node, + {stop, Res} = handle_message({rexi_EXIT, {maintenance_mode, N3}}, S3, Acc3), + ?assertMatch( + [ + {db_name, _}, + {sizes, {[{active, 0}, {external, 0}]}}, + {partition, <<"xx">>}, + {doc_count, 0}, + {doc_del_count, 0} + ], + Res + ). + mk_shard(Name, Range) -> Node = list_to_atom(Name), BName = list_to_binary(Name),
