This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch optimize-doc-updates
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 737a95290c0cf3335e48eb28f9354bcc96294944
Author: Paul J. Davis <[email protected]>
AuthorDate: Wed Nov 15 12:03:28 2017 -0600

    Automatically repair revision trees
---
 src/couch/src/couch_db_updater.erl |  9 ++----
 src/couch/src/couch_key_tree.erl   | 62 ++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/couch/src/couch_db_updater.erl 
b/src/couch/src/couch_db_updater.erl
index f437426..bcddbe0 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -876,11 +876,8 @@ stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, 
Limit) ->
     Stemmed = couch_key_tree:stem(Tree, Limit),
     Info#full_doc_info{rev_tree = Stemmed}.
 
-full_stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) ->
-    lists:map(fun(#full_doc_info{rev_tree=Tree}=FDI) ->
-        Stemmed = couch_key_tree:full_stem(Tree, Limit),
-        FDI#full_doc_info{rev_tree=Stemmed}
-    end, DocInfos).
+stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) ->
+    lists:map(fun(FDI) -> stem_full_doc_info(FDI, Limit) end, DocInfos).
 
 update_docs_int(Db, DocsList, NonRepDocs, MergeConflicts, FullCommit) ->
     #db{
@@ -1128,7 +1125,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, 
Retry) ->
         }
     end, NewInfos0),
 
-    NewInfos = full_stem_full_doc_infos(Db, NewInfos1),
+    NewInfos = stem_full_doc_infos(Db, NewInfos1),
     RemoveSeqs =
     case Retry of
     nil ->
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index d18ecbc..14c136f 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -63,8 +63,7 @@ multi_merge/2,
 merge/3,
 merge/2,
 remove_leafs/2,
-stem/2,
-full_stem/2
+stem/2
 ]).
 
 -include_lib("couch/include/couch_db.hrl").
@@ -480,53 +479,64 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | 
RestTree]) ->
 
 
 stem(Trees, Limit) ->
-    lists:sort(lists:flatmap(fun(Tree) ->
-        stem_tree(Tree, Limit)
-    end, Trees)).
+    try
+        {_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) ->
+            {NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen),
+            {NewSeen, NewBranches ++ TreeAcc}
+        end, {sets:new(), []}, Trees),
+        lists:sort(Branches)
+    catch throw:dupe_keys ->
+        repair_tree(Trees, Limit)
+    end.
 
 
-stem_tree({Depth, Child}, Limit) ->
-    case stem_tree(Depth, Child, Limit) of
-        {_, NewChild, NewBranches} ->
-            [{Depth, NewChild} | NewBranches];
-        {_, NewBranches} ->
-            NewBranches
+stem_tree({Depth, Child}, Limit, Seen) ->
+    case stem_tree(Depth, Child, Limit, Seen) of
+        {NewSeen, _, NewChild, NewBranches} ->
+            {NewSeen, [{Depth, NewChild} | NewBranches]};
+        {NewSeen, _, NewBranches} ->
+            {NewSeen, NewBranches}
     end.
 
 
-stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit) ->
-    {Limit - 1, Leaf, []};
+stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit, Seen) ->
+    {Seen, Limit - 1, Leaf, []};
 
-stem_tree(Depth, {Key, Val, Children}, Limit) ->
-    FinalAcc = lists:foldl(fun(Child, {LimitPosAcc, ChildAcc, BranchAcc}) ->
-        case stem_tree(Depth + 1, Child, Limit) of
-            {LimitPos, NewChild, NewBranches} ->
+stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) ->
+    Seen1 = case sets:is_element(Key, Seen0) of
+        true -> throw(dupe_keys);
+        false -> sets:add_element(Key, Seen0)
+    end,
+    FinalAcc = lists:foldl(fun(Child, Acc) ->
+        {SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc,
+        case stem_tree(Depth + 1, Child, Limit, SeenAcc) of
+            {NewSeenAcc, LimitPos, NewChild, NewBranches} ->
                 NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
                 NewChildAcc = [NewChild | ChildAcc],
                 NewBranchAcc = NewBranches ++ BranchAcc,
-                {NewLimitPosAcc, NewChildAcc, NewBranchAcc};
-            {LimitPos, NewBranches} ->
+                {NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+            {NewSeenAcc, LimitPos, NewBranches} ->
                 NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
                 NewBranchAcc = NewBranches ++ BranchAcc,
-                {NewLimitPosAcc, ChildAcc, NewBranchAcc}
+                {NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc}
         end
-    end, {-1, [], []}, Children),
-    {FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+    end, {Seen1, -1, [], []}, Children),
+    {FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
     case FinalLimitPos of
         N when N > 0, length(FinalChildren) > 0 ->
             FinalNode = {Key, Val, lists:reverse(FinalChildren)},
-            {FinalLimitPos - 1, FinalNode, FinalBranches};
+            {FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches};
         0 when length(FinalChildren) > 0 ->
             NewBranches = lists:map(fun(Child) ->
                 {Depth + 1, Child}
             end, lists:reverse(FinalChildren)),
-            {-1, NewBranches ++ FinalBranches};
+            {FinalSeen, -1, NewBranches ++ FinalBranches};
         N when N < 0, length(FinalChildren) == 0 ->
-            {FinalLimitPos - 1, FinalBranches}
+            {FinalSeen, FinalLimitPos - 1, FinalBranches}
     end.
 
 
-full_stem(Trees, Limit) ->
+repair_tree(Trees, Limit) ->
     % flatten each branch in a tree into a tree path, sort by starting rev #
     Paths = lists:sort(lists:map(fun({Pos, Path}) ->
         StemmedPath = lists:sublist(Path, Limit),

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

Reply via email to