This is an automated email from the ASF dual-hosted git repository. davisp pushed a commit to branch optimize-doc-updates in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit ad22a32da046c9316a01bfb4f3e5cb6d299dabbd Author: Paul J. Davis <[email protected]> AuthorDate: Thu Nov 2 14:30:14 2017 -0500 Optimize couch_key_tree:stem/2 This changes the stemming algorithm to be a single pass O(N) operation rather than the existing O(N^2) operation. The old stemming algorithm works by taking the entire tree apart and then merging the resulting paths back into a single tree. The merging operation ends up causing the O(N^2) aspect because every branch has to be compared to every exisitng sub-tree. --- src/couch/src/couch_db_updater.erl | 9 +++++--- src/couch/src/couch_key_tree.erl | 44 +++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index bcddbe0..07ac0cf 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -876,8 +876,11 @@ stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) -> Stemmed = couch_key_tree:stem(Tree, Limit), Info#full_doc_info{rev_tree = Stemmed}. -stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) -> - lists:map(fun(FDI) -> stem_full_doc_info(FDI, Limit) end, DocInfos). +full_stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) -> + lists:map(fun(#full_doc_info{rev_tree=Tree}=FDI) -> + Stemmed = couch_key_tree:full_stem(Tree, Limit), + FDI#full_doc_info{rev_tree=Stemmed} + DocInfos). update_docs_int(Db, DocsList, NonRepDocs, MergeConflicts, FullCommit) -> #db{ @@ -1125,7 +1128,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> } end, NewInfos0), - NewInfos = stem_full_doc_infos(Db, NewInfos1), + NewInfos = full_stem_full_doc_infos(Db, NewInfos1), RemoveSeqs = case Retry of nil -> diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl index bc4076a..e509e89 100644 --- a/src/couch/src/couch_key_tree.erl +++ b/src/couch/src/couch_key_tree.erl @@ -62,7 +62,8 @@ mapfold/3, merge/3, merge/2, remove_leafs/2, -stem/2 +stem/2, +full_stem/2 ]). -include_lib("couch/include/couch_db.hrl"). @@ -470,6 +471,47 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) -> stem(Trees, Limit) -> + lists:map(fun(Tree) -> + stem_tree(Tree, Limit) + end, Trees). + + +stem_tree({Depth, Children}, Limit) -> + lists:map(fun(Child) -> + stem_tree(Depth, Child, Limit) + end, Children). + + +stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit) -> + {Limit - 1, Leaf, []}; + +stem_tree(Depth, {Key, Val, Children}, Limit) -> + FinalAcc = lists:foldl(fun(Child, {LimitPosAcc, ChildAcc, BranchAcc}) -> + case stem_tree(Depth + 1, Child, Limit) of + {LimitPos, NewChild, NewBranches} -> + NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), + NewChildAcc = [NewChild | ChildAcc], + NewBranchAcc = NewBranches ++ BranchAcc, + {NewLimitPosAcc, NewChildAcc, NewBranchAcc}; + {LimitPos, NewBranches} -> + NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), + NewBranchAcc = NewBranches ++ BranchAcc, + {NewLimitPosAcc, ChildAcc, NewBranchAcc} + end + end, {-1, [], []}, Children), + {FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc, + NewChildren = lists:sort(FinalChildren) + case {FinalLimitPos, NewChildren} of + {N, _} when N > 0, length(NewChildren) > 0 -> + {FinalLimiPos - 1, {Key, Value, NewChildren}, FinalBranches}; + {0, _} when length(NewChildren) > 0 -> + {-1, [{Depth, NewChildren} | FinalBranches]}; + {N, []} when N < 0 -> + {FinalLimitPos - 1, FinalBranches} + end. + + +full_stem(Trees, Limit) -> % flatten each branch in a tree into a tree path, sort by starting rev # Paths = lists:sort(lists:map(fun({Pos, Path}) -> StemmedPath = lists:sublist(Path, Limit), -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
