The part that I didn't quite get was the following > But what happens is at the point we calculate the seq to remove from > docinfo_by_seq_tree, we calculate the wrong value. Thus when the update > continues we remove an update_seq that doesn't exist in the tree and insert > our new seq.
Do we end up selecting the sequence from the non-leaf revision? Best, Adam On Friday, September 2, 2011 at 6:29 AM, Dave Cottlehuber wrote: > Paul, > > Love your work mate! Outstanding commit msg, I'm working through it (slowly). > > A+ > D > > On 2 September 2011 16:31, <[email protected] (mailto:[email protected])> > wrote: > > Author: davisp > > Date: Fri Sep 2 04:31:10 2011 > > New Revision: 1164350 > > > > URL: http://svn.apache.org/viewvc?rev=1164350&view=rev > > Log: > > Fix introduction of duplicates into _changes feed > > > > When a document is updated the new update_seq is assigned as part of the > > rev_tree merging in couch_db_updater:merge_rev_trees/7 based on the > > condition of whether the new rev_tree is equal to the old tree. This > > equality is done as a simple Erlang term comparison. If the trees are > > not equal a new update_seq is assigned to the #full_doc_info{} record > > that is stored in fulldocinfo_by_id_btree. > > > > During replication it is possible that a document update merges into the > > rev_tree internally without creating a leaf. If the terminal node of the > > replicated path happens to land on a node with a value of ?REV_MISSING > > the new document information will be preferred and replace the > > ?REV_MISSING value. > > > > This preference ends up causing the rev_tree comparison to evaluate to > > false which ends up giving this document a new update_seq. Up until this > > point everything is fine. We write a bit of extra data (that will be > > cleared during compaction) because of a previous bug where we decided to > > be cautious and avoid losing data due to a broken rev_tree merging > > aglorithm. It is also important to note that this is the place were we > > calculate the update_seq to remove from the docinfo_by_seq_tree. > > > > After this point we get back to couch_db_udpater:update_docs_int/5 where > > we eventually call couch_db_updater:new_index_entries/3 which creates > > the new entries for the fulldocinfo_by_id_tree and docinfo_by_seq_btree. > > At this point we end up creating a #doc_info{} record based on the > > leaves in the rev_tree. As you recall, the update that caused the new > > update_seq was not a leaf, at this point we create a #doc_info{} record > > with an incorrect high_seq member pointing to the update_seq we are > > about to remove from the docinfo_by_seq_tree (remember we calculated the > > seq to remove before we consulted the leaves). > > > > The end result is that we remove the same update_seq we insert. This > > sets us up for the real bug. The next time we go to update this document > > the same procedure is applied. But what happens is at the point we > > calculate the seq to remove from docinfo_by_seq_tree, we calculate the > > wrong value. Thus when the update continues we remove an update_seq that > > doesn't exist in the tree and insert our new seq. But, the seq from the > > previous update is still in the tree. Thus, our docinfo_by_seq_tree now > > contains two entries pointing at the same document. > > > > At this point, we run into the observed behavior of this bug that ends > > up causing duplicate entries in views which then ends up throwing errors > > when the view is compaction. These errors are also particularly nasty > > because they bubble up the the couch_view gen_server which crashes and > > spiders out crashing every couch_view_group process. That part probably > > isn't important though. > > > > There's a simple test include with the patch to illustrate the behavior > > and maintain an assertion that it stays fixed. > > > > Fixes COUCHDB-1265 > > > > > > Modified: > > couchdb/trunk/share/www/script/test/recreate_doc.js > > couchdb/trunk/src/couchdb/couch_doc.erl > > couchdb/trunk/src/couchdb/couch_key_tree.erl > > > > Modified: couchdb/trunk/share/www/script/test/recreate_doc.js > > URL: > > http://svn.apache.org/viewvc/couchdb/trunk/share/www/script/test/recreate_doc.js?rev=1164350&r1=1164349&r2=1164350&view=diff > > ============================================================================== > > --- couchdb/trunk/share/www/script/test/recreate_doc.js (original) > > +++ couchdb/trunk/share/www/script/test/recreate_doc.js Fri Sep 2 04:31:10 > > 2011 > > @@ -77,4 +77,45 @@ couchTests.recreate_doc = function(debug > > } catch (e) { > > T(e.error == "conflict"); > > } > > + > > + db.deleteDb(); > > + db.createDb(); > > + > > + // COUCHDB-1265 > > + // Resuscitate an unavailable old revision and make sure that it > > + // doesn't introduce duplicates into the _changes feed. > > + > > + var doc = {_id: "bar", count: 0}; > > + T(db.save(doc).ok); > > + var ghost = {_id: "bar", _rev: doc._rev, count: doc.count}; > > + for(var i = 0; i < 2; i++) { > > + doc.count += 1; > > + T(db.save(doc).ok); > > + } > > + > > + // Compact so that the old revision to be resuscitated will be > > + // in the rev_tree as ?REV_MISSING > > + db.compact(); > > + while(db.info (http://db.info)().compact_running) {} > > + > > + // Saving the ghost here puts it back in the rev_tree in such > > + // a way as to create a new update_seq but without changing a > > + // leaf revision. This would cause the #full_doc_info{} and > > + // #doc_info{} records to diverge in their idea of what the > > + // doc's update_seq is and end up introducing a duplicate in > > + // the _changes feed the next time this doc is updated. > > + T(db.save(ghost, {new_edits: false}).ok); > > + > > + // The duplicate would have been introduce here becuase the #doc_info{} > > + // would not have been removed correctly. > > + T(db.save(doc).ok); > > + > > + // And finally assert that there are no duplicates in _changes. > > + var req = CouchDB.request("GET", "/test_suite_db/_changes"); > > + var resp = JSON.parse(req.responseText); > > + var docids = {}; > > + for(var i = 0; i < resp.results.length; i++) { > > + T(docids[resp.results[i].id] === undefined, "Duplicates in _changes > > feed."); > > + docids[resp.results[i].id] = true; > > + } > > }; > > > > Modified: couchdb/trunk/src/couchdb/couch_doc.erl > > URL: > > http://svn.apache.org/viewvc/couchdb/trunk/src/couchdb/couch_doc.erl?rev=1164350&r1=1164349&r2=1164350&view=diff > > ============================================================================== > > --- couchdb/trunk/src/couchdb/couch_doc.erl (original) > > +++ couchdb/trunk/src/couchdb/couch_doc.erl Fri Sep 2 04:31:10 2011 > > @@ -319,10 +319,19 @@ to_doc_info(FullDocInfo) -> > > {DocInfo, _Path} = to_doc_info_path(FullDocInfo), > > DocInfo. > > > > -max_seq([], Max) -> > > - Max; > > -max_seq([#rev_info{seq=Seq}|Rest], Max) -> > > - max_seq(Rest, if Max > Seq -> Max; true -> Seq end). > > +max_seq(Tree) -> > > + FoldFun = fun({_Pos, _Key}, Value, _Type, MaxOldSeq) -> > > + case Value of > > + {_Deleted, _DiskPos, OldTreeSeq} -> > > + % Older versions didn't track data sizes. > > + erlang:max(MaxOldSeq, OldTreeSeq); > > + {_Deleted, _DiskPos, OldTreeSeq, _Size} -> > > + erlang:max(MaxOldSeq, OldTreeSeq); > > + _ -> > > + MaxOldSeq > > + end > > + end, > > + couch_key_tree:fold(FoldFun, 0, Tree). > > > > to_doc_info_path(#full_doc_info{id=Id,rev_tree=Tree}) -> > > RevInfosAndPath = [ > > @@ -342,7 +351,7 @@ to_doc_info_path(#full_doc_info{id=Id,re > > end, RevInfosAndPath), > > [{_RevInfo, WinPath}|_] = SortedRevInfosAndPath, > > RevInfos = [RevInfo || {RevInfo, _Path} <- SortedRevInfosAndPath], > > - {#doc_info{id=Id, high_seq=max_seq(RevInfos, 0), revs=RevInfos}, WinPath}. > > + {#doc_info{id=Id, high_seq=max_seq(Tree), revs=RevInfos}, WinPath}. > > > > > > > > > > Modified: couchdb/trunk/src/couchdb/couch_key_tree.erl > > URL: > > http://svn.apache.org/viewvc/couchdb/trunk/src/couchdb/couch_key_tree.erl?rev=1164350&r1=1164349&r2=1164350&view=diff > > ============================================================================== > > --- couchdb/trunk/src/couchdb/couch_key_tree.erl (original) > > +++ couchdb/trunk/src/couchdb/couch_key_tree.erl Fri Sep 2 04:31:10 2011 > > @@ -49,7 +49,7 @@ > > > > -export([merge/3, find_missing/2, get_key_leafs/2, get_full_key_paths/2, > > get/2]). > > -export([get_all_leafs/1, count_leafs/1, remove_leafs/2, > > get_all_leafs_full/1, stem/2]). > > --export([map/2, mapfold/3, map_leafs/2]). > > +-export([map/2, mapfold/3, map_leafs/2, fold/3]). > > > > -include("couch_db.hrl"). > > > > @@ -320,6 +320,21 @@ count_leafs_simple([{_Key, _Value, SubTr > > count_leafs_simple(SubTree) + count_leafs_simple(RestTree). > > > > > > +fold(_Fun, Acc, []) -> > > + Acc; > > +fold(Fun, Acc0, [{Pos, Tree}|Rest]) -> > > + Acc1 = fold_simple(Fun, Acc0, Pos, [Tree]), > > + fold(Fun, Acc1, Rest). > > + > > +fold_simple(_Fun, Acc, _Pos, []) -> > > + Acc; > > +fold_simple(Fun, Acc0, Pos, [{Key, Value, SubTree} | RestTree]) -> > > + Type = if SubTree == [] -> leaf; true -> branch end, > > + Acc1 = Fun({Pos, Key}, Value, Type, Acc0), > > + Acc2 = fold_simple(Fun, Acc1, Pos+1, SubTree), > > + fold_simple(Fun, Acc2, Pos, RestTree). > > + > > + > > map(_Fun, []) -> > > []; > > map(Fun, [{Pos, Tree}|Rest]) ->
