Just for the record, there was a problem with the patch which affects
continuing compactions that fail for whatever reason.
$
cat compact_deleted.patch
--- couchdb-0.11.0/src/couchdb/couch_db.erl 2010-03-04 02:17:44.000000000
-0300
+++ couchdb-0.11.0.new/src/couchdb/couch_db.erl 2010-05-04 17:18:54.000000000
-0300
@@ -323,7 +323,7 @@
get_revs_limit(#db{revs_limit=Limit}) ->
Limit.
-set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit > 0 ->
+set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit >= 0 ->
check_is_admin(Db),
gen_server:call(Pid, {set_revs_limit, Limit}, infinity);
set_revs_limit(_Db, _Limit) ->
--- couchdb-0.11.0/src/couchdb/couch_key_tree.erl 2009-11-21
10:43:43.000000000 -0300
+++ couchdb-0.11.0.new/src/couchdb/couch_key_tree.erl 2010-05-04
17:40:57.000000000 -0300
@@ -314,7 +314,7 @@
% flatten each branch in a tree into a tree path
Paths = get_all_leafs_full(Trees),
- Paths2 = [{Pos, lists:sublist(Path, Limit)} || {Pos, Path} <- Paths],
+ Paths2 = [{Pos, lists:sublist(Path, lists:max([Limit, 1]))} || {Pos, Path}
<- Paths],
% convert paths back to trees
lists:foldl(
--- couchdb-0.11.0/src/couchdb/couch_db_updater.erl 2010-02-22
12:20:53.000000000 -0300
+++ couchdb-0.11.0.new/src/couchdb/couch_db_updater.erl 2010-05-05
09:19:50.000000000 -0300
@@ -736,9 +736,16 @@
end, Tree).
-copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) ->
- Ids = [Id || #doc_info{id=Id} <- InfoBySeq],
- LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids),
+copy_docs(#db{revs_limit=Limit}=Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) ->
+ if Limit > 0 ->
+ Ids = [Id || #doc_info{id=Id} <- InfoBySeq],
+ LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids);
+ true ->
+ AllIds = [Id || #doc_info{id=Id} <- InfoBySeq],
+ BaseResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, AllIds),
+ LookupResults = [Filtered || {ok,
#full_doc_info{deleted=false}}=Filtered <- BaseResults],
+ Ids = [Id || {ok, #full_doc_info{id=Id}} <- LookupResults]
+ end,
% write out the attachments
NewFullDocInfos0 = lists:map(
Regards,
Mike
-copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) ->
- Ids = [Id || #doc_info{id=Id} <- InfoBySeq],
- LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids),
+copy_docs(#db{revs_limit=Limit}=Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) ->
+ if Limit > 0 ->
+ Ids = [Id || #doc_info{id=Id} <- InfoBySeq],
+ LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids);
+ true ->
+ AllIds = [Id || #doc_info{id=Id} <- InfoBySeq],
+ BaseResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, AllIds),
+ LookupResults = [Filtered || {ok,
#full_doc_info{deleted=false}}=Filtered <- BaseResults],
+ Ids = [Id || {ok, #full_doc_info{id=Id}} <- LookupResults]
+ end,
% write out the attachments
NewFullDocInfos0 = lists:map(
On Wed, 2010-05-05 at 10:42 -0300, Mike Leddy wrote:
> Hi,
>
> You are welcome.
>
> I had imagined trying some sort of database swapping but I never liked
> the idea of switching where the app looks to the shadow/replacement and
> doing it correctly while always available and replicating.
>
> I decided to bite the bullet and went for the my desired solution. It
> may be completely against the grain of good practice but I decided to
> patch couchdb to be able to purge deleted data on compaction.
>
> Please note I am an Erlang/Couchdb newbie.
>
> First it seemed reasonable to use revs_limit to trigger deleted document
> removal as I also want to clear out older revisions as well. I decided
> that a revs_limit = 0 might be appropriate.
>
> Patching couch_db.erl to allow revs_limit = 0 gave me:
>
> --- couchdb-0.11.0/src/couchdb/couch_db.erl 2010-03-04 02:17:44.000000000
> -0300
> +++ couchdb-0.11.0.new/src/couchdb/couch_db.erl 2010-05-04 17:18:54.000000000
> -0300
> @@ -323,7 +323,7 @@
> get_revs_limit(#db{revs_limit=Limit}) ->
> Limit.
>
> -set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit > 0 ->
> +set_revs_limit(#db{update_pid=Pid}=Db, Limit) when Limit >= 0 ->
> check_is_admin(Db),
> gen_server:call(Pid, {set_revs_limit, Limit}, infinity);
> set_revs_limit(_Db, _Limit) ->
>
> Making sure that revs_limit = 0 doesn't wipe out the whole database:
>
> --- couchdb-0.11.0/src/couchdb/couch_key_tree.erl 2009-11-21
> 10:43:43.000000000 -0300
> +++ couchdb-0.11.0.new/src/couchdb/couch_key_tree.erl 2010-05-04
> 17:40:57.000000000 -0300
> @@ -314,7 +314,7 @@
> % flatten each branch in a tree into a tree path
> Paths = get_all_leafs_full(Trees),
>
> - Paths2 = [{Pos, lists:sublist(Path, Limit)} || {Pos, Path} <- Paths],
> + Paths2 = [{Pos, lists:sublist(Path, lists:max([Limit, 1]))} || {Pos,
> Path} <- Paths],
>
> % convert paths back to trees
> lists:foldl(
>
> Now the trickier part, choosing a good place to filter out the deleted
> docs.... After several failed attempts I chose this:
>
> --- couchdb-0.11.0/src/couchdb/couch_db_updater.erl 2010-02-22
> 12:20:53.000000000 -0300
> +++ couchdb-0.11.0.new/src/couchdb/couch_db_updater.erl 2010-05-05
> 09:19:50.000000000 -0300
> @@ -736,9 +736,16 @@
> end, Tree).
>
>
> -copy_docs(Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry) ->
> - Ids = [Id || #doc_info{id=Id} <- InfoBySeq],
> - LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids),
> +copy_docs(#db{revs_limit=Limit}=Db, #db{fd=DestFd}=NewDb, InfoBySeq, Retry)
> ->
> + if Limit > 0 ->
> + Ids = [Id || #doc_info{id=Id} <- InfoBySeq],
> + LookupResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree, Ids);
> + true ->
> + AllIds = [Id || #doc_info{id=Id} <- InfoBySeq],
> + BaseResults = couch_btree:lookup(Db#db.fulldocinfo_by_id_btree,
> AllIds),
> + LookupResults = [Filtered || {ok,
> #full_doc_info{deleted=false}}=Filtered <- BaseResults],
> + Ids = [Id || #full_doc_info{id=Id} <- LookupResults]
> + end,
>
> % write out the attachments
> NewFullDocInfos0 = lists:map(
>
> Now I can simply:
>
> curl -X PUT 'localhost:5984/db/_revs_limit' -d '0'
>
> and then:
>
> curl -X POST 'localhost:5984/db/_compact'
>
> Hopefully I haven broken anything - I am still running some tests but
> it looks like am able to do what I need and stay 100% available without
> altering my application.
>
> I am not suggesting that anyone use this as it is but it might trigger
> someone to incorporate something in couchdb that has a similar
> functionality.
>
> Best regards,
>
> Mike
>
>
> On Tue, 2010-05-04 at 09:09 -0700, J Chris Anderson wrote:
> > On May 3, 2010, at 8:56 AM, Mike Leddy wrote:
> >
> > > Hi,
> > >
> > > I am currently on couchdb 0.11.0 using official debian packages with
> > > erlang 1:13.b.4-dfsg-4 and I am having problems purging old documents.
> > >
> > > My database is constantly receiving new data and old data (more than
> > > six weeks) is being deleted. I have been running like this for several
> > > months and the overhead of old deleted document 'stubs' is becoming
> > > relevant in day to day operations such as new replications, database
> > > compaction etc.
> > >
> > > I decided that it would be best to purge the old deleted documents
> > > so that the database would compact better and only contain relevant
> > > ie. recent data.
> > >
> > > [What I would really like would be a compact that does not include
> > > documents that match a filter function, then I could do this on each
> > > node independently.]
> > >
> > > Unfortunately I am encountering problems purging the documents. I wrote
> > > a script to process all the documents via _changes and purge the old
> > > documents but I keep hitting documents that cannot be purged.
> > >
> >
> > Thanks for the bug report. One way I've seen people accomplish this use
> > case that doesn't involve purging, is by storing documents into a new
> > database each week, and then throwing out old database files.
> >
> > Purging is really designed for removing secret data that was accidentally
> > saved, more than for reclaiming space. Thanks for the bug report -
> > hopefully it will be easy to fix.
> >
> > Chris
> >
> > > Here is the start of my changes feed:
> > >
> > > curl 'localhost:5984/iris/_changes?limit=5&since=0'
> > > {"results":[
> > > {"seq":2,"id":"_design/admin","changes":[{"rev":"1-ea95c1898a2c779d664c1d1b71a24f33"}]},
> > > {"seq":22435808,"id":"1259540160F2016","changes":[{"rev":"2-7dcfd742f74c79286c3f3093595a83df"}],"deleted":true},
> > > {"seq":22435809,"id":"1259540640F2016","changes":[{"rev":"2-6bd122eb9f83c0838bc9875a1b73abaf"}],"deleted":true},
> > > {"seq":22435810,"id":"1259616780F2443","changes":[{"rev":"2-53e2311f5de7058fbfd55979816d3efc"}],"deleted":true},
> > > {"seq":22435811,"id":"1259616784F2443","changes":[{"rev":"2-caaff4cd1290f7807c2bcfeb6edc39e0"}],"deleted":true}
> > > ],
> > > "last_seq":22435811}
> > >
> > > This is a compacted copy of my main production database which is
> > > already on seq 106280009.
> > >
> > > When i try to purge I get a badarity error:
> > >
> > > curl -X POST 'localhost:5984/iris/_purge' -d
> > > '{"1259540160F2016":["2-7dcfd742f74c79286c3f3093595a83df"]}'
> > > {"error":"{{badarity,{#Fun<couch_db_updater.25.101160745>,\n
> > > [{2,<<124,230,79,165,16,199,208,127,32,211,160,223,180,12,3,28>>},\n
> > > {true,4185,19290621}]}},\n [{couch_key_tree,map_leafs_simple,3},\n
> > > {couch_key_tree,map_leafs_simple,3},\n {couch_key_tree,map_leafs,2},\n
> > > {couch_db_updater,'-handle_call/3-fun-2-',2},\n {lists,mapfoldl,3},\n
> > > {couch_db_updater,handle_call,3},\n {gen_server,handle_msg,5},\n
> > > {proc_lib,init_p_do_apply,3}]}","reason":"{gen_server,call,\n
> > > [<0.28323.1>,\n {purge_docs,[{<<\"1259540160F2016\">>,\n
> > > [{2,\n
> > > <<125,207,215,66,247,76,121,40,108,63,48,\n
> > > 147,89,90,131,223>>}]}]}]}"}
> > >
> > > This is what appears in my server log:
> > >
> > > [Mon, 03 May 2010 15:45:03 GMT] [error] [<0.28323.1>] ** Generic server
> > > <0.28323.1> terminating
> > > ** Last message in was {purge_docs,[{<<"1259540160F2016">>,
> > > [{2,
> > >
> > > <<125,207,215,66,247,76,121,40,108,
> > > 63,48,147,89,90,131,223>>}]}]}
> > > ** When Server state == {db,<0.28322.1>,<0.28323.1>,nil,
> > >
> > > <<"1272901503155514">>,<0.28320.1>,<0.28324.1>,
> > > {db_header,5,106014165,0,
> > > {59608213341,{24516895,36227125}},
> > > {59608203141,60744020},
> > > {59608273696,[]},
> > > 0,nil,nil,1000},
> > > 106014165,
> > > {btree,<0.28320.1>,
> > > {59608213341,{24516895,36227125}},
> > > #Fun<couch_db_updater.7.132302543>,
> > > #Fun<couch_db_updater.8.107957134>,
> > > #Fun<couch_btree.5.124754102>,
> > > #Fun<couch_db_updater.9.46112288>},
> > > {btree,<0.28320.1>,
> > > {59608203141,60744020},
> > > #Fun<couch_db_updater.10.19027664>,
> > > #Fun<couch_db_updater.11.35033879>,
> > > #Fun<couch_btree.5.124754102>,
> > > #Fun<couch_db_updater.12.56344865>},
> > > {btree,<0.28320.1>,
> > > {59608273696,[]},
> > > #Fun<couch_btree.0.83553141>,
> > > #Fun<couch_btree.1.30790806>,
> > > #Fun<couch_btree.2.124754102>,nil},
> > > 106014165,<<"iris">>,
> > >
> > > "/var/lib/couchdb/0.11.0/iris.couch",[],[],nil,
> > > {user_ctx,null,[],undefined},
> > > nil,1000,
> > > [before_header,after_header,on_file_open]}
> > > ** Reason for termination ==
> > > ** {{badarity,{#Fun<couch_db_updater.25.101160745>,
> > > [{2,
> > >
> > > <<124,230,79,165,16,199,208,127,32,211,160,223,180,12,3,28>>},
> > > {true,4185,19290621}]}},
> > > [{couch_key_tree,map_leafs_simple,3},
> > > {couch_key_tree,map_leafs_simple,3},
> > > {couch_key_tree,map_leafs,2},
> > > {couch_db_updater,'-handle_call/3-fun-2-',2},
> > > {lists,mapfoldl,3},
> > > {couch_db_updater,handle_call,3},
> > > {gen_server,handle_msg,5},
> > > {proc_lib,init_p_do_apply,3}]}
> > >
> > >
> > > [Mon, 03 May 2010 15:45:03 GMT] [error] [<0.28323.1>]
> > > {error_report,<0.31.0>,
> > > {<0.28323.1>,crash_report,
> > > [[{initial_call,{couch_db_updater,init,['Argument__1']}},
> > > {pid,<0.28323.1>},
> > > {registered_name,[]},
> > > {error_info,
> > > {exit,
> > > {{badarity,
> > > {#Fun<couch_db_updater.25.101160745>,
> > > [{2,
> > >
> > > <<124,230,79,165,16,199,208,127,32,211,160,223,180,12,
> > > 3,28>>},
> > > {true,4185,19290621}]}},
> > > [{couch_key_tree,map_leafs_simple,3},
> > > {couch_key_tree,map_leafs_simple,3},
> > > {couch_key_tree,map_leafs,2},
> > > {couch_db_updater,'-handle_call/3-fun-2-',2},
> > > {lists,mapfoldl,3},
> > > {couch_db_updater,handle_call,3},
> > > {gen_server,handle_msg,5},
> > > {proc_lib,init_p_do_apply,3}]},
> > >
> > > [{gen_server,terminate,6},{proc_lib,init_p_do_apply,3}]}},
> > > {ancestors,
> > >
> > > [<0.28322.1>,couch_server,couch_primary_services,couch_server_sup,
> > > <0.32.0>]},
> > > {messages,[]},
> > > {links,[<0.28322.1>]},
> > > {dictionary,[]},
> > > {trap_exit,false},
> > > {status,running},
> > > {heap_size,4181},
> > > {stack_size,24},
> > > {reductions,42618}],
> > > []]}}
> > >
> > > [Mon, 03 May 2010 15:45:03 GMT] [error] [<0.28291.1>] Uncaught error in
> > > HTTP request: {exit,
> > > {{{badarity,
> > >
> > > {#Fun<couch_db_updater.25.101160745>,
> > > [{2,
> > >
> > > <<124,230,79,165,16,199,208,127,32,211,
> > > 160,223,180,12,3,28>>},
> > > {true,4185,19290621}]}},
> > > [{couch_key_tree,map_leafs_simple,3},
> > > {couch_key_tree,map_leafs_simple,3},
> > > {couch_key_tree,map_leafs,2},
> > > {couch_db_updater,
> > > '-handle_call/3-fun-2-',2},
> > > {lists,mapfoldl,3},
> > > {couch_db_updater,handle_call,3},
> > > {gen_server,handle_msg,5},
> > > {proc_lib,init_p_do_apply,3}]},
> > > {gen_server,call,
> > > [<0.28323.1>,
> > > {purge_docs,
> > > [{<<"1259540160F2016">>,
> > > [{2,
> > > <<125,207,215,66,247,76,121,
> > > 40,108,63,48,147,89,90,131,
> > > 223>>}]}]}]}}}
> > >
> > > [Mon, 03 May 2010 15:45:03 GMT] [info] [<0.28291.1>] Stacktrace:
> > > [{gen_server,call,2},
> > > {couch_httpd_db,db_req,2},
> > > {couch_httpd_db,do_db_req,2},
> > > {couch_httpd,handle_request_int,5},
> > > {mochiweb_http,headers,5},
> > > {proc_lib,init_p_do_apply,3}]
> > >
> > > [Mon, 03 May 2010 15:45:03 GMT] [info] [<0.28291.1>] 127.0.0.1 - -
> > > 'POST' /iris/_purge 500
> > >
> > > Any suggestions would be greatly appreciated.
> > >
> > > Thanks,
> > >
> > > Mike
> > >
> > >
> > >
> > >
> >
> >
>
>
>
>