This is not a rant but a selfish wish :) I wish all commit messages would be this explanatory like the one below. As I am not really deep into the erlang code but would love to understand more, a good way for me is to read all the commit messages. But mostly we have oneliner. Reading the code is ok but with a explaining message, it is clearer.
Maybe you remember this when you write your next message to help me and others :) Thanks a lot Cheers Andy ---------- Forwarded message ---------- From: <[email protected]> Date: 10 February 2014 22:54 Subject: couch commit: updated refs/heads/2001-feature-external-size to d9d11a9 To: [email protected] Updated Branches: refs/heads/2001-feature-external-size [created] d9d11a943 Implement database external size calculations This patch adds calculations to show the "external size" of a database which is roughly a measure of how much disk space it would take to store the contents of the database in flat files. It is used to calculate rough compression ratios for capacity planning. COUCHDB-2001 Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch/repo Commit: http://git-wip-us.apache.org/repos/asf/couchdb-couch/commit/d9d11a94 Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch/tree/d9d11a94 Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch/diff/d9d11a94 Branch: refs/heads/2001-feature-external-size Commit: d9d11a943e2815897ee5b5b3d1b1eef4b417c1fa Parents: 09c6556 Author: Paul J. Davis <[email protected]> Authored: Mon Feb 10 14:03:47 2014 -0600 Committer: Paul J. Davis <[email protected]> Committed: Mon Feb 10 15:32:21 2014 -0600 ---------------------------------------------------------------------- include/couch_db.hrl | 5 +- src/couch_btree.erl | 10 +- src/couch_compress.erl | 10 ++ src/couch_db.erl | 56 +++++----- src/couch_db_updater.erl | 237 ++++++++++++++++++++++++++---------------- 5 files changed, 196 insertions(+), 122 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/include/couch_db.hrl ---------------------------------------------------------------------- diff --git a/include/couch_db.hrl b/include/couch_db.hrl index 2c015df..2ce5ebe 100644 --- a/include/couch_db.hrl +++ b/include/couch_db.hrl @@ -65,7 +65,7 @@ update_seq = 0, deleted = false, rev_tree = [], - leafs_size = 0 + sizes = {0, 0} }). -record(httpd, @@ -251,6 +251,7 @@ deleted, ptr, seq, - size = nil + sizes = {0, 0}, + atts = [] }). http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_btree.erl ---------------------------------------------------------------------- diff --git a/src/couch_btree.erl b/src/couch_btree.erl index 9caceb8..ac5681d 100644 --- a/src/couch_btree.erl +++ b/src/couch_btree.erl @@ -61,8 +61,8 @@ final_reduce(#btree{reduce=Reduce}, Val) -> final_reduce(Reduce, Val); final_reduce(Reduce, {[], []}) -> Reduce(reduce, []); -final_reduce(_Bt, {[], [Red]}) -> - Red; +final_reduce(Reduce, {[], [Red]}) -> + Reduce(rereduce, [Red]); final_reduce(Reduce, {[], Reductions}) -> Reduce(rereduce, Reductions); final_reduce(Reduce, {KVs, Reductions}) -> @@ -92,14 +92,14 @@ fold_reduce(#btree{root=Root}=Bt, Fun, Acc, Options) -> full_reduce(#btree{root=nil,reduce=Reduce}) -> {ok, Reduce(reduce, [])}; -full_reduce(#btree{root=Root}) -> - {ok, element(2, Root)}. +full_reduce(#btree{root=Root, reduce=Reduce}) -> + {ok, Reduce(rereduce, [element(2, Root)])}. size(#btree{root = nil}) -> 0; size(#btree{root = {_P, _Red}}) -> % pre 1.2 format - nil; + undefined; size(#btree{root = {_P, _Red, Size}}) -> Size. http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_compress.erl ---------------------------------------------------------------------- diff --git a/src/couch_compress.erl b/src/couch_compress.erl index 6b47a7a..49876d8 100644 --- a/src/couch_compress.erl +++ b/src/couch_compress.erl @@ -14,6 +14,7 @@ -export([compress/2, decompress/1, is_compressed/2]). -export([get_compression_method/0]). +-export([uncompressed_length/1]). -include_lib("couch/include/couch_db.hrl"). @@ -82,3 +83,12 @@ is_compressed(<<?TERM_PREFIX, _/binary>>, Method) -> is_compressed(Term, _Method) when not is_binary(Term) -> false. + +uncompressed_length(<<?SNAPPY_PREFIX, _/binary>> = Bin) -> + snappy:uncompressed_length(Bin); +uncompressed_length(<<?COMPRESSED_TERM_PREFIX, _/binary>> = Bin) -> + <<131, 80, Size:4/big-unsigned-integer-unit:8, _/binary>> = Bin, + Size; +uncompressed_length(<<?TERM_PREFIX, _/binary>> = Bin) -> + size(Bin). + http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db.erl ---------------------------------------------------------------------- diff --git a/src/couch_db.erl b/src/couch_db.erl index 32a0049..6432e8e 100644 --- a/src/couch_db.erl +++ b/src/couch_db.erl @@ -302,44 +302,48 @@ get_db_info(Db) -> name=Name, instance_start_time=StartTime, committed_update_seq=CommittedUpdateSeq, - id_tree = IdBtree, - seq_tree = SeqBtree, - local_tree = LocalBtree + id_tree = IdBtree } = Db, - {ok, Size} = couch_file:bytes(Fd), - {ok, DbReduction} = couch_btree:full_reduce(IdBtree), + {ok, FileSize} = couch_file:bytes(Fd), + {ok, {Count, DelCount, Sizes}} = couch_btree:full_reduce(IdBtree), + {ActiveSize0, ExternalSize} = Sizes, + ActiveSize = active_size(Db, ActiveSize0), InfoList = [ {db_name, Name}, - {doc_count, element(1, DbReduction)}, - {doc_del_count, element(2, DbReduction)}, + {doc_count, Count}, + {doc_del_count, DelCount}, {update_seq, SeqNum}, {purge_seq, couch_db:get_purge_seq(Db)}, {compact_running, Compactor/=nil}, - {disk_size, Size}, - {data_size, db_data_size(DbReduction, [SeqBtree, IdBtree, LocalBtree])}, + {disk_size, FileSize}, + {data_size, ActiveSize}, + {sizes, {[ + {file, FileSize}, + {active, ActiveSize}, + {external, ExternalSize} + ]}}, {instance_start_time, StartTime}, {disk_format_version, DiskVersion}, {committed_update_seq, CommittedUpdateSeq} ], {ok, InfoList}. -db_data_size({_Count, _DelCount}, _Trees) -> - % pre 1.2 format, upgraded on compaction - null; -db_data_size({_Count, _DelCount, nil}, _Trees) -> - null; -db_data_size({_Count, _DelCount, DocAndAttsSize}, Trees) -> - sum_tree_sizes(DocAndAttsSize, Trees). - -sum_tree_sizes(Acc, []) -> - Acc; -sum_tree_sizes(Acc, [T | Rest]) -> - case couch_btree:size(T) of - nil -> - null; - Sz -> - sum_tree_sizes(Acc + Sz, Rest) - end. +active_size(#db{}=Db, DocActiveSize) -> + Trees = [ + Db#db.id_tree, + Db#db.seq_tree, + Db#db.local_tree + ], + lists:foldl(fun(T, Acc) -> + case couch_btree:size(T) of + _ when Acc == null -> + null; + undefined -> + null; + Size -> + Acc + Size + end + end, DocActiveSize, Trees). get_design_docs(#db{name = <<"shards/", _:18/binary, DbName/binary>>}) -> {_, Ref} = spawn_monitor(fun() -> exit(fabric:design_docs(DbName)) end), http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/d9d11a94/src/couch_db_updater.erl ---------------------------------------------------------------------- diff --git a/src/couch_db_updater.erl b/src/couch_db_updater.erl index 649826a..eb75177 100644 --- a/src/couch_db_updater.erl +++ b/src/couch_db_updater.erl @@ -342,40 +342,60 @@ collect_updates(GroupedDocsAcc, ClientsAcc, MergeConflicts, FullCommit) -> end. rev_tree(DiskTree) -> - couch_key_tree:mapfold(fun - (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, leaf, _Acc) -> + couch_key_tree:map(fun + (_RevId, {Del, Ptr, Seq}) -> % pre 1.2 format, will be upgraded on compaction - {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq}, nil}; - (_RevId, {IsDeleted, BodyPointer, UpdateSeq}, branch, Acc) -> - {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq}, Acc}; - (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, leaf, Acc) -> - Acc2 = sum_leaf_sizes(Acc, Size), - {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq, size=Size}, Acc2}; - (_RevId, {IsDeleted, BodyPointer, UpdateSeq, Size}, branch, Acc) -> - {#leaf{deleted=?i2b(IsDeleted), ptr=BodyPointer, seq=UpdateSeq, size=Size}, Acc}; - (_RevId, ?REV_MISSING, _Type, Acc) -> - {?REV_MISSING, Acc} - end, 0, DiskTree). + #leaf{deleted=?i2b(Del), ptr=Ptr, seq=Seq}; + (_RevId, {Del, Ptr, Seq, Size}) -> + % Pre-bigcouch format, will be upgraded on compaction + #leaf{ + deleted = ?i2b(Del), + ptr = Ptr, + seq = Seq, + sizes = {Size, 0}, + atts = [] + }; + (_RevId, {Del, Ptr, Seq, Sizes, Atts}) -> + #leaf{ + deleted = ?i2b(Del), + ptr = Ptr, + seq = Seq, + sizes = Sizes, + atts = Atts + }; + (_RevId, ?REV_MISSING) -> + ?REV_MISSING + end, DiskTree). disk_tree(RevTree) -> couch_key_tree:map(fun (_RevId, ?REV_MISSING) -> ?REV_MISSING; - (_RevId, #leaf{deleted=IsDeleted, ptr=BodyPointer, seq=UpdateSeq, size=Size}) -> - {?b2i(IsDeleted), BodyPointer, UpdateSeq, Size} + (_RevId, #leaf{}=Leaf) -> + #leaf{ + deleted = Del, + ptr = Ptr, + seq = Seq, + sizes = Sizes, + atts = Atts + } = Leaf, + {?b2i(Del), Ptr, Seq, upgrade_sizes(Sizes), Atts} end, RevTree). +upgrade_sizes({_, _} = Sizes) -> + Sizes; +upgrade_sizes(S) when is_integer(S) -> + {S, 0}. + btree_by_seq_split(#full_doc_info{id=Id, update_seq=Seq, deleted=Del, rev_tree=T}) -> {Seq, {Id, ?b2i(Del), disk_tree(T)}}. btree_by_seq_join(Seq, {Id, Del, DiskTree}) when is_integer(Del) -> - {RevTree, LeafsSize} = rev_tree(DiskTree), #full_doc_info{ id = Id, update_seq = Seq, deleted = ?i2b(Del), - rev_tree = RevTree, - leafs_size = LeafsSize + rev_tree = rev_tree(DiskTree) }; btree_by_seq_join(KeySeq, {Id, RevInfos, DeletedRevInfos}) -> % Older versions stored #doc_info records in the seq_tree. @@ -389,49 +409,59 @@ btree_by_seq_join(KeySeq, {Id, RevInfos, DeletedRevInfos}) -> [#rev_info{rev=Rev,seq=Seq,deleted=true,body_sp = Bp} || {Rev, Seq, Bp} <- DeletedRevInfos]}. -btree_by_id_split(#full_doc_info{id=Id, update_seq=Seq, - deleted=Deleted, rev_tree=Tree}) -> - {Id, {Seq, ?b2i(Deleted), disk_tree(Tree)}}. +btree_by_id_split(#full_doc_info{}=Info) -> + #full_doc_info{ + id = Id, + update_seq = Seq, + deleted = Del, + sizes = Sizes, + rev_tree = Tree + } = Info, + {Id, {Seq, ?b2i(Del), upgrade_sizes(Sizes), disk_tree(Tree)}}. btree_by_id_join(Id, {HighSeq, Deleted, DiskTree}) -> - {Tree, LeafsSize} = rev_tree(DiskTree), + % Upgrade from pre-BigCouch disk format + ActiveSize = couch_key_tree:fold(fun + (_RevId, {_Del, _Ptr, _Seq}, _, Acc) -> + Acc; + (_RevId, {_Del, _Ptr, _Seq, Size}, _, Acc) -> + Acc + Size; + (_RevId, {_Del, _Ptr, _Seq, Sizes, _Atts}, _, Acc) -> + {Active, _} = Sizes, + Active + Acc; + (_RevId, ?REV_MISSING, _, Acc) -> + Acc + end, 0, DiskTree), + btree_by_id_join(Id, {HighSeq, Deleted, {ActiveSize, 0}, DiskTree}); + +btree_by_id_join(Id, {HighSeq, Deleted, Sizes, DiskTree}) -> #full_doc_info{ id = Id, update_seq = HighSeq, deleted = ?i2b(Deleted), - rev_tree = Tree, - leafs_size = LeafsSize + sizes = Sizes, + rev_tree = rev_tree(DiskTree) }. btree_by_id_reduce(reduce, FullDocInfos) -> - lists:foldl( - fun(Info, {NotDeleted, Deleted, Size}) -> - Size2 = sum_leaf_sizes(Size, Info#full_doc_info.leafs_size), - case Info#full_doc_info.deleted of - true -> - {NotDeleted, Deleted + 1, Size2}; - false -> - {NotDeleted + 1, Deleted, Size2} - end - end, - {0, 0, 0}, FullDocInfos); -btree_by_id_reduce(rereduce, Reds) -> - lists:foldl( - fun({NotDeleted, Deleted}, {AccNotDeleted, AccDeleted, _AccSize}) -> - % pre 1.2 format, will be upgraded on compaction - {AccNotDeleted + NotDeleted, AccDeleted + Deleted, nil}; - ({NotDeleted, Deleted, Size}, {AccNotDeleted, AccDeleted, AccSize}) -> - AccSize2 = sum_leaf_sizes(AccSize, Size), - {AccNotDeleted + NotDeleted, AccDeleted + Deleted, AccSize2} - end, - {0, 0, 0}, Reds). - -sum_leaf_sizes(nil, _) -> - nil; -sum_leaf_sizes(_, nil) -> - nil; -sum_leaf_sizes(Size1, Size2) -> - Size1 + Size2. + lists:foldl(fun + (#full_doc_info{deleted=false, sizes=Sizes}, {NotDel, Del, SAcc}) -> + {NotDel + 1, Del, reduce_sizes(Sizes, SAcc)}; + (#full_doc_info{deleted=true, sizes=Sizes}, {NotDel, Del, SAcc}) -> + {NotDel, Del + 1, reduce_sizes(Sizes, SAcc)} + end, {0, 0, {0, 0}}, FullDocInfos); +btree_by_id_reduce(rereduce, Reductions) -> + lists:foldl(fun + ({NotDel, Del}, {NDAcc, DAcc, SAcc}) -> + {NotDel + NDAcc, Del + DAcc, SAcc}; + ({NotDel, Del, Sizes}, {NDAcc, DAcc, SAcc}) -> + {NotDel + NDAcc, Del + DAcc, reduce_sizes(Sizes, SAcc)} + end, {0, 0, {0, 0}}, Reductions). + +reduce_sizes({A1, E1}, {A2, E2}) -> + {A1 + A2, E1 + E2}; +reduce_sizes(S, {_, _} = Acc) when is_integer(S) -> + reduce_sizes({S, 0}, Acc). btree_by_seq_reduce(reduce, DocInfos) -> % count the number of documents @@ -549,10 +579,15 @@ flush_trees(_Db, [], AccFlushedTrees) -> flush_trees(#db{fd = Fd} = Db, [InfoUnflushed | RestUnflushed], AccFlushed) -> #full_doc_info{update_seq=UpdateSeq, rev_tree=Unflushed} = InfoUnflushed, - {Flushed, LeafsSize} = couch_key_tree:mapfold( + {Flushed, FinalAcc} = couch_key_tree:mapfold( fun(_Rev, Value, Type, Acc) -> case Value of - #doc{deleted = IsDeleted, body = {summary, Summary, AttsFd}} -> + #doc{} = Doc -> + #doc{ + deleted = IsDeleted, + body = {summary, Summary, AttsFd}, + atts = Atts + } = Doc, % this node value is actually an unwritten document summary, % write to disk. % make sure the Fd in the written bins is the same Fd we are @@ -571,31 +606,44 @@ flush_trees(#db{fd = Fd} = Db, " changed. Possibly retrying.", []), throw(retry) end, - {ok, NewSummaryPointer, SummarySize} = - couch_file:append_raw_chunk(Fd, Summary), - TotalSize = lists:foldl( - fun(#att{att_len = L}, A) -> A + L end, - SummarySize, Value#doc.atts), - NewValue = #leaf{deleted=IsDeleted, ptr=NewSummaryPointer, - seq=UpdateSeq, size=TotalSize}, - case Type of - leaf -> - {NewValue, Acc + TotalSize}; - branch -> - {NewValue, Acc} - end; - {_, _, _, LeafSize} when Type =:= leaf, LeafSize =/= nil -> - {Value, Acc + LeafSize}; - _ -> + AttsInfo = lists:usort([ + {P, L} || #att{data = {_, P}, att_len = L} <- Atts + ]), + [_, _, SummaryBin] = Summary, + ExternalSize = couch_compress:uncompressed_length(SummaryBin), + {ok, NewPtr, ActiveSize} + = couch_file:append_raw_chunk(Fd, Summary), + Leaf = #leaf{ + deleted = IsDeleted, + ptr = NewPtr, + seq = UpdateSeq, + sizes = {ActiveSize, ExternalSize}, + atts = AttsInfo + }, + {Leaf, add_sizes(Type, Leaf, Acc)}; + #leaf{} = Leaf -> + {Value, add_sizes(Type, Leaf, Acc)}; + ?REV_MISSING -> {Value, Acc} end - end, 0, Unflushed), + end, {0, 0, []}, Unflushed), + {FinalAS, FinalES, FinalAtts} = FinalAcc, + TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0, FinalAtts), InfoFlushed = InfoUnflushed#full_doc_info{ rev_tree = Flushed, - leafs_size = LeafsSize + sizes = {FinalAS + TotalAttSize, FinalES + TotalAttSize} }, flush_trees(Db, RestUnflushed, [InfoFlushed | AccFlushed]). +add_sizes(branch, _, Acc) -> + Acc; +add_sizes(leaf, #leaf{sizes=Sizes, atts=AttSizes}, Acc) -> + {ActiveSize, ExternalSize} = upgrade_sizes(Sizes), + {ASAcc, ESAcc, AttsAcc} = Acc, + NewASAcc = ActiveSize + ASAcc, + NewESAcc = ExternalSize + ESAcc, + NewAttsAcc = lists:umerge(AttSizes, AttsAcc), + {NewASAcc, NewESAcc, NewAttsAcc}. send_result(Client, Ref, NewResult) -> % used to send a result to the client @@ -896,23 +944,34 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> A =< B end, merge_lookups(MixedInfos, LookupResults)), - NewInfos1 = lists:map( - fun(#full_doc_info{rev_tree=RevTree}=Info) -> - Info#full_doc_info{rev_tree=couch_key_tree:map( - fun(_, _, branch) -> - ?REV_MISSING; - (_Rev, #leaf{ptr=Sp}=Leaf, leaf) -> - {_Body, AttsInfo} = Summary = copy_doc_attachments( - Db, Sp, DestFd), - SummaryChunk = make_doc_summary(NewDb, Summary), - {ok, Pos, SummarySize} = couch_file:append_raw_chunk( - DestFd, SummaryChunk), - TotalLeafSize = lists:foldl( - fun({_, _, _, AttLen, _, _, _, _}, S) -> S + AttLen end, - SummarySize, AttsInfo), - Leaf#leaf{ptr=Pos, size=TotalLeafSize} - end, RevTree)} - end, NewInfos0), + NewInfos1 = lists:map(fun(Info) -> + {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun + (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) -> + {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd), + Summary = make_doc_summary(NewDb, {Body, AttInfos}), + [_, _, SummaryBin] = Summary, + ExternalSize = couch_compress:uncompressed_length(SummaryBin), + {ok, Pos, ActiveSize} + = couch_file:append_raw_chunk(DestFd, Summary), + AttSizes = [{element(3, A), element(4, A)} || A <- AttInfos], + NewLeaf = Leaf#leaf{ + ptr = Pos, + sizes = {ActiveSize, ExternalSize}, + atts = lists:usort(AttSizes) + }, + {NewLeaf, add_sizes(leaf, NewLeaf, SizesAcc)}; + (_Rev, _Value, branch, SizesAcc) -> + {?REV_MISSING, SizesAcc} + end, {0, 0, []}, Info#full_doc_info.rev_tree), + {FinalAS, FinalES, FinalAtts} = FinalAcc, + TotalAttSize = lists:foldl(fun({_, S}, A) -> S + A end, 0, FinalAtts), + NewActiveSize = FinalAS + TotalAttSize, + NewExternalSize = FinalES + TotalAttSize, + Info#full_doc_info{ + rev_tree = NewRevTree, + sizes = {NewActiveSize, NewExternalSize} + } + end, NewInfos0), NewInfos = stem_full_doc_infos(Db, NewInfos1), RemoveSeqs = -- Andy Wenk Hamburg - Germany RockIt! http://www.couchdb-buch.de http://www.pg-praxisbuch.de GPG fingerprint: C044 8322 9E12 1483 4FEC 9452 B65D 6BE3 9ED3 9588 https://people.apache.org/keys/committer/andywenk.asc
