This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch btree-cache
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit c40e12990c72ced6dd9ae8f1df610d1e15a4a173
Author: Nick Vatamaniuc <vatam...@gmail.com>
AuthorDate: Mon Aug 18 18:30:29 2025 -0400

    BTree engine term cache
    
    Cache the top b-tree nodes and header terms. Items are inserted into the 
cache
    on first use and then they have to be updated when accessed. If they are not
    updated frequently enough they are evicted.
    
    In order to get or update any kv leaf nodes in the b-trees, we always have 
to
    read the top few kp nodes closer to the root. Even with parallel preads and
    data being in the page cache the cost of system calls, marshaling the term,
    going through the erlang IO system can add up.
    
    The cache is limited in size. The size defaults to 64Mb and is configurable 
by
    the user. If the cache is full, no more entries will be added until more 
space
    frees up.
    
    Since multiple processes are accessing the data concurrently, ets tables are
    sharded by the number of schedulers, not unlike how we shard our 
couch_server
    processes. Each ets table has an associated cleaner process which evicts 
unused
    entries. Cleaners traverse table entries and "decay" the usage counters
    exponentially, by repeatedly shifting the value to the right with the `bsr` 
op.
    Entries with a usage counter equal 0 are then removed.
    
    In order to only insert the top nodes in the cache the couch_btree module's
    lookup and streaming functions were updated with a `depth` parameter. If 
that
    way we avoid thrashing node which are less likely to be reused through the
    cache: if the depth > 2 or the node is a kv_node it skips the cache 
altogether.
    
    To get an idea on tuning the cache size for various workload there are two 
new
    metrics to count hits and misses:
    
    ```
    % http $DB/_node/_local/_stats/couchdb/bt_engine_cache
    
    {
        "hits": {
            "desc": "number of bt_engine cache hits",
            "type": "counter",
            "value": 233310
        },
        "misses": {
            "desc": "number of bt_engine cache misses",
            "type": "counter",
            "value": 7386
        }
    }
    ```
    
    Those are the metrics after  running `fabric_bench:go(#{q=>8, 
doc_size=>small,
    docs=>100000})`.
    
    A quick and dirty comparison with main:
    
    ```
    > fabric_bench:go(#{q=>8, doc_size=>small, docs=>100000}).
     *** Parameters
     * batch_size       : 1000
     * doc_size         : small
     * docs             : 100000
     * individual_docs  : 1000
     * n                : default
     * q                : 8
    
     *** Environment
     * Nodes        : 1
     * Bench ver.   : 1
     * N            : 1
     * Q            : 8
     * OS           : unix/darwin
     * Couch ver.   : 3.5.0-d16fc1f
     * Couch git sha: d16fc1f
     * VM details   : Erlang/OTP 26 [erts-14.2.5.11] [source] [64-bit] 
[smp:12:12] [ds:12:12:16] [async-threads:1]
    
     *** Inserting 100000 docs
     * Add 100000 docs, ok:100/accepted:0     (Hz):   22000
     * Get random doc 100000X                 (Hz):    3800
     * All docs                               (Hz):  130000
     * All docs w/ include_docs               (Hz):   48000
     * Changes                                (Hz):   16000
     * Single doc updates 1000X               (Hz):     530
     * Time to run all benchmarks            (sec):      40
    ```
    
    With the btree cache:
    
    ```
    fabric_bench:go(#{q=>8, doc_size=>small, docs=>100000}).
     *** Parameters
     * batch_size       : 1000
     * doc_size         : small
     * docs             : 100000
     * individual_docs  : 1000
     * n                : default
     * q                : 8
    
     *** Environment
     * Nodes        : 1
     * Bench ver.   : 1
     * N            : 1
     * Q            : 8
     * OS           : unix/darwin
     * Couch ver.   : 3.5.0-d16fc1f
     * Couch git sha: d16fc1f
     * VM details   : Erlang/OTP 26 [erts-14.2.5.11] [source] [64-bit] 
[smp:12:12] [ds:12:12:16] [async-threads:1]
    
     *** Inserting 100000 docs
     * Add 100000 docs, ok:100/accepted:0     (Hz):   24000
     * Get random doc 100000X                 (Hz):    4400
     * All docs                               (Hz):  140000
     * All docs w/ include_docs               (Hz):   49000
     * Changes                                (Hz):   29000
     * Single doc updates 1000X               (Hz):     680
     * Time to run all benchmarks            (sec):      33
    ```
    
    The idea to use a depth parameter and an ets table came from Paul J.
    Davis (@davisp).
---
 rel/overlay/etc/default.ini             |   5 +
 src/couch/priv/stats_descriptions.cfg   |   8 +
 src/couch/src/couch_bt_engine.erl       |  29 +++-
 src/couch/src/couch_bt_engine_cache.erl | 139 +++++++++++++++
 src/couch/src/couch_btree.erl           | 299 +++++++++++++++++++++-----------
 src/couch/src/couch_primary_sup.erl     |   3 +-
 6 files changed, 374 insertions(+), 109 deletions(-)

diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini
index c2ea4c7ca..070c082d8 100644
--- a/rel/overlay/etc/default.ini
+++ b/rel/overlay/etc/default.ini
@@ -130,6 +130,11 @@ view_index_dir = {{view_index_dir}}
 ; downgrade or even open the databases in question.
 ;prohibit_downgrade = true
 
+[bt_engine_cache]
+; Memory used for btree engine cache. This is a cache for top two levels of
+; database btrees (id tree, seq tree). Value is in bytes.
+;max_size = 67108864
+
 [purge]
 ; Allowed maximum number of documents in one purge request
 ;max_document_id_number = 100
diff --git a/src/couch/priv/stats_descriptions.cfg 
b/src/couch/priv/stats_descriptions.cfg
index 6a7120f87..04b68e9f3 100644
--- a/src/couch/priv/stats_descriptions.cfg
+++ b/src/couch/priv/stats_descriptions.cfg
@@ -474,3 +474,11 @@
     {type, counter},
     {desc, <<"number of mango selector evaluations">>}
 ]}.
+{[couchdb, bt_engine_cache, hits], [
+    {type, counter},
+    {desc, <<"number of bt_engine cache hits">>}
+]}.
+{[couchdb, bt_engine_cache, misses], [
+    {type, counter},
+    {desc, <<"number of bt_engine cache misses">>}
+]}.
diff --git a/src/couch/src/couch_bt_engine.erl 
b/src/couch/src/couch_bt_engine.erl
index 180c99e36..69c451a33 100644
--- a/src/couch/src/couch_bt_engine.erl
+++ b/src/couch/src/couch_bt_engine.erl
@@ -828,7 +828,8 @@ init_state(FilePath, Fd, Header0, Options) ->
         {split, fun ?MODULE:id_tree_split/1},
         {join, fun ?MODULE:id_tree_join/2},
         {reduce, fun ?MODULE:id_tree_reduce/2},
-        {compression, Compression}
+        {compression, Compression},
+        {cache_depth, 2}
     ]),
 
     SeqTreeState = couch_bt_engine_header:seq_tree_state(Header),
@@ -836,28 +837,32 @@ init_state(FilePath, Fd, Header0, Options) ->
         {split, fun ?MODULE:seq_tree_split/1},
         {join, fun ?MODULE:seq_tree_join/2},
         {reduce, fun ?MODULE:seq_tree_reduce/2},
-        {compression, Compression}
+        {compression, Compression},
+        {cache_depth, 2}
     ]),
 
     LocalTreeState = couch_bt_engine_header:local_tree_state(Header),
     {ok, LocalTree} = couch_btree:open(LocalTreeState, Fd, [
         {split, fun ?MODULE:local_tree_split/1},
         {join, fun ?MODULE:local_tree_join/2},
-        {compression, Compression}
+        {compression, Compression},
+        {cache_depth, 2}
     ]),
 
     PurgeTreeState = couch_bt_engine_header:purge_tree_state(Header),
     {ok, PurgeTree} = couch_btree:open(PurgeTreeState, Fd, [
         {split, fun ?MODULE:purge_tree_split/1},
         {join, fun ?MODULE:purge_tree_join/2},
-        {reduce, fun ?MODULE:purge_tree_reduce/2}
+        {reduce, fun ?MODULE:purge_tree_reduce/2},
+        {cache_depth, 2}
     ]),
 
     PurgeSeqTreeState = couch_bt_engine_header:purge_seq_tree_state(Header),
     {ok, PurgeSeqTree} = couch_btree:open(PurgeSeqTreeState, Fd, [
         {split, fun ?MODULE:purge_seq_tree_split/1},
         {join, fun ?MODULE:purge_seq_tree_join/2},
-        {reduce, fun ?MODULE:purge_tree_reduce/2}
+        {reduce, fun ?MODULE:purge_tree_reduce/2},
+        {cache_depth, 2}
     ]),
 
     ok = couch_file:set_db_pid(Fd, self()),
@@ -1195,8 +1200,14 @@ get_header_term(#st{header = Header} = St, Key, Default) 
when is_atom(Key) ->
         undefined ->
             Default;
         Pointer when is_integer(Pointer) ->
-            {ok, Term} = couch_file:pread_term(St#st.fd, Pointer),
-            Term
+            case couch_bt_engine_cache:lookup({St#st.fd, Pointer}) of
+                undefined ->
+                    {ok, Term} = couch_file:pread_term(St#st.fd, Pointer),
+                    couch_bt_engine_cache:insert({St#st.fd, Pointer}, Term),
+                    Term;
+                Term ->
+                    Term
+            end
     end.
 
 set_header_term(#st{} = St, Key, Term) when is_atom(Key) ->
@@ -1209,4 +1220,6 @@ set_header_term(#st{} = St, Key, Term) when is_atom(Key) 
->
 set_header_term(Fd, Header, Key, Term, Compression) when is_atom(Key) ->
     TermOpts = [{compression, Compression}],
     {ok, Ptr, _} = couch_file:append_term(Fd, Term, TermOpts),
-    couch_bt_engine_header:set(Header, Key, Ptr).
+    Result = couch_bt_engine_header:set(Header, Key, Ptr),
+    couch_bt_engine_cache:insert({Fd, Ptr}, Term),
+    Result.
diff --git a/src/couch/src/couch_bt_engine_cache.erl 
b/src/couch/src/couch_bt_engine_cache.erl
new file mode 100644
index 000000000..d2729c2a0
--- /dev/null
+++ b/src/couch/src/couch_bt_engine_cache.erl
@@ -0,0 +1,139 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(couch_bt_engine_cache).
+
+-export([
+    create_tables/0,
+    sup_children/0,
+    start_link/1,
+    insert/2,
+    lookup/1,
+    init/1
+]).
+
+-define(DEFAULT_SIZE, 64 * 1024 * 1024).
+-define(INTERVAL_MSEC, 2000).
+-define(PTERM_KEY, {?MODULE, caches}).
+
+-record(cache, {tid, max_size}).
+
+% Main API functions
+
+insert(Key, Term) ->
+    case get_cache(Key) of
+        #cache{tid = Tid, max_size = MaxSize} ->
+            case ets:info(Tid, memory) < MaxSize of
+                true -> ets:insert_new(Tid, {Key, 1, Term});
+                false -> false
+            end;
+        undefined ->
+            false
+    end.
+
+lookup(Key) ->
+    case get_cache(Key) of
+        #cache{tid = Tid} ->
+            case ets:lookup_element(Tid, Key, 3, undefined) of
+                undefined ->
+                    couch_stats:increment_counter([couchdb, bt_engine_cache, 
misses]),
+                    undefined;
+                Term ->
+                    bump_usage(Tid, Key, 1),
+                    couch_stats:increment_counter([couchdb, bt_engine_cache, 
hits]),
+                    Term
+            end;
+        undefined ->
+            undefined
+    end.
+
+% Supervisor helper functions
+
+create_tables() ->
+    BtCaches = [new() || _ <- lists:seq(1, shard_count())],
+    persistent_term:put(?PTERM_KEY, list_to_tuple(BtCaches)).
+
+sup_children() ->
+    [sup_child(I) || I <- lists:seq(1, shard_count())].
+
+sup_child(N) ->
+    Name = list_to_atom("couch_bt_engine_cache_" ++ integer_to_list(N)),
+    #{id => Name, start => {?MODULE, start_link, [N]}, shutdown => 
brutal_kill}.
+
+% Process start
+
+start_link(N) when is_integer(N) ->
+    {ok, proc_lib:spawn_link(?MODULE, init, [N])}.
+
+init(N) ->
+    Caches = persistent_term:get(?PTERM_KEY),
+    Cache = #cache{tid = Tid} = element(N, Caches),
+    ets:delete_all_objects(Tid),
+    loop(Cache).
+
+loop(#cache{tid = Tid, max_size = MaxSize} = Cache) ->
+    wait(),
+    decay(Tid),
+    %% If we're only 1/4 full, keep the 0 entries
+    case ets:info(Tid, memory) > MaxSize div 4 of
+        true -> clear(Tid);
+        false -> ok
+    end,
+    loop(Cache).
+
+% Private helper functions
+
+new() ->
+    Opts = [public, {write_concurrency, true}, {read_concurrency, true}],
+    Max = round(max_size() / erlang:system_info(wordsize) / shard_count()),
+    #cache{tid = ets:new(?MODULE, Opts), max_size = Max}.
+
+get_cache(Term) ->
+    case persistent_term:get(?PTERM_KEY, undefined) of
+        Tuple when is_tuple(Tuple) ->
+            Index = erlang:phash2(Term, tuple_size(Tuple)),
+            #cache{} = element(1 + Index, Tuple);
+        undefined ->
+            undefined
+    end.
+
+bump_usage(Tid, Key, Val) ->
+    try
+        ets:update_counter(Tid, Key, Val)
+    catch
+        error:badarg ->
+            ok
+    end.
+
+% Divide by 4 with a bsr 2 so in about 2-3 sec we expect the entry to have been
+% accessed at least 4 times to survive.
+%
+decay(Tid) ->
+    Head = {'$1', '$2', '$3'},
+    Result = [{{'$1', {'bsr', '$2', 2}, '$3'}}],
+    ets:select_replace(Tid, [{Head, [], Result}]).
+
+clear(Tid) ->
+    Head = {'_', '$1', '_'},
+    Match = [{'=<', '$1', 0}],
+    Result = [true],
+    ets:select_delete(Tid, [{Head, Match, Result}]).
+
+shard_count() ->
+    erlang:system_info(schedulers).
+
+max_size() ->
+    config:get_integer("bt_engine_cache", "max_size", ?DEFAULT_SIZE).
+
+wait() ->
+    Jitter = rand:uniform(max(1, round(?INTERVAL_MSEC * 0.5))),
+    timer:sleep(?INTERVAL_MSEC + Jitter).
diff --git a/src/couch/src/couch_btree.erl b/src/couch/src/couch_btree.erl
index 788b5f120..fbdc63ccf 100644
--- a/src/couch/src/couch_btree.erl
+++ b/src/couch/src/couch_btree.erl
@@ -27,7 +27,8 @@
     assemble_kv,
     less,
     reduce = nil,
-    compression = ?DEFAULT_COMPRESSION
+    compression = ?DEFAULT_COMPRESSION,
+    cache_depth = 0
 }).
 
 -define(FILL_RATIO, 0.5).
@@ -62,7 +63,9 @@ set_options(Bt, [{less, Less} | Rest]) ->
 set_options(Bt, [{reduce, Reduce} | Rest]) ->
     set_options(Bt#btree{reduce = Reduce}, Rest);
 set_options(Bt, [{compression, Comp} | Rest]) ->
-    set_options(Bt#btree{compression = Comp}, Rest).
+    set_options(Bt#btree{compression = Comp}, Rest);
+set_options(Bt, [{cache_depth, Depth} | Rest]) when is_integer(Depth) ->
+    set_options(Bt#btree{cache_depth = min(3, Depth)}, Rest).
 
 open(State, Fd, Options) ->
     {ok, set_options(#btree{root = State, fd = Fd}, Options)}.
@@ -111,7 +114,8 @@ fold_reduce(#btree{root = Root} = Bt, Fun, Acc, Options) ->
                 [],
                 KeyGroupFun,
                 Fun,
-                Acc
+                Acc,
+                0
             ),
         if
             GroupedKey2 == undefined ->
@@ -258,7 +262,8 @@ fold(#btree{root = Root} = Bt, Fun, Acc, Options) ->
                     InRange,
                     Dir,
                     convert_fun_arity(Fun),
-                    Acc
+                    Acc,
+                    0
                 );
             StartKey ->
                 stream_node(
@@ -269,7 +274,8 @@ fold(#btree{root = Root} = Bt, Fun, Acc, Options) ->
                     InRange,
                     Dir,
                     convert_fun_arity(Fun),
-                    Acc
+                    Acc,
+                    0
                 )
         end,
     case Result of
@@ -307,7 +313,7 @@ query_modify(Bt, LookupKeys, InsertValues, RemoveKeys) ->
             end
         end,
     Actions = lists:sort(SortFun, lists:append([InsertActions, RemoveActions, 
FetchActions])),
-    {ok, KeyPointers, QueryResults} = modify_node(Bt, Root, Actions, []),
+    {ok, KeyPointers, QueryResults} = modify_node(Bt, Root, Actions, [], 0),
     {ok, NewRoot} = complete_root(Bt, KeyPointers),
     {ok, QueryResults, Bt#btree{root = NewRoot}}.
 
@@ -323,64 +329,87 @@ lookup(#btree{root = Root, less = Less} = Bt, Keys) ->
             undefined -> lists:sort(Keys);
             _ -> lists:sort(Less, Keys)
         end,
-    {ok, SortedResults} = lookup(Bt, Root, SortedKeys),
+    {ok, SortedResults} = lookup(Bt, Root, SortedKeys, 0),
     % We want to return the results in the same order as the keys were input
     % but we may have changed the order when we sorted. So we need to put the
     % order back into the results.
     couch_util:reorder_results(Keys, SortedResults).
 
-lookup(_Bt, nil, Keys) ->
+lookup(_Bt, nil, Keys, _Depth) ->
     {ok, [{Key, not_found} || Key <- Keys]};
-lookup(Bt, Node, Keys) ->
+lookup(Bt, Node, Keys, Depth0) ->
+    Depth = Depth0 + 1,
     Pointer = element(1, Node),
-    {NodeType, NodeList} = get_node(Bt, Pointer),
+    {NodeType, NodeList} = get_node(Bt, Pointer, Depth),
     case NodeType of
         kp_node ->
-            lookup_kpnode(Bt, list_to_tuple(NodeList), 1, Keys, []);
+            lookup_kpnode(Bt, list_to_tuple(NodeList), 1, Keys, [], Depth);
         kv_node ->
-            lookup_kvnode(Bt, list_to_tuple(NodeList), 1, Keys, [])
+            lookup_kvnode(Bt, list_to_tuple(NodeList), 1, Keys, [], Depth)
     end.
 
-lookup_kpnode(_Bt, _NodeTuple, _LowerBound, [], Output) ->
+lookup_kpnode(_Bt, _NodeTuple, _LowerBound, [], Output, _Depth) ->
     {ok, lists:reverse(Output)};
-lookup_kpnode(_Bt, NodeTuple, LowerBound, Keys, Output) when 
tuple_size(NodeTuple) < LowerBound ->
+lookup_kpnode(_Bt, NodeTuple, LowerBound, Keys, Output, _Depth) when
+    tuple_size(NodeTuple) < LowerBound
+->
     {ok, lists:reverse(Output, [{Key, not_found} || Key <- Keys])};
-lookup_kpnode(Bt, NodeTuple, LowerBound, [FirstLookupKey | _] = LookupKeys, 
Output) ->
+lookup_kpnode(Bt, NodeTuple, LowerBound, [FirstLookupKey | _] = LookupKeys, 
Output, Depth) ->
     N = find_first_gteq(Bt, NodeTuple, LowerBound, tuple_size(NodeTuple), 
FirstLookupKey),
     {Key, PointerInfo} = element(N, NodeTuple),
     SplitFun = fun(LookupKey) -> not less(Bt, Key, LookupKey) end,
     case lists:splitwith(SplitFun, LookupKeys) of
         {[], GreaterQueries} ->
-            lookup_kpnode(Bt, NodeTuple, N + 1, GreaterQueries, Output);
+            lookup_kpnode(Bt, NodeTuple, N + 1, GreaterQueries, Output, Depth);
         {LessEqQueries, GreaterQueries} ->
-            {ok, Results} = lookup(Bt, PointerInfo, LessEqQueries),
-            lookup_kpnode(Bt, NodeTuple, N + 1, GreaterQueries, 
lists:reverse(Results, Output))
+            {ok, Results} = lookup(Bt, PointerInfo, LessEqQueries, Depth),
+            lookup_kpnode(
+                Bt, NodeTuple, N + 1, GreaterQueries, lists:reverse(Results, 
Output), Depth
+            )
     end.
 
-lookup_kvnode(_Bt, _NodeTuple, _LowerBound, [], Output) ->
+lookup_kvnode(_Bt, _NodeTuple, _LowerBound, [], Output, _Depth) ->
     {ok, lists:reverse(Output)};
-lookup_kvnode(_Bt, NodeTuple, LowerBound, Keys, Output) when 
tuple_size(NodeTuple) < LowerBound ->
+lookup_kvnode(_Bt, NodeTuple, LowerBound, Keys, Output, _Depth) when
+    tuple_size(NodeTuple) < LowerBound
+->
     % keys not found
     {ok, lists:reverse(Output, [{Key, not_found} || Key <- Keys])};
-lookup_kvnode(Bt, NodeTuple, LowerBound, [LookupKey | RestLookupKeys], Output) 
->
+lookup_kvnode(Bt, NodeTuple, LowerBound, [LookupKey | RestLookupKeys], Output, 
Depth) ->
     N = find_first_gteq(Bt, NodeTuple, LowerBound, tuple_size(NodeTuple), 
LookupKey),
     {Key, Value} = element(N, NodeTuple),
     case less(Bt, LookupKey, Key) of
         true ->
             % LookupKey is less than Key
-            lookup_kvnode(Bt, NodeTuple, N, RestLookupKeys, [{LookupKey, 
not_found} | Output]);
+            lookup_kvnode(
+                Bt, NodeTuple, N, RestLookupKeys, [{LookupKey, not_found} | 
Output], Depth
+            );
         false ->
             case less(Bt, Key, LookupKey) of
                 true ->
                     % LookupKey is greater than Key
-                    lookup_kvnode(Bt, NodeTuple, N + 1, RestLookupKeys, [
-                        {LookupKey, not_found} | Output
-                    ]);
+                    lookup_kvnode(
+                        Bt,
+                        NodeTuple,
+                        N + 1,
+                        RestLookupKeys,
+                        [
+                            {LookupKey, not_found} | Output
+                        ],
+                        Depth
+                    );
                 false ->
                     % LookupKey is equal to Key
-                    lookup_kvnode(Bt, NodeTuple, N, RestLookupKeys, [
-                        {LookupKey, {ok, assemble(Bt, LookupKey, Value)}} | 
Output
-                    ])
+                    lookup_kvnode(
+                        Bt,
+                        NodeTuple,
+                        N,
+                        RestLookupKeys,
+                        [
+                            {LookupKey, {ok, assemble(Bt, LookupKey, Value)}} 
| Output
+                        ],
+                        Depth
+                    )
             end
     end.
 
@@ -436,21 +465,22 @@ get_chunk_size() ->
             1279
     end.
 
-modify_node(Bt, RootPointerInfo, Actions, QueryOutput) ->
+modify_node(Bt, RootPointerInfo, Actions, QueryOutput, Depth0) ->
+    Depth = Depth0 + 1,
     {NodeType, NodeList} =
         case RootPointerInfo of
             nil ->
                 {kv_node, []};
             _Tuple ->
                 Pointer = element(1, RootPointerInfo),
-                get_node(Bt, Pointer)
+                get_node(Bt, Pointer, Depth)
         end,
     NodeTuple = list_to_tuple(NodeList),
 
     {ok, NewNodeList, QueryOutput2} =
         case NodeType of
-            kp_node -> modify_kpnode(Bt, NodeTuple, 1, Actions, [], 
QueryOutput);
-            kv_node -> modify_kvnode(Bt, NodeTuple, 1, Actions, [], 
QueryOutput)
+            kp_node -> modify_kpnode(Bt, NodeTuple, 1, Actions, [], 
QueryOutput, Depth);
+            kv_node -> modify_kvnode(Bt, NodeTuple, 1, Actions, [], 
QueryOutput, Depth)
         end,
     case NewNodeList of
         % no nodes remain
@@ -492,7 +522,19 @@ reduce_tree_size(kp_node, _NodeSize, [{_K, {_P, _Red, 
nil}} | _]) ->
 reduce_tree_size(kp_node, NodeSize, [{_K, {_P, _Red, Sz}} | NodeList]) ->
     reduce_tree_size(kp_node, NodeSize + Sz, NodeList).
 
-get_node(#btree{fd = Fd}, NodePos) ->
+get_node(#btree{fd = Fd, cache_depth = Max}, NodePos, Depth) when Depth =< Max 
->
+    case couch_bt_engine_cache:lookup({Fd, NodePos}) of
+        undefined ->
+            {ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos),
+            case NodeType of
+                kp_node -> couch_bt_engine_cache:insert({Fd, NodePos}, 
NodeList);
+                kv_node -> ok
+            end,
+            {NodeType, NodeList};
+        NodeList ->
+            {kp_node, NodeList}
+    end;
+get_node(#btree{fd = Fd}, NodePos, _Depth) ->
     {ok, {NodeType, NodeList}} = couch_file:pread_term(Fd, NodePos),
     {NodeType, NodeList}.
 
@@ -568,9 +610,9 @@ old_list_is_prefix([KV | Rest1], [KV | Rest2], Acc) ->
 old_list_is_prefix(_OldList, _NewList, _Acc) ->
     false.
 
-modify_kpnode(Bt, {}, _LowerBound, Actions, [], QueryOutput) ->
-    modify_node(Bt, nil, Actions, QueryOutput);
-modify_kpnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput) ->
+modify_kpnode(Bt, {}, _LowerBound, Actions, [], QueryOutput, Depth) ->
+    modify_node(Bt, nil, Actions, QueryOutput, Depth);
+modify_kpnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput, _Depth) 
->
     {ok,
         lists:reverse(
             ResultNode,
@@ -588,7 +630,8 @@ modify_kpnode(
     LowerBound,
     [{_, FirstActionKey, _} | _] = Actions,
     ResultNode,
-    QueryOutput
+    QueryOutput,
+    Depth
 ) ->
     Sz = tuple_size(NodeTuple),
     N = find_first_gteq(Bt, NodeTuple, LowerBound, Sz, FirstActionKey),
@@ -597,7 +640,7 @@ modify_kpnode(
             % perform remaining actions on last node
             {_, PointerInfo} = element(Sz, NodeTuple),
             {ok, ChildKPs, QueryOutput2} =
-                modify_node(Bt, PointerInfo, Actions, QueryOutput),
+                modify_node(Bt, PointerInfo, Actions, QueryOutput, Depth),
             NodeList = lists:reverse(
                 ResultNode,
                 bounded_tuple_to_list(
@@ -615,7 +658,7 @@ modify_kpnode(
             end,
             {LessEqQueries, GreaterQueries} = lists:splitwith(SplitFun, 
Actions),
             {ok, ChildKPs, QueryOutput2} =
-                modify_node(Bt, PointerInfo, LessEqQueries, QueryOutput),
+                modify_node(Bt, PointerInfo, LessEqQueries, QueryOutput, 
Depth),
             ResultNode2 = lists:reverse(
                 ChildKPs,
                 bounded_tuple_to_revlist(
@@ -625,7 +668,7 @@ modify_kpnode(
                     ResultNode
                 )
             ),
-            modify_kpnode(Bt, NodeTuple, N + 1, GreaterQueries, ResultNode2, 
QueryOutput2)
+            modify_kpnode(Bt, NodeTuple, N + 1, GreaterQueries, ResultNode2, 
QueryOutput2, Depth)
     end.
 
 bounded_tuple_to_revlist(_Tuple, Start, End, Tail) when Start > End ->
@@ -653,7 +696,7 @@ find_first_gteq(Bt, Tuple, Start, End, Key) ->
             find_first_gteq(Bt, Tuple, Start, Mid, Key)
     end.
 
-modify_kvnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput) ->
+modify_kvnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput, _Depth) 
->
     {ok,
         lists:reverse(
             ResultNode, bounded_tuple_to_list(NodeTuple, LowerBound, 
tuple_size(NodeTuple), [])
@@ -665,7 +708,8 @@ modify_kvnode(
     LowerBound,
     [{ActionType, ActionKey, ActionValue} | RestActions],
     ResultNode,
-    QueryOutput
+    QueryOutput,
+    Depth
 ) when LowerBound > tuple_size(NodeTuple) ->
     case ActionType of
         insert ->
@@ -675,16 +719,25 @@ modify_kvnode(
                 LowerBound,
                 RestActions,
                 [{ActionKey, ActionValue} | ResultNode],
-                QueryOutput
+                QueryOutput,
+                Depth
             );
         remove ->
             % just drop the action
-            modify_kvnode(Bt, NodeTuple, LowerBound, RestActions, ResultNode, 
QueryOutput);
+            modify_kvnode(Bt, NodeTuple, LowerBound, RestActions, ResultNode, 
QueryOutput, Depth);
         fetch ->
             % the key/value must not exist in the tree
-            modify_kvnode(Bt, NodeTuple, LowerBound, RestActions, ResultNode, [
-                {not_found, {ActionKey, nil}} | QueryOutput
-            ])
+            modify_kvnode(
+                Bt,
+                NodeTuple,
+                LowerBound,
+                RestActions,
+                ResultNode,
+                [
+                    {not_found, {ActionKey, nil}} | QueryOutput
+                ],
+                Depth
+            )
     end;
 modify_kvnode(
     Bt,
@@ -692,7 +745,8 @@ modify_kvnode(
     LowerBound,
     [{ActionType, ActionKey, ActionValue} | RestActions],
     AccNode,
-    QueryOutput
+    QueryOutput,
+    Depth
 ) ->
     N = find_first_gteq(Bt, NodeTuple, LowerBound, tuple_size(NodeTuple), 
ActionKey),
     {Key, Value} = element(N, NodeTuple),
@@ -708,16 +762,25 @@ modify_kvnode(
                         N,
                         RestActions,
                         [{ActionKey, ActionValue} | ResultNode],
-                        QueryOutput
+                        QueryOutput,
+                        Depth
                     );
                 remove ->
                     % ActionKey is less than the Key, just drop the action
-                    modify_kvnode(Bt, NodeTuple, N, RestActions, ResultNode, 
QueryOutput);
+                    modify_kvnode(Bt, NodeTuple, N, RestActions, ResultNode, 
QueryOutput, Depth);
                 fetch ->
                     % ActionKey is less than the Key, the key/value must not 
exist in the tree
-                    modify_kvnode(Bt, NodeTuple, N, RestActions, ResultNode, [
-                        {not_found, {ActionKey, nil}} | QueryOutput
-                    ])
+                    modify_kvnode(
+                        Bt,
+                        NodeTuple,
+                        N,
+                        RestActions,
+                        ResultNode,
+                        [
+                            {not_found, {ActionKey, nil}} | QueryOutput
+                        ],
+                        Depth
+                    )
             end;
         false ->
             % ActionKey and Key are maybe equal.
@@ -731,18 +794,27 @@ modify_kvnode(
                                 N + 1,
                                 RestActions,
                                 [{ActionKey, ActionValue} | ResultNode],
-                                QueryOutput
+                                QueryOutput,
+                                Depth
                             );
                         remove ->
                             modify_kvnode(
-                                Bt, NodeTuple, N + 1, RestActions, ResultNode, 
QueryOutput
+                                Bt, NodeTuple, N + 1, RestActions, ResultNode, 
QueryOutput, Depth
                             );
                         fetch ->
                             % ActionKey is equal to the Key, insert into the 
QueryOuput, but re-process the node
                             % since an identical action key can follow it.
-                            modify_kvnode(Bt, NodeTuple, N, RestActions, 
ResultNode, [
-                                {ok, assemble(Bt, Key, Value)} | QueryOutput
-                            ])
+                            modify_kvnode(
+                                Bt,
+                                NodeTuple,
+                                N,
+                                RestActions,
+                                ResultNode,
+                                [
+                                    {ok, assemble(Bt, Key, Value)} | 
QueryOutput
+                                ],
+                                Depth
+                            )
                     end;
                 true ->
                     modify_kvnode(
@@ -751,7 +823,8 @@ modify_kvnode(
                         N + 1,
                         [{ActionType, ActionKey, ActionValue} | RestActions],
                         [{Key, Value} | ResultNode],
-                        QueryOutput
+                        QueryOutput,
+                        Depth
                     )
             end
     end.
@@ -767,7 +840,8 @@ reduce_stream_node(
     GroupedRedsAcc,
     _KeyGroupFun,
     _Fun,
-    Acc
+    Acc,
+    _Depth
 ) ->
     {ok, Acc, GroupedRedsAcc, GroupedKVsAcc, GroupedKey};
 reduce_stream_node(
@@ -781,10 +855,12 @@ reduce_stream_node(
     GroupedRedsAcc,
     KeyGroupFun,
     Fun,
-    Acc
+    Acc,
+    Depth0
 ) ->
+    Depth = Depth0 + 1,
     P = element(1, Node),
-    case get_node(Bt, P) of
+    case get_node(Bt, P, Depth) of
         {kp_node, NodeList} ->
             NodeList2 = adjust_dir(Dir, NodeList),
             reduce_stream_kp_node(
@@ -798,7 +874,8 @@ reduce_stream_node(
                 GroupedRedsAcc,
                 KeyGroupFun,
                 Fun,
-                Acc
+                Acc,
+                Depth
             );
         {kv_node, KVs} ->
             KVs2 = adjust_dir(Dir, KVs),
@@ -813,7 +890,8 @@ reduce_stream_node(
                 GroupedRedsAcc,
                 KeyGroupFun,
                 Fun,
-                Acc
+                Acc,
+                Depth
             )
     end.
 
@@ -828,7 +906,8 @@ reduce_stream_kv_node(
     GroupedRedsAcc,
     KeyGroupFun,
     Fun,
-    Acc
+    Acc,
+    Depth
 ) ->
     GTEKeyStartKVs =
         case KeyStart of
@@ -855,7 +934,8 @@ reduce_stream_kv_node(
         GroupedRedsAcc,
         KeyGroupFun,
         Fun,
-        Acc
+        Acc,
+        Depth
     ).
 
 reduce_stream_kv_node2(
@@ -866,7 +946,8 @@ reduce_stream_kv_node2(
     GroupedRedsAcc,
     _KeyGroupFun,
     _Fun,
-    Acc
+    Acc,
+    _Depth
 ) ->
     {ok, Acc, GroupedRedsAcc, GroupedKVsAcc, GroupedKey};
 reduce_stream_kv_node2(
@@ -877,7 +958,8 @@ reduce_stream_kv_node2(
     GroupedRedsAcc,
     KeyGroupFun,
     Fun,
-    Acc
+    Acc,
+    Depth
 ) ->
     case GroupedKey of
         undefined ->
@@ -889,7 +971,8 @@ reduce_stream_kv_node2(
                 [],
                 KeyGroupFun,
                 Fun,
-                Acc
+                Acc,
+                Depth
             );
         _ ->
             case KeyGroupFun(GroupedKey, Key) of
@@ -902,7 +985,8 @@ reduce_stream_kv_node2(
                         GroupedRedsAcc,
                         KeyGroupFun,
                         Fun,
-                        Acc
+                        Acc,
+                        Depth
                     );
                 false ->
                     case Fun(GroupedKey, {GroupedKVsAcc, GroupedRedsAcc}, Acc) 
of
@@ -915,7 +999,8 @@ reduce_stream_kv_node2(
                                 [],
                                 KeyGroupFun,
                                 Fun,
-                                Acc2
+                                Acc2,
+                                Depth
                             );
                         {stop, Acc2} ->
                             throw({stop, Acc2})
@@ -934,7 +1019,8 @@ reduce_stream_kp_node(
     GroupedRedsAcc,
     KeyGroupFun,
     Fun,
-    Acc
+    Acc,
+    Depth
 ) ->
     Nodes =
         case KeyStart of
@@ -977,7 +1063,8 @@ reduce_stream_kp_node(
         GroupedRedsAcc,
         KeyGroupFun,
         Fun,
-        Acc
+        Acc,
+        Depth
     ).
 
 reduce_stream_kp_node2(
@@ -991,7 +1078,8 @@ reduce_stream_kp_node2(
     [],
     KeyGroupFun,
     Fun,
-    Acc
+    Acc,
+    Depth
 ) ->
     {ok, Acc2, GroupedRedsAcc2, GroupedKVsAcc2, GroupedKey2} =
         reduce_stream_node(
@@ -1005,7 +1093,8 @@ reduce_stream_kp_node2(
             [],
             KeyGroupFun,
             Fun,
-            Acc
+            Acc,
+            Depth
         ),
     reduce_stream_kp_node2(
         Bt,
@@ -1018,7 +1107,8 @@ reduce_stream_kp_node2(
         GroupedRedsAcc2,
         KeyGroupFun,
         Fun,
-        Acc2
+        Acc2,
+        Depth
     );
 reduce_stream_kp_node2(
     Bt,
@@ -1031,7 +1121,8 @@ reduce_stream_kp_node2(
     GroupedRedsAcc,
     KeyGroupFun,
     Fun,
-    Acc
+    Acc,
+    Depth
 ) ->
     {Grouped0, Ungrouped0} = lists:splitwith(
         fun({Key, _}) ->
@@ -1062,7 +1153,8 @@ reduce_stream_kp_node2(
                     GroupedReds ++ GroupedRedsAcc,
                     KeyGroupFun,
                     Fun,
-                    Acc
+                    Acc,
+                    Depth
                 ),
             reduce_stream_kp_node2(
                 Bt,
@@ -1075,7 +1167,8 @@ reduce_stream_kp_node2(
                 GroupedRedsAcc2,
                 KeyGroupFun,
                 Fun,
-                Acc2
+                Acc2,
+                Depth
             );
         [] ->
             {ok, Acc, GroupedReds ++ GroupedRedsAcc, GroupedKVsAcc, GroupedKey}
@@ -1086,40 +1179,46 @@ adjust_dir(fwd, List) ->
 adjust_dir(rev, List) ->
     lists:reverse(List).
 
-stream_node(Bt, Reds, Node, StartKey, InRange, Dir, Fun, Acc) ->
+stream_node(Bt, Reds, Node, StartKey, InRange, Dir, Fun, Acc, Depth0) ->
+    Depth = Depth0 + 1,
     Pointer = element(1, Node),
-    {NodeType, NodeList} = get_node(Bt, Pointer),
+    {NodeType, NodeList} = get_node(Bt, Pointer, Depth),
     case NodeType of
         kp_node ->
-            stream_kp_node(Bt, Reds, adjust_dir(Dir, NodeList), StartKey, 
InRange, Dir, Fun, Acc);
+            stream_kp_node(
+                Bt, Reds, adjust_dir(Dir, NodeList), StartKey, InRange, Dir, 
Fun, Acc, Depth
+            );
         kv_node ->
-            stream_kv_node(Bt, Reds, adjust_dir(Dir, NodeList), StartKey, 
InRange, Dir, Fun, Acc)
+            stream_kv_node(
+                Bt, Reds, adjust_dir(Dir, NodeList), StartKey, InRange, Dir, 
Fun, Acc, Depth
+            )
     end.
 
-stream_node(Bt, Reds, Node, InRange, Dir, Fun, Acc) ->
+stream_node(Bt, Reds, Node, InRange, Dir, Fun, Acc, Depth0) ->
+    Depth = Depth0 + 1,
     Pointer = element(1, Node),
-    {NodeType, NodeList} = get_node(Bt, Pointer),
+    {NodeType, NodeList} = get_node(Bt, Pointer, Depth),
     case NodeType of
         kp_node ->
-            stream_kp_node(Bt, Reds, adjust_dir(Dir, NodeList), InRange, Dir, 
Fun, Acc);
+            stream_kp_node(Bt, Reds, adjust_dir(Dir, NodeList), InRange, Dir, 
Fun, Acc, Depth);
         kv_node ->
-            stream_kv_node2(Bt, Reds, [], adjust_dir(Dir, NodeList), InRange, 
Dir, Fun, Acc)
+            stream_kv_node2(Bt, Reds, [], adjust_dir(Dir, NodeList), InRange, 
Dir, Fun, Acc, Depth)
     end.
 
-stream_kp_node(_Bt, _Reds, [], _InRange, _Dir, _Fun, Acc) ->
+stream_kp_node(_Bt, _Reds, [], _InRange, _Dir, _Fun, Acc, _Depth) ->
     {ok, Acc};
-stream_kp_node(Bt, Reds, [{Key, Node} | Rest], InRange, Dir, Fun, Acc) ->
+stream_kp_node(Bt, Reds, [{Key, Node} | Rest], InRange, Dir, Fun, Acc, Depth) 
->
     Red = element(2, Node),
     case Fun(traverse, Key, Red, Acc) of
         {ok, Acc2} ->
-            case stream_node(Bt, Reds, Node, InRange, Dir, Fun, Acc2) of
+            case stream_node(Bt, Reds, Node, InRange, Dir, Fun, Acc2, Depth) of
                 {ok, Acc3} ->
-                    stream_kp_node(Bt, [Red | Reds], Rest, InRange, Dir, Fun, 
Acc3);
+                    stream_kp_node(Bt, [Red | Reds], Rest, InRange, Dir, Fun, 
Acc3, Depth);
                 {stop, LastReds, Acc3} ->
                     {stop, LastReds, Acc3}
             end;
         {skip, Acc2} ->
-            stream_kp_node(Bt, [Red | Reds], Rest, InRange, Dir, Fun, Acc2);
+            stream_kp_node(Bt, [Red | Reds], Rest, InRange, Dir, Fun, Acc2, 
Depth);
         {stop, Acc2} ->
             {stop, Reds, Acc2}
     end.
@@ -1134,7 +1233,7 @@ drop_nodes(Bt, Reds, StartKey, [{NodeKey, Node} | 
RestKPs]) ->
             {Reds, [{NodeKey, Node} | RestKPs]}
     end.
 
-stream_kp_node(Bt, Reds, KPs, StartKey, InRange, Dir, Fun, Acc) ->
+stream_kp_node(Bt, Reds, KPs, StartKey, InRange, Dir, Fun, Acc, Depth) ->
     {NewReds, NodesToStream} =
         case Dir of
             fwd ->
@@ -1157,16 +1256,16 @@ stream_kp_node(Bt, Reds, KPs, StartKey, InRange, Dir, 
Fun, Acc) ->
         [] ->
             {ok, Acc};
         [{_Key, Node} | Rest] ->
-            case stream_node(Bt, NewReds, Node, StartKey, InRange, Dir, Fun, 
Acc) of
+            case stream_node(Bt, NewReds, Node, StartKey, InRange, Dir, Fun, 
Acc, Depth) of
                 {ok, Acc2} ->
                     Red = element(2, Node),
-                    stream_kp_node(Bt, [Red | NewReds], Rest, InRange, Dir, 
Fun, Acc2);
+                    stream_kp_node(Bt, [Red | NewReds], Rest, InRange, Dir, 
Fun, Acc2, Depth);
                 {stop, LastReds, Acc2} ->
                     {stop, LastReds, Acc2}
             end
     end.
 
-stream_kv_node(Bt, Reds, KVs, StartKey, InRange, Dir, Fun, Acc) ->
+stream_kv_node(Bt, Reds, KVs, StartKey, InRange, Dir, Fun, Acc, Depth) ->
     DropFun =
         case Dir of
             fwd ->
@@ -1176,11 +1275,11 @@ stream_kv_node(Bt, Reds, KVs, StartKey, InRange, Dir, 
Fun, Acc) ->
         end,
     {LTKVs, GTEKVs} = lists:splitwith(DropFun, KVs),
     AssembleLTKVs = [assemble(Bt, K, V) || {K, V} <- LTKVs],
-    stream_kv_node2(Bt, Reds, AssembleLTKVs, GTEKVs, InRange, Dir, Fun, Acc).
+    stream_kv_node2(Bt, Reds, AssembleLTKVs, GTEKVs, InRange, Dir, Fun, Acc, 
Depth).
 
-stream_kv_node2(_Bt, _Reds, _PrevKVs, [], _InRange, _Dir, _Fun, Acc) ->
+stream_kv_node2(_Bt, _Reds, _PrevKVs, [], _InRange, _Dir, _Fun, Acc, _Depth) ->
     {ok, Acc};
-stream_kv_node2(Bt, Reds, PrevKVs, [{K, V} | RestKVs], InRange, Dir, Fun, Acc) 
->
+stream_kv_node2(Bt, Reds, PrevKVs, [{K, V} | RestKVs], InRange, Dir, Fun, Acc, 
Depth) ->
     case InRange(K) of
         false ->
             {stop, {PrevKVs, Reds}, Acc};
@@ -1189,7 +1288,7 @@ stream_kv_node2(Bt, Reds, PrevKVs, [{K, V} | RestKVs], 
InRange, Dir, Fun, Acc) -
             case Fun(visit, AssembledKV, {PrevKVs, Reds}, Acc) of
                 {ok, Acc2} ->
                     stream_kv_node2(
-                        Bt, Reds, [AssembledKV | PrevKVs], RestKVs, InRange, 
Dir, Fun, Acc2
+                        Bt, Reds, [AssembledKV | PrevKVs], RestKVs, InRange, 
Dir, Fun, Acc2, Depth
                     );
                 {stop, Acc2} ->
                     {stop, {PrevKVs, Reds}, Acc2}
diff --git a/src/couch/src/couch_primary_sup.erl 
b/src/couch/src/couch_primary_sup.erl
index 32ee282d6..93a1d2112 100644
--- a/src/couch/src/couch_primary_sup.erl
+++ b/src/couch/src/couch_primary_sup.erl
@@ -18,6 +18,7 @@ start_link() ->
     supervisor:start_link({local, couch_primary_services}, ?MODULE, []).
 
 init([]) ->
+    ok = couch_bt_engine_cache:create_tables(),
     Children =
         [
             {couch_task_status, {couch_task_status, start_link, []}, 
permanent, brutal_kill, worker,
@@ -45,7 +46,7 @@ init([]) ->
                     ]
                 ]},
                 permanent, 5000, worker, [ets_lru]}
-        ] ++ couch_servers(),
+        ] ++ couch_bt_engine_cache:sup_children() ++ couch_servers(),
     {ok, {{one_for_one, 10, 3600}, Children}}.
 
 couch_servers() ->


Reply via email to