nickva commented on code in PR #5603:
URL: https://github.com/apache/couchdb/pull/5603#discussion_r2314863873


##########
src/couch/src/couch_time_seq.erl:
##########
@@ -0,0 +1,302 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+% This module implements exponentially decaying time intervals which map to
+% database update sequences. The idea is be able to quickly determine the set
+% of changes which occurred in a rough time interval. The closer to the last
+% update time -- the higher the accuracy. For instance, going back a few days,
+% it's only possible to target individual days. Then weeks, then months, then
+% going back years can target only years.
+%
+% An example of the shape of the data structure might be:
+%
+%  +---------+  +---------+  +--------+
+%  | seq=986 |  | seq=891 |  | seq=19 |
+%  +---------+  +---------+  +--------+
+%            ^            ^           ^
+%            |            |           |
+%          t=42          t=40        t=37
+%
+% The head, on the left, points to the newest (most recently updated) time bin.
+% In this example it started at t=42. The last t=37 bin is the oldest time bin.
+%
+% If we're looking for sequences starting before t=41, we'd pick seq=891 and
+% run the changes since=891. If we're looking for a sequence starting before
+% t=37, we'd start with since=0. The main idea here is that we'd rather error
+% on the side of returning too many rows than not enough. This is how our
+% clustered changes feeds behave during rewinds already, so it's really new
+% behavior.
+%
+% The number of bins is always less than or equal to ?MAX_BIN_COUNT. During
+% updates, when we're forced to move to a new bin, the bins are rolled-up to
+% make some room before adding the latest update.
+%
+% During the roll-up, multiple old bins might end up merging into a single with
+% a larger width. For example the above bins might end up in a single bin. In
+% the example above the old two bins might merge and the bins now might look
+% like:
+%
+%  +---------+  +---------+
+%  | seq=986 |  | seq=891 |
+%  +---------+  +---------+
+%            ^            ^
+%            |            |
+%          t=42          t=37
+%
+% If we're now looking for sequences staring before t=40, we'd pick seq=891.
+%
+% The roll-up algorithm can be summarized as:
+%
+%   - As a preparatory step: tag all the bins with their size, and reverse them
+%     to put them in chronological order (oldest first).
+%
+%   - For each interval in ?INTERVALS:
+%
+%     * For each bin in the list of bins before the (now - next interval) time:
+%
+%        + If durations of two adjacent bins are =< interval, merge the newer
+%          one into the older one.
+%
+%     * If we merged any bins for the given interval, return.
+%
+%     * If we didn't merge any bins, continue with a longer interval
+%
+%     * Once we get to just one interval remaining, and didn't succeed merging
+%       anything using the interval schedule in ?INTERVALS, simply merge half
+%       the bins the oldest part of the interval. This is guaranteed to make
+%       more room.
+%
+
+-module(couch_time_seq).
+
+-export([
+    new/0,
+    new/1,
+    since/2,
+    update/2,
+    update/3,
+    histogram/2
+]).
+
+-export_type([
+    time_seq/0
+]).
+
+% How bin count = 60 was picked:
+%
+%  - With the ?INTERVALS schedule defined below ran 1 update per hour for 1M
+%    updates starting at year 3000 and ending at year 3114 and obtained:
+%      * Less than 10 years of spacing between years at the start: 3000, 3009, 
3018 ...
+%      * Ten individual latest days: 3114-01-20 -> 3114-01-30
+%      * Seven individual latest months: 3113-07 -> 3114-01
+%  - Uncompressed term_to_binary(TSeq) = 920B
+%  - RAM flat size erts_debug:flat_size(TSeq) * erlang:system_info(wordsize) = 
2KB
+%
+-define(MAX_BIN_COUNT, 60).
+
+-define(H, 3600).
+-define(D, ?H * 24).
+-define(M, ?D * 30).
+-define(Y, ?D * 365).
+%% erlfmt-ignore
+-define(INTERVALS, [
+    ?H * 3, ?H * 6, ?H * 12,
+    ?D, ?D * 10,
+    ?M, ?M * 6,
+    ?Y, ?Y * 2, ?Y * 4, ?Y * 8, ?Y * 16
+]).
+
+% Version number so we could upgrade the data structure in the future.
+%
+-define(VER, 1).
+
+% Users can set a minimum time boundary to avoid errors with broken clocks.
+% Sometimes embedded systems get booted into 1970 and then get their time from
+% NTP. Updates are ignored if we can tell they are obviously wrong. Use some
+% recent time as a default min time value:
+%
+%   1754006400 = 2025-08-01T00:00:00Z
+%
+-define(DEFAULT_MIN_TIME, 1754006400).
+
+-type unix_time() :: pos_integer().
+-type bin_size() :: pos_integer().
+-type update_seq() :: non_neg_integer() | now.
+-type bin() :: {unix_time(), bin_size(), update_seq()}.
+-type time_seq() :: #{v := pos_integer(), bins := [bin()]}.
+
+-spec new() -> time_seq().
+new() ->
+    #{v => ?VER, bins => []}.
+
+-spec new(time_seq()) -> time_seq().
+new(#{v := ?VER, bins := Bins} = Ctx) when is_list(Bins) ->
+    % Future upgrade clauses would go here. When upgrading make sure to add a
+    % clause to check/1 to return true for the older version as well. But the
+    % upgarde itself will happen in new/1.
+    Ctx.
+
+-spec update(time_seq(), update_seq()) -> time_seq().
+update(#{v := ?VER} = Ctx, Seq) when is_integer(Seq), Seq >= 0 ->
+    update(Ctx, now_unix_sec(), Seq).

Review Comment:
   I had both an `update/2` and `update/3`, tests use update/3 with their own 
time. But it is a bit confusing I agree, so I'll update it to have just 
`update/3` and let the `couch_db_updater` pass in the time. 
   
   But it may be neater to have couch_db_updater grab the time from 
`couch_time_seq:timestamp()` in case we wanted to change the resolution, plug 
in another time source or something like that in the future.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscr...@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to