nickva commented on code in PR #5603:
URL: https://github.com/apache/couchdb/pull/5603#discussion_r2223956888


##########
src/couch/src/couch_time_seq.erl:
##########
@@ -0,0 +1,216 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+% This module implements exponentially decaying time intervals which map to
+% database update sequences. The idea is be able to quickly determine the set
+% of changes which occurred in a rough time interval. The closer to the last
+% update time -- the higher the accuracy. For instance, going back a few days,
+% it's only possible to target individual days. Then weeks, then months, then
+% going back years can target only years.
+%
+% An example of the shape of the data structure might be:
+%
+%  +---------+  +---------+     +--------+
+%  | seq=986 |  | seq=891 | ... | seq=19 |
+%  +---------+  +---------+     +--------+
+%            ^            ^              ^
+%            |            |              |
+%          t=42         t=40           t=37
+%
+% The head, on the left, points to the newest (most recently updated) time bin.
+% In this example it started at t=42. The last t=37 bin is the oldest time bin.
+%
+% If we're looking for sequences starting before t=41, we'd pick seq=891 and
+% run the changes since=891. If we're looking for a sequence starting before
+% t=37, we'd start with since=0. The main idea here is that we'd rather error
+% on the side of returning too many rows than not enough.
+%
+% The bins stay as a fixed size (maximum length = 50 bins) since on
+% updates, when we're forced to move to a new hour (new bin), the bins are
+% rolled-up into a new set of 50 bins, starting with the latest update time.
+% During the roll-up, multiple old bins might end up fitting into a single new
+% bin with a larger width. For example the above bins might end up in a single
+% bin. For example the above data structure might now look like:
+%
+%  +---------+
+%  | seq=986 |
+%  +---------+
+%            ^
+%            |
+%          t=42
+%
+% If we're now looking for sequences staring before t=42, we'd pick seq=0. In
+% exchange for the loss of accuracy we get a fixed sized data structure with at
+% most 50 entries and only a few hundred bytes in size when serialized.
+
+-module(couch_time_seq).
+
+-export([
+    new/0,
+    new/1,
+    check/1,
+    since/2,
+    update/2,
+    update/3,
+    histogram/1,
+    bin_count/0
+]).
+
+% Bin widths defintion:
+%  * Base unit is hours
+%  * Total bin count is 50. This is the max length of the data structure as 
well.
+%  * Maximum time is 20 years with decreased granularity further back in time.
+%
+-define(D, 24).
+-define(M, 24 * 30).
+-define(Y, 24 * 365).
+%% erlfmt-ignore
+-define(BINS, {                             % Total
+    1, 1, 1, 1, 4, 4, 4, 8,                 % 01D
+    8, 8, 8,                                % 02D
+    12, 12,                                 % 03D
+    ?D, ?D, ?D, ?D, ?D, ?D, ?D,             % 10D
+    ?D*5, ?D*5, ?D*5, ?D*5,                 % 01M
+    ?D*10, ?D*10, ?D*10,                    % 02M
+    ?M, ?M, ?M, ?M, ?M, ?M, ?M, ?M, ?M, ?M, % 01Y
+    ?M*6, ?M*6, ?M*6, ?M*6,                 % 03Y
+    ?Y, ?Y, ?Y, ?Y, ?Y, ?Y, ?Y,             % 10Y
+    ?Y*5, ?Y*5                              % 20Y
+}).
+
+% Version number so we could upgrade the data structure in the future.
+%
+-define(VER, 1).
+
+% User can set a minimum time boundary to avoid errors with broken clocks.
+% Sometimes embedded systems get booted into 1970 and then get their time from
+% NTP. Updates are ignored if we can tell they are obviously wrong. Use some
+% recent time as a default min time value:
+%
+%   1752724800 = 2025-07-17T00:00:00
+%
+-define(DEFAULT_MIN_TIME, 1752724800).
+
+new() ->
+    #{v => ?VER, bins => []}.
+
+new(#{v := ?VER, bins := Bins} = Ctx) when is_list(Bins) ->
+    % Future upgrade clauses to next version could go here
+    Ctx.
+
+check(#{v := ?VER, bins := Bins}) when is_list(Bins) ->
+    true;
+check(_) ->
+    false.
+
+update(#{v := ?VER} = Ctx, Seq) when is_integer(Seq), Seq >= 0 ->
+    update(Ctx, os:system_time(second), Seq).
+
+update(#{v := ?VER} = Ctx, Time, Seq) ->
+    #{bins := Bins} = Ctx,
+    case Time >= min_time() of
+        true -> Ctx#{bins := update_bins(Bins, hour(Time), Seq)};
+        false -> Ctx
+    end.
+
+% Retun a highest known sequence that comes before the given time. If the time
+% falls on or before the oldest bin then return 0. This might be a sequence to
+% use with a regular _changes?since=... call
+%
+since(#{v := ?VER} = Ctx, Time) when is_integer(Time) ->
+    #{bins := Bins} = Ctx,
+    case lists:dropwhile(fun({T, _}) -> Time =< T end, Bins) of
+        [] -> 0;
+        [{_, Seq} | _] -> Seq
+    end.
+
+% Return a histogram of formatted time (RFC3339) and number of sequence updates
+% which happened in that bin. The result might look like: be emitted as json:
+%
+%   [["2025-01-02T03:04:00Z", 42], ["2025-01-02T01:01:00Z", 1], ...]
+%
+histogram(#{v := ?VER, bins := Bins}) ->
+    [[rfc3339(T), S] || {T, S} <- seq_histogram(Bins)].
+
+% Return the maximum size of the data structure. At any point in time the
+% actual number of bins may be lower than this but it won't be higher.
+%
+bin_count() ->

Review Comment:
   Thanks again for the suggestion. Updated in a fixup.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscr...@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to