This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch optimize-uuids-generation
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit b718a9881472f489a4e1e4fd0f2b7314e745bb8e
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Mon Oct 13 16:24:05 2025 -0400

    Optimize UUID generation and use UUID v7 by default
    
    The UUID v7 along with the `random` UUID types do not require going through 
the
    gen_server as they do not need to keep any state. This should make the 
system
    more robust and prevent clients from overwhelming the node with concurrent
    requests to the `/_uuid` endpoint.
    
    Add a plain hex-encoded version of UUID v7 as `uuid_v7_hex` and use that as 
the
    new default. The `hex` version is used according to the guidance in:
    
    https://datatracker.ietf.org/doc/rfc9562/
    
    ```
       For many applications, such as databases, storing UUIDs as text is
       unnecessarily verbose, requiring 288 bits to represent 128-bit UUID
       values.  Thus, where feasible, UUIDs SHOULD be stored within database
       applications as the underlying 128-bit binary value.
    ```
    
    Ideally we'd like to use just the 128 bit binary, however since we're
    transmitting json we'll encode it as base 16. This way its shape is also
    exactly compatible (same length and characters) with the previous 
`sequentail`
    default.
    
    In the future more compact encodings may be added: base32, base58 etc.
    
    Users can still generate properly formatted UUID v7 strings (with dashes) by
    using the `uuid_v7` algorithm. That will produce 36 byte character stirngs
    which may look like:
    
    ```
        [
           "0199df37-5929-7032-b402-c3e61fbbf88f",
           "0199df37-5929-7851-9492-e566433ced7f",
           ...
        ]
    ```
    
    While at it, expand the test suite and add more coverage and also ensure to
    test the sizes of the returned values.
---
 src/couch/src/couch_uuids.erl              | 83 +++++++++++++++++-------------
 src/couch/test/eunit/couch_uuids_tests.erl | 62 ++++++++++++++++------
 src/docs/src/config/misc.rst               | 62 ++++++++++++++--------
 3 files changed, 134 insertions(+), 73 deletions(-)

diff --git a/src/couch/src/couch_uuids.erl b/src/couch/src/couch_uuids.erl
index f9c11398d..251035e30 100644
--- a/src/couch/src/couch_uuids.erl
+++ b/src/couch/src/couch_uuids.erl
@@ -17,7 +17,7 @@
 
 -export([start/0, stop/0]).
 -export([new/0, random/0]).
--export([v7_hex/0, v7_bin/0]).
+-export([v7_bin/0]).
 -export([init/1]).
 -export([handle_call/3, handle_cast/2, handle_info/2]).
 
@@ -33,7 +33,13 @@ stop() ->
     gen_server:cast(?MODULE, stop).
 
 new() ->
-    gen_server:call(?MODULE, create).
+    % Some algorithms can bypass the gen_server
+    case config_algorithm() of
+        "random" -> random();
+        "uuid_v7" -> v7();
+        "uuid_v7_hex" -> v7_hex();
+        _ -> gen_server:call(?MODULE, create)
+    end.
 
 random() ->
     couch_util:to_hex_bin(crypto:strong_rand_bytes(16)).
@@ -42,16 +48,12 @@ init([]) ->
     ok = config:listen_for_changes(?MODULE, nil),
     {ok, state()}.
 
-handle_call(create, _From, random) ->
-    {reply, random(), random};
-handle_call(create, _From, uuid_v7) ->
-    {reply, v7_hex(), uuid_v7};
 handle_call(create, _From, {utc_random, ClockSeq}) ->
     {UtcRandom, NewClockSeq} = utc_random(ClockSeq),
     {reply, UtcRandom, {utc_random, NewClockSeq}};
 handle_call(create, _From, {utc_id, UtcIdSuffix, ClockSeq}) ->
-    Now = os:timestamp(),
-    {UtcId, NewClockSeq} = utc_suffix(UtcIdSuffix, ClockSeq, Now),
+    OsMicros = micros_since_epoch(),
+    {UtcId, NewClockSeq} = utc_suffix(UtcIdSuffix, ClockSeq, OsMicros),
     {reply, UtcId, {utc_id, UtcIdSuffix, NewClockSeq}};
 handle_call(create, _From, {sequential, Pref, Seq}) ->
     Result = ?l2b(Pref ++ io_lib:format("~6.16.0b", [Seq])),
@@ -110,9 +112,10 @@ v7_bin() ->
     <<MSec:48, 7:4, RandA:12, 2:2, RandB:62>>.
 
 v7_hex() ->
-    <<A:8/binary, B:4/binary, C:4/binary, D:4/binary, E:12/binary>> = 
couch_util:to_hex_bin(
-        v7_bin()
-    ),
+    couch_util:to_hex_bin(v7_bin()).
+
+v7() ->
+    <<A:8/binary, B:4/binary, C:4/binary, D:4/binary, E:12/binary>> = v7_hex(),
     <<A/binary, "-", B/binary, "-", C/binary, "-", D/binary, "-", E/binary>>.
 
 new_prefix() ->
@@ -122,37 +125,35 @@ inc() ->
     rand:uniform(16#ffd).
 
 state() ->
-    AlgoStr = config:get("uuids", "algorithm", "sequential"),
+    AlgoStr = config_algorithm(),
     case couch_util:to_existing_atom(AlgoStr) of
         random ->
             random;
         utc_random ->
-            ClockSeq = micros_since_epoch(os:timestamp()),
+            ClockSeq = micros_since_epoch(),
             {utc_random, ClockSeq};
         utc_id ->
-            ClockSeq = micros_since_epoch(os:timestamp()),
+            ClockSeq = micros_since_epoch(),
             UtcIdSuffix = config:get("uuids", "utc_id_suffix", ""),
             {utc_id, UtcIdSuffix, ClockSeq};
         sequential ->
             {sequential, new_prefix(), inc()};
         uuid_v7 ->
             uuid_v7;
+        uuid_v7_hex ->
+            uuid_v7_hex;
         Unknown ->
             throw({unknown_uuid_algorithm, Unknown})
     end.
 
-micros_since_epoch({_, _, Micro} = Now) ->
-    Nowish = calendar:now_to_universal_time(Now),
-    Nowsecs = calendar:datetime_to_gregorian_seconds(Nowish),
-    Then = calendar:datetime_to_gregorian_seconds({{1970, 1, 1}, {0, 0, 0}}),
-    (Nowsecs - Then) * 1000000 + Micro.
+micros_since_epoch() ->
+    os:system_time(microsecond).
 
 utc_random(ClockSeq) ->
     Suffix = couch_util:to_hex(crypto:strong_rand_bytes(9)),
-    utc_suffix(Suffix, ClockSeq, os:timestamp()).
+    utc_suffix(Suffix, ClockSeq, micros_since_epoch()).
 
-utc_suffix(Suffix, ClockSeq, Now) ->
-    OsMicros = micros_since_epoch(Now),
+utc_suffix(Suffix, ClockSeq, OsMicros) when is_integer(OsMicros) ->
     NewClockSeq =
         if
             OsMicros =< ClockSeq ->
@@ -165,6 +166,9 @@ utc_suffix(Suffix, ClockSeq, Now) ->
     Prefix = io_lib:format("~14.16.0b", [NewClockSeq]),
     {list_to_binary(Prefix ++ Suffix), NewClockSeq}.
 
+config_algorithm() ->
+    config:get("uuids", "algorithm", "sequential").
+
 -ifdef(TEST).
 
 -include_lib("eunit/include/eunit.hrl").
@@ -172,43 +176,48 @@ utc_suffix(Suffix, ClockSeq, Now) ->
 utc_id_time_does_not_advance_test() ->
     % Timestamp didn't advance but local clock sequence should and new UUIds
     % should be generated
-    Now = {0, 1, 2},
-    ClockSeq0 = micros_since_epoch({3, 4, 5}),
-    {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, Now),
+    ClockSeq0 = 345,
+    {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, 12),
     ?assert(is_binary(UtcId0)),
     ?assertEqual(ClockSeq0 + 1, ClockSeq1),
-    {UtcId1, ClockSeq2} = utc_suffix("", ClockSeq1, Now),
+    {UtcId1, ClockSeq2} = utc_suffix("", ClockSeq1, ClockSeq0),
     ?assertNotEqual(UtcId0, UtcId1),
     ?assertEqual(ClockSeq1 + 1, ClockSeq2).
 
 utc_id_time_advanced_test() ->
     % Timestamp advanced, a new UUID generated and also the last clock sequence
     % is updated to that timestamp.
-    Now0 = {0, 1, 2},
-    ClockSeq0 = micros_since_epoch({3, 4, 5}),
-    {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, Now0),
+    ClockSeq0 = 345,
+    {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, 12),
     ?assert(is_binary(UtcId0)),
     ?assertEqual(ClockSeq0 + 1, ClockSeq1),
-    Now1 = {9, 9, 9},
-    {UtcId1, ClockSeq2} = utc_suffix("", ClockSeq1, Now1),
+    ClockSeq2 = 999,
+    {UtcId1, ClockSeq3} = utc_suffix("", ClockSeq1, ClockSeq2),
     ?assert(is_binary(UtcId1)),
     ?assertNotEqual(UtcId0, UtcId1),
-    ?assertEqual(micros_since_epoch(Now1), ClockSeq2).
+    ?assertEqual(ClockSeq2, ClockSeq3).
 
 utc_random_test_time_does_not_advance_test() ->
-    {MSec, Sec, USec} = os:timestamp(),
-    Future = {MSec + 10, Sec, USec},
-    ClockSeqFuture = micros_since_epoch(Future),
+    OsMicros = os:system_time(microsecond),
+    ClockSeqFuture = OsMicros + 10_000_000,
     {UtcRandom, NextClockSeq} = utc_random(ClockSeqFuture),
     ?assert(is_binary(UtcRandom)),
     ?assertEqual(32, byte_size(UtcRandom)),
     ?assertEqual(ClockSeqFuture + 1, NextClockSeq).
 
 utc_random_test_time_advance_test() ->
-    ClockSeqPast = micros_since_epoch({1, 1, 1}),
+    ClockSeqPast = 111,
     {UtcRandom, NextClockSeq} = utc_random(ClockSeqPast),
     ?assert(is_binary(UtcRandom)),
     ?assertEqual(32, byte_size(UtcRandom)),
-    ?assert(NextClockSeq > micros_since_epoch({1000, 0, 0})).
+    ?assert(NextClockSeq > 1_000_000_000).
+
+uuid_v7_test() ->
+    ?assertEqual(36, byte_size(v7())),
+    ?assertEqual(32, byte_size(v7_hex())),
+    ?assertEqual(16, byte_size(v7_bin())),
+    Fun = fun(_, Acc) -> sets:add_element(v7_bin(), Acc) end,
+    Set = lists:foldl(Fun, couch_util:new_set(), lists:seq(1, 1000)),
+    ?assertEqual(1000, sets:size(Set)).
 
 -endif.
diff --git a/src/couch/test/eunit/couch_uuids_tests.erl 
b/src/couch/test/eunit/couch_uuids_tests.erl
index 3dde842cb..8543c396a 100644
--- a/src/couch/test/eunit/couch_uuids_tests.erl
+++ b/src/couch/test/eunit/couch_uuids_tests.erl
@@ -17,52 +17,78 @@
 -define(TIMEOUT, 20).
 
 setup_all() ->
-    test_util:start_applications([config, couch_log]),
+    test_util:start_applications([config, couch_stats, couch_log]),
     couch_uuids:start().
 
 teardown_all(_) ->
     couch_uuids:stop(),
-    test_util:stop_applications([config, couch_log]).
+    test_util:stop_applications([config, couch_stats, couch_log]).
 
 uuids_test_() ->
     {
         setup,
         fun setup_all/0,
         fun teardown_all/1,
-        [
-            {timeout, ?TIMEOUT, fun default_algorithm/0},
-            {timeout, ?TIMEOUT, fun sequential_algorithm/0},
-            {timeout, ?TIMEOUT, fun utc_algorithm/0},
-            {timeout, ?TIMEOUT, fun utc_id_suffix_algorithm/0}
-        ]
+        with([
+            ?TDEF(default_algorithm, ?TIMEOUT),
+            ?TDEF(sequential_algorithm, ?TIMEOUT),
+            ?TDEF(utc_algorithm, ?TIMEOUT),
+            ?TDEF(utc_id_suffix_algorithm, ?TIMEOUT),
+            ?TDEF(random_algorithm, ?TIMEOUT),
+            ?TDEF(uuid_v7_algorithm, ?TIMEOUT),
+            ?TDEF(uuid_v7_hex_algorithm, ?TIMEOUT)
+        ])
     }.
 
-default_algorithm() ->
+default_algorithm(_) ->
     config:delete("uuids", "algorithm", false),
     check_unique().
 
-sequential_algorithm() ->
+random_algorithm(_) ->
+    config:set("uuids", "algorithm", "random", false),
+    check_unique(),
+    check_size(32).
+
+sequential_algorithm(_) ->
     config:set("uuids", "algorithm", "sequential", false),
     check_unique(),
     check_increment_monotonically(),
-    check_rollover().
+    check_rollover(),
+    check_size(32).
 
-utc_algorithm() ->
+utc_algorithm(_) ->
     config:set("uuids", "algorithm", "utc_random", false),
     check_unique(),
-    check_increment_monotonically().
+    check_increment_monotonically(),
+    check_size(32).
 
-utc_id_suffix_algorithm() ->
+utc_id_suffix_algorithm(_) ->
     config:set("uuids", "algorithm", "utc_id", false),
     config:set("uuids", "utc_id_suffix", "bozo", false),
     check_unique(),
     check_increment_monotonically(),
-    check_preserve_suffix().
+    check_preserve_suffix(),
+    % 14 character time prefix + bozo
+    check_size(14 + 4).
+
+uuid_v7_algorithm(_) ->
+    config:set("uuids", "algorithm", "uuid_v7", false),
+    check_unique(),
+    check_size(36).
+
+uuid_v7_hex_algorithm(_) ->
+    config:set("uuids", "algorithm", "uuid_v7_hex", false),
+    check_unique(),
+    check_size(32).
 
 check_unique() ->
     %% this one may really runs for too long on slow hosts
     ?assert(test_unique(10000, [couch_uuids:new()])).
 
+check_size(Size) ->
+    %% this one may really runs for too long on slow hosts
+    ?assert(test_size(Size, 10000)).
+
 check_increment_monotonically() ->
     ?assert(couch_uuids:new() < couch_uuids:new()).
 
@@ -77,6 +103,12 @@ check_preserve_suffix() ->
     Suffix = get_suffix(UUID),
     ?assert(test_same_suffix(10000, Suffix)).
 
+test_size(_Size, 0) ->
+    true;
+test_size(Size, N) ->
+    ?assertEqual(Size, byte_size(couch_uuids:new())),
+    test_size(Size, N - 1).
+
 test_unique(0, _) ->
     true;
 test_unique(N, UUIDs) ->
diff --git a/src/docs/src/config/misc.rst b/src/docs/src/config/misc.rst
index 8373e0f98..e2ffeb082 100644
--- a/src/docs/src/config/misc.rst
+++ b/src/docs/src/config/misc.rst
@@ -66,7 +66,8 @@ UUIDs Configuration
     .. config:option:: algorithm :: Generation Algorithm
 
         .. versionchanged:: 1.3 Added ``utc_id`` algorithm.
-        .. versionchanged:: 3.6 Added ``uuid_v7`` algorithm.
+        .. versionchanged:: 3.6 Added ``uuid_v7`` and ``uuid_v7_hex`` 
algorithms.
+        .. versionchanged:: 3.6 ``uuid_v7_hex`` algorithm became the default.
 
         CouchDB provides various algorithms to generate the UUID values that
         are  used for document `_id`'s by default::
@@ -159,22 +160,41 @@ UUIDs Configuration
                   ]
               }
 
-        - ``uuid_v7``: UUID v7 string in hex.
+        - ``uuid_v7``: UUID v7 string.
 
           .. code-block:: javascript
 
                {
                    "uuids": [
-                       "0199d2456e7f7b0a9b7130f9a9db8bee",
-                       "0199d2456e7f72dda9f758fcc259c5fc",
-                       "0199d2456e7f751c80b461180f7c7717",
-                       "0199d2456e7f7c569b317d53367ca45a",
-                       "0199d2456e7f77bfbffe92682c9c8c69",
-                       "0199d2456e7f703ea97286f3d976343e",
-                       "0199d2456e7f7f729142ed3b2da9101f",
-                       "0199d2456e7f7723905c1f91f40d54f5",
-                       "0199d2456e7f7e40979c7e2e22ffeb6a",
-                       "0199d2456e7f7a42b43acfcc1e18eb84"
+                      "0199df37-5929-7032-b402-c3e61fbbf88f",
+                      "0199df37-5929-7851-9492-e566433ced7f",
+                      "0199df37-5929-7c5c-9ba3-26bb2f16a044",
+                      "0199df37-5929-7162-92c4-b06e63053849",
+                      "0199df37-5929-7880-af15-a69c3157df06",
+                      "0199df37-5929-72db-bc45-295ef3e1adc4",
+                      "0199df37-5929-7a3c-ad38-553cfead7950",
+                      "0199df37-5929-755a-8cbd-cdf681136120",
+                      "0199df37-5929-74b8-ad9b-8a46c1fbf992",
+                      "0199df37-5929-7216-953c-65af078590fc"
+                   ]
+               }
+
+        - ``uuid_v7_hex``: UUID v7 in hex.
+
+          .. code-block:: javascript
+
+               {
+                   "uuids": [
+                      "0199df3833cc79c89bdde9530efc4f0c",
+                      "0199df3833cc7694af52300fce26dc2f",
+                      "0199df3833cc78bf93c0a7b7f1c00d5c",
+                      "0199df3833cc740d84ee0932e8ff1df3",
+                      "0199df3833cc751bb45b27cfbcc3c753",
+                      "0199df3833cc7a7cbce5469d3f52ba83",
+                      "0199df3833cc7b3f8c3f517a3a84b649",
+                      "0199df3833cc74b9b3cacaa0ebfbc842",
+                      "0199df3833cc7f49ae722b96ba1dbb3a",
+                      "0199df3833cc7db39f01868033e1dbec"
                    ]
                }
 
@@ -182,17 +202,17 @@ UUIDs Configuration
             **Impact of UUID choices:** the choice of UUID has a significant
             impact on the layout of the B-tree, prior to compaction.
 
-            For example, using a sequential UUID algorithm while uploading a
-            large batch of documents will avoid the need to rewrite many
-            intermediate B-tree nodes. A random UUID algorithm may require
+            For example, using the UUID v7 or sequential algorithms while
+            uploading a large batch of documents will avoid the need to rewrite
+            many intermediate B-tree nodes. A random UUID algorithm may require
             rewriting intermediate nodes on a regular basis, resulting in
-            significantly decreased throughput and wasted disk space space due 
to
-            the append-only B-tree design.
+            significantly decreased throughput and wasted disk space space due
+            to the append-only B-tree design.
 
-            It is generally recommended to set your own UUIDs, or use the
-            sequential algorithm unless you have a specific need and take into
-            account the likely need for compaction to re-balance the B-tree and
-            reclaim wasted space.
+            It is generally recommended to set your own UUIDs, or use the UUID
+            v7 or sequential algorithms unless you have a specific need and 
take
+            into account the likely need for compaction to re-balance the
+            B-tree and reclaim wasted space.
 
     .. config:option:: utc_id_suffix :: UTC ID Suffix
 

Reply via email to