This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch optimize-uuids-generation in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit b718a9881472f489a4e1e4fd0f2b7314e745bb8e Author: Nick Vatamaniuc <[email protected]> AuthorDate: Mon Oct 13 16:24:05 2025 -0400 Optimize UUID generation and use UUID v7 by default The UUID v7 along with the `random` UUID types do not require going through the gen_server as they do not need to keep any state. This should make the system more robust and prevent clients from overwhelming the node with concurrent requests to the `/_uuid` endpoint. Add a plain hex-encoded version of UUID v7 as `uuid_v7_hex` and use that as the new default. The `hex` version is used according to the guidance in: https://datatracker.ietf.org/doc/rfc9562/ ``` For many applications, such as databases, storing UUIDs as text is unnecessarily verbose, requiring 288 bits to represent 128-bit UUID values. Thus, where feasible, UUIDs SHOULD be stored within database applications as the underlying 128-bit binary value. ``` Ideally we'd like to use just the 128 bit binary, however since we're transmitting json we'll encode it as base 16. This way its shape is also exactly compatible (same length and characters) with the previous `sequentail` default. In the future more compact encodings may be added: base32, base58 etc. Users can still generate properly formatted UUID v7 strings (with dashes) by using the `uuid_v7` algorithm. That will produce 36 byte character stirngs which may look like: ``` [ "0199df37-5929-7032-b402-c3e61fbbf88f", "0199df37-5929-7851-9492-e566433ced7f", ... ] ``` While at it, expand the test suite and add more coverage and also ensure to test the sizes of the returned values. --- src/couch/src/couch_uuids.erl | 83 +++++++++++++++++------------- src/couch/test/eunit/couch_uuids_tests.erl | 62 ++++++++++++++++------ src/docs/src/config/misc.rst | 62 ++++++++++++++-------- 3 files changed, 134 insertions(+), 73 deletions(-) diff --git a/src/couch/src/couch_uuids.erl b/src/couch/src/couch_uuids.erl index f9c11398d..251035e30 100644 --- a/src/couch/src/couch_uuids.erl +++ b/src/couch/src/couch_uuids.erl @@ -17,7 +17,7 @@ -export([start/0, stop/0]). -export([new/0, random/0]). --export([v7_hex/0, v7_bin/0]). +-export([v7_bin/0]). -export([init/1]). -export([handle_call/3, handle_cast/2, handle_info/2]). @@ -33,7 +33,13 @@ stop() -> gen_server:cast(?MODULE, stop). new() -> - gen_server:call(?MODULE, create). + % Some algorithms can bypass the gen_server + case config_algorithm() of + "random" -> random(); + "uuid_v7" -> v7(); + "uuid_v7_hex" -> v7_hex(); + _ -> gen_server:call(?MODULE, create) + end. random() -> couch_util:to_hex_bin(crypto:strong_rand_bytes(16)). @@ -42,16 +48,12 @@ init([]) -> ok = config:listen_for_changes(?MODULE, nil), {ok, state()}. -handle_call(create, _From, random) -> - {reply, random(), random}; -handle_call(create, _From, uuid_v7) -> - {reply, v7_hex(), uuid_v7}; handle_call(create, _From, {utc_random, ClockSeq}) -> {UtcRandom, NewClockSeq} = utc_random(ClockSeq), {reply, UtcRandom, {utc_random, NewClockSeq}}; handle_call(create, _From, {utc_id, UtcIdSuffix, ClockSeq}) -> - Now = os:timestamp(), - {UtcId, NewClockSeq} = utc_suffix(UtcIdSuffix, ClockSeq, Now), + OsMicros = micros_since_epoch(), + {UtcId, NewClockSeq} = utc_suffix(UtcIdSuffix, ClockSeq, OsMicros), {reply, UtcId, {utc_id, UtcIdSuffix, NewClockSeq}}; handle_call(create, _From, {sequential, Pref, Seq}) -> Result = ?l2b(Pref ++ io_lib:format("~6.16.0b", [Seq])), @@ -110,9 +112,10 @@ v7_bin() -> <<MSec:48, 7:4, RandA:12, 2:2, RandB:62>>. v7_hex() -> - <<A:8/binary, B:4/binary, C:4/binary, D:4/binary, E:12/binary>> = couch_util:to_hex_bin( - v7_bin() - ), + couch_util:to_hex_bin(v7_bin()). + +v7() -> + <<A:8/binary, B:4/binary, C:4/binary, D:4/binary, E:12/binary>> = v7_hex(), <<A/binary, "-", B/binary, "-", C/binary, "-", D/binary, "-", E/binary>>. new_prefix() -> @@ -122,37 +125,35 @@ inc() -> rand:uniform(16#ffd). state() -> - AlgoStr = config:get("uuids", "algorithm", "sequential"), + AlgoStr = config_algorithm(), case couch_util:to_existing_atom(AlgoStr) of random -> random; utc_random -> - ClockSeq = micros_since_epoch(os:timestamp()), + ClockSeq = micros_since_epoch(), {utc_random, ClockSeq}; utc_id -> - ClockSeq = micros_since_epoch(os:timestamp()), + ClockSeq = micros_since_epoch(), UtcIdSuffix = config:get("uuids", "utc_id_suffix", ""), {utc_id, UtcIdSuffix, ClockSeq}; sequential -> {sequential, new_prefix(), inc()}; uuid_v7 -> uuid_v7; + uuid_v7_hex -> + uuid_v7_hex; Unknown -> throw({unknown_uuid_algorithm, Unknown}) end. -micros_since_epoch({_, _, Micro} = Now) -> - Nowish = calendar:now_to_universal_time(Now), - Nowsecs = calendar:datetime_to_gregorian_seconds(Nowish), - Then = calendar:datetime_to_gregorian_seconds({{1970, 1, 1}, {0, 0, 0}}), - (Nowsecs - Then) * 1000000 + Micro. +micros_since_epoch() -> + os:system_time(microsecond). utc_random(ClockSeq) -> Suffix = couch_util:to_hex(crypto:strong_rand_bytes(9)), - utc_suffix(Suffix, ClockSeq, os:timestamp()). + utc_suffix(Suffix, ClockSeq, micros_since_epoch()). -utc_suffix(Suffix, ClockSeq, Now) -> - OsMicros = micros_since_epoch(Now), +utc_suffix(Suffix, ClockSeq, OsMicros) when is_integer(OsMicros) -> NewClockSeq = if OsMicros =< ClockSeq -> @@ -165,6 +166,9 @@ utc_suffix(Suffix, ClockSeq, Now) -> Prefix = io_lib:format("~14.16.0b", [NewClockSeq]), {list_to_binary(Prefix ++ Suffix), NewClockSeq}. +config_algorithm() -> + config:get("uuids", "algorithm", "sequential"). + -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -172,43 +176,48 @@ utc_suffix(Suffix, ClockSeq, Now) -> utc_id_time_does_not_advance_test() -> % Timestamp didn't advance but local clock sequence should and new UUIds % should be generated - Now = {0, 1, 2}, - ClockSeq0 = micros_since_epoch({3, 4, 5}), - {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, Now), + ClockSeq0 = 345, + {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, 12), ?assert(is_binary(UtcId0)), ?assertEqual(ClockSeq0 + 1, ClockSeq1), - {UtcId1, ClockSeq2} = utc_suffix("", ClockSeq1, Now), + {UtcId1, ClockSeq2} = utc_suffix("", ClockSeq1, ClockSeq0), ?assertNotEqual(UtcId0, UtcId1), ?assertEqual(ClockSeq1 + 1, ClockSeq2). utc_id_time_advanced_test() -> % Timestamp advanced, a new UUID generated and also the last clock sequence % is updated to that timestamp. - Now0 = {0, 1, 2}, - ClockSeq0 = micros_since_epoch({3, 4, 5}), - {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, Now0), + ClockSeq0 = 345, + {UtcId0, ClockSeq1} = utc_suffix("", ClockSeq0, 12), ?assert(is_binary(UtcId0)), ?assertEqual(ClockSeq0 + 1, ClockSeq1), - Now1 = {9, 9, 9}, - {UtcId1, ClockSeq2} = utc_suffix("", ClockSeq1, Now1), + ClockSeq2 = 999, + {UtcId1, ClockSeq3} = utc_suffix("", ClockSeq1, ClockSeq2), ?assert(is_binary(UtcId1)), ?assertNotEqual(UtcId0, UtcId1), - ?assertEqual(micros_since_epoch(Now1), ClockSeq2). + ?assertEqual(ClockSeq2, ClockSeq3). utc_random_test_time_does_not_advance_test() -> - {MSec, Sec, USec} = os:timestamp(), - Future = {MSec + 10, Sec, USec}, - ClockSeqFuture = micros_since_epoch(Future), + OsMicros = os:system_time(microsecond), + ClockSeqFuture = OsMicros + 10_000_000, {UtcRandom, NextClockSeq} = utc_random(ClockSeqFuture), ?assert(is_binary(UtcRandom)), ?assertEqual(32, byte_size(UtcRandom)), ?assertEqual(ClockSeqFuture + 1, NextClockSeq). utc_random_test_time_advance_test() -> - ClockSeqPast = micros_since_epoch({1, 1, 1}), + ClockSeqPast = 111, {UtcRandom, NextClockSeq} = utc_random(ClockSeqPast), ?assert(is_binary(UtcRandom)), ?assertEqual(32, byte_size(UtcRandom)), - ?assert(NextClockSeq > micros_since_epoch({1000, 0, 0})). + ?assert(NextClockSeq > 1_000_000_000). + +uuid_v7_test() -> + ?assertEqual(36, byte_size(v7())), + ?assertEqual(32, byte_size(v7_hex())), + ?assertEqual(16, byte_size(v7_bin())), + Fun = fun(_, Acc) -> sets:add_element(v7_bin(), Acc) end, + Set = lists:foldl(Fun, couch_util:new_set(), lists:seq(1, 1000)), + ?assertEqual(1000, sets:size(Set)). -endif. diff --git a/src/couch/test/eunit/couch_uuids_tests.erl b/src/couch/test/eunit/couch_uuids_tests.erl index 3dde842cb..8543c396a 100644 --- a/src/couch/test/eunit/couch_uuids_tests.erl +++ b/src/couch/test/eunit/couch_uuids_tests.erl @@ -17,52 +17,78 @@ -define(TIMEOUT, 20). setup_all() -> - test_util:start_applications([config, couch_log]), + test_util:start_applications([config, couch_stats, couch_log]), couch_uuids:start(). teardown_all(_) -> couch_uuids:stop(), - test_util:stop_applications([config, couch_log]). + test_util:stop_applications([config, couch_stats, couch_log]). uuids_test_() -> { setup, fun setup_all/0, fun teardown_all/1, - [ - {timeout, ?TIMEOUT, fun default_algorithm/0}, - {timeout, ?TIMEOUT, fun sequential_algorithm/0}, - {timeout, ?TIMEOUT, fun utc_algorithm/0}, - {timeout, ?TIMEOUT, fun utc_id_suffix_algorithm/0} - ] + with([ + ?TDEF(default_algorithm, ?TIMEOUT), + ?TDEF(sequential_algorithm, ?TIMEOUT), + ?TDEF(utc_algorithm, ?TIMEOUT), + ?TDEF(utc_id_suffix_algorithm, ?TIMEOUT), + ?TDEF(random_algorithm, ?TIMEOUT), + ?TDEF(uuid_v7_algorithm, ?TIMEOUT), + ?TDEF(uuid_v7_hex_algorithm, ?TIMEOUT) + ]) }. -default_algorithm() -> +default_algorithm(_) -> config:delete("uuids", "algorithm", false), check_unique(). -sequential_algorithm() -> +random_algorithm(_) -> + config:set("uuids", "algorithm", "random", false), + check_unique(), + check_size(32). + +sequential_algorithm(_) -> config:set("uuids", "algorithm", "sequential", false), check_unique(), check_increment_monotonically(), - check_rollover(). + check_rollover(), + check_size(32). -utc_algorithm() -> +utc_algorithm(_) -> config:set("uuids", "algorithm", "utc_random", false), check_unique(), - check_increment_monotonically(). + check_increment_monotonically(), + check_size(32). -utc_id_suffix_algorithm() -> +utc_id_suffix_algorithm(_) -> config:set("uuids", "algorithm", "utc_id", false), config:set("uuids", "utc_id_suffix", "bozo", false), check_unique(), check_increment_monotonically(), - check_preserve_suffix(). + check_preserve_suffix(), + % 14 character time prefix + bozo + check_size(14 + 4). + +uuid_v7_algorithm(_) -> + config:set("uuids", "algorithm", "uuid_v7", false), + check_unique(), + check_size(36). + +uuid_v7_hex_algorithm(_) -> + config:set("uuids", "algorithm", "uuid_v7_hex", false), + check_unique(), + check_size(32). check_unique() -> %% this one may really runs for too long on slow hosts ?assert(test_unique(10000, [couch_uuids:new()])). +check_size(Size) -> + %% this one may really runs for too long on slow hosts + ?assert(test_size(Size, 10000)). + check_increment_monotonically() -> ?assert(couch_uuids:new() < couch_uuids:new()). @@ -77,6 +103,12 @@ check_preserve_suffix() -> Suffix = get_suffix(UUID), ?assert(test_same_suffix(10000, Suffix)). +test_size(_Size, 0) -> + true; +test_size(Size, N) -> + ?assertEqual(Size, byte_size(couch_uuids:new())), + test_size(Size, N - 1). + test_unique(0, _) -> true; test_unique(N, UUIDs) -> diff --git a/src/docs/src/config/misc.rst b/src/docs/src/config/misc.rst index 8373e0f98..e2ffeb082 100644 --- a/src/docs/src/config/misc.rst +++ b/src/docs/src/config/misc.rst @@ -66,7 +66,8 @@ UUIDs Configuration .. config:option:: algorithm :: Generation Algorithm .. versionchanged:: 1.3 Added ``utc_id`` algorithm. - .. versionchanged:: 3.6 Added ``uuid_v7`` algorithm. + .. versionchanged:: 3.6 Added ``uuid_v7`` and ``uuid_v7_hex`` algorithms. + .. versionchanged:: 3.6 ``uuid_v7_hex`` algorithm became the default. CouchDB provides various algorithms to generate the UUID values that are used for document `_id`'s by default:: @@ -159,22 +160,41 @@ UUIDs Configuration ] } - - ``uuid_v7``: UUID v7 string in hex. + - ``uuid_v7``: UUID v7 string. .. code-block:: javascript { "uuids": [ - "0199d2456e7f7b0a9b7130f9a9db8bee", - "0199d2456e7f72dda9f758fcc259c5fc", - "0199d2456e7f751c80b461180f7c7717", - "0199d2456e7f7c569b317d53367ca45a", - "0199d2456e7f77bfbffe92682c9c8c69", - "0199d2456e7f703ea97286f3d976343e", - "0199d2456e7f7f729142ed3b2da9101f", - "0199d2456e7f7723905c1f91f40d54f5", - "0199d2456e7f7e40979c7e2e22ffeb6a", - "0199d2456e7f7a42b43acfcc1e18eb84" + "0199df37-5929-7032-b402-c3e61fbbf88f", + "0199df37-5929-7851-9492-e566433ced7f", + "0199df37-5929-7c5c-9ba3-26bb2f16a044", + "0199df37-5929-7162-92c4-b06e63053849", + "0199df37-5929-7880-af15-a69c3157df06", + "0199df37-5929-72db-bc45-295ef3e1adc4", + "0199df37-5929-7a3c-ad38-553cfead7950", + "0199df37-5929-755a-8cbd-cdf681136120", + "0199df37-5929-74b8-ad9b-8a46c1fbf992", + "0199df37-5929-7216-953c-65af078590fc" + ] + } + + - ``uuid_v7_hex``: UUID v7 in hex. + + .. code-block:: javascript + + { + "uuids": [ + "0199df3833cc79c89bdde9530efc4f0c", + "0199df3833cc7694af52300fce26dc2f", + "0199df3833cc78bf93c0a7b7f1c00d5c", + "0199df3833cc740d84ee0932e8ff1df3", + "0199df3833cc751bb45b27cfbcc3c753", + "0199df3833cc7a7cbce5469d3f52ba83", + "0199df3833cc7b3f8c3f517a3a84b649", + "0199df3833cc74b9b3cacaa0ebfbc842", + "0199df3833cc7f49ae722b96ba1dbb3a", + "0199df3833cc7db39f01868033e1dbec" ] } @@ -182,17 +202,17 @@ UUIDs Configuration **Impact of UUID choices:** the choice of UUID has a significant impact on the layout of the B-tree, prior to compaction. - For example, using a sequential UUID algorithm while uploading a - large batch of documents will avoid the need to rewrite many - intermediate B-tree nodes. A random UUID algorithm may require + For example, using the UUID v7 or sequential algorithms while + uploading a large batch of documents will avoid the need to rewrite + many intermediate B-tree nodes. A random UUID algorithm may require rewriting intermediate nodes on a regular basis, resulting in - significantly decreased throughput and wasted disk space space due to - the append-only B-tree design. + significantly decreased throughput and wasted disk space space due + to the append-only B-tree design. - It is generally recommended to set your own UUIDs, or use the - sequential algorithm unless you have a specific need and take into - account the likely need for compaction to re-balance the B-tree and - reclaim wasted space. + It is generally recommended to set your own UUIDs, or use the UUID + v7 or sequential algorithms unless you have a specific need and take + into account the likely need for compaction to re-balance the + B-tree and reclaim wasted space. .. config:option:: utc_id_suffix :: UTC ID Suffix
