Hello, Attached is a proof-of-concept implementation supporting '--random=seed=N' option for sort/shuf/shred, to enable reproducible (pseudo) random runs. It was discussed a while ago, here: http://lists.gnu.org/archive/html/coreutils/2013-11/msg00068.html
comments are welcomed, - Assaf
>From 0107e5a2597283bf572f0f79c74c2f315b97e36c Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 0/7] *** SUBJECT HERE *** *** BLURB HERE *** A. Gordon (7): gnulib: add randread source with fixed seed gnulib: add randint source with known seed shred: new option: --random-seed=N shuf: new option: --random-seed=N sort: new option: --random-seed=N tests: sort,shuf,shred: add --random-seed=N tests doc: mention --random-seed=N option doc/coreutils.texi | 36 ++++++++++++++++++++++++++++++++- gl/lib/randint.c | 6 ++++++ gl/lib/randint.h | 1 + gl/lib/randread.c | 31 ++++++++++++++++++++++++++++ gl/lib/randread.h | 1 + src/shred.c | 27 +++++++++++++++++++++++-- src/shuf.c | 31 ++++++++++++++++++++++++---- src/sort.c | 50 +++++++++++++++++++++++++++++++++++++++++++++- tests/misc/shred-passes.sh | 14 +++++++++++++ tests/misc/shuf.sh | 14 +++++++++++++ tests/misc/sort-rand.sh | 14 +++++++++++++ 11 files changed, 217 insertions(+), 8 deletions(-) -- 1.9.1 >From 9f197c17f4670b33ff5464c70fd371dd3ba15e78 Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 1/7] gnulib: add randread source with fixed seed * gl/lib/randread.{c,h}: randread_new_seed() - new function to allocate and initialize a 'struct randread_source *' with a user-supplied seed (thus, known initial (not so) random state). get_nonce_seed() - initialize 'struct randread_source' with seed value, instead of pid/ppid/time. --- gl/lib/randread.c | 31 +++++++++++++++++++++++++++++++ gl/lib/randread.h | 1 + 2 files changed, 32 insertions(+) diff --git a/gl/lib/randread.c b/gl/lib/randread.c index 5efbd4f..b996c9e 100644 --- a/gl/lib/randread.c +++ b/gl/lib/randread.c @@ -188,6 +188,21 @@ get_nonce (void *buffer, size_t bufsize, size_t bytes_bound) #endif } +/* Put a fixed, user-supplied value into BUFFER, with size BUFSIZE. */ +static void +get_nonce_seed (void *buffer, size_t bufsize, size_t bytes_bound, int seed) +{ + char *buf = buffer; + ssize_t seeded = 0; + + /* Unlike get_nonce(), reset the buffer to a known zero state, + to ensure reproducible seeding */ + memset (buf, 0, bufsize); + + /* TODO: fill more of the state buffer? or leave it as mostly zeros ?*/ + ISAAC_SEED (int, v = seed); +} + /* Create and initialize a random data source from NAME, or use a reasonable default source if NAME is null. BYTES_BOUND is an upper @@ -230,6 +245,22 @@ randread_new (char const *name, size_t bytes_bound) } } +/* Similar to randread_new(), create and initialize a random data source, + except use a user-supplied seed value. */ +struct randread_source *randread_new_seed (int seed, size_t bytes_bound) +{ + /* TODO: not fail on bytes_bound==0 with seed value? */ + if (bytes_bound == 0) + return NULL; + + struct randread_source *s = simple_new (NULL, NULL); + s->buf.isaac.buffered = 0; + get_nonce_seed (s->buf.isaac.state.m, sizeof s->buf.isaac.state.m, + bytes_bound, seed); + isaac_seed (&s->buf.isaac.state); + return s; +} + /* Set S's handler and its argument. HANDLER (HANDLER_ARG) is called when there is a read error or end of file from the random data diff --git a/gl/lib/randread.h b/gl/lib/randread.h index c6ffc79..2022dca 100644 --- a/gl/lib/randread.h +++ b/gl/lib/randread.h @@ -25,6 +25,7 @@ struct randread_source; struct randread_source *randread_new (char const *, size_t); +struct randread_source *randread_new_seed (int seed, size_t); void randread (struct randread_source *, void *, size_t); void randread_set_handler (struct randread_source *, void (*) (void const *)); void randread_set_handler_arg (struct randread_source *, void const *); -- 1.9.1 >From a45684503b08d05e83fe33d871652b30f3b83faf Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 2/7] gnulib: add randint source with known seed * gl/lib/randint.{c,h}: randint_all_new_seed(): new function to allocate and initialize a 'struct randint_source *' with a known user-supplied seed (thus, have a not so random initial state). --- gl/lib/randint.c | 6 ++++++ gl/lib/randint.h | 1 + 2 files changed, 7 insertions(+) diff --git a/gl/lib/randint.c b/gl/lib/randint.c index 240fd24..fae729b 100644 --- a/gl/lib/randint.c +++ b/gl/lib/randint.c @@ -87,6 +87,12 @@ randint_all_new (char const *name, size_t bytes_bound) return (source ? randint_new (source) : NULL); } +struct randint_source *randint_all_new_seed (int seed, size_t bytes_bound) +{ + struct randread_source *source = randread_new_seed (seed, bytes_bound); + return (source ? randint_new (source) : NULL); +} + /* Return the random data source of *S. */ struct randread_source * diff --git a/gl/lib/randint.h b/gl/lib/randint.h index d6728c2..1a6c635 100644 --- a/gl/lib/randint.h +++ b/gl/lib/randint.h @@ -34,6 +34,7 @@ struct randint_source; struct randint_source *randint_new (struct randread_source *); struct randint_source *randint_all_new (char const *, size_t); +struct randint_source *randint_all_new_seed (int, size_t); struct randread_source *randint_get_source (struct randint_source const *) _GL_ATTRIBUTE_PURE; randint randint_genmax (struct randint_source *, randint genmax); -- 1.9.1 >From dc16319ef24bf324ab03715ceb58ff5b8fd22407 Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 3/7] shred: new option: --random-seed=N * src/shred.c: usage(): mention new option. main(): handle new option. --- src/shred.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/shred.c b/src/shred.c index 543dcb0..e1718ff 100644 --- a/src/shred.c +++ b/src/shred.c @@ -90,6 +90,7 @@ #include "error.h" #include "fcntl--.h" #include "human.h" +#include "quote.h" #include "quotearg.h" /* For quotearg_colon */ #include "randint.h" #include "randread.h" @@ -141,7 +142,8 @@ struct Options non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - RANDOM_SOURCE_OPTION = CHAR_MAX + 1 + RANDOM_SOURCE_OPTION = CHAR_MAX + 1, + RANDOM_SEED_OPTION }; static struct option const long_opts[] = @@ -150,6 +152,7 @@ static struct option const long_opts[] = {"force", no_argument, NULL, 'f'}, {"iterations", required_argument, NULL, 'n'}, {"size", required_argument, NULL, 's'}, + {"random-seed", required_argument, NULL, RANDOM_SEED_OPTION}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"remove", optional_argument, NULL, 'u'}, {"verbose", no_argument, NULL, 'v'}, @@ -177,6 +180,7 @@ for even very expensive hardware probing to recover the data.\n\ printf (_("\ -f, --force change permissions to allow writing if necessary\n\ -n, --iterations=N overwrite N times instead of the default (%d)\n\ + --random-seed=N use N as random seed\n\ --random-source=FILE get random bytes from FILE\n\ -s, --size=N shred this many bytes (suffixes like K, M, G accepted)\n\ "), DEFAULT_PASSES); @@ -1207,6 +1211,8 @@ main (int argc, char **argv) int c; int i; char const *random_source = NULL; + long random_seed = 0; + bool use_random_seed = false; initialize_main (&argc, &argv); set_program_name (argv[0]); @@ -1240,6 +1246,16 @@ main (int argc, char **argv) random_source = optarg; break; + case RANDOM_SEED_OPTION: + { + strtol_error e = xstrtol (optarg, NULL, 10, &random_seed, NULL); + if (e != LONGINT_OK) + error (EXIT_FAILURE, 0, _("invalid random seed: %s"), + quote (optarg)); + use_random_seed = true; + } + break; + case 'u': if (optarg == NULL) flags.remove_file = remove_wipesync; @@ -1282,8 +1298,15 @@ main (int argc, char **argv) error (0, 0, _("missing file operand")); usage (EXIT_FAILURE); } + if (random_source && use_random_seed) + { + error (0, 0, _("random-source and random-seed are mutually exclusive")); + usage (EXIT_FAILURE); + } - randint_source = randint_all_new (random_source, SIZE_MAX); + randint_source = (use_random_seed) + ?randint_all_new_seed (random_seed, SIZE_MAX) + :randint_all_new (random_source, SIZE_MAX); if (! randint_source) error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source)); atexit (clear_random_data); -- 1.9.1 >From f41946b1639b2beb641b06e1944422ccef887998 Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 4/7] shuf: new option: --random-seed=N * src/shuf.c: usage(): mention new option. main(): handle new option. --- src/shuf.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index 5a25e58..141779c 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -76,6 +76,7 @@ Write a random permutation of the input lines to standard output.\n\ -i, --input-range=LO-HI treat each number LO through HI as an input line\n\ -n, --head-count=COUNT output at most COUNT lines\n\ -o, --output=FILE write result to FILE instead of standard output\n\ + --random-seed=N use N as random seed\n\ --random-source=FILE get random bytes from FILE\n\ -r, --repeat output lines can be repeated\n\ "), stdout); @@ -98,7 +99,8 @@ With no FILE, or when FILE is -, read standard input.\n\ non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - RANDOM_SOURCE_OPTION = CHAR_MAX + 1 + RANDOM_SOURCE_OPTION = CHAR_MAX + 1, + RANDOM_SEED_OPTION }; static struct option const long_opts[] = @@ -107,6 +109,7 @@ static struct option const long_opts[] = {"input-range", required_argument, NULL, 'i'}, {"head-count", required_argument, NULL, 'n'}, {"output", required_argument, NULL, 'o'}, + {"random-seed", required_argument, NULL, RANDOM_SEED_OPTION}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"repeat", no_argument, NULL, 'r'}, {"zero-terminated", no_argument, NULL, 'z'}, @@ -391,6 +394,8 @@ main (int argc, char **argv) size_t head_lines = SIZE_MAX; char const *outfile = NULL; char *random_source = NULL; + long random_seed = 0; + bool use_random_seed = false; char eolbyte = '\n'; char **input_lines = NULL; bool use_reservoir_sampling = false; @@ -476,6 +481,16 @@ main (int argc, char **argv) random_source = optarg; break; + case RANDOM_SEED_OPTION: + { + strtol_error e = xstrtol (optarg, NULL, 10, &random_seed, NULL); + if (e != LONGINT_OK) + error (EXIT_FAILURE, 0, _("invalid random seed: %s"), + quote (optarg)); + use_random_seed = true; + } + break; + case 'r': repeat = true; break; @@ -504,6 +519,11 @@ main (int argc, char **argv) error (0, 0, _("extra operand %s"), quote (operand[!input_range])); usage (EXIT_FAILURE); } + if (random_source && use_random_seed) + { + error (0, 0, _("random-source and random-seed are mutually exclusive")); + usage (EXIT_FAILURE); + } /* Prepare input. */ if (echo) @@ -543,10 +563,13 @@ main (int argc, char **argv) if (! repeat) head_lines = MIN (head_lines, n_lines); - randint_source = randint_all_new (random_source, - (use_reservoir_sampling || repeat + const size_t bytes_bound = (use_reservoir_sampling || repeat ? SIZE_MAX - : randperm_bound (head_lines, n_lines))); + : randperm_bound (head_lines, n_lines)); + randint_source = (use_random_seed) + ?randint_all_new_seed (random_seed, bytes_bound) + :randint_all_new (random_source, bytes_bound); + if (! randint_source) error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source)); -- 1.9.1 >From 5c19703d04e2e2314ac85b9d5ce11ffcf26a7993 Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 5/7] sort: new option: --random-seed=N * src/sort.c: usage(): mention new option. main(): handle new option. random_md5_state_init_seed(): initialize the random data structure from a user-supplied seed value. --- src/sort.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/sort.c b/src/sort.c index 23d4565..3cff244 100644 --- a/src/sort.c +++ b/src/sort.c @@ -451,6 +451,7 @@ Ordering options:\n\ fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ + --random-seed=N use N as random seed\n\ --random-source=FILE get random bytes from FILE\n\ -r, --reverse reverse the result of comparisons\n\ "), stdout); @@ -547,6 +548,7 @@ enum FILES0_FROM_OPTION, NMERGE_OPTION, RANDOM_SOURCE_OPTION, + RANDOM_SEED_OPTION, SORT_OPTION, PARALLEL_OPTION }; @@ -571,6 +573,7 @@ static struct option const long_options[] = {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, + {"random-seed", required_argument, NULL, RANDOM_SEED_OPTION}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"sort", required_argument, NULL, SORT_OPTION}, {"output", required_argument, NULL, 'o'}, @@ -2063,6 +2066,28 @@ random_md5_state_init (char const *random_source) md5_process_bytes (buf, sizeof buf, &random_md5_state); } +/* Initialize the pseudo-random MD5 state with a user-supplied seed. */ + +static void +random_md5_state_init_seed (int seed) +{ + unsigned char buf[MD5_DIGEST_SIZE]; + struct randread_source *r = randread_new_seed (seed, sizeof buf); + if (! r) + { + error (0, 0, _("random seed initization error")); + exit (SORT_FAILURE); + } + randread (r, buf, sizeof buf); + if (randread_free (r) != 0) + { + error (0, 0, _("random seed cleanup error")); + exit (SORT_FAILURE); + } + md5_init_ctx (&random_md5_state); + md5_process_bytes (buf, sizeof buf, &random_md5_state); +} + /* This is like strxfrm, except it reports any error and exits. */ static size_t @@ -4180,6 +4205,8 @@ main (int argc, char **argv) bool mergeonly = false; char *random_source = NULL; bool need_random = false; + long random_seed = 0; + bool use_random_seed = false; size_t nthreads = 0; size_t nfiles = 0; bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); @@ -4483,6 +4510,16 @@ main (int argc, char **argv) random_source = optarg; break; + case RANDOM_SEED_OPTION: + { + strtol_error e = xstrtol (optarg, NULL, 10, &random_seed, NULL); + if (e != LONGINT_OK) + error (EXIT_FAILURE, 0, _("invalid random seed: %s"), + quote (optarg)); + use_random_seed = true; + } + break; + case 's': stable = true; break; @@ -4650,6 +4687,12 @@ main (int argc, char **argv) check_ordering_compatibility (); + if (random_source && use_random_seed) + { + error (0, 0, _("random-source and random-seed are mutually exclusive")); + usage (EXIT_FAILURE); + } + if (debug) { if (checkonly || outfile) @@ -4672,7 +4715,12 @@ main (int argc, char **argv) reverse = gkey.reverse; if (need_random) - random_md5_state_init (random_source); + { + if (use_random_seed) + random_md5_state_init_seed (random_seed); + else + random_md5_state_init (random_source); + } if (temp_dir_count == 0) { -- 1.9.1 >From 33a9fa43cb92b2e24578c4965d45ecd81fc83b85 Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 6/7] tests: sort,shuf,shred: add --random-seed=N tests * tests/misc/shred-passes.sh, * tests/misc/shuf.sh, * tests/misc/sort-rand.sh: test new --random-seed=N option, ensure identical output is produced. --- tests/misc/shred-passes.sh | 14 ++++++++++++++ tests/misc/shuf.sh | 14 ++++++++++++++ tests/misc/sort-rand.sh | 14 ++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/tests/misc/shred-passes.sh b/tests/misc/shred-passes.sh index 0fa63be..12e6630 100755 --- a/tests/misc/shred-passes.sh +++ b/tests/misc/shred-passes.sh @@ -47,5 +47,19 @@ shred -v -u f 2>out || fail=1 compare exp out || fail=1 +# should fail: mutually exclusive random options +{ shred --random-source=/dev/null --random-seed=5 f || test $? -ne 1 ; } && + { warn_ "shred --random-{source,seed} did not trigger an error"; fail=1 ; } +# should fail: invalid random seed +{ shred --random-seed=hello f || test $? -ne 1 ; } && + { warn_ "shred --random-seed=hello did not trigger an error"; fail=1 ; } + +# shred with fixed random seed should produce identical output +for i in 1 2 ; do + seq 100 > j$i || framework_failure_ + shred --random-seed=4200 j$i || framework_failure_ +done +compare_ j1 j2 \ + || { fail=1; echo "shred --random-seed=N produced non-identical output" ; } Exit $fail diff --git a/tests/misc/shuf.sh b/tests/misc/shuf.sh index c7db14f..ef8c230 100755 --- a/tests/misc/shuf.sh +++ b/tests/misc/shuf.sh @@ -166,4 +166,18 @@ printf "A\nB\nC\nD\nE\n" | shuf --rep -n0 > exp || framework_failure_ test \! -s exp || { fail=1; echo "--repeat,STDIN,-n0 produced bad output">&2 ; } +# should fail: mutually exclusive random options +{ shuf --random-source=/dev/null --random-seed=5 -e A || test $? -ne 1 ; } && + { warn_ "shuf --random-{source,seed} did not trigger an error"; fail=1 ; } +# should fail: invalid random seed +{ shuf --random-seed=hello -e A || test $? -ne 1 ; } && + { warn_ "shuf --random-seed=hello did not trigger an error"; fail=1 ; } + +# shuf with fixed random seed should produce identical output +for i in 1 2 ; do + shuf -n20 --random-seed=4242 < in > exp$i || framework_failure_ +done +compare_ exp1 exp2 \ + || { fail=1; echo "shuf --random-seed=N produced non-identical output" ; } + Exit $fail diff --git a/tests/misc/sort-rand.sh b/tests/misc/sort-rand.sh index 3fc5fef..4c2ddf1 100755 --- a/tests/misc/sort-rand.sh +++ b/tests/misc/sort-rand.sh @@ -49,4 +49,18 @@ if (locale --version) > /dev/null 2>&1; then { fail=1; echo "not a permutation with LC_ALL=$locale" 1>&2; } fi +# should fail: mutually exclusive random options +{ sort --random-source=/dev/null --random-seed=5 in || test $? -ne 1 ; } && + { warn_ "sort --random-{source,seed} did not trigger an error"; fail=1 ; } +# should fail: invalid random seed +{ sort --random-seed=hello in || test $? -ne 1 ; } && + { warn_ "sort --random-seed=hello did not trigger an error"; fail=1 ; } + +# sort with fixed random seed should produce identical output +for i in 1 2 ; do + sort --random-seed=4242 -R < in > exp$i || framework_failure_ +done +compare_ exp1 exp2 \ + || { fail=1; echo "sort --random-seed=N produced non-identical output" ; } + Exit $fail -- 1.9.1 >From 0107e5a2597283bf572f0f79c74c2f315b97e36c Mon Sep 17 00:00:00 2001 From: "A. Gordon" <[email protected]> Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 7/7] doc: mention --random-seed=N option * doc/coreutils.texi: add new section for --random-seed, and mention new option in sort,shuf,shred sections. --- doc/coreutils.texi | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 87fb3dc..577f20c 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -767,6 +767,7 @@ name. * Signal specifications:: Specifying signals using the --signal option. * Disambiguating names and IDs:: chgrp, chown, chroot, id: user and group syntax * Random sources:: --random-source, in some programs. +* Random seeds:: --random-seed, in some programs. * Target directory:: Specifying a target directory, in some programs. * Trailing slashes:: --strip-trailing-slashes, in some programs. * Traversing symlinks:: -H, -L, or -P, in some programs. @@ -1240,6 +1241,18 @@ operating system. To reproduce the results of an earlier invocation of a command, you can save some random data into a file and then use that file as the random source in earlier and later invocations of the command. +Alternatively, use @option{--random-seed=@var{N}}. + +@node Random seeds +@section User-supplied random seed + +@cindex random seeds + +The @command{shuf}, @command{shred}, and @command{sort} commands +sometimes need random data to do their work. The +@option{--random-seed=@var{N}} option forces a known random seed, +enable reproducible results with the random options (effectively, +producing non-random output). @node Target directory @section Target directory @@ -4551,7 +4564,7 @@ functions for different fields, you can invoke @command{sort} more than once. The choice of hash function is affected by the -@option{--random-source} option. +@option{--random-source} and @option{--random-seed} options. @end table @@ -4649,6 +4662,13 @@ On newer systems, @option{-o} cannot appear after an input file if scripts should specify @option{-o @var{output-file}} before any input files. +@item --random-seed=@var{N} +@opindex --random-seed +@cindex user-supplied random seed +Use numeric value @var{N} to initialize the random data source. +Enables reproducible output with random sort options (effectively +producing non-random output). @xref{Random seeds}. + @item --random-source=@var{file} @opindex --random-source @cindex random source for sorting @@ -5019,6 +5039,13 @@ Write output to @var{output-file} instead of standard output. @var{output-file}, so you can safely shuffle a file in place by using commands like @code{shuf -o F <F} and @code{cat F | shuf -o F}. +@item --random-seed=@var{N} +@opindex --random-seed +@cindex user-supplied random seed +Use numeric value @var{N} to initialize the random data source used +to determine which permutation to use. Enables reproducible output +(effectively producing non-random output). @xref{Random seeds}. + @item --random-source=@var{file} @opindex --random-source @cindex random source for shuffling @@ -9687,6 +9714,13 @@ overwrite. You can reduce this to save time, or increase it if you think it's appropriate. After 25 passes all of the internal overwrite patterns will have been used at least once. +@item --random-seed=@var{N} +@opindex --random-seed +@cindex user-supplied random seed +Use numeric value @var{N} to initialize the random data source used +to overwrite and to choose pass ordering (Effectively producing +non-random output). @xref{Random seeds}. + @item --random-source=@var{file} @opindex --random-source @cindex random source for shredding -- 1.9.1
