A user noted that the following command was slower than they
expected:

   busybox shuf -i "1500000000-$(date +%s)" -n 5

At time of writing the range contains 128 million values.  On my
system this takes 7.7s whereas 'shuf' from coreutils takes a
handful of milliseconds.

Optimise BusyBox 'shuf' for cases where -n is specified by stopping
shuffling once the required number of lines have been processed.
On my system the time for the example is reduced to 0.4s.

function                                             old     new   delta
shuf_main                                            520     540     +20
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 1/0 up/down: 20/0)               Total: 20 bytes

v2: Code shrink.  Since outlines <= numlines:
    - the loop in shuffle_lines() only needs to test the value of
      outlines;
    - shuffle_lines() can be called unconditionally.
    Update timing to allow for the 13 million seconds elapsed since v1.

Signed-off-by: Ron Yorston <[email protected]>
---
 coreutils/shuf.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/coreutils/shuf.c b/coreutils/shuf.c
index fdbd3e9b2..50dfa249d 100644
--- a/coreutils/shuf.c
+++ b/coreutils/shuf.c
@@ -39,8 +39,10 @@
 
 /*
  * Use the Fisher-Yates shuffle algorithm on an array of lines.
+ * If the required number of output lines is less than the total
+ * we can stop shuffling early.
  */
-static void shuffle_lines(char **lines, unsigned numlines)
+static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines)
 {
        unsigned i;
        unsigned r;
@@ -48,7 +50,7 @@ static void shuffle_lines(char **lines, unsigned numlines)
 
        srand(monotonic_us());
 
-       for (i = numlines-1; i > 0; i--) {
+       for (i = numlines-1; outlines > 0; i--, outlines--) {
                r = rand();
                /* RAND_MAX can be as small as 32767 */
                if (i > RAND_MAX)
@@ -67,7 +69,7 @@ int shuf_main(int argc, char **argv)
        char *opt_i_str, *opt_n_str, *opt_o_str;
        unsigned i;
        char **lines;
-       unsigned numlines;
+       unsigned numlines, outlines;
        char eol;
 
        opts = getopt32(argv, "^"
@@ -128,24 +130,23 @@ int shuf_main(int argc, char **argv)
                fclose_if_not_stdin(fp);
        }
 
-       if (numlines != 0)
-               shuffle_lines(lines, numlines);
+       outlines = numlines;
+       if (opts & OPT_n) {
+               outlines = xatou(opt_n_str);
+               if (outlines > numlines)
+                       outlines = numlines;
+       }
+
+       shuffle_lines(lines, numlines, outlines);
 
        if (opts & OPT_o)
                xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), 
STDOUT_FILENO);
 
-       if (opts & OPT_n) {
-               unsigned maxlines;
-               maxlines = xatou(opt_n_str);
-               if (numlines > maxlines)
-                       numlines = maxlines;
-       }
-
        eol = '\n';
        if (opts & OPT_z)
                eol = '\0';
 
-       for (i = 0; i < numlines; i++) {
+       for (i = numlines-outlines; i < numlines; i++) {
                if (opts & OPT_i)
                        printf("%u%c", (unsigned)(uintptr_t)lines[i], eol);
                else
-- 
2.31.1

_______________________________________________
busybox mailing list
[email protected]
http://lists.busybox.net/mailman/listinfo/busybox

Reply via email to