It just occurred to me that we could simplify from 3 to 2 loops,
while also making the code more adaptive to the input,
by simply determining the average line length per block.

I'll push the attached later.

thanks,
Pádraig.
>From 00aedf3f3c1b60c4e734d042b91f687a19d9e1fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 23 Mar 2015 11:54:19 +0000
Subject: [PATCH] wc: use a more adaptive wc -l implementation

* src/wc.c (wc): Allow any block to select the count implementation,
rather than just using the first 10 lines.  This also simplifies
the code from 3 loops to 2.
---
 src/wc.c | 55 +++++++++++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/src/wc.c b/src/wc.c
index ceb48ed..7d030ae 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -265,7 +265,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
       /* Use a separate loop when counting only lines or lines and bytes --
          but not chars or words.  */
       bool long_lines = false;
-      bool check_len = true;
       while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
         {
           char *p = buf;
@@ -277,41 +276,37 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
               break;
             }
 
+          bytes += bytes_read;
+
           char *end = p + bytes_read;
+          uintmax_t plines = lines;
 
-          /* Avoid function call overhead for shorter lines.  */
-          if (check_len)
-            while (p != end)
-              {
+          if (! long_lines)
+            {
+              /* Avoid function call overhead for shorter lines.  */
+              while (p != end)
                 lines += *p++ == '\n';
-                /* If there are more than 150 chars in the first 10 lines,
-                   then use memchr, where system specific optimizations
-                   may outweigh function call overhead.
-                   FIXME: This line length was determined in 2015, on both
-                   x86_64 and ppc64, but it's worth re-evaluating in future with
-                   newer compilers, CPUs, or memchr() implementations etc.  */
-                if (lines <= 10)
-                  {
-                    if (p - buf > 150)
-                      {
-                        long_lines = true;
-                        break;
-                      }
-                  }
-              }
-          else if (! long_lines)
-            while (p != end)
-              lines += *p++ == '\n';
-
-          /* memchr is more efficient with longer lines.  */
-          while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
+            }
+          else
             {
-              ++p;
-              ++lines;
+              /* memchr is more efficient with longer lines.  */
+              while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
+                {
+                  ++p;
+                  ++lines;
+                }
             }
 
-          bytes += bytes_read;
-          check_len = false;
+          /* If the average line length in the block is > 15, then use
+             memchr for the next block, where system specific optimizations
+             may outweigh function call overhead.
+             FIXME: This line length was determined in 2015, on both
+             x86_64 and ppc64, but it's worth re-evaluating in future with
+             newer compilers, CPUs, or memchr() implementations etc.  */
+          if (lines == plines || (bytes_read / (lines - plines) > 15))
+            long_lines = true;
+          else
+            long_lines = false;
         }
     }
 #if MB_LEN_MAX > 1
-- 
2.1.0

Reply via email to