wc(1): accelerate word counting

Scott Cheloha Wed, 17 Nov 2021 06:38:37 -0800

In wc(1) we currently count words, both ASCII and multibyte, in a
getline(3) loop.


This makes sense in the multibyte case because stdio handles all the
nasty buffer resizing for us.  We avoid splitting a multibyte between
two read(2) calls and the resulting code is simpler.

However, for ASCII input we don't have the split-character problem.
Using getline(3) doesn't really buy us anything.  We can count words
in a big buffer (as we do in the ASCII byte- and line-counting modes)
just fine.

Eliminating the getline(3) overhead improves performance.  The gains
are obvious when enumerating files with lots of lines.  Consider this
pathological input:

# Before
$ jot -b Z 100000000 > ~/Z-100M-lines
$ command time wc ~/Z-100M-lines
 100000000 100000000 200000000 /home/ssc/Z-100M-lines
        4.50 real         4.45 user         0.04 sys

# After
$ jot -b Z 100000000 > ~/Z-100M-lines
$ command time obj/wc ~/Z-100M-lines
 100000000 100000000 200000000 /home/ssc/Z-100M-lines
        0.48 real         0.44 user         0.04 sys

ok?

Index: wc.c
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.28
diff -u -p -r1.28 wc.c
--- wc.c        16 Nov 2021 23:34:24 -0000      1.28
+++ wc.c        17 Nov 2021 14:32:17 -0000
@@ -144,7 +144,7 @@ cnt(const char *path)
                fd = STDIN_FILENO;
        }
 
-       if (!doword && !multibyte) {
+       if (!multibyte) {
                if (bufsz < MAXBSIZE &&
                    (buf = realloc(buf, MAXBSIZE)) == NULL)
                        err(1, NULL);
@@ -153,7 +153,7 @@ cnt(const char *path)
                 * faster to get lines than to get words, since
                 * the word count requires some logic.
                 */
-               if (doline) {
+               if (doline && !doword) {
                        while ((len = read(fd, buf, MAXBSIZE)) > 0) {
                                charct += len;
                                for (C = buf; len--; ++C)
@@ -172,7 +172,7 @@ cnt(const char *path)
                 * a special device in case someone adds a new type
                 * of inode.
                 */
-               else if (dochar) {
+               else if (dochar && !doword) {
                        mode_t ifmt;
 
                        if (fstat(fd, &sbuf)) {
@@ -192,6 +192,26 @@ cnt(const char *path)
                                        }
                                }
                        }
+               } else {
+                       gotsp = 1;
+                       while ((len = read(fd, buf, MAXBSIZE)) > 0) {
+                               const char *end = buf + len;
+                               charct += len;
+                               for (C = buf; len--; ++C) {
+                                       if (isspace((unsigned char)*C)) {
+                                               gotsp = 1;
+                                               if (*C == '\n')
+                                                       ++linect;
+                                       } else if (gotsp) {
+                                               gotsp = 0;
+                                               ++wordct;
+                                       }
+                               }
+                       }
+                       if (len == -1) {
+                               warn("%s", file);
+                               rval = 1;
+                       }
                }
        } else {
                if (path == NULL)
@@ -211,38 +231,24 @@ cnt(const char *path)
                 */
                gotsp = 1;
                while ((len = getline(&buf, &bufsz, stream)) > 0) {
-                       if (multibyte) {
-                               const char *end = buf + len;
-                               for (C = buf; C < end; C += len) {
-                                       ++charct;
-                                       len = mbtowc(&wc, C, MB_CUR_MAX);
-                                       if (len == -1) {
-                                               mbtowc(NULL, NULL,
-                                                   MB_CUR_MAX);
-                                               len = 1;
-                                               wc = L'?';
-                                       } else if (len == 0)
-                                               len = 1;
-                                       if (iswspace(wc)) {
-                                               gotsp = 1;
-                                               if (wc == L'\n')
-                                                       ++linect;
-                                       } else if (gotsp) {
-                                               gotsp = 0;
-                                               ++wordct;
-                                       }
-                               }
-                       } else {
-                               charct += len;
-                               for (C = buf; len--; ++C) {
-                                       if (isspace((unsigned char)*C)) {
-                                               gotsp = 1;
-                                               if (*C == '\n')
-                                                       ++linect;
-                                       } else if (gotsp) {
-                                               gotsp = 0;
-                                               ++wordct;
-                                       }
+                       const char *end = buf + len;
+                       for (C = buf; C < end; C += len) {
+                               ++charct;
+                               len = mbtowc(&wc, C, MB_CUR_MAX);
+                               if (len == -1) {
+                                       mbtowc(NULL, NULL,
+                                           MB_CUR_MAX);
+                                       len = 1;
+                                       wc = L'?';
+                               } else if (len == 0)
+                                       len = 1;
+                               if (iswspace(wc)) {
+                                       gotsp = 1;
+                                       if (wc == L'\n')
+                                               ++linect;
+                               } else if (gotsp) {
+                                       gotsp = 0;
+                                       ++wordct;
                                }
                        }
                }

wc(1): accelerate word counting

Reply via email to