In wc(1) we currently count words, both ASCII and multibyte, in a
getline(3) loop.
This makes sense in the multibyte case because stdio handles all the
nasty buffer resizing for us. We avoid splitting a multibyte between
two read(2) calls and the resulting code is simpler.
However, for ASCII input we don't have the split-character problem.
Using getline(3) doesn't really buy us anything. We can count words
in a big buffer (as we do in the ASCII byte- and line-counting modes)
just fine.
Eliminating the getline(3) overhead improves performance. The gains
are obvious when enumerating files with lots of lines. Consider this
pathological input:
# Before
$ jot -b Z 100000000 > ~/Z-100M-lines
$ command time wc ~/Z-100M-lines
100000000 100000000 200000000 /home/ssc/Z-100M-lines
4.50 real 4.45 user 0.04 sys
# After
$ jot -b Z 100000000 > ~/Z-100M-lines
$ command time obj/wc ~/Z-100M-lines
100000000 100000000 200000000 /home/ssc/Z-100M-lines
0.48 real 0.44 user 0.04 sys
ok?
Index: wc.c
===================================================================
RCS file: /cvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.28
diff -u -p -r1.28 wc.c
--- wc.c 16 Nov 2021 23:34:24 -0000 1.28
+++ wc.c 17 Nov 2021 14:32:17 -0000
@@ -144,7 +144,7 @@ cnt(const char *path)
fd = STDIN_FILENO;
}
- if (!doword && !multibyte) {
+ if (!multibyte) {
if (bufsz < MAXBSIZE &&
(buf = realloc(buf, MAXBSIZE)) == NULL)
err(1, NULL);
@@ -153,7 +153,7 @@ cnt(const char *path)
* faster to get lines than to get words, since
* the word count requires some logic.
*/
- if (doline) {
+ if (doline && !doword) {
while ((len = read(fd, buf, MAXBSIZE)) > 0) {
charct += len;
for (C = buf; len--; ++C)
@@ -172,7 +172,7 @@ cnt(const char *path)
* a special device in case someone adds a new type
* of inode.
*/
- else if (dochar) {
+ else if (dochar && !doword) {
mode_t ifmt;
if (fstat(fd, &sbuf)) {
@@ -192,6 +192,26 @@ cnt(const char *path)
}
}
}
+ } else {
+ gotsp = 1;
+ while ((len = read(fd, buf, MAXBSIZE)) > 0) {
+ const char *end = buf + len;
+ charct += len;
+ for (C = buf; len--; ++C) {
+ if (isspace((unsigned char)*C)) {
+ gotsp = 1;
+ if (*C == '\n')
+ ++linect;
+ } else if (gotsp) {
+ gotsp = 0;
+ ++wordct;
+ }
+ }
+ }
+ if (len == -1) {
+ warn("%s", file);
+ rval = 1;
+ }
}
} else {
if (path == NULL)
@@ -211,38 +231,24 @@ cnt(const char *path)
*/
gotsp = 1;
while ((len = getline(&buf, &bufsz, stream)) > 0) {
- if (multibyte) {
- const char *end = buf + len;
- for (C = buf; C < end; C += len) {
- ++charct;
- len = mbtowc(&wc, C, MB_CUR_MAX);
- if (len == -1) {
- mbtowc(NULL, NULL,
- MB_CUR_MAX);
- len = 1;
- wc = L'?';
- } else if (len == 0)
- len = 1;
- if (iswspace(wc)) {
- gotsp = 1;
- if (wc == L'\n')
- ++linect;
- } else if (gotsp) {
- gotsp = 0;
- ++wordct;
- }
- }
- } else {
- charct += len;
- for (C = buf; len--; ++C) {
- if (isspace((unsigned char)*C)) {
- gotsp = 1;
- if (*C == '\n')
- ++linect;
- } else if (gotsp) {
- gotsp = 0;
- ++wordct;
- }
+ const char *end = buf + len;
+ for (C = buf; C < end; C += len) {
+ ++charct;
+ len = mbtowc(&wc, C, MB_CUR_MAX);
+ if (len == -1) {
+ mbtowc(NULL, NULL,
+ MB_CUR_MAX);
+ len = 1;
+ wc = L'?';
+ } else if (len == 0)
+ len = 1;
+ if (iswspace(wc)) {
+ gotsp = 1;
+ if (wc == L'\n')
+ ++linect;
+ } else if (gotsp) {
+ gotsp = 0;
+ ++wordct;
}
}
}