Hi,
I did some tests and found out you can actually beat memchr with a simple loop.
Tests were done on a Intel Xeon E3-1231v3 (4*3.4GHz), on a 4GB file that was
already cached in memory. Benchmarking was done simply with the 'time' command.
I don't know how this code would run on other architectures, but I guess you
could put it in an #ifdef?
Coreutils 2.83 version, compiled with -O3:507755520 /home/ztion/words
real 0m3.126s
user 0m2.699s
sys 0m0.429s
Improved version compiled with -O2:507755520 /home/ztion/words
real 0m2.857s
user 0m2.461s
sys 0m0.396s
Improved version compiled with -O3: 507755520 /home/ztion/words
real 0m1.518s
user 0m1.157s
sys 0m0.361s
I studied the generated assembly and with -O3 gcc generates some fancy SSE
code, getting some nice speedups. memchr is also SSE optimized as far as I
know, so it's interesting that this is so much faster, twice as fast actually.
In case you don't like turning -O3 on for some reason (the default in coreutils
is -O2 i think), the best version I could put together for -O2 was this:
Improved version 2, compiled with -O2:
507755520 /home/ztion/words
real 0m2.206s
user 0m1.827s
sys 0m0.379s
Improved version:--- /home/ztion/coreutils/core/coreutils-8.23/src/wc.c
2014-07-11 13:00:07.000000000 +0200
+++ wc.c 2015-03-15 09:01:38.141536166 +0100
@@ -259,11 +259,14 @@
}
else if (!count_chars && !count_complicated)
{
+ uintmax_t count_lines;
+
+ count_lines = 0;
/* Use a separate loop when counting only lines or lines and bytes --
but not chars or words. */
while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
{
- char *p = buf;
+ unsigned char *p = buf, *end;
if (bytes_read == SAFE_READ_ERROR)
{
@@ -272,13 +275,18 @@
break;
}
- while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
+ end = buf + bytes_read;
+ /* this is actually faster than memchr */
+ while (p != end)
{
+ count_lines += *p == '\n';
++p;
- ++lines;
}
+
bytes += bytes_read;
}
+ lines = count_lines;
+
}
#if MB_LEN_MAX > 1
# define SUPPORT_OLD_MBRTOWC 1
Improved version 2:
--- /home/ztion/coreutils/core/coreutils-8.23/src/wc.c 2014-07-11
13:00:07.000000000 +0200
+++ wc.c 2015-03-15 09:27:55.815459623 +0100
@@ -259,11 +259,15 @@
}
else if (!count_chars && !count_complicated)
{
+ uintmax_t count_lines;
+
+ count_lines = 0;
/* Use a separate loop when counting only lines or lines and bytes --
but not chars or words. */
while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
{
- char *p = buf;
+ unsigned char *p = buf, *end;
+ uint32_t temp_chars;
if (bytes_read == SAFE_READ_ERROR)
{
@@ -272,13 +276,37 @@
break;
}
- while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
+ end = buf + bytes_read;
+ while (p < end - 8)
+ {
+ temp_chars = (*((int32_t *)p));
+ count_lines += (temp_chars & 0xff) == '\n';
+ count_lines += ((temp_chars >> 8) & 0xff) == '\n';
+ count_lines += ((temp_chars >> 16) & 0xff) == '\n';
+ count_lines += ((temp_chars >> 24) & 0xff) == '\n';
+
+ p += 4;
+
+ temp_chars = (*((int32_t *)p));
+ count_lines += (temp_chars & 0xff) == '\n';
+ count_lines += ((temp_chars >> 8) & 0xff) == '\n';
+ count_lines += ((temp_chars >> 16) & 0xff) == '\n';
+ count_lines += ((temp_chars >> 24) & 0xff) == '\n';
+
+ p += 4;
+
+ }
+ /* do last bytes */
+ while (p != end)
{
- ++p;
- ++lines;
+ count_lines += *p == '\n';
+ p++;
}
+
bytes += bytes_read;
}
+ lines = count_lines;
+
}
#if MB_LEN_MAX > 1
# define SUPPORT_OLD_MBRTOWC 1
--
/Kristoffer Brånemyr