[Rd] "bug" and patch: quadratic running time for strsplit(..., fixed=TRUE) (PR#9902)

jbrzusto Fri, 07 Sep 2007 06:48:07 -0700

Full_Name: John Brzustowski
Version: R-devel-trunk, R-2.4.0
OS: linux, gcc 4.0.3
Submission from: (NULL) (206.248.157.184)



This isn't a bug, but an easily-remedied performance issue.

SYMPTOM

> for (i in 1000 * (1:20)) {
   y <- paste(rep("asdf", times=i), collapse=" ")
   t <- system.time(strsplit(y, " ", fixed=TRUE))
   cat(sprintf("i=%5d time=%5d msec\n",i, round(1000*t[1])))
}

i= 1000 time=    2 msec
i= 2000 time=    9 msec
i= 3000 time=   20 msec
i= 4000 time=   34 msec
i= 5000 time=   57 msec
i= 6000 time=   77 msec
i= 7000 time=  107 msec
i= 8000 time=  136 msec
i= 9000 time=  177 msec
i=10000 time=  230 msec
i=11000 time=  275 msec
i=12000 time=  308 msec
i=13000 time=  371 msec
i=14000 time=  446 msec
i=15000 time=  544 msec
i=16000 time=  639 msec
i=17000 time=  726 msec
i=18000 time=  864 msec
i=19000 time=  944 msec
i=20000 time= 1106 msec

DIAGNOSIS

strsplit() uses strlen() in the bounds check clause of a for(;;)
statement, which forces a full scan of the source string for each
character in the source string.  Unlike R's LENGTH() macro, strlen for
C strings is an expensive operation, and in this case (at least),
gcc 4.0.3's -O2 level optimizer is not able to recognize the call as a loop
invariant, despite the declaration "const char *buf".

REMEDIED BEHAVIOUR

i= 1000 time=    0 msec
i= 2000 time=    1 msec
i= 3000 time=    1 msec
i= 4000 time=    0 msec
i= 5000 time=    1 msec
i= 6000 time=    1 msec
i= 7000 time=    1 msec
i= 8000 time=    2 msec
i= 9000 time=    2 msec
i=10000 time=    2 msec
i=11000 time=    2 msec
i=12000 time=    2 msec
i=13000 time=    2 msec
i=14000 time=    2 msec
i=15000 time=    4 msec
i=16000 time=    3 msec
i=17000 time=    3 msec
i=18000 time=    4 msec
i=19000 time=    3 msec
i=20000 time=    4 msec

RELATED ISSUES

A simple search turns up other instances of this usage in R's source.
For completeness, I'm submitting patches for all of them, but have not
tested whether they in fact cause a detectable performance problem.
In the case of modules/X11/dataentry.c, the patch also fixes a presumably
ineffectual "bug".

$ grep -nR "for *([^;]*;[^;]*strlen *(" *

main/rlocale.c:137:             for (i = 0; i < strlen(lc_str) && i < 
sizeof(lc_str); i++)
main/printutils.c:486:          for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
main/sysutils.c:493:            for(j = 0; j < strlen(sub); j++) *outbuf++ = 
sub[j];
modules/X11/rotated.c:608:      for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:856:      for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1399:     for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1797:     for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2045:     for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2339:     for(i=0; i<strlen(text)-1; i++)
modules/X11/dataentry.c:1358:   for(i = 0; i < strlen(text); i++) *bufp++ =
text[i];


PATCHES (against trunk)
Only the first is required to fix strsplit().

Index: src/main/character.c
===================================================================
--- src/main/character.c        (revision 42792)
+++ src/main/character.c        (working copy)
@@ -357,7 +357,7 @@
     int i, j, len, tlen, ntok, slen;
     int extended_opt, cflags, fixed_opt, perl_opt;
     char *pt = NULL;
-    const char *buf, *split = "", *bufp, *laststart;
+    const char *buf, *split = "", *bufp, *laststart, *ebuf = NULL;
     regex_t reg;
     regmatch_t regmatch[1];
     pcre *re_pcre = NULL;
@@ -419,7 +419,8 @@
            if(fixed_opt) {
                /* This is UTF-8 safe since it compares whole strings */
                laststart = buf;
-               for(bufp = buf; bufp - buf < strlen(buf); bufp++) {
+               ebuf = buf + strlen(buf);
+               for(bufp = buf; bufp < ebuf; bufp++) {
                    if((slen == 1 && *bufp != *split) ||
                       (slen > 1 && strncmp(bufp, split, slen))) continue;
                    ntok++;
@@ -480,7 +481,7 @@
                    /* This is UTF-8 safe since it compares whole strings,
                       but it would be more efficient to skip along by chars.
                     */
-                   for(; bufp - buf < strlen(buf); bufp++) {
+                   for(; bufp < ebuf; bufp++) {
                        if((slen == 1 && *bufp != *split) ||
                           (slen > 1 && strncmp(bufp, split, slen))) continue;
                        if(slen) {
Index: src/main/rlocale.c
===================================================================
--- src/main/rlocale.c  (revision 42792)
+++ src/main/rlocale.c  (working copy)
@@ -127,14 +127,14 @@
 int Ri18n_wcwidth(wchar_t c)
 {
     char lc_str[128];
-    unsigned int i;
+    unsigned int i, j;
 
     static char *lc_cache = "";
     static int lc = 0;
 
     if (0 != strcmp(setlocale(LC_CTYPE, NULL), lc_cache)) {
        strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str));
-       for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
+       for (i = 0, j = strlen(lc_str); i < j && i < sizeof(lc_str); i++)
            lc_str[i] = toupper(lc_str[i]);
        for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t)); 
             i++) {
Index: src/main/printutils.c
===================================================================
--- src/main/printutils.c       (revision 42792)
+++ src/main/printutils.c       (working copy)
@@ -483,7 +483,8 @@
                        else
 #endif
                            snprintf(buf, 11, "\\u%04x", k);
-                       for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
+                       memcpy(q, buf, j = strlen(buf));
+                       q += j;
                        p += res;
                    }
                    i += (res - 1);
Index: src/main/sysutils.c
===================================================================
--- src/main/sysutils.c (revision 42792)
+++ src/main/sysutils.c (working copy)
@@ -490,8 +490,9 @@
                        R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
                        goto top_of_loop;
                    }
-                   for(j = 0; j < strlen(sub); j++) *outbuf++ = sub[j];
-                   outb -= strlen(sub);
+                   memcpy(outbuf, sub, j = strlen(sub)); 
+                   outbuf += j;
+                   outb -= j;
                }
                inbuf++; inb--;
                goto next_char;
Index: src/modules/X11/rotated.c
===================================================================
--- src/modules/X11/rotated.c   (revision 42792)
+++ src/modules/X11/rotated.c   (working copy)
@@ -605,7 +605,7 @@
 
     /* count number of sections in string */
     if(align!=NONE)
-       for(i=0; i<strlen(text)-1; i++)
+       for(i=strlen(text)-2; i >= 0; i--)
            if(text[i]=='\n')
                nl++;
 
@@ -853,7 +853,7 @@
     /* count number of sections in string */
     item->nl=1;
     if(align!=NONE)
-       for(i=0; i<strlen(text)-1; i++)
+       for(i=strlen(text)-2; i >= 0; i--)
            if(text[i]=='\n')
                item->nl++;
 
@@ -1396,7 +1396,7 @@
     /* count number of sections in string */
     nl=1;
     if(align!=NONE)
-       for(i=0; i<strlen(text)-1; i++)
+       for(i=strlen(text)-2; i >= 0; i--)
            if(text[i]=='\n')
                nl++;
 
@@ -1794,7 +1794,7 @@
 
     /* count number of sections in string */
     if(align!=NONE)
-       for(i=0; i<strlen(text)-1; i++)
+       for(i=strlen(text)-2; i >= 0; i--)
            if(text[i]=='\n')
                nl++;
 
@@ -2042,7 +2042,7 @@
     /* count number of sections in string */
     item->nl=1;
     if(align!=NONE)
-       for(i=0; i<strlen(text)-1; i++)
+       for(i=strlen(text)-2; i >= 0; i--)
            if(text[i]=='\n')
                item->nl++;
 
@@ -2336,7 +2336,7 @@
     /* count number of sections in string */
     nl=1;
     if(align!=NONE)
-       for(i=0; i<strlen(text)-1; i++)
+       for(i=strlen(text)-2; i >= 0; i--)
            if(text[i]=='\n')
                nl++;
 
Index: src/modules/X11/dataentry.c
===================================================================
--- src/modules/X11/dataentry.c (revision 42792)
+++ src/modules/X11/dataentry.c (working copy)
@@ -1232,7 +1232,7 @@
    do as we get a char at a time */
 static void handlechar(DEstruct DE, char *text)
 {
-    int i, c = text[0];
+    int i, c = text[0], j;
 #ifdef USE_FONTSET
     wchar_t wcs[BOOSTED_BUF_SIZE];
 
@@ -1355,9 +1355,13 @@
        goto donehc;
     }
 
-    for(i = 0; i < strlen(text); i++) *bufp++ = text[i];
-    *(bufp+1) = '\0';
-    clength += strlen(text);
+    /* as originally written, this left an undefined byte at
+       the end of bufp, followed by a zero byte; luckily, the storage
+       pointed to by bufp had already been zeroed, so the undefined
+       byte was in fact zero.  */
+    strcpy(bufp, text);
+    bufp += (j = strlen(text));
+    clength += j;
     printstring(DE, buf, clength, DE->crow, DE->ccol, 1);
     return;

______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

[Rd] "bug" and patch: quadratic running time for strsplit(..., fixed=TRUE) (PR#9902)

Reply via email to