Hello tech@,

On 04/23/16 07:21, Jonathan Gray wrote on bugs@:
> $ cat foo.sh
> #!/bin/sh
> 
> $1 -r '
> s/[[:space:]]//g
> s/\<sf([0-9]+),/ISL_SFLOAT@\1,/g
> '
> $ cat foo.csv
> R32G32B32A32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32, sf32,     ,  
>    ,    , linear,
> $ ./foo.sh sed < foo.csv 
> R32G32B32A32_FLOAT,128,1,1,1,ISL_SFLOAT@32,sf32,ISL_SFLOAT@32,sf32,,,,linear,
> $ ./foo.sh gsed < foo.csv
> R32G32B32A32_FLOAT,128,1,1,1,ISL_SFLOAT@32,ISL_SFLOAT@32,ISL_SFLOAT@32,ISL_SFLOAT@32,,,,linear,
> 
> Encountered with code that was recently added to Mesa:
> 
> https://cgit.freedesktop.org/mesa/mesa/tree/src/intel/isl/isl_format_layout_gen.bash
> 

After a long discussion with several other developers I came up with
the following diff. It fixes the bug and also passes both the
regression tests and a full bulk build (thanks to ajacoutot@).

I'd like to post it here for wider exposure. Please test to as much sed
scripts as you can find. Comments and OKs welcome.

Diff looks sane to jasper@

For those interested: The problem comes from the fact that the string
pointer increments to the end of the previous match and is then called
with the REG_NOTBOL. The REG_NOTBOL combined with a match at the begin
of the string causes our regex library to treat the word as not begin of
word.
The TRE implementation does the reverse and treats this case as if it
always is begin of word. This causes a similar bug under MacOS:
$ echo 'foo foofoo' | sed -E 's/\<foo/bar/g'
bar barbar

I've solved this problem by converting sed to use REG_STARTEND more
explicitly. Although this isn't a POSIX specified flag, it is already
used by sed and shouldn't be a problem.

The patch in engine.c from the library is because beginp is set in 
matcher to start, so this one always results to true.

I also tried to set up a testcase with gnu regex, but it turned out that
they don't set re_nsub after regcomp, so the sed compilation phase fails. 
After examining gnu sed's code I found that it uses an own internal API 
not specified by POSIX. 

Index: ./lib/libc/regex/engine.c
===================================================================
RCS file: /cvs/src/lib/libc/regex/engine.c,v
retrieving revision 1.19
diff -u -p -r1.19 engine.c
--- ./lib/libc/regex/engine.c   28 Dec 2015 23:01:22 -0000      1.19
+++ ./lib/libc/regex/engine.c   2 May 2016 08:50:20 -0000
@@ -674,7 +674,7 @@ fast(struct match *m, char *start, char 
        states fresh = m->fresh;
        states tmp = m->tmp;
        char *p = start;
-       int c = (start == m->beginp) ? OUT : *(start-1);
+       int c = (start == m->offp) ? OUT : *(start-1);
        int lastc;      /* previous c */
        int flagch;
        int i;
@@ -758,7 +758,7 @@ slow(struct match *m, char *start, char 
        states empty = m->empty;
        states tmp = m->tmp;
        char *p = start;
-       int c = (start == m->beginp) ? OUT : *(start-1);
+       int c = (start == m->offp) ? OUT : *(start-1);
        int lastc;      /* previous c */
        int flagch;
        int i;
Index: ./usr.bin/sed/process.c
===================================================================
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.27
diff -u -p -r1.27 process.c
--- ./usr.bin/sed/process.c     26 Oct 2015 14:08:47 -0000      1.27
+++ ./usr.bin/sed/process.c     2 May 2016 08:50:21 -0000
@@ -61,7 +61,8 @@ static SPACE HS, PS, SS;
 static inline int       applies(struct s_command *);
 static void             flush_appends(void);
 static void             lputs(char *);
-static inline int       regexec_e(regex_t *, const char *, int, int, size_t);
+static inline int       regexec_e(regex_t *, const char *, int, int, size_t,
+                            size_t);
 static void             regsub(SPACE *, char *, char *);
 static int              substitute(struct s_command *);
 
@@ -267,7 +268,7 @@ new:                if (!nflag && !pd)
  * (lastline, linenumber, ps).
  */
 #define        MATCH(a)                                                \
-       (a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, psl) :       \
+       (a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, 0, psl) :    \
            (a)->type == AT_LINE ? linenum == (a)->u.l : lastline()
 
 /*
@@ -335,6 +336,7 @@ substitute(struct s_command *cp)
        regex_t *re;
        regoff_t slen;
        int n, lastempty;
+       size_t le = 0;
        char *s;
 
        s = ps;
@@ -346,7 +348,7 @@ substitute(struct s_command *cp)
                            cp->u.s->maxbref);
                }
        }
-       if (!regexec_e(re, s, 0, 0, psl))
+       if (!regexec_e(re, s, 0, 0, 0, psl))
                return (0);
 
        SS.len = 0;                             /* Clean substitute space. */
@@ -356,28 +358,29 @@ substitute(struct s_command *cp)
 
        do {
                /* Copy the leading retained string. */
-               if (n <= 1 && match[0].rm_so)
-                       cspace(&SS, s, match[0].rm_so, APPEND);
+               if (n <= 1 && (match[0].rm_so - le))
+                       cspace(&SS, s, match[0].rm_so - le, APPEND);
 
                /* Skip zero-length matches right after other matches. */
-               if (lastempty || match[0].rm_so ||
+               if (lastempty || (match[0].rm_so - le) ||
                    match[0].rm_so != match[0].rm_eo) {
                        if (n <= 1) {
                                /* Want this match: append replacement. */
-                               regsub(&SS, s, cp->u.s->new);
+                               regsub(&SS, ps, cp->u.s->new);
                                if (n == 1)
                                        n = -1;
                        } else {
                                /* Want a later match: append original. */
-                               if (match[0].rm_eo)
-                                       cspace(&SS, s, match[0].rm_eo, APPEND);
+                               if (match[0].rm_eo - le)
+                                       cspace(&SS, s, match[0].rm_eo - le, 
APPEND);
                                n--;
                        }
                }
 
                /* Move past this match. */
-               s += match[0].rm_eo;
-               slen -= match[0].rm_eo;
+               s += (match[0].rm_eo - le);
+               slen -= (match[0].rm_eo - le);
+               le = match[0].rm_eo;
 
                /*
                 * After a zero-length match, advance one byte,
@@ -388,13 +391,15 @@ substitute(struct s_command *cp)
                                slen = -1;
                        else
                                slen--;
-                       if (*s != '\0')
+                       if (*s != '\0') {
                                cspace(&SS, s++, 1, APPEND);
+                               le++;
+                       }
                        lastempty = 1;
                } else
                        lastempty = 0;
 
-       } while (n >= 0 && slen >= 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
+       } while (n >= 0 && slen >= 0 && regexec_e(re, ps, 0, 0, le, psl));
 
        /* Did not find the requested number of matches. */
        if (n > 1)
@@ -509,7 +514,7 @@ lputs(char *s)
 
 static inline int
 regexec_e(regex_t *preg, const char *string, int eflags,
-    int nomatch, size_t slen)
+    int nomatch, size_t start, size_t stop)
 {
        int eval;
 
@@ -520,8 +525,8 @@ regexec_e(regex_t *preg, const char *str
                defpreg = preg;
 
        /* Set anchors */
-       match[0].rm_so = 0;
-       match[0].rm_eo = slen;
+       match[0].rm_so = start;
+       match[0].rm_eo = stop;
 
        eval = regexec(defpreg, string,
            nomatch ? 0 : maxnsub + 1, match, eflags | REG_STARTEND);

Reply via email to