Hello tech@,
On 04/23/16 07:21, Jonathan Gray wrote on bugs@:
> $ cat foo.sh
> #!/bin/sh
>
> $1 -r '
> s/[[:space:]]//g
> s/\<sf([0-9]+),/ISL_SFLOAT@\1,/g
> '
> $ cat foo.csv
> R32G32B32A32_FLOAT , 128, 1, 1, 1, sf32, sf32, sf32, sf32, ,
> , , linear,
> $ ./foo.sh sed < foo.csv
> R32G32B32A32_FLOAT,128,1,1,1,ISL_SFLOAT@32,sf32,ISL_SFLOAT@32,sf32,,,,linear,
> $ ./foo.sh gsed < foo.csv
> R32G32B32A32_FLOAT,128,1,1,1,ISL_SFLOAT@32,ISL_SFLOAT@32,ISL_SFLOAT@32,ISL_SFLOAT@32,,,,linear,
>
> Encountered with code that was recently added to Mesa:
>
> https://cgit.freedesktop.org/mesa/mesa/tree/src/intel/isl/isl_format_layout_gen.bash
>
After a long discussion with several other developers I came up with
the following diff. It fixes the bug and also passes both the
regression tests and a full bulk build (thanks to ajacoutot@).
I'd like to post it here for wider exposure. Please test to as much sed
scripts as you can find. Comments and OKs welcome.
Diff looks sane to jasper@
For those interested: The problem comes from the fact that the string
pointer increments to the end of the previous match and is then called
with the REG_NOTBOL. The REG_NOTBOL combined with a match at the begin
of the string causes our regex library to treat the word as not begin of
word.
The TRE implementation does the reverse and treats this case as if it
always is begin of word. This causes a similar bug under MacOS:
$ echo 'foo foofoo' | sed -E 's/\<foo/bar/g'
bar barbar
I've solved this problem by converting sed to use REG_STARTEND more
explicitly. Although this isn't a POSIX specified flag, it is already
used by sed and shouldn't be a problem.
The patch in engine.c from the library is because beginp is set in
matcher to start, so this one always results to true.
I also tried to set up a testcase with gnu regex, but it turned out that
they don't set re_nsub after regcomp, so the sed compilation phase fails.
After examining gnu sed's code I found that it uses an own internal API
not specified by POSIX.
Index: ./lib/libc/regex/engine.c
===================================================================
RCS file: /cvs/src/lib/libc/regex/engine.c,v
retrieving revision 1.19
diff -u -p -r1.19 engine.c
--- ./lib/libc/regex/engine.c 28 Dec 2015 23:01:22 -0000 1.19
+++ ./lib/libc/regex/engine.c 2 May 2016 08:50:20 -0000
@@ -674,7 +674,7 @@ fast(struct match *m, char *start, char
states fresh = m->fresh;
states tmp = m->tmp;
char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
+ int c = (start == m->offp) ? OUT : *(start-1);
int lastc; /* previous c */
int flagch;
int i;
@@ -758,7 +758,7 @@ slow(struct match *m, char *start, char
states empty = m->empty;
states tmp = m->tmp;
char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
+ int c = (start == m->offp) ? OUT : *(start-1);
int lastc; /* previous c */
int flagch;
int i;
Index: ./usr.bin/sed/process.c
===================================================================
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.27
diff -u -p -r1.27 process.c
--- ./usr.bin/sed/process.c 26 Oct 2015 14:08:47 -0000 1.27
+++ ./usr.bin/sed/process.c 2 May 2016 08:50:21 -0000
@@ -61,7 +61,8 @@ static SPACE HS, PS, SS;
static inline int applies(struct s_command *);
static void flush_appends(void);
static void lputs(char *);
-static inline int regexec_e(regex_t *, const char *, int, int, size_t);
+static inline int regexec_e(regex_t *, const char *, int, int, size_t,
+ size_t);
static void regsub(SPACE *, char *, char *);
static int substitute(struct s_command *);
@@ -267,7 +268,7 @@ new: if (!nflag && !pd)
* (lastline, linenumber, ps).
*/
#define MATCH(a) \
- (a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, psl) : \
+ (a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, 0, psl) : \
(a)->type == AT_LINE ? linenum == (a)->u.l : lastline()
/*
@@ -335,6 +336,7 @@ substitute(struct s_command *cp)
regex_t *re;
regoff_t slen;
int n, lastempty;
+ size_t le = 0;
char *s;
s = ps;
@@ -346,7 +348,7 @@ substitute(struct s_command *cp)
cp->u.s->maxbref);
}
}
- if (!regexec_e(re, s, 0, 0, psl))
+ if (!regexec_e(re, s, 0, 0, 0, psl))
return (0);
SS.len = 0; /* Clean substitute space. */
@@ -356,28 +358,29 @@ substitute(struct s_command *cp)
do {
/* Copy the leading retained string. */
- if (n <= 1 && match[0].rm_so)
- cspace(&SS, s, match[0].rm_so, APPEND);
+ if (n <= 1 && (match[0].rm_so - le))
+ cspace(&SS, s, match[0].rm_so - le, APPEND);
/* Skip zero-length matches right after other matches. */
- if (lastempty || match[0].rm_so ||
+ if (lastempty || (match[0].rm_so - le) ||
match[0].rm_so != match[0].rm_eo) {
if (n <= 1) {
/* Want this match: append replacement. */
- regsub(&SS, s, cp->u.s->new);
+ regsub(&SS, ps, cp->u.s->new);
if (n == 1)
n = -1;
} else {
/* Want a later match: append original. */
- if (match[0].rm_eo)
- cspace(&SS, s, match[0].rm_eo, APPEND);
+ if (match[0].rm_eo - le)
+ cspace(&SS, s, match[0].rm_eo - le,
APPEND);
n--;
}
}
/* Move past this match. */
- s += match[0].rm_eo;
- slen -= match[0].rm_eo;
+ s += (match[0].rm_eo - le);
+ slen -= (match[0].rm_eo - le);
+ le = match[0].rm_eo;
/*
* After a zero-length match, advance one byte,
@@ -388,13 +391,15 @@ substitute(struct s_command *cp)
slen = -1;
else
slen--;
- if (*s != '\0')
+ if (*s != '\0') {
cspace(&SS, s++, 1, APPEND);
+ le++;
+ }
lastempty = 1;
} else
lastempty = 0;
- } while (n >= 0 && slen >= 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
+ } while (n >= 0 && slen >= 0 && regexec_e(re, ps, 0, 0, le, psl));
/* Did not find the requested number of matches. */
if (n > 1)
@@ -509,7 +514,7 @@ lputs(char *s)
static inline int
regexec_e(regex_t *preg, const char *string, int eflags,
- int nomatch, size_t slen)
+ int nomatch, size_t start, size_t stop)
{
int eval;
@@ -520,8 +525,8 @@ regexec_e(regex_t *preg, const char *str
defpreg = preg;
/* Set anchors */
- match[0].rm_so = 0;
- match[0].rm_eo = slen;
+ match[0].rm_so = start;
+ match[0].rm_eo = stop;
eval = regexec(defpreg, string,
nomatch ? 0 : maxnsub + 1, match, eflags | REG_STARTEND);