Replying to myself to provide further evidence that the patch
included below is right:

Ingo Schwarze wrote on Fri, Jul 22, 2011 at 02:42:03AM +0200:

> So, finally, here is the update of my patch to fix zero-length
> matches in sed(1).  Sorry for the ridiculous delay.
> 
> Changes with respect to the first version:
> 
>  - Removed the now unused local variable re_off (noticed by otto@).
>  - Make slen signed, or zero-length matches at EOF without a trailing
>    newline would cause it to underflow (suspected by otto@ after
>    looking at FreeBSD code, suspicion confirmed myself).
>  - Advancing after zero-length matches was not quite right:
>    A zero-length match before the final character of a file caused the
>    zero-length match after the final character to get lost if the file
>    ended without a terminating newline (noticed myself by changing
>    echo "$in" to echo -n "$in" in substitute.sh of our own test suite);
>    fixed by reworking the end of the do...while loop, which also
>    makes it easier to understand.
> 
> This now passes our own test suite both as it is and with -n,
> and otto@ checked that the remaining failures with respect to the
> FreeBSD test suite look unrelated to this particular patch.

Here is an extended version of the test suite demonstrating
correctness for input lacking the trailing newline character.

OK for the test suite and/or the code change?
  Ingo


Index: substitute.sh
===================================================================
RCS file: /cvs/src/regress/usr.bin/sed/substitute.sh,v
retrieving revision 1.1
diff -u -r1.1 substitute.sh
--- substitute.sh       23 Jun 2011 11:53:57 -0000      1.1
+++ substitute.sh       23 Jul 2011 20:23:33 -0000
@@ -1,9 +1,47 @@
 #!/bin/sh
+#
+#      $OpenBSD$
+#
+# Copyright (c) 2011 Ingo Schwarze <schwa...@openbsd.org>
+#
+# Permission to use, copy, modify, and distribute this test suite for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# Test suite for the sed(1) -E substitute command, checking
+# the handling of multiple zero-length matches in particular.
 
 # error counter
 err=0
 
-# test function; arguments are:
+# test function for one specific flag; arguments are:
+# input string
+# regular expression to replace
+# substitution flag
+# wanted output
+tf() {
+       in=$1
+       expr=$2
+       flag=$3
+       want=$4
+       out=`echo "$in" | sed -E "s/$expr/x/$flag"`
+       [ "X$out" = "X$want" ] || \
+               echo "$in/$expr/$flag/$want/$out ($((++err)))"
+       [ -z "$in" ] && return
+       out=`echo -n "$in" | sed -E "s/$expr/x/$flag"`
+       [ "X$out" = "X$want" ] || \
+               echo "-n:$in/$expr/$flag/$want/$out ($((++err)))"
+}
+
+# test function for various flags; arguments are:
 # input string
 # regular expression to replace
 # wanted output for /g (global substitution)
@@ -14,23 +52,19 @@
        expr=$2
        want=$3
        shift 3
-       out=`echo "$in" | sed -E "s/$expr/x/g"`
-       [ "X$out" = "X$want" ] || echo "$in/$expr/g/$want/$out ($((++err)))"
+       tf "$in" "$expr" g "$want"
 
        # substitution with specific index
        num=1
        while [ $# -gt 0 ]; do
                want=$1
                shift
-               out=`echo "$in" | sed -E "s/$expr/x/$num"`
-               [ "X$out" = "X$want" ] || \
-                       echo "$in/$expr/$num/$want/$out ($((++err)))"
+               tf "$in" "$expr" "$num" "$want"
                num=$((num+1))
        done
 
        # substitution with excessive index
-       out=`echo "$in" | sed -E "s/$expr/x/$num"`
-       [ "X$out" = "X$in" ] || echo "$in/$expr/$num/=/$out ($((++err)))"
+       tf "$in" "$expr" "$num" "$in"
 }
 
 t '' ^ x x

 ----- 8< ----- schnipp ----- >8 ----- 8< ----- schnapp ----- >8 -----

Index: process.c
===================================================================
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.15
diff -u -p -r1.15 process.c
--- process.c   27 Oct 2009 23:59:43 -0000      1.15
+++ process.c   22 Jul 2011 00:11:49 -0000
@@ -312,7 +312,7 @@ substitute(struct s_command *cp)
 {
        SPACE tspace;
        regex_t *re;
-       size_t re_off, slen;
+       regoff_t slen;
        int n, lastempty;
        char *s;
 
@@ -333,60 +333,54 @@ substitute(struct s_command *cp)
        n = cp->u.s->n;
        lastempty = 1;
 
-       switch (n) {
-       case 0:                                 /* Global */
-               do {
-                       if (lastempty || match[0].rm_so != match[0].rm_eo) {
-                               /* Locate start of replaced string. */
-                               re_off = match[0].rm_so;
-                               /* Copy leading retained string. */
-                               cspace(&SS, s, re_off, APPEND);
-                               /* Add in regular expression. */
-                               regsub(&SS, s, cp->u.s->new);
-                       }
+       do {
+               /* Copy the leading retained string. */
+               if (n <= 1 && match[0].rm_so)
+                       cspace(&SS, s, match[0].rm_so, APPEND);
 
-                       /* Move past this match. */
-                       if (match[0].rm_so != match[0].rm_eo) {
-                               s += match[0].rm_eo;
-                               slen -= match[0].rm_eo;
-                               lastempty = 0;
+               /* Skip zero-length matches right after other matches. */
+               if (lastempty || match[0].rm_so ||
+                   match[0].rm_so != match[0].rm_eo) {
+                       if (n <= 1) {
+                               /* Want this match: append replacement. */
+                               regsub(&SS, s, cp->u.s->new);
+                               if (n == 1)
+                                       n = -1;
                        } else {
-                               if (match[0].rm_so == 0)
-                                       cspace(&SS, s, match[0].rm_so + 1,
-                                           APPEND);
-                               else
-                                       cspace(&SS, s + match[0].rm_so, 1,
-                                           APPEND);
-                               s += match[0].rm_so + 1;
-                               slen -= match[0].rm_so + 1;
-                               lastempty = 1;
+                               /* Want a later match: append original. */
+                               if (match[0].rm_eo)
+                                       cspace(&SS, s, match[0].rm_eo, APPEND);
+                               n--;
                        }
-               } while (slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
-               /* Copy trailing retained string. */
-               if (slen > 0)
-                       cspace(&SS, s, slen, APPEND);
-               break;
-       default:                                /* Nth occurrence */
-               while (--n) {
-                       s += match[0].rm_eo;
-                       slen -= match[0].rm_eo;
-                       if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
-                               return (0);
                }
-               /* FALLTHROUGH */
-       case 1:                                 /* 1st occurrence */
-               /* Locate start of replaced string. */
-               re_off = match[0].rm_so + (s - ps);
-               /* Copy leading retained string. */
-               cspace(&SS, ps, re_off, APPEND);
-               /* Add in regular expression. */
-               regsub(&SS, s, cp->u.s->new);
-               /* Copy trailing retained string. */
+
+               /* Move past this match. */
                s += match[0].rm_eo;
                slen -= match[0].rm_eo;
+
+               /*
+                * After a zero-length match, advance one byte,
+                * and at the end of the line, terminate.
+                */
+               if (match[0].rm_so == match[0].rm_eo) {
+                       if (*s == '\n')
+                               slen = -1;
+                       else
+                               slen--;
+                       cspace(&SS, s++, 1, APPEND);
+                       lastempty = 1;
+               } else
+                       lastempty = 0;
+
+       } while (n >= 0 && slen >= 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
+
+       /* Did not find the requested number of matches. */
+       if (n > 1)
+               return (0);
+
+       /* Copy the trailing retained string. */
+       if (slen > 0)
                cspace(&SS, s, slen, APPEND);
-               break;
-       }
 
        /*
         * Swap the substitute space and the pattern space, and make sure

Reply via email to