Author: kevans
Date: Sat Dec  5 03:16:05 2020
New Revision: 368358
URL: https://svnweb.freebsd.org/changeset/base/368358

Log:
  libregex: implement \b and \B (word boundary, not word boundary)
  
  This is the last of the needed GNU expressions before we can unleash bsdgrep
  by default. \b is effectively an agnostic equivalent of \< and \>, while
  \B will match every space that isn't making a transition from
  nonchar -> char or char -> nonchar.

Modified:
  head/contrib/netbsd-tests/lib/libc/regex/data/meta.in
  head/lib/libc/regex/engine.c
  head/lib/libc/regex/regcomp.c
  head/lib/libc/regex/regex2.h
  head/lib/libregex/tests/gnuext.in

Modified: head/contrib/netbsd-tests/lib/libc/regex/data/meta.in
==============================================================================
--- head/contrib/netbsd-tests/lib/libc/regex/data/meta.in       Sat Dec  5 
03:13:47 2020        (r368357)
+++ head/contrib/netbsd-tests/lib/libc/regex/data/meta.in       Sat Dec  5 
03:16:05 2020        (r368358)
@@ -5,7 +5,7 @@ a\*c            &       a*c     a*c
 a\\b           &       a\b     a\b
 a\\\*b         &       a\*b    a\*b
 # Begin FreeBSD
-a\bc           &C      EESCAPE
+a\bc           &       abc
 # End FreeBSD
 a\             &C      EESCAPE
 a\\bc          &       a\bc    a\bc

Modified: head/lib/libc/regex/engine.c
==============================================================================
--- head/lib/libc/regex/engine.c        Sat Dec  5 03:13:47 2020        
(r368357)
+++ head/lib/libc/regex/engine.c        Sat Dec  5 03:16:05 2020        
(r368358)
@@ -118,6 +118,7 @@ static states step(struct re_guts *g, sopno start, sop
 #define        BOW     (BOL-4)
 #define        EOW     (BOL-5)
 #define        BADCHAR (BOL-6)
+#define        NWBND   (BOL-7)
 #define        NONCHAR(c)      ((c) <= OUT)
 /* sflags */
 #define        SBOS    0x0001
@@ -463,6 +464,8 @@ dissect(struct match *m,
                case OEOW:
                case OBOS:
                case OEOS:
+               case OWBND:
+               case ONWBND:
                        break;
                case OANY:
                case OANYOF:
@@ -691,6 +694,21 @@ backref(struct match *m,
                        else
                                return(NULL);
                        break;
+               case OWBND:
+                       if (ISBOW(m, sp) || ISEOW(m, sp))
+                               { /* yes */ }
+                       else
+                               return(NULL);
+                       break;
+               case ONWBND:
+                       if (((sp == m->beginp) && !ISWORD(*sp)) ||
+                           (sp == m->endp && !ISWORD(*(sp - 1))))
+                               { /* yes, beginning/end of subject */ }
+                       else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
+                               { /* yes, beginning/end of subject */ }
+                       else
+                               return(NULL);
+                       break;
                case OBOW:
                        if (ISBOW(m, sp))
                                { /* yes */ }
@@ -916,6 +934,17 @@ walk(struct match *m, const char *start, const char *s
                        st = step(m->g, startst, stopst, st, flagch, st, 
sflags);
                        SP("sboweow", st, c);
                }
+               if (lastc != OUT && c != OUT &&
+                   ISWORD(lastc) == ISWORD(c)) {
+                       flagch = NWBND;
+               } else if ((lastc == OUT && !ISWORD(c)) ||
+                   (c == OUT && !ISWORD(lastc))) {
+                       flagch = NWBND;
+               }
+               if (flagch == NWBND) {
+                       st = step(m->g, startst, stopst, st, flagch, st, 
sflags);
+                       SP("snwbnd", st, c);
+               }
 
                /* are we done? */
                if (ISSET(st, stopst)) {
@@ -1016,6 +1045,14 @@ step(struct re_guts *g,
                case OEOW:
                        if (ch == EOW)
                                FWD(aft, bef, 1);
+                       break;
+               case OWBND:
+                       if (ch == BOW || ch == EOW)
+                               FWD(aft, bef, 1);
+                       break;
+               case ONWBND:
+                       if (ch == NWBND)
+                               FWD(aft, aft, 1);
                        break;
                case OANY:
                        if (!NONCHAR(ch))

Modified: head/lib/libc/regex/regcomp.c
==============================================================================
--- head/lib/libc/regex/regcomp.c       Sat Dec  5 03:13:47 2020        
(r368357)
+++ head/lib/libc/regex/regcomp.c       Sat Dec  5 03:16:05 2020        
(r368358)
@@ -486,6 +486,12 @@ p_ere_exp(struct parse *p, struct branchc *bc)
                        case '\'':
                                EMIT(OEOS, 0);
                                break;
+                       case 'B':
+                               EMIT(ONWBND, 0);
+                               break;
+                       case 'b':
+                               EMIT(OWBND, 0);
+                               break;
                        case 'W':
                        case 'w':
                        case 'S':
@@ -845,6 +851,12 @@ p_simp_re(struct parse *p, struct branchc *bc)
                        case BACKSL|'\'':
                                EMIT(OEOS, 0);
                                break;
+                       case BACKSL|'B':
+                               EMIT(ONWBND, 0);
+                               break;
+                       case BACKSL|'b':
+                               EMIT(OWBND, 0);
+                               break;
                        case BACKSL|'W':
                        case BACKSL|'w':
                        case BACKSL|'S':
@@ -1892,6 +1904,8 @@ findmust(struct parse *p, struct re_guts *g)
                case OEOL:
                case OBOS:
                case OEOS:
+               case OWBND:
+               case ONWBND:
                case O_QUEST:
                case O_CH:
                case OEND:
@@ -2043,6 +2057,8 @@ altoffset(sop *scan, int offset)
                        try++;
                case OBOW:
                case OEOW:
+               case OWBND:
+               case ONWBND:
                case OLPAREN:
                case ORPAREN:
                case OOR2:

Modified: head/lib/libc/regex/regex2.h
==============================================================================
--- head/lib/libc/regex/regex2.h        Sat Dec  5 03:13:47 2020        
(r368357)
+++ head/lib/libc/regex/regex2.h        Sat Dec  5 03:16:05 2020        
(r368358)
@@ -106,6 +106,8 @@ typedef unsigned long sopno;
 #define        OEOW    (20L<<OPSHIFT)  /* end word     -                       
*/
 #define        OBOS    (21L<<OPSHIFT)  /* begin subj.  -                       
*/
 #define        OEOS    (22L<<OPSHIFT)  /* end subj.    -                       
*/
+#define        OWBND   (23L<<OPSHIFT)  /* word bound   -                       
*/
+#define        ONWBND  (24L<<OPSHIFT)  /* not bound    -                       
*/
 
 /*
  * Structures for [] character-set representation.

Modified: head/lib/libregex/tests/gnuext.in
==============================================================================
--- head/lib/libregex/tests/gnuext.in   Sat Dec  5 03:13:47 2020        
(r368357)
+++ head/lib/libregex/tests/gnuext.in   Sat Dec  5 03:16:05 2020        
(r368358)
@@ -17,14 +17,12 @@ a\|b\|c     b       abc     a
 \s\+   b       aSNTb   SNT
 # Word boundaries (\b, \B, \<, \>, \`, \')
 # (is/not boundary, start/end word, start/end subject string)
-# Most of these are disabled for the moment, and will be re-enabled as
-# we become feature complete.
-#\babc\b       &       <abc>   abc
+\babc\b        &       <abc>   abc
 \<abc\> &      <abc>   abc
-#\Babc\B       &       abc
-#\B[abc]\B     &       <abc>   b
-#\B[abc]+      -       <abc>   bc
-#\B[abc]\+     b       <abc>   bc
+\Babc\B        &       abc
+\B[abc]\B      &       <abc>   b
+\B[abc]+       -       <abc>   bc
+\B[abc]\+      b       <abc>   bc
 \`abc  &       abc     abc
 abc\'  &       abc     abc
 \`abc\'        &       abc     abc
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to