Author: yuripv
Date: Fri Nov 23 15:49:18 2018
New Revision: 340835
URL: https://svnweb.freebsd.org/changeset/base/340835

Log:
  regexec: fix processing multibyte strings.
  
  Matcher function incorrectly assumed that moffset that we get from
  findmust is in bytes. Fix this by introducing a stepback function,
  taking short path if MB_CUR_MAX is 1, and going back byte-by-byte,
  checking if we have a legal character sequence otherwise.
  
  PR:           153502
  Reviewed by:  pfg, kevans
  Approved by:  kib (mentor, implicit)
  Differential revision:        https://reviews.freebsd.org/D18297

Added:
  head/lib/libc/tests/regex/multibyte.sh   (contents, props changed)
Modified:
  head/lib/libc/regex/engine.c
  head/lib/libc/tests/regex/Makefile

Modified: head/lib/libc/regex/engine.c
==============================================================================
--- head/lib/libc/regex/engine.c        Fri Nov 23 13:50:18 2018        
(r340834)
+++ head/lib/libc/regex/engine.c        Fri Nov 23 15:49:18 2018        
(r340835)
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
  */
 
 #ifdef SNAMES
+#define        stepback sstepback
 #define        matcher smatcher
 #define        walk    swalk
 #define        dissect sdissect
@@ -58,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #define        match   smat
 #endif
 #ifdef LNAMES
+#define        stepback lstepback
 #define        matcher lmatcher
 #define        walk    lwalk
 #define        dissect ldissect
@@ -68,6 +70,7 @@ __FBSDID("$FreeBSD$");
 #define        match   lmat
 #endif
 #ifdef MNAMES
+#define        stepback mstepback
 #define        matcher mmatcher
 #define        walk    mwalk
 #define        dissect mdissect
@@ -142,6 +145,39 @@ static const char *pchar(int ch);
 #endif
 
 /*
+ * Given a multibyte string pointed to by start, step back nchar characters
+ * from current position pointed to by cur.
+ */
+static const char *
+stepback(const char *start, const char *cur, int nchar)
+{
+       const char *ret;
+       int wc, mbc;
+       mbstate_t mbs;
+       size_t clen;
+
+       if (MB_CUR_MAX == 1)
+               return ((cur - nchar) > start ? cur - nchar : NULL);
+
+       ret = cur;
+       for (wc = nchar; wc > 0; wc--) {
+               for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
+                       if ((ret - mbc) < start)
+                               return (NULL);
+                       memset(&mbs, 0, sizeof(mbs));
+                       clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
+                       if (clen != (size_t)-1 && clen != (size_t)-2)
+                               break;
+               }
+               if (mbc > MB_CUR_MAX)
+                       return (NULL);
+               ret -= mbc;
+       }
+
+       return (ret);
+}
+
+/*
  - matcher - the actual matching engine
  == static int matcher(struct re_guts *g, const char *string, \
  ==    size_t nmatch, regmatch_t pmatch[], int eflags);
@@ -244,9 +280,14 @@ matcher(struct re_guts *g,
        ZAPSTATE(&m->mbs);
 
        /* Adjust start according to moffset, to speed things up */
-       if (dp != NULL && g->moffset > -1)
-               start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
+       if (dp != NULL && g->moffset > -1) {
+               const char *nstart;
 
+               nstart = stepback(start, dp, g->moffset);
+               if (nstart != NULL)
+                       start = nstart;
+       }
+
        SP("mloop", m->st, *start);
 
        /* this loop does only one repetition except for backrefs */
@@ -1083,6 +1124,7 @@ pchar(int ch)
 #endif
 #endif
 
+#undef stepback
 #undef matcher
 #undef walk
 #undef dissect

Modified: head/lib/libc/tests/regex/Makefile
==============================================================================
--- head/lib/libc/tests/regex/Makefile  Fri Nov 23 13:50:18 2018        
(r340834)
+++ head/lib/libc/tests/regex/Makefile  Fri Nov 23 15:49:18 2018        
(r340835)
@@ -2,6 +2,9 @@
 
 PACKAGE=       tests
 
+# local test cases
+ATF_TESTS_SH+= multibyte
+
 .include "Makefile.inc"
 .include "${.CURDIR:H}/Makefile.netbsd-tests"
 .include <bsd.test.mk>

Added: head/lib/libc/tests/regex/multibyte.sh
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/lib/libc/tests/regex/multibyte.sh      Fri Nov 23 15:49:18 2018        
(r340835)
@@ -0,0 +1,35 @@
+# $FreeBSD$
+
+atf_test_case multibyte
+multibyte_head()
+{
+       atf_set "descr" "Check matching multibyte characters (PR153502)"
+}
+multibyte_body()
+{
+       export LC_CTYPE="C.UTF-8"
+
+       printf 'é' | atf_check -o "inline:é" \
+           sed -ne '/^.$/p'
+       printf 'éé' | atf_check -o "inline:éé" \
+           sed -ne '/^..$/p'
+       printf 'aéa' | atf_check -o "inline:aéa" \
+           sed -ne '/a.a/p'
+       printf 'aéa'| atf_check -o "inline:aéa" \
+           sed -ne '/a.*a/p'
+       printf 'aaéaa' | atf_check -o "inline:aaéaa" \
+           sed -ne '/aa.aa/p'
+       printf 'aéaéa' | atf_check -o "inline:aéaéa" \
+           sed -ne '/a.a.a/p'
+       printf 'éa' | atf_check -o "inline:éa" \
+           sed -ne '/.a/p'
+       printf 'aéaa' | atf_check -o "inline:aéaa" \
+           sed -ne '/a.aa/p'
+       printf 'éaé' | atf_check -o "inline:éaé" \
+           sed -ne '/.a./p'
+}
+
+atf_init_test_cases()
+{
+       atf_add_test_case multibyte
+}
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to