git: d96ce6d00070 - stable/13 - regex: mixed sets are misidentified as singletons

Kyle Evans Wed, 25 Sep 2024 13:43:46 -0700

The branch stable/13 has been updated by kevans:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=d96ce6d000703f3f57d9214b741e16cc7741d77e


commit d96ce6d000703f3f57d9214b741e16cc7741d77e
Author:     Bill Sommerfeld <sommerf...@hamachi.org>
AuthorDate: 2023-12-21 03:46:14 +0000
Commit:     Kyle Evans <kev...@freebsd.org>
CommitDate: 2024-09-25 20:42:28 +0000

    regex: mixed sets are misidentified as singletons
    
    Fix "singleton" function used by regcomp() to turn character set matches
    into exact character matches if a character set has exactly one
    element.
    
    The underlying cset representation is complex; most critically it
    records"small" characters (codepoint less than either 128
    or 256 depending on locale) in a bit vector, and "wide" characters in
    a secondary array.
    
    Unfortunately the "singleton" function uses to identify singleton sets
    treated a cset as a singleton if either the "small" or the "wide" sets
    had exactly one element (it would then ignore the other set).
    
    The easiest way to demonstrate this bug:
    
            $ export LANG=C.UTF-8
            $ echo 'a' | grep '[abà]'
    
    It should match (and print "a") but instead it doesn't match because the
    single accented character in the set is misinterpreted as a singleton.
    
    PR:             281710
    Reviewed by:    kevans, yuripv
    Obtained from:  illumos
    
    (cherry picked from commit 8f7ed58a15556bf567ff876e1999e4fe4d684e1d)
---
 lib/libc/regex/regcomp.c          | 25 ++++++++++++++++++-----
 lib/libc/tests/regex/multibyte.sh | 43 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index 42fa1b99e58e..2897052fa0f8 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -1592,17 +1592,32 @@ singleton(cset *cs)
 {
        wint_t i, s, n;
 
+       /* Exclude the complicated cases we don't want to deal with */
+       if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
+               return (OUT);
+
+       if (cs->nwides > 1)
+               return (OUT);
+
+       /* Count the number of characters present in the bitmap */
        for (i = n = 0; i < NC; i++)
                if (CHIN(cs, i)) {
                        n++;
                        s = i;
                }
-       if (n == 1)
-               return (s);
-       if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
-           cs->icase == 0)
+
+       if (n > 1)
+               return (OUT);
+
+       if (n == 1) {
+               if (cs->nwides == 0)
+                       return (s);
+               else
+                       return (OUT);
+       }
+       if (cs->nwides == 1)
                return (cs->wides[0]);
-       /* Don't bother handling the other cases. */
+
        return (OUT);
 }
 
diff --git a/lib/libc/tests/regex/multibyte.sh 
b/lib/libc/tests/regex/multibyte.sh
index a736352bf0a2..18323f500a2b 100755
--- a/lib/libc/tests/regex/multibyte.sh
+++ b/lib/libc/tests/regex/multibyte.sh
@@ -1,4 +1,3 @@
-
 atf_test_case bmpat
 bmpat_head()
 {
@@ -45,8 +44,50 @@ icase_body()
        echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
 }
 
+atf_test_case mbset cleanup
+mbset_head()
+{
+       atf_set "descr" "Check multibyte sets matching"
+}
+mbset_body()
+{
+       export LC_CTYPE="C.UTF-8"
+
+       # This involved an erroneously implemented optimization which reduces
+       # single-element sets to an exact match with a single codepoint.
+       # Match sets record small-codepoint characters in a bitmap and
+       # large-codepoint characters in an array; the optimization would falsely
+       # trigger if either the bitmap or the array was a singleton, ignoring
+       # the members of the other side of the set.
+       #
+       # To exercise this, we construct sets which have one member of one side
+       # and one or more of the other, and verify that all members can be
+       # found.
+       printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+       printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+       printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+       printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+       printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+       printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+       printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+       printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+       printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+       printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+       printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+       printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+       printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+       printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+       printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+       printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+}
+mbset_cleanup()
+{
+       rm -f mbset
+}
+
 atf_init_test_cases()
 {
        atf_add_test_case bmpat
        atf_add_test_case icase
+       atf_add_test_case mbset
 }

git: d96ce6d00070 - stable/13 - regex: mixed sets are misidentified as singletons

Reply via email to