Commit: patch 9.1.0011: regexp cannot match combining chars in collection

Christian Brabandt Thu, 04 Jan 2024 14:00:11 -0800

patch 9.1.0011: regexp cannot match combining chars in collection

Commit: 
https://github.com/vim/vim/commit/d2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf
Author: Christian Brabandt <[email protected]>
Date:   Thu Jan 4 22:54:08 2024 +0100


    patch 9.1.0011: regexp cannot match combining chars in collection
    
    Problem:  regexp cannot match combining chars in collection
    Solution: Check for combining characters in regex collections for the
              NFA and BT Regex Engine
    
    Also, while at it, make debug mode work again.
    
    fixes #10286
    closes: #12871
    
    Signed-off-by: Christian Brabandt <[email protected]>

diff --git a/src/regexp.c b/src/regexp.c
index a64672856..c3bc4966c 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -2686,7 +2686,10 @@ static regengine_T bt_regengine =
     bt_regcomp,
     bt_regfree,
     bt_regexec_nl,
-    bt_regexec_multi,
+    bt_regexec_multi
+#ifdef DEBUG
+    ,(char_u *)""
+#endif
 };
 
 #include "regexp_nfa.c"
@@ -2696,7 +2699,10 @@ static regengine_T nfa_regengine =
     nfa_regcomp,
     nfa_regfree,
     nfa_regexec_nl,
-    nfa_regexec_multi,
+    nfa_regexec_multi
+#ifdef DEBUG
+    ,(char_u *)""
+#endif
 };
 
 // Which regexp engine to use? Needed for vim_regcomp().
diff --git a/src/regexp.h b/src/regexp.h
index d6c8f48c7..1ff2e1b6e 100644
--- a/src/regexp.h
+++ b/src/regexp.h
@@ -178,7 +178,9 @@ struct regengine
     int                (*regexec_nl)(regmatch_T *, char_u *, colnr_T, int);
     // bt_regexec_mult or nfa_regexec_mult
     long       (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, 
colnr_T, int *);
-    //char_u   *expr;
+#ifdef DEBUG
+    char_u     *expr;
+#endif
 };
 
 // Flags used by vim_regsub() and vim_regsub_both()
diff --git a/src/regexp_bt.c b/src/regexp_bt.c
index 522cf37e2..198946e0d 100644
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3743,13 +3743,38 @@ regmatch(
 
          case ANYOF:
          case ANYBUT:
-           if (c == NUL)
-               status = RA_NOMATCH;
-           else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
-               status = RA_NOMATCH;
-           else
-               ADVANCE_REGINPUT();
-           break;
+           {
+               char_u  *q = OPERAND(scan);
+
+               if (c == NUL)
+                   status = RA_NOMATCH;
+               else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
+                   status = RA_NOMATCH;
+               else
+               {
+                   // Check following combining characters
+                   int len = 0;
+                   int i;
+
+                   if (enc_utf8)
+                       len = utfc_ptr2len(q) - utf_ptr2len(q);
+
+                   MB_CPTR_ADV(rex.input);
+                   MB_CPTR_ADV(q);
+
+                   if (!enc_utf8 || len == 0)
+                       break;
+
+                   for (i = 0; i < len; ++i)
+                       if (q[i] != rex.input[i])
+                       {
+                           status = RA_NOMATCH;
+                           break;
+                       }
+                   rex.input += len;
+               }
+               break;
+           }
 
          case MULTIBYTECODE:
            if (has_mbyte)
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index d724d527b..ff5434890 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -1764,6 +1764,7 @@ collection:
            endp = skip_anyof(p);
            if (*endp == ']')
            {
+               int plen;
                /*
                 * Try to reverse engineer character classes. For example,
                 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
@@ -2033,13 +2034,43 @@ collection:
                        else
                        {
                            if (got_coll_char == TRUE && startc == 0)
+                           {
                                EMIT(0x0a);
+                               EMIT(NFA_CONCAT);
+                           }
                            else
+                           {
                                EMIT(startc);
-                           EMIT(NFA_CONCAT);
+                               if (!(enc_utf8 && (utf_ptr2len(regparse) != 
(plen = utfc_ptr2len(regparse)))))
+                               {
+                                   EMIT(NFA_CONCAT);
+                               }
+                           }
                        }
                    }
 
+                   if (enc_utf8 && (utf_ptr2len(regparse) != (plen = 
utfc_ptr2len(regparse))))
+                   {
+                       int i = utf_ptr2len(regparse);
+
+                       c = utf_ptr2char(regparse + i);
+
+                       // Add composing characters
+                       for (;;)
+                       {
+                           if (c == 0)
+                               // \x00 is translated to \x0a, start at \x01.
+                               EMIT(1);
+                           else
+                               EMIT(c);
+                           EMIT(NFA_CONCAT);
+                           if ((i += utf_char2len(c)) >= plen)
+                               break;
+                           c = utf_ptr2char(regparse + i);
+                       }
+                       EMIT(NFA_COMPOSING);
+                       EMIT(NFA_CONCAT);
+                   }
                    MB_PTR_ADV(regparse);
                } // while (p < endp)
 
@@ -6418,6 +6449,84 @@ nfa_regmatch(
                result_if_matched = (t->state->c == NFA_START_COLL);
                for (;;)
                {
+                   if (state->c == NFA_COMPOSING)
+                   {
+                       int         mc = curc;
+                       int         len = 0;
+                       nfa_state_T *end;
+                       nfa_state_T *sta;
+                       int         cchars[MAX_MCO];
+                       int         ccount = 0;
+                       int         j;
+
+                       sta = t->state->out->out;
+                       len = 0;
+                       if (utf_iscomposing(sta->c))
+                       {
+                           // Only match composing character(s), ignore base
+                           // character.  Used for ".{composing}" and 
"{composing}"
+                           // (no preceding character).
+                           len += mb_char2len(mc);
+                       }
+                       if (rex.reg_icombine && len == 0)
+                       {
+                           // If \Z was present, then ignore composing 
characters.
+                           // When ignoring the base character this always 
matches.
+                           if (sta->c != curc)
+                               result = FAIL;
+                           else
+                               result = OK;
+                           while (sta->c != NFA_END_COMPOSING)
+                               sta = sta->out;
+                       }
+                       // Check base character matches first, unless ignored.
+                       else if (len > 0 || mc == sta->c)
+//                     if (len > 0 || mc == sta->c)
+                       {
+                           if (len == 0)
+                           {
+                               len += mb_char2len(mc);
+                               sta = sta->out;
+                           }
+
+                           // We don't care about the order of composing 
characters.
+                           // Get them into cchars[] first.
+                           while (len < clen)
+                           {
+                               mc = mb_ptr2char(rex.input + len);
+                               cchars[ccount++] = mc;
+                               len += mb_char2len(mc);
+                               if (ccount == MAX_MCO)
+                                   break;
+                           }
+
+                           // Check that each composing char in the pattern 
matches a
+                           // composing char in the text.  We do not check if 
all
+                           // composing chars are matched.
+                           result = OK;
+                           while (sta->c != NFA_END_COMPOSING)
+                           {
+                               for (j = 0; j < ccount; ++j)
+                                   if (cchars[j] == sta->c)
+                                       break;
+                               if (j == ccount)
+                               {
+                                   result = FAIL;
+                                   break;
+                               }
+                               sta = sta->out;
+                           }
+                       }
+                       else
+                           result = FAIL;
+
+                       if (t->state->out->out1->c == NFA_END_COMPOSING)
+                       {
+                           end = t->state->out->out1;
+                           ADD_STATE_IF_MATCH(end);
+                       }
+                       break;
+                   }
                    if (state->c == NFA_END_COLL)
                    {
                        result = !result_if_matched;
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim
index b591aedbb..6669dee57 100644
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -575,5 +575,16 @@ func Test_match_too_complicated()
   set regexpengine=0
 endfunc
 
+func Test_combining_chars_in_collection()
+  new
+  for i in range(0,2)
+    exe "set re=".i
+    put =['ɔ̃', 'ɔ',  '̃  ã', 'abcd']
+    :%s/[ɔ̃]//
+    call assert_equal(['', '', 'ɔ', '̃  ã', 'abcd'], getline(1,'$'))
+    %d
+  endfor
+  bw!
+endfunc
 
 " vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c
index c31fbf635..d45181d9d 100644
--- a/src/version.c
+++ b/src/version.c
@@ -704,6 +704,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    11,
 /**/
     10,
 /**/

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/vim_dev/E1rLVky-00D6u0-8S%40256bit.org.

Commit: patch 9.1.0011: regexp cannot match combining chars in collection

Raspunde prin e-mail lui