All versions of PCRE2 that include PCRE2_MATCH_INVALID_UTF had a bug on
its JIT implementation that results in failure to match for the negative
perl classes, and seems to be easier to replicate when the matching
character is a multibyte one.

Disable that flag and use the original fallback instead.

Alternatively JIT could be disabled instead, but the option selected has
less of an impact on performance.

Carlo
>From 9194c8e9f9ca7315c2e8c25a7986d0690fb31d7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <care...@gmail.com>
Date: Thu, 20 Apr 2023 18:37:20 -0700
Subject: [PATCH] pcre: workaround bug affecting \W or \D

PCRE2 has a bug when using PCRE2_MATCH_INVALID_UTF that would
randomly fail to match patterns using \W or \D.

* NEWS: mention this
* src/pcre2search.c: not use the problematic flag in all broken
  versions of PCRE2
* tests: add new pcre2-utf-bug224 test
---
 NEWS                   |  5 +++++
 src/pcresearch.c       | 23 ++++++++++++++---------
 tests/Makefile.am      |  1 +
 tests/pcre-utf8-bug224 | 31 +++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 9 deletions(-)
 create mode 100755 tests/pcre-utf8-bug224

diff --git a/NEWS b/NEWS
index f16c576..8e371dc 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   when running on 32-bit x86 and ARM hosts using glibc 2.34+.
   [bug introduced in grep 3.9]
 
+  grep no longer fails to match patterns with \D or \W when linked to
+  PCRE2 10.34 or newer.
+  [bug introduced in grep 3.8]
+
+
 ** Changes in behavior
 
   grep --version now prints a line describing the version of PCRE2 it uses.
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1f82932..6ef0d2e 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -58,6 +58,9 @@ struct pcre_comp
   /* Table, indexed by ! (flag & PCRE2_NOTBOL), of whether the empty
      string matches when that flag is used.  */
   int empty_match[2];
+
+  /* Flags */
+  unsigned binary_safe:1;
 };
 
 /* Memory allocation functions for PCRE.  */
@@ -130,16 +133,11 @@ jit_exec (struct pcre_comp *pc, char const *subject, 
idx_t search_bytes,
     }
 }
 
-/* Return true if E is an error code for bad UTF-8, and if pcre2_match
-   could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF.  */
+/* Return true if E is an error code for bad UTF-8 */
 static bool
 bad_utf8_from_pcre2 (int e)
 {
-#ifdef PCRE2_MATCH_INVALID_UTF
-  return false;
-#else
   return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1;
-#endif
 }
 
 /* Compile the -P style PATTERN, containing SIZE bytes that are
@@ -157,6 +155,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, 
bool exact)
     = pcre2_general_context_create (private_malloc, private_free, NULL);
   pcre2_compile_context *ccontext = pcre2_compile_context_create (gcontext);
 
+  pc->binary_safe = false;
   if (localeinfo.multibyte)
     {
       uint32_t unicode;
@@ -181,8 +180,14 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, 
bool exact)
       flags |= PCRE2_NEVER_BACKSLASH_C;
 #endif
 #ifdef PCRE2_MATCH_INVALID_UTF
-      /* Consider invalid UTF-8 as a barrier, instead of error.  */
-      flags |= PCRE2_MATCH_INVALID_UTF;
+      /* workaround PCRE2 bug
+         https://github.com/PCRE2Project/pcre2/issues/224 */
+#if PCRE2_MAJOR == 10 && PCRE2_MINOR <= 42
+      pc->binary_safe = !strstr (pattern, "\\D") && !strstr (pattern, "\\W");
+      if (pc->binary_safe)
+        /* Consider invalid UTF-8 as a barrier, instead of error.  */
+        flags |= PCRE2_MATCH_INVALID_UTF;
+#endif
 #endif
     }
 
@@ -313,7 +318,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
 
           e = jit_exec (pc, subject, line_end - subject,
                         search_offset, options);
-          if (!bad_utf8_from_pcre2 (e))
+          if (pc->binary_safe || !bad_utf8_from_pcre2 (e))
             break;
 
           idx_t valid_bytes = pcre2_get_startchar (pc->data);
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7718f24..9b4422e 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -155,6 +155,7 @@ TESTS =                                             \
   pcre-jitstack                                        \
   pcre-o                                       \
   pcre-utf8                                    \
+  pcre-utf8-bug224                             \
   pcre-utf8-w                                  \
   pcre-w                                       \
   pcre-wx-backref                              \
diff --git a/tests/pcre-utf8-bug224 b/tests/pcre-utf8-bug224
new file mode 100755
index 0000000..739e7b5
--- /dev/null
+++ b/tests/pcre-utf8-bug224
@@ -0,0 +1,31 @@
+#!/bin/sh
+# Ensure \D and \W matches multibyte characters in UTF mode
+#
+# Copyright (C) 2023 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_en_utf8_locale_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+require_pcre_
+
+echo . | grep -qP '(*UTF).' 2>/dev/null \
+  || skip_ 'PCRE unicode support is compiled out'
+
+fail=0
+
+# 'ñ' (U+00F1)
+printf '\302\221\n' > in || framework_failure_
+grep -P '\D' in > out || fail=1
+compare in out || fail=1
+
+# “𝄞” (U+1D11E)
+printf '\360\235\204\236\n' > in || framework_failure_
+grep -P '\W' in > out || fail=1
+compare in out || fail=1
+
+Exit $fail
-- 
2.39.2 (Apple Git-143)

Reply via email to