In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/e6a3850e182c1d286b5e83a9f9917b7f0ddc4178?hp=0cd39adf63bbc345adf2626353dca017ce526563>

- Log -----------------------------------------------------------------
commit e6a3850e182c1d286b5e83a9f9917b7f0ddc4178
Author: Karl Williamson <[email protected]>
Date:   Sun Sep 16 10:51:01 2012 -0600

    regexec.c: Avoid unnecessary calculation
    
    When matching an EXACT node and the target string and the pattern differ
    in utf8ness, the code prior to this patch calculated each code point from
    the utf8 version in order to do the EXACT comparision with the non-utf8
    version.  But it is unnecessary to do this full calculation.  Code
    points above Latin1 cannot possibly match a non-UTF8 string; there is no
    need to know precisely which code point it is in order to know that it
    won't match.  Similarly, invariant code points can be checked directly;
    and the Latin1 variants can be downgraded for comparison by a simple
    macro.

M       regexec.c

commit 7b4252f49b5cd844c918f7f2cb81e064d4377ba9
Author: Karl Williamson <[email protected]>
Date:   Sun Sep 16 10:58:26 2012 -0600

    utf8.h: Add macro to test if UTF8 code point isn't Latin1

M       utf8.h
M       utfebcdic.h
-----------------------------------------------------------------------

Summary of changes:
 regexec.c   |   59 ++++++++++++++++++++++++++++++++++++++++++-----------------
 utf8.h      |    1 +
 utfebcdic.h |    1 +
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/regexec.c b/regexec.c
index 322e596..f207cda 100644
--- a/regexec.c
+++ b/regexec.c
@@ -3756,31 +3756,56 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char 
*startpos, regnode *prog)
                const char * const e = s + ln;
 
                if (utf8_target) {
-                   /* The target is utf8, the pattern is not utf8. */
+                    /* The target is utf8, the pattern is not utf8.
+                     * Above-Latin1 code points can't match the pattern;
+                     * invariants match exactly, and the other Latin1 ones need
+                     * to be downgraded to a single byte in order to do the
+                     * comparison.  (If we could be confident that the target
+                     * is not malformed, this could be refactored to have fewer
+                     * tests by just assuming that if the first bytes match, it
+                     * is an invariant, but there are tests in the test suite
+                     * dealing with (??{...}) which violate this) */
                    while (s < e) {
-                       STRLEN ulen;
                        if (l >= PL_regeol)
                             sayNO;
-                       if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
-                                           uniflags))
-                            sayNO;
-                       l += ulen;
-                       s ++;
+                        if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
+                            sayNO;
+                        }
+                        if (UTF8_IS_INVARIANT(*(U8*)l)) {
+                           if (*l != *s) {
+                                sayNO;
+                            }
+                            l++;
+                        }
+                        else {
+                            if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) 
{
+                                sayNO;
+                            }
+                            l += 2;
+                        }
+                       s++;
                    }
                }
                else {
                    /* The target is not utf8, the pattern is utf8. */
                    while (s < e) {
-                       STRLEN ulen;
-                       if (l >= PL_regeol)
-                           sayNO;
-                       if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
-                                          uniflags))
-                           sayNO;
-                       s += ulen;
-                       l ++;
+                        if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
+                        {
+                            sayNO;
+                        }
+                        if (UTF8_IS_INVARIANT(*(U8*)s)) {
+                           if (*s != *l) {
+                                sayNO;
+                            }
+                            s++;
+                        }
+                        else {
+                            if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) 
{
+                                sayNO;
+                            }
+                            s += 2;
+                        }
+                       l++;
                    }
                }
                locinput = l;
diff --git a/utf8.h b/utf8.h
index bf8251a..30537a8 100644
--- a/utf8.h
+++ b/utf8.h
@@ -170,6 +170,7 @@ Perl's extended UTF-8 means we can have start bytes up to 
FF.
 
 /* Masking with 0xfe allows low bit to be 0 or 1; thus this matches 0xc[23] */
 #define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfe) == 0xc2)
+#define UTF8_IS_ABOVE_LATIN1(c)        ((U8)(c) >= 0xc4)
 
 #define UTF_START_MARK(len) (((len) >  7) ? 0xFF : (0xFE << (7-(len))))
 
diff --git a/utfebcdic.h b/utfebcdic.h
index 3eba83d..1662103 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -570,6 +570,7 @@ END_EXTERN_C
 #define UTF8_IS_CONTINUATION(c)                ((NATIVE_TO_UTF(c) & 0xE0) == 
0xA0)
 #define UTF8_IS_CONTINUED(c)           (NATIVE_TO_UTF(c) >= 0xA0)
 #define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xC5 && 
NATIVE_TO_UTF(c) <= 0xC7)
+#define UTF8_IS_ABOVE_LATIN1(c)        (NATIVE_TO_I8(c) >= 0xC8)
 
 #define UTF_START_MARK(len) (((len) >  7) ? 0xFF : ((U8)(0xFE << (7-(len)))))
 #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2)))

--
Perl5 Master Repository

Reply via email to