In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/2b9519f04ca8ec89b5551bc43e2f67de293a9679?hp=52e18d8c188e949821d841ea625161e5c0430fa4>

- Log -----------------------------------------------------------------
commit 2b9519f04ca8ec89b5551bc43e2f67de293a9679
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 22 12:55:31 2018 -0700

    Use dfa to speed up translating UTF-8 into code point
    
    This dfa is available from the internet has the reputation of being the
    fastest general translator.  This commit changes to use it at the
    beginning of our translator, modifying it slightly to accept surrogates
    and all 4-byte Perl-extended.  If necessary, it drops down into our
    translator to handle errors and warnings and Perl extended.
    
    It shows some improvement over our base translation:
    
    Key:
        Ir   Instruction read
        Dr   Data read
        Dw   Data write
        COND conditional branches
        IND  indirect branches
        _m   branch predict miss
        -    indeterminate percentage (e.g. 1/0)
    
    The numbers represent raw counts per loop iteration.
    
    unicode::utf8n_to_uvchr_0x007f
    ord(X)
    
           blead   dfa Ratio %
           ----- ----- -------
        Ir 359.0 359.0   100.0
        Dr 111.0 111.0   100.0
        Dw  64.0  64.0   100.0
      COND  42.0  42.0   100.0
       IND   5.0   5.0   100.0
    
    COND_m   2.0   0.0     Inf
     IND_m   5.0   5.0   100.0
    
    unicode::utf8n_to_uvchr_0x07ff
    ord(X)
    
           blead   dfa Ratio %
           ----- ----- -------
        Ir 478.0 467.0   102.4
        Dr 132.0 133.0    99.2
        Dw  79.0  78.0   101.3
      COND  63.0  57.0   110.5
       IND   5.0   5.0   100.0
    
    COND_m   1.0   0.0     Inf
     IND_m   5.0   5.0   100.0
    
    unicode::utf8n_to_uvchr_0xfffd
    ord(X)
    
           blead   dfa Ratio %
           ----- ----- -------
        Ir 494.0 486.0   101.6
        Dr 134.0 136.0    98.5
        Dw  79.0  78.0   101.3
      COND  67.0  61.0   109.8
       IND   5.0   5.0   100.0
    
    COND_m   2.0   0.0     Inf
     IND_m   5.0   5.0   100.0
    
    unicode::utf8n_to_uvchr_0x1fffd
    ord(X)
    
           blead   dfa Ratio %
           ----- ----- -------
        Ir 508.0 505.0   100.6
        Dr 135.0 139.0    97.1
        Dw  79.0  78.0   101.3
      COND  70.0  65.0   107.7
       IND   5.0   5.0   100.0
    
    COND_m   2.0   1.0   200.0
     IND_m   5.0   5.0   100.0
    
    unicode::utf8n_to_uvchr_0x10fffd
    ord(X)
    
           blead   dfa Ratio %
           ----- ----- -------
        Ir 508.0 505.0   100.6
        Dr 135.0 139.0    97.1
        Dw  79.0  78.0   101.3
      COND  70.0  65.0   107.7
       IND   5.0   5.0   100.0
    
    COND_m   2.0   1.0   200.0
     IND_m   5.0   5.0   100.0
    
    Each code point represents an extra byte required in its UTF-8
    representation from the previous one.

-----------------------------------------------------------------------

Summary of changes:
 utf8.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 149 insertions(+), 8 deletions(-)

diff --git a/utf8.c b/utf8.c
index 34e47f3389..21664d5cf8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1159,6 +1159,102 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
 }
 
+/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/,
+ * which requires this copyright notice */
+
+/* Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#if 0
+static U8 utf8d_C9[] = {
+  /* The first part of the table maps bytes to character classes that
+   * to reduce the size of the transition table and create bitmasks. */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
+
+  /* The second part is a transition table that maps a combination
+   * of a state of the automaton and a character class to a state. */
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12
+};
+
+#endif
+
+#ifndef EBCDIC
+
+/* This is a version of the above table customized for Perl that doesn't
+ * exclude surrogates and accepts start bytes up through F7 (representing
+ * 2**21 - 1). */
+static U8 dfa_tab_for_perl[] = {
+    /* The first part of the table maps bytes to character classes to reduce
+     * the size of the transition table and create bitmasks. */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
+  10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/
+
+  /* The second part is a transition table that maps a combination
+   * of a state of the automaton and a character class to a state. */
+   0,12,24,36,96,12,12,12,12,12,48,72, 
12,12,12,12,12,12,12,12,12,12,12,12,/*23*/
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 
12,24,12,12,12,12,12,24,12,24,12,12,/*47*/
+  12,12,12,12,12,12,12,24,12,12,12,12, 
12,24,12,12,12,12,12,12,12,24,12,12,/*71*/
+  12,12,12,12,12,12,12,36,12,36,12,12, 
12,36,12,12,12,12,12,36,12,36,12,12,/*95*/
+  12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */
+
+ /* The customization was to repurpose the surrogates type '4' to instead be
+  * for start bytes F1-F7.  Types 5 and 6 are now unused, and their entries in
+  * the transition part of the table are set to 12, so are illegal.
+  *
+  * To do higher code points would require expansion and some rearrangement of
+  * the table.  The type '1' entries for continuation bytes 80-8f would have to
+  * be split into several types, because they aren't treated uniformly for
+  * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE:
+  * 80-81.  We start needing to worry about overflow if FE is included.
+  * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember
+  * from the web site that these are used to right shift).  FE would
+  * necessarily be type 7; and FF, type 8.  And new states would have to be
+  * created for F8 and FC (and FE and FF if used), so quite a bit of work would
+  * be involved.
+  *
+  * XXX Better would be to customize the table so that the noncharacters are
+  * excluded.  This again is non trivial, but doing so would simplify the code
+  * that uses this, and might make it small enough to make it inlinable */
+};
+
+#endif
+
 /*
 
 =for apidoc utf8n_to_uvchr_error
@@ -1350,11 +1446,10 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
                                AV ** msgs)
 {
     const U8 * const s0 = s;
-    U8 * send = NULL;           /* (initialized to silence compilers' wrong
-                                   warning) */
+    const U8 * send = s0 + curlen;
     U32 possible_problems = 0;  /* A bit is set here for each potential problem
                                    found as we go along */
-    UV uv = *s;
+    UV uv;
     STRLEN expectlen   = 0;     /* How long should this sequence be?
                                    (initialized to silence compilers' wrong
                                    warning) */
@@ -1370,6 +1465,8 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
                                             routine; see [perl #130921] */
     UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
 
+    UV state = 0;
+
     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
 
     if (errors) {
@@ -1424,10 +1521,54 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
     }
 
     /* An invariant is trivially well-formed */
-    if (UTF8_IS_INVARIANT(uv)) {
-       return uv;
+    if (UTF8_IS_INVARIANT(*s0)) {
+       return *s0;
+    }
+
+#ifndef EBCDIC
+
+    /* Measurements show that this dfa is somewhat faster than the regular code
+     * below, so use it first, dropping down for the non-normal cases. */
+
+#  define PERL_UTF8_DECODE_REJECT 12
+
+    while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
+        UV type = dfa_tab_for_perl[*s];
+
+        if (state != 0) {
+            uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT);
+            state = dfa_tab_for_perl[256 + state + type];
+        }
+        else {
+            uv = (0xff >> type) & (*s);
+            state = dfa_tab_for_perl[256 + type];
+        }
+
+        if (state == 0) {
+
+            /* If this could be a code point that the flags don't allow (the 
first
+            * surrogate is the first such possible one), delve further, but we 
already
+            * have calculated 'uv' */
+            if (  (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                           |UTF8_WARN_ILLEGAL_INTERCHANGE))
+                && uv >= UNICODE_SURROGATE_FIRST)
+            {
+                curlen = s + 1 - s0;
+                goto got_uv;
+            }
+
+            return uv;
+        }
+
+        s++;
     }
 
+    /* Here, is some sort of failure.  Use the full mechanism */
+
+    uv = *s0;
+
+#endif
+
     /* A continuation character can't start a valid sequence */
     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
        possible_problems |= UTF8_GOT_CONTINUATION;
@@ -1448,14 +1589,12 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
 
     /* Setup the loop end point, making sure to not look past the end of the
      * input string, and flag it as too short if the size isn't big enough. */
-    send = (U8*) s0;
     if (UNLIKELY(curlen < expectlen)) {
         possible_problems |= UTF8_GOT_SHORT;
         avail_len = curlen;
-        send += curlen;
     }
     else {
-        send += expectlen;
+        send = (U8*) s0 + expectlen;
     }
 
     /* Now, loop through the remaining bytes in the character's sequence,
@@ -1549,6 +1688,8 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
         }
     }
 
+  got_uv:
+
     /* Here, we have found all the possible problems, except for when the input
      * is for a problematic code point not allowed by the input parameters. */
 

-- 
Perl5 Master Repository

Reply via email to