In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/14b9bbbc5b192cd6b85fd4ab2aff99bb6471fb0c?hp=8b78e93627afc5201875e37a37a7b27874b874c5>

- Log -----------------------------------------------------------------
commit 14b9bbbc5b192cd6b85fd4ab2aff99bb6471fb0c
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 13:07:33 2017 -0700

    pp_pack.c: Remove no longer relevant comment

M       pp_pack.c

commit df3377142c0886d6a189c225c7ceb29f6c3da6f2
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 13:02:46 2017 -0700

    pp_pack.c: Remove needless branch
    
    This function only sets *retlen to 0 if the input length is 0.  In all
    but one case, the function was not called with with that input.  In that
    one case, I changed to avoid calling the function with that input.
    Hence we can remove checking *retlen for 0.

M       pp_pack.c

commit 05fefba96af5488952719c6f57af518afba9170a
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 13:00:27 2017 -0700

    pp_pack.c: Remove obsolete code
    
    This code effectively reduced to
    
     if (foo) 0 else 0
    
    because a #define was changed to 0 some releases ago.  Just replace by
    
     0

M       pp_pack.c

commit 15b010f03b2e5fb7406abea47a023c54c42402a2
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 12:58:16 2017 -0700

    utf8.c: Move comment a few lines up in the file
    
    Move it to where it makes more sense.

M       utf8.c

commit 0eb3d6a038a2f51874b4eb6b6268e2f305559b1d
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 12:57:33 2017 -0700

    utf8.h: Clarify comment

M       utf8.h

commit 7084498489de93f9ab8ba8d9c39bba8fcb2c9a6a
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 20:59:49 2017 -0700

    utf8.h: White-space, parens only
    
    Add parens to clarify grouping, white-space for legibility

M       utf8.h

commit d34818305fc99020f8962e6d266a233f2f2bbe10
Author: Karl Williamson <[email protected]>
Date:   Sat Feb 11 20:57:36 2017 -0700

    utf8.h: Add branch prediction
    
    use bytes;
    
    is unlikely to be the case.

M       utf8.h
-----------------------------------------------------------------------

Summary of changes:
 pp_pack.c | 38 ++++++++++++++++++--------------------
 utf8.c    | 18 +++++++++---------
 utf8.h    | 22 +++++++++++-----------
 3 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index 737e019a74..86d138bb05 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -251,12 +251,15 @@ STATIC U8
 utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype)
 {
     STRLEN retlen;
-    UV val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen,
+    UV val;
+
+    if (*s >= end) {
+       goto croak;
+    }
+    val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen,
                         ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-    /* We try to process malformed UTF-8 as much as possible (preferably with
-       warnings), but these two mean we make no progress in the string and
-       might enter an infinite loop */
-    if (retlen == (STRLEN) -1 || retlen == 0)
+    if (retlen == (STRLEN) -1)
+      croak:
        Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack",
                   (int) TYPE_NO_MODIFIERS(datumtype));
     if (val >= 0x100) {
@@ -290,7 +293,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, 
const char *buf, int buf_
     for (;buf_len > 0; buf_len--) {
        if (from >= end) return FALSE;
        val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags);
-       if (retlen == (STRLEN) -1 || retlen == 0) {
+       if (retlen == (STRLEN) -1) {
            from += UTF8SKIP(from);
            bad |= 1;
        } else from += retlen;
@@ -396,7 +399,7 @@ STMT_START {                                                
        \
     STRLEN retlen;                                             \
     if (str >= end) break;                                     \
     val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags);    \
-    if (retlen == (STRLEN) -1 || retlen == 0) {                        \
+    if (retlen == (STRLEN) -1) {                               \
        *cur = '\0';                                            \
        Perl_croak(aTHX_ "Malformed UTF-8 string in pack");     \
     }                                                          \
@@ -1225,7 +1228,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, 
const char *strbeg, const c
                    STRLEN retlen;
                    aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
                                 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-                   if (retlen == (STRLEN) -1 || retlen == 0)
+                   if (retlen == (STRLEN) -1)
                        Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
                    s += retlen;
                  }
@@ -1248,7 +1251,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, 
const char *strbeg, const c
                    STRLEN retlen;
                    const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
                                         ckWARN(WARN_UTF8) ? 0 : 
UTF8_ALLOW_ANY);
-                   if (retlen == (STRLEN) -1 || retlen == 0)
+                   if (retlen == (STRLEN) -1)
                        Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
                    s += retlen;
                    if (!checksum)
@@ -1310,7 +1313,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, 
const char *strbeg, const c
                                                        strend - s,
                                                        &retlen,
                                                        UTF8_ALLOW_DEFAULT));
-                   if (retlen == (STRLEN) -1 || retlen == 0)
+                   if (retlen == (STRLEN) -1)
                        Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
                    s += retlen;
                }
@@ -2594,10 +2597,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV 
**beglist, SV **endlist )
                        GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
                        end = start+SvLEN(cat)-UTF8_MAXLEN;
                    }
-                   cur = (char *) uvchr_to_utf8_flags((U8 *) cur,
-                                                      auv,
-                                                      warn_utf8 ?
-                                                      0 : UNICODE_ALLOW_ANY);
+                   cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0);
                } else {
                    if (auv >= 0x100) {
                        if (!SvUTF8(cat)) {
@@ -2648,9 +2648,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV 
**beglist, SV **endlist )
                auv = SvUV_no_inf(fromstr, datumtype);
                if (utf8) {
                    U8 buffer[UTF8_MAXLEN], *endb;
-                   endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv),
-                                              warn_utf8 ?
-                                              0 : UNICODE_ALLOW_ANY);
+                   endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0);
                    if (cur+(endb-buffer)*UTF8_EXPAND >= end) {
                        *cur = '\0';
                        SvCUR_set(cat, cur - start);
@@ -2666,9 +2664,9 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV 
**beglist, SV **endlist )
                        GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
                        end = start+SvLEN(cat)-UTF8_MAXLEN;
                    }
-                   cur = (char *) uvchr_to_utf8_flags((U8 *) cur, 
UNI_TO_NATIVE(auv),
-                                                      warn_utf8 ?
-                                                      0 : UNICODE_ALLOW_ANY);
+                   cur = (char *) uvchr_to_utf8_flags((U8 *) cur,
+                                                       UNI_TO_NATIVE(auv),
+                                                      0);
                }
            }
            break;
diff --git a/utf8.c b/utf8.c
index 9ce72daba0..bec68a5883 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1179,14 +1179,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
     /* Save how many bytes were actually in the character */
     curlen = s - s0;
 
-    /* A convenience macro that matches either of the too-short conditions.  */
-#   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
-
-    if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
-        uv_so_far = uv;
-        uv = UNICODE_REPLACEMENT;
-    }
-
     /* Note that there are two types of too-short malformation.  One is when
      * there is actual wrong data before the normal termination of the
      * sequence.  The other is that the sequence wasn't complete before the end
@@ -1194,7 +1186,15 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
      * This means that we were passed data for a partial character, but it is
      * valid as far as we saw.  The other is definitely invalid.  This
      * distinction could be important to a caller, so the two types are kept
-     * separate. */
+     * separate.
+     *
+     * A convenience macro that matches either of the too-short conditions.  */
+#   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+    if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+        uv_so_far = uv;
+        uv = UNICODE_REPLACEMENT;
+    }
 
     /* Check for overflow */
     if (UNLIKELY(does_utf8_overflow(s0, send))) {
diff --git a/utf8.h b/utf8.h
index 0fbe4b79d0..affa2d67f5 100644
--- a/utf8.h
+++ b/utf8.h
@@ -707,7 +707,7 @@ with a ptr argument.
 /* A Unicode character can fold to up to 3 characters */
 #define UTF8_MAX_FOLD_CHAR_EXPAND 3
 
-#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
+#define IN_BYTES UNLIKELY(CopHINTS_get(PL_curcop) & HINT_BYTES)
 
 /*
 
@@ -726,12 +726,12 @@ case any call to string overloading updates the internal 
UTF-8 encoding flag.
  * Is so within 'feature unicode_strings' or 'locale :not_characters', and not
  * within 'use bytes'.  UTF-8 locales are not tested for here, but perhaps
  * could be */
-#define IN_UNI_8_BIT                                                           
  \
-           (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT))                      
 \
-               || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL               
  \
-                   /* -1 below is for :not_characters */                       
  \
-                   && _is_in_locale_category(FALSE, -1)))                      
  \
-              && ! IN_BYTES)
+#define IN_UNI_8_BIT                                                    \
+           ((    (      (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT))    \
+                   || (   CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \
+                            /* -1 below is for :not_characters */       \
+                       && _is_in_locale_category(FALSE, -1)))           \
+              && (! IN_BYTES))
 
 
 #define UTF8_ALLOW_EMPTY               0x0001  /* Allow a zero length string */
@@ -802,10 +802,10 @@ case any call to string overloading updates the internal 
UTF-8 encoding flag.
 #define UTF8_WARN_ILLEGAL_INTERCHANGE \
                           (UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
 
-/* This is used typically for code that is willing to accept inputs of
- * illformed UTF-8 sequences, for whatever reason.  However, all such sequences
- * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are
- * also present. */
+/* This is typically used for code that processes UTF-8 input and doesn't want
+ * to have to deal with any malformations that might be present.  All such will
+ * be safely replaced by the REPLACEMENT CHARACTER, unless other flags
+ * overriding this are also present. */
 #define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION                               
 \
                         |UTF8_ALLOW_NON_CONTINUATION                           
 \
                         |UTF8_ALLOW_SHORT                                      
 \

--
Perl5 Master Repository

Reply via email to