In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/f28f4d2a3736f60f4eb285910b24e2cce4656382?hp=983d792ccf473faca5827d9cddb06a1a59e5dc6d>

- Log -----------------------------------------------------------------
commit f28f4d2a3736f60f4eb285910b24e2cce4656382
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 28 15:22:08 2016 -0700

    Change name of PL_ variable
    
    This variable really means the character that replaces any embedded NULs
    when doing collation.  Change the name accordingly.  (Embedded NULs must
    be replaced because the libc function strxfrm is used, and it operates
    on C strings which have no embedded NULs.)

M       embedvar.h
M       intrpvar.h
M       locale.c

commit afc4976faee3dbcd0f85100736d54a8694d26645
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 28 09:09:23 2016 -0700

    PATCH: [perl #129953] lib/locale.t failures on FREEBSD
    
    I thought this bug was in FREEBSD, but when I went to gather the info
    needed to report it to the vendor, it turned out to be a mistake I had
    made.
    
    The problem is basically doubly encoding into UTF-8.  In order to save
    CPU time, in a UTF-8 locale, I had stored a string as UTF-8 encoded.
    This string is to be inserted into a larger string.  What I neglected to
    consider in this situation is that not all strings in such locales need
    be in UTF-8.  The UTF-8 encoded insert could get added to a non-UTF-8
    string, and the result later was switched to UTF-8, so the inserted
    string's bytes were individually converted to UTF-8, effectively a
    second time.  This is a problem only if the inserted string is different
    when encoded in UTF-8 than not, and for this particular usage, on most
    platforms it was UTF-8 invariant, so did not show up, except on those
    platforms where it was variant.
    
    The solution is to store the replacement as a code point, and encode it
    as UTF-8 only if necessary, once.  This actually simplifies the code.

M       intrpvar.h
M       locale.c

commit 1e4c96768cc9fe7008eef89b69243de628c78837
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 27 10:25:49 2016 -0700

    locale.c: Add some comments

M       locale.c

commit 535a3fb3ec9051c531a7797f1de40cbfc39e3f7f
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 28 14:16:18 2016 -0700

    lib/locale.t: Don't assume NUL is a control
    
    A test that assumed NUL would be considered a control fails in locales
    where it isn't considered a control.
    
    This was found on FREEBSD, where the locale "hi_IN.ISCII-DEV" is
    defective (based on the Wikipedia article on ISCII
    
https://en.wikipedia.org/wiki/Indian_Script_Code_for_Information_Interchange).
    Only the code points 0x09-0x0D are considered controls in this
    implementation, whereas the article says ISCII is a superset of ASCII,
    so should have 33 controls, not just 5.  (Unrelated to this ticket, but
    another apparent defect I saw is that this implementation defines 0x91,
    but the article says that code point is unassigned.)

M       lib/locale.t

commit 83cf869355710e5fa99226ae9c014afdb53a30cf
Author: Karl Williamson <[email protected]>
Date:   Sat Oct 29 12:24:21 2016 -0600

    lib/locale.t: Improve skipping of incompatible locales
    
    Perl works properly with only a subset of the possible locales out
    there.  This improves the detection of those and the skipping.

M       lib/locale.t
-----------------------------------------------------------------------

Summary of changes:
 embedvar.h   |  2 +-
 intrpvar.h   |  2 +-
 lib/locale.t | 27 +++++++++++------
 locale.c     | 94 ++++++++++++++++++++++++++++++++----------------------------
 4 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/embedvar.h b/embedvar.h
index 575b755..c413932 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -307,9 +307,9 @@
 #define PL_stderrgv            (vTHX->Istderrgv)
 #define PL_stdingv             (vTHX->Istdingv)
 #define PL_strtab              (vTHX->Istrtab)
+#define PL_strxfrm_NUL_replacement     (vTHX->Istrxfrm_NUL_replacement)
 #define PL_strxfrm_is_behaved  (vTHX->Istrxfrm_is_behaved)
 #define PL_strxfrm_max_cp      (vTHX->Istrxfrm_max_cp)
-#define PL_strxfrm_min_char    (vTHX->Istrxfrm_min_char)
 #define PL_sub_generation      (vTHX->Isub_generation)
 #define PL_subline             (vTHX->Isubline)
 #define PL_subname             (vTHX->Isubname)
diff --git a/intrpvar.h b/intrpvar.h
index 4243fc8..1aa94f7 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -565,7 +565,7 @@ PERLVAR(I, collation_name, char *)  /* Name of current 
collation */
 PERLVAR(I, collxfrm_base, Size_t)      /* Basic overhead in *xfrm() */
 PERLVARI(I, collxfrm_mult,Size_t, 2)   /* Expansion factor in *xfrm() */
 PERLVARI(I, collation_ix, U32, 0)      /* Collation generation index */
-PERLVARA(I, strxfrm_min_char, 3, char)
+PERLVARI(I, strxfrm_NUL_replacement, U8, 0)  /* Code point to replace NULs */
 PERLVARI(I, strxfrm_is_behaved, bool, TRUE)
                             /* Assume until proven otherwise that it works */
 PERLVARI(I, strxfrm_max_cp, U8, 0)      /* Highest collating cp in locale */
diff --git a/lib/locale.t b/lib/locale.t
index 4a2cb75..2f3123d 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -750,8 +750,9 @@ debug "Scanning for locales...\n";
 
 require POSIX; import POSIX ':locale_h';
 
-my @Locale = find_locales([ 'LC_CTYPE', 'LC_NUMERIC', 'LC_ALL' ]);
-my @include_incompatible_locales = find_locales('LC_CTYPE',
+my $categories = [ 'LC_CTYPE', 'LC_NUMERIC', 'LC_ALL' ];
+my @Locale = find_locales($categories);
+my @include_incompatible_locales = find_locales($categories,
                                                 'even incompatible locales');
 
 # The locales included in the incompatible list that aren't in the compatible
@@ -778,12 +779,16 @@ if (@Locale < @include_incompatible_locales) {
             push @warnings, ($warning =~ s/\n/\n# /sgr);
         };
 
-        setlocale(&POSIX::LC_CTYPE, $bad_locale);
+        my $ret = setlocale(&POSIX::LC_CTYPE, $bad_locale);
 
         my $message = "testing of locale '$bad_locale' is skipped";
         if (@warnings) {
             skip $message . ":\n# " . join "\n# ", @warnings;
         }
+        elsif (! $ret) {
+            skip("$message:\n#"
+               . " setlocale(&POSIX::LC_CTYPE, '$bad_locale') failed");
+        }
         else {
             fail $message . ", because it is was found to be incompatible with"
                           . " Perl, but could not discern reason";
@@ -1801,10 +1806,16 @@ foreach my $Locale (@Locale) {
 
         ++$locales_test_number;
         $test_names{$locales_test_number}
-                            = 'Verify that \0 sorts before any other control';
-        my $ok = $sorted_controls[0] eq "\0";
-        report_result($Locale, $locales_test_number, $ok);
-        shift @sorted_controls;
+                = 'Skip in locales where \0 is not considered a control;'
+                . ' otherwise verify that \0 sorts before any other control';
+        if ("\0" !~ /[[:cntrl:]]/) {
+            report_result($Locale, $locales_test_number, 1);
+        }
+        else {
+            my $ok = $sorted_controls[0] eq "\0";
+            report_result($Locale, $locales_test_number, $ok);
+            shift @sorted_controls;
+        }
         my $lowest_control = $sorted_controls[0];
 
         ++$locales_test_number;
@@ -1822,7 +1833,7 @@ foreach my $Locale (@Locale) {
         ++$locales_test_number;
         $test_names{$locales_test_number}
                             = 'Verify that strings with embedded NUL collate';
-        $ok = "a\0a\0a" lt "a${lowest_control}a${lowest_control}a";
+        my $ok = "a\0a\0a" lt "a${lowest_control}a${lowest_control}a";
         report_result($Locale, $locales_test_number, $ok);
 
         ++$locales_test_number;
diff --git a/locale.c b/locale.c
index e0b6793..a871b9e 100644
--- a/locale.c
+++ b/locale.c
@@ -514,7 +514,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
        PL_collxfrm_base = 0;
        PL_collxfrm_mult = 2;
         PL_in_utf8_COLLATE_locale = FALSE;
-        *PL_strxfrm_min_char = '\0';
+        PL_strxfrm_NUL_replacement = '\0';
         PL_strxfrm_max_cp = 0;
        return;
     }
@@ -530,7 +530,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
         }
 
         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
-        *PL_strxfrm_min_char = '\0';
+        PL_strxfrm_NUL_replacement = '\0';
         PL_strxfrm_max_cp = 0;
 
         /* A locale collation definition includes primary, secondary, tertiary,
@@ -1465,29 +1465,35 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
      * otherwise contain that character, but otherwise there may be
      * less-than-perfect results with that character and NUL.  This is
      * unavoidable unless we replace strxfrm with our own implementation. */
-    if (s_strlen < len) {
+    if (s_strlen < len) {   /* Only execute if there is an embedded NUL */
         char * e = s + len;
         char * sans_nuls;
-        STRLEN cur_min_char_len;
         STRLEN sans_nuls_len;
         STRLEN sans_nuls_pos;
         int try_non_controls;
-
-        /* If we don't know what control character sorts lowest for this
-         * locale, find it */
-        if (*PL_strxfrm_min_char == '\0') {
+        char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
+                                                   making sure 2nd byte is NUL.
+                                                 */
+        STRLEN this_replacement_len;
+
+        /* If we don't know what non-NUL control character sorts lowest for
+         * this locale, find it */
+        if (PL_strxfrm_NUL_replacement == '\0') {
             int j;
-#ifdef DEBUGGING
-            U8     cur_min_cp = 1; /* The code point that sorts lowest, so far 
*/
-#endif
-            char * cur_min_x = NULL;    /* And its xfrm, (except it also
+            char * cur_min_x = NULL;    /* The min_char's xfrm, (except it also
                                            includes the collation index
                                            prefixed. */
 
             DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace 
NUL\n"));
 
             /* Unlikely, but it may be that no control will work to replace
-             * NUL, in which case we instead look for any character */
+             * NUL, in which case we instead look for any character.  Controls
+             * are preferred because collation order is, in general, context
+             * sensitive, with adjoining characters affecting the order, and
+             * controls are less likely to have such interactions, allowing the
+             * NUL-replacement to stand on its own.  (Another way to look at it
+             * is to imagine what would happen if the NUL were replaced by a
+             * combining character; it wouldn't work out all that well.) */
             for (try_non_controls = 0;
                  try_non_controls < 2;
                  try_non_controls++)
@@ -1498,31 +1504,21 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                     STRLEN x_len;   /* length of 'x' */
                     STRLEN trial_len = 1;
 
-                    /* Create a 1 byte string of the current code point, but
-                     * with room to be 2 bytes */
-                    char cur_source[] = { (char) j, '\0' , '\0' };
-
-                    if (PL_in_utf8_COLLATE_locale) {
-                        if (! try_non_controls && ! isCNTRL_L1(j)) {
-                            continue;
-                        }
+                    /* Create a 1 byte string of the current code point */
+                    char cur_source[] = { (char) j, '\0' };
 
-                        /* If needs to be 2 bytes, find them */
-                        if (! UVCHR_IS_INVARIANT(j)) {
-                            char * d = cur_source;
-                            append_utf8_from_native_byte((U8) j, (U8 **) &d);
-                            trial_len = 2;
-                        }
-                    }
-                    else if (! try_non_controls && ! isCNTRL_LC(j)) {
+                    if (! try_non_controls && (PL_in_utf8_COLLATE_locale)
+                                               ? ! isCNTRL_L1(j)
+                                               : ! isCNTRL_LC(j))
+                    {
                         continue;
                     }
 
                     /* Then transform it */
                     x = _mem_collxfrm(cur_source, trial_len, &x_len,
-                                      PL_in_utf8_COLLATE_locale);
+                                      0 /* The string is not in UTF-8 */);
 
-                    /* Ignore any character that didn't successfully transform
+                    /* Ignore any character that didn't successfully transform.
                      * */
                     if (! x) {
                         continue;
@@ -1534,19 +1530,15 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                         || strLT(x         + COLLXFRM_HDR_LEN,
                                  cur_min_x + COLLXFRM_HDR_LEN))
                     {
-                        PL_strxfrm_min_char[0] = cur_source[0];
-                        PL_strxfrm_min_char[1] = cur_source[1];
-                        PL_strxfrm_min_char[2] = cur_source[2];
+                        PL_strxfrm_NUL_replacement = j;
                         cur_min_x = x;
-#ifdef DEBUGGING
-                        cur_min_cp = j;
-#endif
                     }
                     else {
                         Safefree(x);
                     }
-                } /* end of loop through all bytes */
+                } /* end of loop through all 255 characters */
 
+                /* Stop looking if found */
                 if (cur_min_x) {
                     break;
                 }
@@ -1556,7 +1548,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                  * character that works */
                 DEBUG_L(PerlIO_printf(Perl_debug_log,
                 "_mem_collxfrm: No control worked.  Trying non-controls\n"));
-            }
+            } /* End of loop to try first the controls, then any char */
 
             if (! cur_min_x) {
                 DEBUG_L(PerlIO_printf(Perl_debug_log,
@@ -1567,16 +1559,30 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
 
             DEBUG_L(PerlIO_printf(Perl_debug_log,
                     "_mem_collxfrm: Replacing embedded NULs in locale %s with "
-                    "0x%02X\n", PL_collation_name, cur_min_cp));
+                    "0x%02X\n", PL_collation_name, 
PL_strxfrm_NUL_replacement));
 
             Safefree(cur_min_x);
+        } /* End of determining the character that is to replace NULs */
+
+        /* If the replacement is variant under UTF-8, it must match the
+         * UTF8-ness as the original */
+        if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) {
+            this_replacement_char[0] =
+                                UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement);
+            this_replacement_char[1] =
+                                UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement);
+            this_replacement_len = 2;
+        }
+        else {
+            this_replacement_char[0] = PL_strxfrm_NUL_replacement;
+            /* this_replacement_char[1] = '\0' was done at initialization */
+            this_replacement_len = 1;
         }
 
         /* The worst case length for the replaced string would be if every
          * character in it is NUL.  Multiply that by the length of each
          * replacement, and allow for a trailing NUL */
-        cur_min_char_len = strlen(PL_strxfrm_min_char);
-        sans_nuls_len = (len * cur_min_char_len) + 1;
+        sans_nuls_len = (len * this_replacement_len) + 1;
         Newx(sans_nuls, sans_nuls_len, char);
         *sans_nuls = '\0';
         sans_nuls_pos = 0;
@@ -1590,7 +1596,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
 
             /* Do the actual replacement */
             sans_nuls_pos = my_strlcat(sans_nuls + sans_nuls_pos,
-                                       PL_strxfrm_min_char,
+                                       this_replacement_char,
                                        sans_nuls_len);
 
             /* Move past the input NUL */
@@ -1604,7 +1610,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
         /* Switch so below we transform this modified string */
         s = sans_nuls;
         len = strlen(s);
-    }
+    } /* End of replacing NULs */
 
     /* Make sure the UTF8ness of the string and locale match */
     if (utf8 != PL_in_utf8_COLLATE_locale) {

--
Perl5 Master Repository

Reply via email to