[perl.git] branch blead, updated. v5.17.6-202-g3421318

Tony Cook Sat, 08 Dec 2012 18:30:06 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/34213185c286738af0d6c7a97ef3ef00228d3a43?hp=551feee877a880cc6b02ddb3ce19271edca2143c>


- Log -----------------------------------------------------------------
commit 34213185c286738af0d6c7a97ef3ef00228d3a43
Merge: 551feee f1ee460
Author: Tony Cook <[email protected]>
Date:   Sun Dec 9 13:29:25 2012 +1100

    getline $/ = \N now reads N characters not bytes
    
    This was discussed for 5.16 but was not included as it was too late
    for inclusion.

commit f1ee460bffff6beafd031a678da686e1e5ebf4bf
Author: Tony Cook <[email protected]>
Date:   Sun Dec 9 13:27:12 2012 +1100

    remove the warning added for 5.16 and indicate the count is chars not bytes

M       pod/perlvar.pod
-----------------------------------------------------------------------

Summary of changes:
 pod/perlvar.pod |    9 +----
 sv.c            |  108 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 t/io/utf8.t     |   40 ++++++++++++++++++++-
 3 files changed, 135 insertions(+), 22 deletions(-)

diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index 47b202a..998ea42 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -1358,7 +1358,7 @@ referenced integer.  So this:
     open my $fh, "<", $myfile or die $!;
     local $_ = <$fh>;
 
-will read a record of no more than 32768 bytes from $fh.  If you're
+will read a record of no more than 32768 characters from $fh.  If you're
 not reading from a record-oriented file (or your OS doesn't have
 record-oriented files), then you'll likely get a full chunk of data
 with every read.  If a record is larger than the record size you've
@@ -1370,13 +1370,6 @@ buffering,so you must not mix record and non-record 
reads on the
 same filehandle.  Record mode mixes with line mode only when the
 same buffering layer is in use for both modes.
 
-If you perform a record read on a FILE with an encoding layer such as
-C<:encoding(latin1)> or C<:utf8>, you may get an invalid string as a
-result, may leave the FILE positioned between characters in the stream
-and may not be reading the number of bytes from the underlying file
-that you specified.  This behaviour may change without warning in a
-future version of perl.
-
 You cannot call C<input_record_separator()> on a handle, only as a
 static method.  See L<IO::Handle|IO::Handle>.
 
diff --git a/sv.c b/sv.c
index 8bc60db..c611e97 100644
--- a/sv.c
+++ b/sv.c
@@ -7667,29 +7667,111 @@ S_sv_gets_append_to_utf8(pTHX_ SV *const sv, PerlIO 
*const fp, I32 append)
 static char *
 S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
 {
-    I32 bytesread;
-    const U32 recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */
+    SSize_t bytesread;
+    const STRLEN recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */
       /* Grab the size of the record we're getting */
-    char *const buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
+    char *buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
+    
+    /* Go yank in */
 #ifdef VMS
+#include <rms.h>
     int fd;
-#endif
+    Stat_t st;
 
-    /* Go yank in */
-#ifdef VMS
-    /* VMS wants read instead of fread, because fread doesn't respect */
-    /* RMS record boundaries. This is not necessarily a good thing to be */
-    /* doing, but we've got no other real choice - except avoid stdio
-       as implementation - perhaps write a :vms layer ?
-    */
+    /* With a true, record-oriented file on VMS, we need to use read directly
+     * to ensure that we respect RMS record boundaries.  The user is 
responsible
+     * for providing a PL_rs value that corresponds to the FAB$W_MRS (maximum
+     * record size) field.  N.B. This is likely to produce invalid results on
+     * varying-width character data when a record ends mid-character.
+     */
     fd = PerlIO_fileno(fp);
-    if (fd != -1) {
+    if (fd != -1
+       && PerlLIO_fstat(fd, &st) == 0
+       && (st.st_fab_rfm == FAB$C_VAR
+           || st.st_fab_rfm == FAB$C_VFC
+           || st.st_fab_rfm == FAB$C_FIX)) {
+
        bytesread = PerlLIO_read(fd, buffer, recsize);
     }
-    else /* in-memory file from PerlIO::Scalar */
+    else /* in-memory file from PerlIO::Scalar
+          * or not a record-oriented file
+          */
 #endif
     {
        bytesread = PerlIO_read(fp, buffer, recsize);
+
+       /* At this point, the logic in sv_get() means that sv will
+          be treated as utf-8 if the handle is utf8.
+       */
+       if (PerlIO_isutf8(fp) && bytesread > 0) {
+           char *bend = buffer + bytesread;
+           char *bufp = buffer;
+           size_t charcount = 0;
+           bool charstart = TRUE;
+           STRLEN skip = 0;
+
+           while (charcount < recsize) {
+               /* count accumulated characters */
+               while (bufp < bend) {
+                   if (charstart) {
+                       skip = UTF8SKIP(bufp);
+                   }
+                   if (bufp + skip > bend) {
+                       /* partial at the end */
+                       charstart = FALSE;
+                       break;
+                   }
+                   else {
+                       ++charcount;
+                       bufp += skip;
+                       charstart = TRUE;
+                   }
+               }
+
+               if (charcount < recsize) {
+                   STRLEN readsize;
+                   STRLEN bufp_offset = bufp - buffer;
+                   SSize_t morebytesread;
+
+                   /* originally I read enough to fill any incomplete
+                      character and the first byte of the next
+                      character if needed, but if there's many
+                      multi-byte encoded characters we're going to be
+                      making a read call for every character beyond
+                      the original read size.
+
+                      So instead, read the rest of the character if
+                      any, and enough bytes to match at least the
+                      start bytes for each character we're going to
+                      read.
+                   */
+                   if (charstart)
+                       readsize = recsize - charcount;
+                   else 
+                       readsize = skip - (bend - bufp) + recsize - charcount - 
1;
+                   buffer = SvGROW(sv, append + bytesread + readsize + 1) + 
append;
+                   bend = buffer + bytesread;
+                   morebytesread = PerlIO_read(fp, bend, readsize);
+                   if (morebytesread <= 0) {
+                       /* we're done, if we still have incomplete
+                          characters the check code in sv_gets() will
+                          warn about them.
+
+                          I'd originally considered doing
+                          PerlIO_ungetc() on all but the lead
+                          character of the incomplete character, but
+                          read() doesn't do that, so I don't.
+                       */
+                       break;
+                   }
+
+                   /* prepare to scan some more */
+                   bytesread += morebytesread;
+                   bend = buffer + bytesread;
+                   bufp = buffer + bufp_offset;
+               }
+           }
+       }
     }
 
     if (bytesread < 0)
diff --git a/t/io/utf8.t b/t/io/utf8.t
index 4b01747..29beba2 100644
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
 no utf8; # needed for use utf8 not griping about the raw octets
 
 
-plan(tests => 55);
+plan(tests => 61);
 
 $| = 1;
 
@@ -348,3 +348,41 @@ is($failed, undef);
          "<:utf8 rcatline must warn about bad utf8");
     close F;
 }
+
+{
+    # fixed record reads
+    open F, ">:utf8", $a_file;
+    print F "foo\xE4";
+    print F "bar\xFE";
+    print F "\xC0\xC8\xCC\xD2";
+    print F "a\xE4ab";
+    print F "a\xE4a";
+    close F;
+    open F, "<:utf8", $a_file;
+    local $/ = \4;
+    my $line = <F>;
+    is($line, "foo\xE4", "readline with \$/ = \\4");
+    $line .= <F>;
+    is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4");
+    $line = <F>;
+    is($line, "\xC0\xC8\xCC\xD2", "readline with several encoded characters");
+    $line = <F>;
+    is($line, "a\xE4ab", "readline with another boundary condition");
+    $line = <F>;
+    is($line, "a\xE4a", "readline with boundary condition");
+    close F;
+
+    # badly encoded at EOF
+    open F, ">:raw", $a_file;
+    print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl
+    close F;
+
+    use warnings 'utf8';
+    open F, "<:utf8", $a_file;
+    undef $@;
+    local $SIG{__WARN__} = sub { $@ = shift };
+    $line = <F>;
+
+    like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/,
+         "<:utf8 readline (fixed) must warn about bad utf8");
+}

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.17.6-202-g3421318

Reply via email to