In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/34213185c286738af0d6c7a97ef3ef00228d3a43?hp=551feee877a880cc6b02ddb3ce19271edca2143c>
- Log ----------------------------------------------------------------- commit 34213185c286738af0d6c7a97ef3ef00228d3a43 Merge: 551feee f1ee460 Author: Tony Cook <[email protected]> Date: Sun Dec 9 13:29:25 2012 +1100 getline $/ = \N now reads N characters not bytes This was discussed for 5.16 but was not included as it was too late for inclusion. commit f1ee460bffff6beafd031a678da686e1e5ebf4bf Author: Tony Cook <[email protected]> Date: Sun Dec 9 13:27:12 2012 +1100 remove the warning added for 5.16 and indicate the count is chars not bytes M pod/perlvar.pod ----------------------------------------------------------------------- Summary of changes: pod/perlvar.pod | 9 +---- sv.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++------- t/io/utf8.t | 40 ++++++++++++++++++++- 3 files changed, 135 insertions(+), 22 deletions(-) diff --git a/pod/perlvar.pod b/pod/perlvar.pod index 47b202a..998ea42 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -1358,7 +1358,7 @@ referenced integer. So this: open my $fh, "<", $myfile or die $!; local $_ = <$fh>; -will read a record of no more than 32768 bytes from $fh. If you're +will read a record of no more than 32768 characters from $fh. If you're not reading from a record-oriented file (or your OS doesn't have record-oriented files), then you'll likely get a full chunk of data with every read. If a record is larger than the record size you've @@ -1370,13 +1370,6 @@ buffering,so you must not mix record and non-record reads on the same filehandle. Record mode mixes with line mode only when the same buffering layer is in use for both modes. -If you perform a record read on a FILE with an encoding layer such as -C<:encoding(latin1)> or C<:utf8>, you may get an invalid string as a -result, may leave the FILE positioned between characters in the stream -and may not be reading the number of bytes from the underlying file -that you specified. This behaviour may change without warning in a -future version of perl. - You cannot call C<input_record_separator()> on a handle, only as a static method. See L<IO::Handle|IO::Handle>. diff --git a/sv.c b/sv.c index 8bc60db..c611e97 100644 --- a/sv.c +++ b/sv.c @@ -7667,29 +7667,111 @@ S_sv_gets_append_to_utf8(pTHX_ SV *const sv, PerlIO *const fp, I32 append) static char * S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append) { - I32 bytesread; - const U32 recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */ + SSize_t bytesread; + const STRLEN recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */ /* Grab the size of the record we're getting */ - char *const buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append; + char *buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append; + + /* Go yank in */ #ifdef VMS +#include <rms.h> int fd; -#endif + Stat_t st; - /* Go yank in */ -#ifdef VMS - /* VMS wants read instead of fread, because fread doesn't respect */ - /* RMS record boundaries. This is not necessarily a good thing to be */ - /* doing, but we've got no other real choice - except avoid stdio - as implementation - perhaps write a :vms layer ? - */ + /* With a true, record-oriented file on VMS, we need to use read directly + * to ensure that we respect RMS record boundaries. The user is responsible + * for providing a PL_rs value that corresponds to the FAB$W_MRS (maximum + * record size) field. N.B. This is likely to produce invalid results on + * varying-width character data when a record ends mid-character. + */ fd = PerlIO_fileno(fp); - if (fd != -1) { + if (fd != -1 + && PerlLIO_fstat(fd, &st) == 0 + && (st.st_fab_rfm == FAB$C_VAR + || st.st_fab_rfm == FAB$C_VFC + || st.st_fab_rfm == FAB$C_FIX)) { + bytesread = PerlLIO_read(fd, buffer, recsize); } - else /* in-memory file from PerlIO::Scalar */ + else /* in-memory file from PerlIO::Scalar + * or not a record-oriented file + */ #endif { bytesread = PerlIO_read(fp, buffer, recsize); + + /* At this point, the logic in sv_get() means that sv will + be treated as utf-8 if the handle is utf8. + */ + if (PerlIO_isutf8(fp) && bytesread > 0) { + char *bend = buffer + bytesread; + char *bufp = buffer; + size_t charcount = 0; + bool charstart = TRUE; + STRLEN skip = 0; + + while (charcount < recsize) { + /* count accumulated characters */ + while (bufp < bend) { + if (charstart) { + skip = UTF8SKIP(bufp); + } + if (bufp + skip > bend) { + /* partial at the end */ + charstart = FALSE; + break; + } + else { + ++charcount; + bufp += skip; + charstart = TRUE; + } + } + + if (charcount < recsize) { + STRLEN readsize; + STRLEN bufp_offset = bufp - buffer; + SSize_t morebytesread; + + /* originally I read enough to fill any incomplete + character and the first byte of the next + character if needed, but if there's many + multi-byte encoded characters we're going to be + making a read call for every character beyond + the original read size. + + So instead, read the rest of the character if + any, and enough bytes to match at least the + start bytes for each character we're going to + read. + */ + if (charstart) + readsize = recsize - charcount; + else + readsize = skip - (bend - bufp) + recsize - charcount - 1; + buffer = SvGROW(sv, append + bytesread + readsize + 1) + append; + bend = buffer + bytesread; + morebytesread = PerlIO_read(fp, bend, readsize); + if (morebytesread <= 0) { + /* we're done, if we still have incomplete + characters the check code in sv_gets() will + warn about them. + + I'd originally considered doing + PerlIO_ungetc() on all but the lead + character of the incomplete character, but + read() doesn't do that, so I don't. + */ + break; + } + + /* prepare to scan some more */ + bytesread += morebytesread; + bend = buffer + bytesread; + bufp = buffer + bufp_offset; + } + } + } } if (bytesread < 0) diff --git a/t/io/utf8.t b/t/io/utf8.t index 4b01747..29beba2 100644 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -10,7 +10,7 @@ BEGIN { no utf8; # needed for use utf8 not griping about the raw octets -plan(tests => 55); +plan(tests => 61); $| = 1; @@ -348,3 +348,41 @@ is($failed, undef); "<:utf8 rcatline must warn about bad utf8"); close F; } + +{ + # fixed record reads + open F, ">:utf8", $a_file; + print F "foo\xE4"; + print F "bar\xFE"; + print F "\xC0\xC8\xCC\xD2"; + print F "a\xE4ab"; + print F "a\xE4a"; + close F; + open F, "<:utf8", $a_file; + local $/ = \4; + my $line = <F>; + is($line, "foo\xE4", "readline with \$/ = \\4"); + $line .= <F>; + is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4"); + $line = <F>; + is($line, "\xC0\xC8\xCC\xD2", "readline with several encoded characters"); + $line = <F>; + is($line, "a\xE4ab", "readline with another boundary condition"); + $line = <F>; + is($line, "a\xE4a", "readline with boundary condition"); + close F; + + # badly encoded at EOF + open F, ">:raw", $a_file; + print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl + close F; + + use warnings 'utf8'; + open F, "<:utf8", $a_file; + undef $@; + local $SIG{__WARN__} = sub { $@ = shift }; + $line = <F>; + + like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/, + "<:utf8 readline (fixed) must warn about bad utf8"); +} -- Perl5 Master Repository
