In perl.git, the branch tonyc/readline-fixed has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/abf116bf8d0b86e2c2d1a33fbbe8acc97b53a932?hp=83f21b046e129de1b6ba8a8efbdda658fad53c88>

- Log -----------------------------------------------------------------
commit abf116bf8d0b86e2c2d1a33fbbe8acc97b53a932
Author: Tony Cook <[email protected]>
Date:   Sat Mar 17 13:10:29 2012 +1100

    no need to FIXME, it behaves like read() which is the intent
    
    except read() doesn't complain about the invalid characters like
    sv_gets().

M       sv.c

commit b53f0b1a5adddb22c318d24e584cb87c96f596c5
Author: Tony Cook <[email protected]>
Date:   Sat Mar 17 12:54:17 2012 +1100

    fix another boundary case and hopefully improve performance
    
    The fix: the if we found ourselves at a charstart with only one
    character to read, readsize would be zero, handle that correctly.
    
    Performance: originally I read just the first byte of the next
    character, which meant as many extra read calls as there are
    characters left to read after the initial read.  So O(Nleft) reads
    where Nleft is the number of characters left to read after the initial
    read.
    
    Now read as many bytes as there are characters left to read, which
    should mean the number of reads comes down to O(log(Nleft**2)) I think
    (but don't ask me to justify that.)

M       sv.c
M       t/io/utf8.t
-----------------------------------------------------------------------

Summary of changes:
 sv.c        |   30 ++++++++++++++++++++++--------
 t/io/utf8.t |    8 +++++++-
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/sv.c b/sv.c
index 6a303cc..8d5a417 100644
--- a/sv.c
+++ b/sv.c
@@ -7610,24 +7610,38 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const 
fp, I32 append)
                }
 
                if (charcount < recsize) {
-                   /* read the rest of the current character, and maybe the
-                      beginning of the next, if we need it */
-                   STRLEN readsize = (charstart ? 0 : skip - (bend - bufp))
-                       + (charcount + 1 < recsize);
+                   STRLEN readsize;
                    STRLEN bufp_offset = bufp - buffer;
                    ssize_t morebytesread;
 
+                   /* originally I read enough to fill any incomplete
+                      character and the first byte of the next
+                      character if needed, but if there's many
+                      multi-byte encoded characters we're going to be
+                      making a read call for every character beyond
+                      the original read size.
+
+                      So instead, read the rest of the character if
+                      any, and enough bytes to match at least the
+                      start bytes for each character we're going to
+                      read.
+                   */
+                   if (charstart)
+                       readsize = recsize - charcount;
+                   else 
+                       readsize = skip - (bend - bufp) + recsize - charcount - 
1;
                    buffer = SvGROW(sv, append + bytesread + readsize + 1) + 
append;
                    bend = buffer + bytesread;
                    morebytesread = PerlIO_read(fp, bend, readsize);
                    if (morebytesread <= 0) {
                        /* we're done, if we still have incomplete
                           characters the check code in sv_gets() will
-                          warn and zero them.
+                          warn about them.
 
-                          FIXME: If we've read more than one lead
-                          character for an incomplete character, push
-                          it back.
+                          I'd originally considered doing
+                          PerlIO_ungetc() on all but the lead
+                          character of the incomplete character, but
+                          read() doesn't do that, so I don't.
                        */
                        break;
                    }
diff --git a/t/io/utf8.t b/t/io/utf8.t
index 919b734..29beba2 100644
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
 no utf8; # needed for use utf8 not griping about the raw octets
 
 
-plan(tests => 59);
+plan(tests => 61);
 
 $| = 1;
 
@@ -354,6 +354,8 @@ is($failed, undef);
     open F, ">:utf8", $a_file;
     print F "foo\xE4";
     print F "bar\xFE";
+    print F "\xC0\xC8\xCC\xD2";
+    print F "a\xE4ab";
     print F "a\xE4a";
     close F;
     open F, "<:utf8", $a_file;
@@ -363,6 +365,10 @@ is($failed, undef);
     $line .= <F>;
     is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4");
     $line = <F>;
+    is($line, "\xC0\xC8\xCC\xD2", "readline with several encoded characters");
+    $line = <F>;
+    is($line, "a\xE4ab", "readline with another boundary condition");
+    $line = <F>;
     is($line, "a\xE4a", "readline with boundary condition");
     close F;
 

--
Perl5 Master Repository

Reply via email to