[svn:parrot] r11980 - in trunk: src src/io t/pmc

leo Wed, 22 Mar 2006 06:53:11 -0800

Author: leo
Date: Wed Mar 22 06:53:01 2006
New Revision: 11980

Modified:
   trunk/src/io/io_utf8.c
   trunk/src/string.c
   trunk/t/pmc/io.t


Log:
strings - utf8 input filter

* reading parts of a file should be ok now
* fix string_rep_compatible
* test


Modified: trunk/src/io/io_utf8.c
==============================================================================
--- trunk/src/io/io_utf8.c      (original)
+++ trunk/src/io/io_utf8.c      Wed Mar 22 06:53:01 2006
@@ -25,6 +25,7 @@
 
 #include "parrot/parrot.h"
 #include "io_private.h"
+#include "parrot/unicode.h"
 
 /* Defined at bottom */
 static const ParrotIOLayerAPI pio_utf8_layer_api;
@@ -48,15 +49,38 @@
               STRING **buf)
 {
     size_t len;
-    STRING *s;
+    STRING *s, *s2;
+    String_iter iter;
 
     len = PIO_read_down(interpreter, layer->down, io, buf);
     s = *buf;
     s->charset  = Parrot_unicode_charset_ptr;
     s->encoding = Parrot_utf8_encoding_ptr;
     /* count chars, verify utf8 */
-    s->strlen = Parrot_utf8_encoding_ptr->codepoints(interpreter, s);
-    /* TODO buffer additional chars for next read */
+    Parrot_utf8_encoding_ptr->iter_init(interpreter, s, &iter);
+    while (iter.bytepos < s->bufused) {
+        if (iter.bytepos + 4 > s->bufused) {
+            const utf8_t *u8ptr = (utf8_t *)((char *)s->strstart + 
+                    iter.bytepos);
+            UINTVAL c = *u8ptr;
+            if (UTF8_IS_START(c)) {
+                /* need len-1 more chars */
+                UINTVAL len2 = UTF8SKIP(u8ptr) - 1;
+                s2 = NULL;
+                s2 = PIO_make_io_string(interpreter, &s2, len2);
+                s2->bufused = len2;
+                s2->charset  = Parrot_unicode_charset_ptr;
+                s2->encoding = Parrot_utf8_encoding_ptr;
+                PIO_read_down(interpreter, layer->down, io, &s2);
+                s->strlen = iter.charpos;
+                s = string_append(interpreter, s, s2, 0);
+                s->strlen = iter.charpos + 1;
+                return len + len2;
+            }
+        }
+        iter.get_and_advance(interpreter, &iter);
+    }
+    s->strlen = iter.charpos;
     return len;
 }
 

Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c  (original)
+++ trunk/src/string.c  Wed Mar 22 06:53:01 2006
@@ -371,6 +371,11 @@
 string_rep_compatible (Interp *interpreter, STRING *a, const STRING *b,
         ENCODING **e)
 {
+    if (a->encoding == b->encoding && a->charset == b->charset) {
+        *e = a->encoding;
+        return a->charset;
+    }
+            
     /*
      * a table could possibly simplify the logic
      */

Modified: trunk/t/pmc/io.t
==============================================================================
--- trunk/t/pmc/io.t    (original)
+++ trunk/t/pmc/io.t    Wed Mar 22 06:53:01 2006
@@ -6,7 +6,7 @@
 use warnings;
 use lib qw( . lib ../lib ../../lib );
 use Test::More;
-use Parrot::Test tests => 33;
+use Parrot::Test tests => 35;
 
 =head1 NAME
 
@@ -652,5 +652,61 @@
 T\xf6tsch
 OUTPUT
 
+pir_output_is(<<'CODE', <<"OUTPUT", "utf8 read layer - readline");
+.sub main :main
+    .local pmc pio
+    .local string f
+    f = 'temp.file'
+    pio = open f, "<"
+    push pio, "utf8"
+    $S0 = readline pio
+    close pio
+    $I1 = charset $S0
+    $S2 = charsetname $I1
+    print $S2
+    print "\n"
+    $I1 = encoding $S0
+    $S2 = encodingname $I1
+    print $S2
+    print "\n"
+    $I1 = find_charset 'iso-8859-1'
+    trans_charset $S1, $S0, $I1
+    print $S1
+.end
+CODE
+unicode
+utf8
+T\xf6tsch
+OUTPUT
+pir_output_is(<<'CODE', <<"OUTPUT", "utf8 read layer, read parts");
+.sub main :main
+    .local pmc pio
+    .local int len
+    .include "stat.pasm"
+    len = stat "1", .STAT_FILESIZE
+    pio = open "1", "<"
+    push pio, "utf8"
+    $S0 = read pio, 2
+    len -= 2
+    $S1 = read pio, len
+    $S0 .= $S1
+    close pio
+    $I1 = charset $S0
+    $S2 = charsetname $I1
+    print $S2
+    print "\n"
+    $I1 = encoding $S0
+    $S2 = encodingname $I1
+    print $S2
+    print "\n"
+    $I1 = find_charset 'iso-8859-1'
+    trans_charset $S1, $S0, $I1
+    print $S1
+.end
+CODE
+unicode
+utf8
+T\xf6tsch
+OUTPUT
 
 unlink("temp.file");

[svn:parrot] r11980 - in trunk: src src/io t/pmc

Reply via email to