Author: leo
Date: Wed Mar 22 06:53:01 2006
New Revision: 11980
Modified:
trunk/src/io/io_utf8.c
trunk/src/string.c
trunk/t/pmc/io.t
Log:
strings - utf8 input filter
* reading parts of a file should be ok now
* fix string_rep_compatible
* test
Modified: trunk/src/io/io_utf8.c
==============================================================================
--- trunk/src/io/io_utf8.c (original)
+++ trunk/src/io/io_utf8.c Wed Mar 22 06:53:01 2006
@@ -25,6 +25,7 @@
#include "parrot/parrot.h"
#include "io_private.h"
+#include "parrot/unicode.h"
/* Defined at bottom */
static const ParrotIOLayerAPI pio_utf8_layer_api;
@@ -48,15 +49,38 @@
STRING **buf)
{
size_t len;
- STRING *s;
+ STRING *s, *s2;
+ String_iter iter;
len = PIO_read_down(interpreter, layer->down, io, buf);
s = *buf;
s->charset = Parrot_unicode_charset_ptr;
s->encoding = Parrot_utf8_encoding_ptr;
/* count chars, verify utf8 */
- s->strlen = Parrot_utf8_encoding_ptr->codepoints(interpreter, s);
- /* TODO buffer additional chars for next read */
+ Parrot_utf8_encoding_ptr->iter_init(interpreter, s, &iter);
+ while (iter.bytepos < s->bufused) {
+ if (iter.bytepos + 4 > s->bufused) {
+ const utf8_t *u8ptr = (utf8_t *)((char *)s->strstart +
+ iter.bytepos);
+ UINTVAL c = *u8ptr;
+ if (UTF8_IS_START(c)) {
+ /* need len-1 more chars */
+ UINTVAL len2 = UTF8SKIP(u8ptr) - 1;
+ s2 = NULL;
+ s2 = PIO_make_io_string(interpreter, &s2, len2);
+ s2->bufused = len2;
+ s2->charset = Parrot_unicode_charset_ptr;
+ s2->encoding = Parrot_utf8_encoding_ptr;
+ PIO_read_down(interpreter, layer->down, io, &s2);
+ s->strlen = iter.charpos;
+ s = string_append(interpreter, s, s2, 0);
+ s->strlen = iter.charpos + 1;
+ return len + len2;
+ }
+ }
+ iter.get_and_advance(interpreter, &iter);
+ }
+ s->strlen = iter.charpos;
return len;
}
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Wed Mar 22 06:53:01 2006
@@ -371,6 +371,11 @@
string_rep_compatible (Interp *interpreter, STRING *a, const STRING *b,
ENCODING **e)
{
+ if (a->encoding == b->encoding && a->charset == b->charset) {
+ *e = a->encoding;
+ return a->charset;
+ }
+
/*
* a table could possibly simplify the logic
*/
Modified: trunk/t/pmc/io.t
==============================================================================
--- trunk/t/pmc/io.t (original)
+++ trunk/t/pmc/io.t Wed Mar 22 06:53:01 2006
@@ -6,7 +6,7 @@
use warnings;
use lib qw( . lib ../lib ../../lib );
use Test::More;
-use Parrot::Test tests => 33;
+use Parrot::Test tests => 35;
=head1 NAME
@@ -652,5 +652,61 @@
T\xf6tsch
OUTPUT
+pir_output_is(<<'CODE', <<"OUTPUT", "utf8 read layer - readline");
+.sub main :main
+ .local pmc pio
+ .local string f
+ f = 'temp.file'
+ pio = open f, "<"
+ push pio, "utf8"
+ $S0 = readline pio
+ close pio
+ $I1 = charset $S0
+ $S2 = charsetname $I1
+ print $S2
+ print "\n"
+ $I1 = encoding $S0
+ $S2 = encodingname $I1
+ print $S2
+ print "\n"
+ $I1 = find_charset 'iso-8859-1'
+ trans_charset $S1, $S0, $I1
+ print $S1
+.end
+CODE
+unicode
+utf8
+T\xf6tsch
+OUTPUT
+pir_output_is(<<'CODE', <<"OUTPUT", "utf8 read layer, read parts");
+.sub main :main
+ .local pmc pio
+ .local int len
+ .include "stat.pasm"
+ len = stat "1", .STAT_FILESIZE
+ pio = open "1", "<"
+ push pio, "utf8"
+ $S0 = read pio, 2
+ len -= 2
+ $S1 = read pio, len
+ $S0 .= $S1
+ close pio
+ $I1 = charset $S0
+ $S2 = charsetname $I1
+ print $S2
+ print "\n"
+ $I1 = encoding $S0
+ $S2 = encodingname $I1
+ print $S2
+ print "\n"
+ $I1 = find_charset 'iso-8859-1'
+ trans_charset $S1, $S0, $I1
+ print $S1
+.end
+CODE
+unicode
+utf8
+T\xf6tsch
+OUTPUT
unlink("temp.file");