Author: simon Date: Mon Jan 26 13:12:52 2009 New Revision: 36039 Modified: branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm branches/strings/pseudocode/ParrotString.pm branches/strings/pseudocode/t/recode.t
Log: Implement char-wise equality testing, and rework the way native encoding iterates over stuff. Modified: branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm ============================================================================== --- branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm (original) +++ branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm Mon Jan 26 13:12:52 2009 @@ -1,7 +1,21 @@ -class ParrotEncoding::ParrotNative is ParrotEncoding::Base::Fixed { +class ParrotEncoding::ParrotNative { our $.width = 1; method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); } + method string_grapheme_iterate($str, $callback, $parameter) { + for (0..$str.bufused-1) { + $callback(self.grapheme_at_index($str,$_), $parameter); + } + } + method string_length($str) { + # We're not really all that fixed - borrow this from the + # variable one. You could optimize this by checking if there's + # any entries in the grapheme table. + my $data = 0; + my $callback = sub ($char, $data is rw) { $data++ }; + $str.encoding.string_char_iterate($str, $callback, $data); + return $data; + } method append_grapheme ($str, $g) { if (@($g) > 1) { my $item; @@ -16,9 +30,9 @@ method string_char_iterate ($str, $callback, $parameter) { for (0..$str.bufused-1) { - my $grapheme = grapheme_at_index($str, $_); + my $grapheme = self.grapheme_at_index($str, $_); for (@( $grapheme )) { - $callback($str.buffer.[$_], $parameter); + $callback($_, $parameter); } } } @@ -27,6 +41,7 @@ # We need to look inside each grapheme, since NFG stores individual # graphemes and graphemes are composed of multiple characters - # this could be improved with caching later but we will + # do it the slow stupid way for now ... } @@ -34,6 +49,7 @@ if (!$str.normalization) { $str.charset.normalize($str, ParrotNormalization::NFG); } + die $index~" Read off end of buffer" if $index > $str.buffer - 1; my $c = $str.buffer[$index]; if $c >= 0 { return [ $c ]; } return $str.normalization.grapheme_table.[-$c - 1]; Modified: branches/strings/pseudocode/ParrotString.pm ============================================================================== --- branches/strings/pseudocode/ParrotString.pm (original) +++ branches/strings/pseudocode/ParrotString.pm Mon Jan 26 13:12:52 2009 @@ -116,13 +116,16 @@ if ($one.strlen != $two.strlen) { return 0; } for (0 .. $one.strlen-1) { if ($one.buffer.[$_] != $two.buffer.[$_]) { - say "Oops, byte "~$_~" differed"; return 0 } } return 1; } sub Parrot_string_character_equal($one, $two) { - say "Not implemented yet"; - return 0; + my $l = Parrot_string_length($one); + return 0 if $l != Parrot_string_length($two); + for (0.. $l-1) { + return 0 if Parrot_string_index($one, $_) != Parrot_string_index($two, $_); + } + return 1; } Modified: branches/strings/pseudocode/t/recode.t ============================================================================== --- branches/strings/pseudocode/t/recode.t (original) +++ branches/strings/pseudocode/t/recode.t Mon Jan 26 13:12:52 2009 @@ -1,12 +1,13 @@ use Test; use ParrotString; -plan 4; +plan 5; # The standard NFG example... my $str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, ParrotCharset::Unicode, ParrotEncoding::UTF8); my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::ParrotNative); Parrot_string_grapheme_copy($str, $str2); is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme"); +is(Parrot_string_length($str2), 6, "One grapheme is actually two chars"); my $str3 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::UTF8); Parrot_string_grapheme_copy($str2, $str3);