In article <[EMAIL PROTECTED]>, Rafael Garcia-Suarez <[EMAIL PROTECTED]> writes: > I'd prefer 3, because it's the more conservative. However, having to > choose between 1 and 2, I think 1 is more right, since it continues > what the current implementation does more consistently. >
Ok, here is 1) then: diff -ru perl-rsync/pod/perlfunc.pod perl-dev/pod/perlfunc.pod --- perl-rsync/pod/perlfunc.pod Mon Mar 21 11:57:09 2005 +++ perl-dev/pod/perlfunc.pod Mon Mar 21 21:42:41 2005 @@ -3425,7 +3425,7 @@ The C<a>, C<A>, and C<Z> types gobble just one value, but pack it as a string of length count, padding with nulls or spaces as necessary. When -unpacking, C<A> strips trailing spaces and nulls, C<Z> strips everything +unpacking, C<A> strips trailing whitespace and nulls, C<Z> strips everything after the first null, and C<a> returns data verbatim. If the value-to-pack is too long, it is truncated. If too long and an diff -ru perl-rsync/pp_pack.c perl-dev/pp_pack.c --- perl-rsync/pp_pack.c Mon Mar 21 11:57:09 2005 +++ perl-dev/pp_pack.c Mon Mar 21 22:17:08 2005 @@ -1373,9 +1373,19 @@ } else if (datumtype == 'A') { /* 'A' strips both nulls and spaces */ char *ptr; - for (ptr = s+len-1; ptr >= s; ptr--) - if (*ptr != 0 && !isSPACE(*ptr)) break; - ptr++; + if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) { + for (ptr = s+len-1; ptr >= s; ptr--) + if (*ptr != 0 && !UTF8_IS_CONTINUATION(*ptr) && + !is_utf8_space(ptr)) break; + if (ptr >= s) ptr += UTF8SKIP(ptr); + else ptr++; + if (ptr > s+len) + Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); + } else { + for (ptr = s+len-1; ptr >= s; ptr--) + if (*ptr != 0 && !isSPACE(*ptr)) break; + ptr++; + } sv = newSVpvn(s, ptr-s); } else sv = newSVpvn(s, len); diff -ru perl-rsync/t/op/pack.t perl-dev/t/op/pack.t --- perl-rsync/t/op/pack.t Mon Mar 21 11:57:09 2005 +++ perl-dev/t/op/pack.t Mon Mar 21 22:18:49 2005 @@ -12,7 +12,7 @@ my $no_signedness = $] > 5.009 ? '' : "Signed/unsigned pack modifiers not available on this perl"; -plan tests => 14621; +plan tests => 14627; use strict; use warnings; @@ -1806,4 +1806,19 @@ is(pack("Z*/Z"), "1\0\0", "pack Z*/Z makes an extended string"); is(pack("Z*/Z", ""), "1\0\0", "pack Z*/Z makes an extended string"); is(pack("Z*/a", ""), "0\0", "pack Z*/a makes an extended string"); +} +{ + # unpack("A*", $unicode) strips general unicode spaces + is(unpack("A*", "ab \n\xa0 \0"), "ab \n\xa0", + 'normal A* strip leaves \xa0'); + is(unpack("U0C0A*", "ab \n\xa0 \0"), "ab \n\xa0", + 'normal A* strip leaves \xa0 even if it got upgraded for technical reasons'); + is(unpack("A*", pack("a*(U0U)a*", "ab \n", 0xa0, " \0")), "ab", + 'upgraded strings A* removes \xa0'); + is(unpack("A*", pack("a*(U0UU)a*", "ab \n", 0xa0, 0x1680, " \0")), "ab", + 'upgraded strings A* removes all unicode whitespace'); + is(unpack("A5", pack("a*(U0U)a*", "ab \n", 0x1680, "def", "ab")), "ab", + 'upgraded strings A5 removes all unicode whitespace'); + is(unpack("A*", pack("U", 0x1680)), "", + 'upgraded strings A* with nothing left'); }