Change 19871 by [EMAIL PROTECTED] on 2003/06/28 15:47:22
Subject: [Encode] pre-1.97 patches
From: Dan Kogai <[EMAIL PROTECTED]>
Date: Sat, 28 Jun 2003 01:20:59 +0900
Message-Id: <[EMAIL PROTECTED]>
Affected files ...
... //depot/perl/ext/Encode/Changes#59 edit
... //depot/perl/ext/Encode/Encode.pm#138 edit
... //depot/perl/ext/Encode/lib/Encode/Guess.pm#10 edit
Differences ...
==== //depot/perl/ext/Encode/Changes#59 (text) ====
Index: perl/ext/Encode/Changes
--- perl/ext/Encode/Changes#58~19811~ Wed Jun 18 05:11:07 2003
+++ perl/ext/Encode/Changes Sat Jun 28 08:47:22 2003
@@ -3,6 +3,16 @@
# $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp $
#
$Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $
+! lib/Encode/Guess.pm
+ $Encode::Guess::NoUTFAutoGuess is added so you can turn off
+ automatic utf(8|16|32) guessing -- originally by Autrijus
+ Message-Id: <[EMAIL PROTECTED]>
+! Encode.pm
+ Addressed the following;
+ Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode
+ Message-Id: <[EMAIL PROTECTED]>
+
+1.96 2003/06/18 09:29:02
! lib/Encode/JP/JP.pm t/guess.t
m/(...)/ in void context then $1 is considered a Bad Thing
Message-Id: <[EMAIL PROTECTED]>
==== //depot/perl/ext/Encode/Encode.pm#138 (text) ====
Index: perl/ext/Encode/Encode.pm
--- perl/ext/Encode/Encode.pm#137~19811~ Wed Jun 18 05:11:07 2003
+++ perl/ext/Encode/Encode.pm Sat Jun 28 08:47:22 2003
@@ -147,7 +147,7 @@
Carp::croak("Unknown encoding '$name'");
}
my $octets = $enc->encode($string,$check);
- return undef if ($check && length($string));
+ $_[1] = $string if $check;
return $octets;
}
==== //depot/perl/ext/Encode/lib/Encode/Guess.pm#10 (text) ====
Index: perl/ext/Encode/lib/Encode/Guess.pm
--- perl/ext/Encode/lib/Encode/Guess.pm#9~19325~ Thu Apr 24 12:06:29 2003
+++ perl/ext/Encode/lib/Encode/Guess.pm Sat Jun 28 08:47:22 2003
@@ -18,6 +18,7 @@
sub perlio_ok { 0 }
our @EXPORT = qw(guess_encoding);
+our $NoUTFAutoGuess = 0;
sub import { # Exporter not used so we do it on our own
my $callpkg = caller;
@@ -70,75 +71,80 @@
return unless defined $octet and length $octet;
# cheat 0: utf8 flag;
- Encode::is_utf8($octet) and return find_encoding('utf8');
+ if ( Encode::is_utf8($octet) ) {
+ return find_encoding('utf8') unless $NoUTFAutoGuess;
+ Encode::_utf8_off($octet);
+ }
# cheat 1: BOM
use Encode::Unicode;
- my $BOM = unpack('n', $octet);
- return find_encoding('UTF-16')
- if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
- $BOM = unpack('N', $octet);
- return find_encoding('UTF-32')
- if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+ unless ($NoUTFAutoGuess) {
+ my $BOM = unpack('n', $octet);
+ return find_encoding('UTF-16')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
+ $BOM = unpack('N', $octet);
+ return find_encoding('UTF-32')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+ if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
+ my $utf;
+ my ($be, $le) = (0, 0);
+ if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+ $utf = "UTF-32";
+ for my $char (unpack('N*', $octet)){
+ $char & 0x0000ffff and $be++;
+ $char & 0xffff0000 and $le++;
+ }
+ }else{ # UTF-16(BE|LE) assumed
+ $utf = "UTF-16";
+ for my $char (unpack('n*', $octet)){
+ $char & 0x00ff and $be++;
+ $char & 0xff00 and $le++;
+ }
+ }
+ $DEBUG and warn "$utf, be == $be, le == $le";
+ $be == $le
+ and return
+ "Encodings ambiguous between $utf BE and LE ($be, $le)";
+ $utf .= ($be > $le) ? 'BE' : 'LE';
+ return find_encoding($utf);
+ }
+ }
my %try = %{$obj->{Suspects}};
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$try{$e->name} = $e;
$DEBUG and warn "Added: ", $e->name;
}
- if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
- my $utf;
- my ($be, $le) = (0, 0);
- if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
- $utf = "UTF-32";
- for my $char (unpack('N*', $octet)){
- $char & 0x0000ffff and $be++;
- $char & 0xffff0000 and $le++;
- }
- }else{ # UTF-16(BE|LE) assumed
- $utf = "UTF-16";
- for my $char (unpack('n*', $octet)){
- $char & 0x00ff and $be++;
- $char & 0xff00 and $le++;
+ my $nline = 1;
+ for my $line (split /\r\n?|\n/, $octet){
+ # cheat 2 -- \e in the string
+ if ($line =~ /\e/o){
+ my @keys = keys %try;
+ delete @try{qw/utf8 ascii/};
+ for my $k (@keys){
+ ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
}
}
- $DEBUG and warn "$utf, be == $be, le == $le";
- $be == $le
- and return "Encodings ambiguous between $utf BE and LE ($be, $le)";
- $utf .= ($be > $le) ? 'BE' : 'LE';
- return find_encoding($utf);
- }else{
- my $nline = 1;
- for my $line (split /\r\n?|\n/, $octet){
- # cheat 2 -- \e in the string
- if ($line =~ /\e/o){
- my @keys = keys %try;
- delete @try{qw/utf8 ascii/};
- for my $k (@keys){
- ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
- }
- }
- my %ok = %try;
- # warn join(",", keys %try);
- for my $k (keys %try){
- my $scratch = $line;
- $try{$k}->decode($scratch, FB_QUIET);
- if ($scratch eq ''){
- $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
- }else{
- use bytes ();
- $DEBUG and
- warn sprintf("%4d:%-24s not ok; %d bytes left\n",
- $nline, $k, bytes::length($scratch));
- delete $ok{$k};
- }
+ my %ok = %try;
+ # warn join(",", keys %try);
+ for my $k (keys %try){
+ my $scratch = $line;
+ $try{$k}->decode($scratch, FB_QUIET);
+ if ($scratch eq ''){
+ $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+ }else{
+ use bytes ();
+ $DEBUG and
+ warn sprintf("%4d:%-24s not ok; %d bytes left\n",
+ $nline, $k, bytes::length($scratch));
+ delete $ok{$k};
}
- %ok or return "No appropriate encodings found!";
- if (scalar(keys(%ok)) == 1){
- my ($retval) = values(%ok);
- return $retval;
- }
- %try = %ok; $nline++;
}
+ %ok or return "No appropriate encodings found!";
+ if (scalar(keys(%ok)) == 1){
+ my ($retval) = values(%ok);
+ return $retval;
+ }
+ %try = %ok; $nline++;
}
$try{ascii} or
return "Encodings too ambiguous: ", join(" or ", keys %try);
@@ -188,6 +194,10 @@
# tries all major Japanese Encodings as well
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
+
+If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
+value, no heuristics will be applied to UTF8/16/32, and the result
+will be limited to the suspects and C<ascii>.
=over 4
End of Patch.