Here is the UTF8 patch. Is this something we want to apply now? ---------------------------------------------------------------------------
Dominic Mitchell wrote: > Before christmas, I started a thread in dbi-users about the support for > setting the utf8 flag on returned values[1]. I've got another patch > now, which is less intrusive. This adds $dbh->{pg_do_utf8}, which will > turn on marking returned data as UTF-8 if necessary. > > I would like this to be considered for inclusion with DBD::Pg, as I feel > it's necessary to correct broken behaviour that I am seeing. > > I'm aware that Tim Bunce thinks that a better interface should be found > for this sort of thing, and I agree. But unfortunately, I need to get > this problem solved, and the attached patch would be an extremely useful > stop gap measure. > > Thanks, > -Dom > > [1] http:[EMAIL PROTECTED]/msg15428.html > > -- > | Semantico: creators of major online resources | > | URL: http://www.semantico.com/ | > | Tel: +44 (1273) 722222 | > | Address: 33 Bond St., Brighton, Sussex, BN1 1RD, UK. | > ? TESTLOG > ? TESTLOG-commented-out-utf8-bits > ? t/.nfs00a6dec800000014 > Index: Pg.pm > =================================================================== > RCS file: /usr/local/cvsroot/dbdpg/dbdpg/Pg.pm,v > retrieving revision 1.17 > diff -u -r1.17 Pg.pm > --- Pg.pm 30 Dec 2002 04:59:05 -0000 1.17 > +++ Pg.pm 10 Jan 2003 11:59:59 -0000 > @@ -1288,6 +1288,15 @@ > escaped by a backslash. Any other ASCII character can be used directly in a > string constant. > > +=item B<pg_do_utf8> (boolean) > + > +PostgreSQL specific attribute. If true, then the utf8 flag will be > +turned for returned character data (if the data is valid utf8). For > +details about the utf8 flag, see L<Encode>. This is only relevant under > +perl 5.8 and higher. > + > +B<NB>: This attribute is experimental and may be subject to change. > + > =item B<pg_INV_READ> (integer, read-only) > > Constant to be used for the mode in lo_creat and lo_open. > Index: dbdimp.c > =================================================================== > RCS file: /usr/local/cvsroot/dbdpg/dbdpg/dbdimp.c,v > retrieving revision 1.10 > diff -u -r1.10 dbdimp.c > --- dbdimp.c 8 Jan 2003 22:08:17 -0000 1.10 > +++ dbdimp.c 10 Jan 2003 12:00:01 -0000 > @@ -470,6 +470,8 @@ > imp_dbh->pg_auto_escape = newval; > } else if (kl==10 && strEQ(key, "pg_bool_tf")) { > imp_dbh->pg_bool_tf = newval; > + } else if (kl==10 && strEQ(key, "pg_do_utf8")) { > + imp_dbh->pg_do_utf8 = newval; > } else { > return 0; > } > @@ -494,6 +496,8 @@ > retsv = newSViv((IV)imp_dbh->pg_auto_escape); > } else if (kl==10 && strEQ(key, "pg_bool_tf")) { > retsv = newSViv((IV)imp_dbh->pg_bool_tf); > + } else if (kl==10 && strEQ(key, "pg_do_utf8")) { > + retsv = newSViv((IV)imp_dbh->pg_do_utf8); > } else if (kl==11 && strEQ(key, "pg_INV_READ")) { > retsv = newSViv((IV)INV_READ); > } else if (kl==12 && strEQ(key, "pg_INV_WRITE")) { > @@ -1332,6 +1336,15 @@ > } > > > +int > +is_high_bit_set(val) > + char *val; > +{ > + while (*val++) > + if (*val & 0x80) return 1; > + return 0; > +} > + > AV * > dbd_st_fetch (sth, imp_sth) > SV *sth; > @@ -1403,6 +1416,14 @@ > val[val_len] = '\0'; > } > sv_setpvn(sv, val, val_len); > + if (imp_dbh->pg_do_utf8) { > + SvUTF8_off(sv); > + /* XXX Is this all the character data types? */ > + if (18 == type || 25 == type || 1042 ==type || 1043 == type) { > + if (is_high_bit_set(val) && is_utf8_string(val, val_len)) > + SvUTF8_on(sv); > + } > + } > } > } > > Index: dbdimp.h > =================================================================== > RCS file: /usr/local/cvsroot/dbdpg/dbdpg/dbdimp.h,v > retrieving revision 1.4 > diff -u -r1.4 dbdimp.h > --- dbdimp.h 8 Jan 2003 22:08:17 -0000 1.4 > +++ dbdimp.h 10 Jan 2003 12:00:01 -0000 > @@ -23,6 +23,7 @@ > int init_commit; /* initialize AutoCommit */ > int pg_auto_escape; /* initialize AutoEscape */ > int pg_bool_tf; /* do bools return 't'/'f' */ > + int pg_do_utf8; /* should we attempt to make utf8 strings? */ > }; > > /* Define sth implementor data structure */ > Index: t/05fetch.t > =================================================================== > RCS file: /usr/local/cvsroot/dbdpg/dbdpg/t/05fetch.t,v > retrieving revision 1.3 > diff -u -r1.3 05fetch.t > --- t/05fetch.t 27 Nov 2002 09:24:36 -0000 1.3 > +++ t/05fetch.t 10 Jan 2003 12:00:01 -0000 > @@ -3,7 +3,7 @@ > use Test::More; > > if (defined $ENV{DBI_DSN}) { > - plan tests => 7; > + plan tests => 10; > } else { > plan skip_all => 'cannot test without DB info'; > } > @@ -80,6 +80,30 @@ > ok($rows == 1, > 'fetch one row on id' > ); > + > +# Attempt to test whether or not we can get unicode out of the database > +# correctly. Reuse the previous sth. > +SKIP: { > + eval "use Encode"; > + skip "need Encode module for unicode tests", 3 if $@; > + local $dbh->{pg_do_utf8} = 1; > + $dbh->do("INSERT INTO test (id, name, val) VALUES (4, '\x{0100}dam', 'cow')"); > + $sth->execute(4); > + my ($id, $name) = $sth->fetchrow_array(); > + ok(Encode::is_utf8($name), > + 'returned data has utf8 bit set' > + ); > + is(length($name), 4, > + 'returned utf8 data is not corrupted' > + ); > + $sth->finish(); > + $sth->execute(1); > + my ($id2, $name2) = $sth->fetchrow_array(); > + ok(! Encode::is_utf8($name2), > + 'returned ASCII data has not got utf8 bit set' > + ); > + $sth->finish(); > +} > > $sql = <<SQL; > SELECT id -- Bruce Momjian | http://candle.pha.pa.us [EMAIL PROTECTED] | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073