On Thu, Dec 08, 2005 at 05:54:35PM -0500, Gregory Maxwell wrote: > No, what is needed for people who care about fixing their data is a > loadable strip_invalid_utf8() that works in older versions.. then just > select * from bar where foo != strip_invalid_utf8(foo); The function > would be useful in general, for example, if you have an application > which doesn't already have much utf8 logic, you want to use a text > field, and stripping is the behaviour you want. For example, lots of > simple web applications.
Would something like the following work? It's written in pl/pgsql and does (AFAICS) the same checking as the backend in recent releases. Except the backend only supports up to 4-byte UTF-8 whereas this function checks upto six byte. For a six byte UTF-8 character, who is wrong? In any case, people should be able to do something like: SELECT field FROM table WHERE NOT utf8_verify(field,4); To check conformance with PostgreSQL 8.1. Note, I don't have large chunks of UTF-8 to test with but it works for the characters I tried with. Tested with 7.4. Have a nice day, -- Martijn van Oosterhout <[email protected]> http://svana.org/kleptog/ > Patent. n. Genius is 5% inspiration and 95% perspiration. A patent is a > tool for doing 5% of the work and then sitting around waiting for someone > else to do the other 95% so you can sue them.
create or replace function utf8_verify(bytea,integer) returns bool as '
DECLARE
str ALIAS FOR $1;
maxlen ALIAS FOR $2;
strlen INTEGER;
i integer;
j INTEGER;
len integer;
chr integer;
wchr integer;
BEGIN
i := 0;
strlen := length(str);
WHILE i < strlen LOOP
-- Check leading byte
chr := get_byte(str,i);
IF chr < 128 THEN -- 0x00 - 0x80 - single byte
len := 1;
wchr := chr;
ELSIF chr < 192 THEN -- 0x80 - 0xC0 - illegal
RETURN false;
ELSIF chr < 224 THEN -- 0xC0 - 0xE0 - two bytes
len := 2;
wchr := chr - 192;
ELSIF chr < 240 THEN -- 0xE0 - 0xF0 - three bytes
len := 3;
wchr := chr - 224;
ELSIF chr < 248 THEN -- 0xF0 - 0xF8 - four bytes
len := 4;
wchr := chr - 240;
ELSIF chr < 252 THEN -- 0xF8 - 0xFC - five bytes
len := 5;
wchr := chr - 248;
ELSIF chr < 254 THEN -- 0xFC - 0xFE - six bytes
len := 6;
wchr := chr - 252;
ELSE
RETURN false; -- FE and FF not currently defined
END IF;
IF i + len > strlen THEN
RETURN false;
END IF;
IF len > maxlen THEN
RETURN false;
END IF;
-- Check remaining characters
j := 1;
WHILE len > j LOOP
chr := get_byte(str, i+j);
IF chr < 128 OR chr >= 192 THEN
RETURN false;
END IF;
wchr := (wchr << 6) + (chr - 192);
j := j+1;
END LOOP;
-- Verify shortest possible string
IF len = 1 AND wchr >= 128 THEN
RETURN false;
ELSIF len = 2 AND (wchr < 128 OR wchr >= 2048) THEN
RETURN false;
ELSIF len = 3 AND (wchr < 2048 OR wchr >= 65536) THEN
RETURN false;
ELSIF len = 4 AND (wchr < 65536 OR wchr >= 2097152) THEN
RETURN false;
ELSIF len = 5 AND (wchr < 2097152 OR wchr >= 67108864) THEN
RETURN false;
ELSIF len = 6 AND (wchr < 67108864 OR wchr >= 2147483648) THEN
RETURN false;
END IF;
-- RAISE NOTICE ''Checked char offset %, OK (wchr=%,len=%)'', i, wchr, len;
i := i+len;
END LOOP;
RETURN true;
END;
' language plpgsql;
pgpBVYFeuKe6z.pgp
Description: PGP signature
