On Wednesday 06 May 2009 18:47:57 Tom Lane wrote: > So the ambiguous-input problem is solved if we define the new format(s) > to be started by backslash and something that the old code would reject. > I'd keep it short, like "\x", but there's still room for multiple > formats if anyone really wants to go to the trouble.
Here is a first cut at a new hex bytea input and output format. Example: SET bytea_output_hex = true; SELECT E'\\xDeAdBeEf'::bytea; bytea ------------ \xdeadbeef (1 row) Bernd did some performance testing for me, and it looked pretty good. Questions: Should this be the default format? Should the configuration parameter be a boolean or an enum, opening possibilities for other formats?
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index c944d8f..a6ac9c8 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL$ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/datatype.sgml,v 1.237 2009/04/27 16:27:35 momjian Exp $ --> <chapter id="datatype"> <title id="datatype-title">Data Types</title> @@ -1189,6 +1189,63 @@ SELECT b, char_length(b) FROM test2; </para> <para> + The <type>bytea</type> type supports two external formats for + input and output: the traditional bytea format that is particular + to PostgreSQL, and the hex format. Both of these are always + accepted on input. The output format depends on the configuration + parameter bytea_output_format; the default is hex. (Note that the + hex format was introduced in PostgreSQL 8.5; so earlier version + and some tools don't understand it.) + </para> + + <para> + The <acronym>SQL</acronym> standard defines a different binary + string type, called <type>BLOB</type> or <type>BINARY LARGE + OBJECT</type>. The input format is different from + <type>bytea</type>, but the provided functions and operators are + mostly the same. + </para> + + <sect2> + <title>Hex Format</title> + + <para> + The hex format encodes the binary data as 2 hexadecimal digits per + byte, highest significant nibble first. The entire string ist + preceded by the sequence <literal>\x</literal> (to distinguish it + from the bytea format). In SQL literals, the backslash may need + to be escaped, but it is one logical backslash as far as the + <type>bytea</type> type is concerned. The hex format is compatible with a wide + range of external applications and protocols, and it tends to be + faster than the traditional bytea format, so its use is + somewhat preferrable. + </para> + + <para> + Example: +<programlisting> +SELECT E'\\xDEADBEEF'; +</programlisting> + </para> + </sect2> + + <sect2> + <title>Traditional Bytea Format</title> + + <para> + The traditional bytea format takes the approach of representing a + binary string as a sequence of ASCII characters and escaping those + bytes that cannot be represented as an ASCII character by a + special escape sequence. If, from the point of view of the + application, representing bytes as characters makes sense, then + this representation can be convenient, but in practice it is + usually confusing becauses it fuzzes up the distinction between + binary strings and characters strings, and the particular escape + mechanism that was chosen is also somewhat unwieldy. So this + format should probably not be used for most new applications. + </para> + + <para> When entering <type>bytea</type> values, octets of certain values <emphasis>must</emphasis> be escaped (but all octet values <emphasis>can</emphasis> be escaped) when used as part @@ -1341,14 +1398,7 @@ SELECT b, char_length(b) FROM test2; have to escape line feeds and carriage returns if your interface automatically translates these. </para> - - <para> - The <acronym>SQL</acronym> standard defines a different binary - string type, called <type>BLOB</type> or <type>BINARY LARGE - OBJECT</type>. The input format is different from - <type>bytea</type>, but the provided functions and operators are - mostly the same. - </para> + </sect2> </sect1> diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index eed799a..b8a3cef 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -122,8 +122,8 @@ static const int8 hexlookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; -static unsigned -hex_encode(const char *src, unsigned len, char *dst) +size_t +hex_encode(const char *src, size_t len, char *dst) { const char *end = src + len; @@ -152,8 +152,8 @@ get_hex(char c) return (char) res; } -static unsigned -hex_decode(const char *src, unsigned len, char *dst) +size_t +hex_decode(const char *src, size_t len, char *dst) { const char *s, *srcend; diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 4cf3966..3c24686 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -62,6 +62,8 @@ static text *text_substring(Datum str, bool length_not_specified); static void appendStringInfoText(StringInfo str, const text *t); +bool bytea_output_hex = true; + /***************************************************************************** * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE * @@ -189,6 +191,18 @@ byteain(PG_FUNCTION_ARGS) int byte; bytea *result; + if (inputText[0] == '\\' && inputText[1] == 'x') + { + size_t len = strlen(inputText); + + byte = (len - 2)/2 + VARHDRSZ; + result = palloc(byte); + SET_VARSIZE(result, byte); + hex_decode(inputText + 2, len - 2, VARDATA(result)); + + PG_RETURN_BYTEA_P(result); + } + for (byte = 0, tp = inputText; *tp != '\0'; byte++) { if (tp[0] != '\\') @@ -269,8 +283,17 @@ byteaout(PG_FUNCTION_ARGS) char *result; char *vp; char *rp; - int val; /* holds unprintable chars */ int i; + + if (bytea_output_hex) + { + rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); + *rp++ = '\\'; + *rp++ = 'x'; + rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); + } + else + { int len; len = 1; /* empty string has 1 char */ @@ -295,6 +318,8 @@ byteaout(PG_FUNCTION_ARGS) } else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) { + int val; /* holds unprintable chars */ + val = *vp; rp[0] = '\\'; rp[3] = DIG(val & 07); @@ -307,6 +332,7 @@ byteaout(PG_FUNCTION_ARGS) else *rp++ = *vp; } + } *rp = '\0'; PG_RETURN_CSTRING(result); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 4f06725..19d07ff 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -584,6 +584,14 @@ const char *const config_type_names[] = static struct config_bool ConfigureNamesBool[] = { { + {"bytea_output_hex", PGC_USERSET, CLIENT_CONN_OTHER, + gettext_noop("Sets the bytea output format to the hex format."), + NULL + }, + &bytea_output_hex, + true, NULL, NULL + }, + { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), NULL diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index c1b9393..d91a54c 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -704,6 +704,8 @@ extern Datum unknownout(PG_FUNCTION_ARGS); extern Datum unknownrecv(PG_FUNCTION_ARGS); extern Datum unknownsend(PG_FUNCTION_ARGS); +extern bool bytea_output_hex; + extern Datum byteain(PG_FUNCTION_ARGS); extern Datum byteaout(PG_FUNCTION_ARGS); extern Datum bytearecv(PG_FUNCTION_ARGS); @@ -728,6 +730,9 @@ extern Datum bytea_substr(PG_FUNCTION_ARGS); extern Datum bytea_substr_no_len(PG_FUNCTION_ARGS); extern Datum pg_column_size(PG_FUNCTION_ARGS); +extern size_t hex_encode(const char *src, size_t len, char *dst); +extern size_t hex_decode(const char *src, size_t len, char *dst); + /* version.c */ extern Datum pgsql_version(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index e80e1a4..3742dc0 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -1,3 +1,4 @@ +SET bytea_output_hex TO false; -- -- create user defined conversion -- diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index be8eb91..0c1fc41 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -97,6 +97,87 @@ LINE 1: SELECT U&'wrong: +0061' UESCAPE '+'; ^ DETAIL: String constants with Unicode escapes cannot be used when standard_conforming_strings is off. RESET standard_conforming_strings; +-- bytea +SET bytea_output_hex = true; +SELECT E'\\xDeAdBeEf'::bytea; + bytea +------------ + \xdeadbeef +(1 row) + +SELECT E'\\xDeAdBeE'::bytea; +ERROR: invalid hexadecimal data: odd number of digits +LINE 1: SELECT E'\\xDeAdBeE'::bytea; + ^ +SELECT E'\\xDeAdBeEx'::bytea; +ERROR: invalid hexadecimal digit: "x" +LINE 1: SELECT E'\\xDeAdBeEx'::bytea; + ^ +SELECT E'\\xDe00BeEf'::bytea; + bytea +------------ + \xde00beef +(1 row) + +SELECT E'DeAdBeEf'::bytea; + bytea +-------------------- + \x4465416442654566 +(1 row) + +SELECT E'De\\000dBeEf'::bytea; + bytea +-------------------- + \x4465006442654566 +(1 row) + +SELECT E'De\123dBeEf'::bytea; + bytea +-------------------- + \x4465536442654566 +(1 row) + +SELECT E'De\\123dBeEf'::bytea; + bytea +-------------------- + \x4465536442654566 +(1 row) + +SELECT E'De\\678dBeEf'::bytea; +ERROR: invalid input syntax for type bytea +LINE 1: SELECT E'De\\678dBeEf'::bytea; + ^ +SET bytea_output_hex = false; +SELECT E'\\xDeAdBeEf'::bytea; + bytea +------------------ + \336\255\276\357 +(1 row) + +SELECT E'\\xDe00BeEf'::bytea; + bytea +------------------ + \336\000\276\357 +(1 row) + +SELECT E'DeAdBeEf'::bytea; + bytea +---------- + DeAdBeEf +(1 row) + +SELECT E'De\\000dBeEf'::bytea; + bytea +------------- + De\000dBeEf +(1 row) + +SELECT E'De\\123dBeEf'::bytea; + bytea +---------- + DeSdBeEf +(1 row) + -- -- test conversions between various string types -- E021-10 implicit casting among the character data types diff --git a/src/test/regress/input/largeobject.source b/src/test/regress/input/largeobject.source index 46ba926..43f6855 100644 --- a/src/test/regress/input/largeobject.source +++ b/src/test/regress/input/largeobject.source @@ -2,6 +2,8 @@ -- Test large object support -- +SET bytea_output_hex TO false; + -- Load a file CREATE TABLE lotest_stash_values (loid oid, fd integer); -- lo_creat(mode integer) returns oid diff --git a/src/test/regress/output/largeobject.source b/src/test/regress/output/largeobject.source index 9d69f6c..7012586 100644 --- a/src/test/regress/output/largeobject.source +++ b/src/test/regress/output/largeobject.source @@ -1,6 +1,7 @@ -- -- Test large object support -- +SET bytea_output_hex TO false; -- Load a file CREATE TABLE lotest_stash_values (loid oid, fd integer); -- lo_creat(mode integer) returns oid diff --git a/src/test/regress/output/largeobject_1.source b/src/test/regress/output/largeobject_1.source index 1fbc29c..0ece752 100644 --- a/src/test/regress/output/largeobject_1.source +++ b/src/test/regress/output/largeobject_1.source @@ -1,6 +1,7 @@ -- -- Test large object support -- +SET bytea_output_hex TO false; -- Load a file CREATE TABLE lotest_stash_values (loid oid, fd integer); -- lo_creat(mode integer) returns oid diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 99a9178..001de4e 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -1,3 +1,5 @@ +SET bytea_output_hex TO false; + -- -- create user defined conversion -- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index a28c75a..8345534 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -43,6 +43,25 @@ SELECT U&'wrong: +0061' UESCAPE '+'; RESET standard_conforming_strings; +-- bytea +SET bytea_output_hex = true; +SELECT E'\\xDeAdBeEf'::bytea; +SELECT E'\\xDeAdBeE'::bytea; +SELECT E'\\xDeAdBeEx'::bytea; +SELECT E'\\xDe00BeEf'::bytea; +SELECT E'DeAdBeEf'::bytea; +SELECT E'De\\000dBeEf'::bytea; +SELECT E'De\123dBeEf'::bytea; +SELECT E'De\\123dBeEf'::bytea; +SELECT E'De\\678dBeEf'::bytea; + +SET bytea_output_hex = false; +SELECT E'\\xDeAdBeEf'::bytea; +SELECT E'\\xDe00BeEf'::bytea; +SELECT E'DeAdBeEf'::bytea; +SELECT E'De\\000dBeEf'::bytea; +SELECT E'De\\123dBeEf'::bytea; + -- -- test conversions between various string types -- E021-10 implicit casting among the character data types
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers