Re: [HACKERS] bytea vs. pg_dump

Peter Eisentraut Tue, 07 Jul 2009 14:59:50 -0700

On Wednesday 06 May 2009 18:47:57 Tom Lane wrote:
> So the ambiguous-input problem is solved if we define the new format(s)
> to be started by backslash and something that the old code would reject.
> I'd keep it short, like "\x", but there's still room for multiple
> formats if anyone really wants to go to the trouble.


Here is a first cut at a new hex bytea input and output format.  Example:

SET bytea_output_hex = true;

SELECT E'\\xDeAdBeEf'::bytea;
   bytea
------------
 \xdeadbeef
(1 row)

Bernd did some performance testing for me, and it looked pretty good.

Questions:

Should this be the default format?

Should the configuration parameter be a boolean or an enum, opening 
possibilities for other formats?

diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index c944d8f..a6ac9c8 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL$ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/datatype.sgml,v 1.237 2009/04/27 16:27:35 momjian Exp $ -->
 
  <chapter id="datatype">
   <title id="datatype-title">Data Types</title>
@@ -1189,6 +1189,63 @@ SELECT b, char_length(b) FROM test2;
    </para>
 
    <para>
+    The <type>bytea</type> type supports two external formats for
+    input and output: the traditional bytea format that is particular
+    to PostgreSQL, and the hex format.  Both of these are always
+    accepted on input.  The output format depends on the configuration
+    parameter bytea_output_format; the default is hex.  (Note that the
+    hex format was introduced in PostgreSQL 8.5; so earlier version
+    and some tools don't understand it.)
+   </para>
+
+   <para>
+        The <acronym>SQL</acronym> standard defines a different binary
+        string type, called <type>BLOB</type> or <type>BINARY LARGE
+        OBJECT</type>.  The input format is different from
+        <type>bytea</type>, but the provided functions and operators are
+        mostly the same.
+   </para>
+
+  <sect2>
+   <title>Hex Format</title>
+
+   <para>
+    The hex format encodes the binary data as 2 hexadecimal digits per
+    byte, highest significant nibble first.  The entire string ist
+    preceded by the sequence <literal>\x</literal> (to distinguish it
+    from the bytea format).  In SQL literals, the backslash may need
+    to be escaped, but it is one logical backslash as far as the
+    <type>bytea</type> type is concerned.  The hex format is compatible with a wide
+    range of external applications and protocols, and it tends to be
+    faster than the traditional bytea format, so its use is
+    somewhat preferrable.
+   </para>
+
+   <para>
+    Example:
+<programlisting>
+SELECT E'\\xDEADBEEF';
+</programlisting>
+   </para>
+  </sect2>
+
+  <sect2>
+   <title>Traditional Bytea Format</title>
+
+   <para>
+    The traditional bytea format takes the approach of representing a
+    binary string as a sequence of ASCII characters and escaping those
+    bytes that cannot be represented as an ASCII character by a
+    special escape sequence.  If, from the point of view of the
+    application, representing bytes as characters makes sense, then
+    this representation can be convenient, but in practice it is
+    usually confusing becauses it fuzzes up the distinction between
+    binary strings and characters strings, and the particular escape
+    mechanism that was chosen is also somewhat unwieldy.  So this
+    format should probably not be used for most new applications.
+   </para>
+
+   <para>
     When entering <type>bytea</type> values, octets of certain
     values <emphasis>must</emphasis> be escaped (but all octet
     values <emphasis>can</emphasis> be escaped) when used as part
@@ -1341,14 +1398,7 @@ SELECT b, char_length(b) FROM test2;
     have to escape line feeds and carriage returns if your interface
     automatically translates these.
    </para>
-
-   <para>
-        The <acronym>SQL</acronym> standard defines a different binary
-        string type, called <type>BLOB</type> or <type>BINARY LARGE
-        OBJECT</type>.  The input format is different from
-        <type>bytea</type>, but the provided functions and operators are
-        mostly the same.
-   </para>
+  </sect2>
  </sect1>
 
 
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index eed799a..b8a3cef 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -122,8 +122,8 @@ static const int8 hexlookup[128] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 };
 
-static unsigned
-hex_encode(const char *src, unsigned len, char *dst)
+size_t
+hex_encode(const char *src, size_t len, char *dst)
 {
 	const char *end = src + len;
 
@@ -152,8 +152,8 @@ get_hex(char c)
 	return (char) res;
 }
 
-static unsigned
-hex_decode(const char *src, unsigned len, char *dst)
+size_t
+hex_decode(const char *src, size_t len, char *dst)
 {
 	const char *s,
 			   *srcend;
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 4cf3966..3c24686 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -62,6 +62,8 @@ static text *text_substring(Datum str,
 			   bool length_not_specified);
 static void appendStringInfoText(StringInfo str, const text *t);
 
+bool bytea_output_hex = true;
+
 
 /*****************************************************************************
  *	 CONVERSION ROUTINES EXPORTED FOR USE BY C CODE							 *
@@ -189,6 +191,18 @@ byteain(PG_FUNCTION_ARGS)
 	int			byte;
 	bytea	   *result;
 
+	if (inputText[0] == '\\' && inputText[1] == 'x')
+	{
+		size_t len = strlen(inputText);
+
+		byte = (len - 2)/2 + VARHDRSZ;
+		result = palloc(byte);
+		SET_VARSIZE(result, byte);
+		hex_decode(inputText + 2, len - 2, VARDATA(result));
+
+		PG_RETURN_BYTEA_P(result);
+	}
+
 	for (byte = 0, tp = inputText; *tp != '\0'; byte++)
 	{
 		if (tp[0] != '\\')
@@ -269,8 +283,17 @@ byteaout(PG_FUNCTION_ARGS)
 	char	   *result;
 	char	   *vp;
 	char	   *rp;
-	int			val;			/* holds unprintable chars */
 	int			i;
+
+	if (bytea_output_hex)
+	{
+		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
+		*rp++ = '\\';
+		*rp++ = 'x';
+		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
+	}
+	else
+	{
 	int			len;
 
 	len = 1;					/* empty string has 1 char */
@@ -295,6 +318,8 @@ byteaout(PG_FUNCTION_ARGS)
 		}
 		else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 		{
+				int			val;			/* holds unprintable chars */
+
 			val = *vp;
 			rp[0] = '\\';
 			rp[3] = DIG(val & 07);
@@ -307,6 +332,7 @@ byteaout(PG_FUNCTION_ARGS)
 		else
 			*rp++ = *vp;
 	}
+	}
 	*rp = '\0';
 	PG_RETURN_CSTRING(result);
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 4f06725..19d07ff 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -584,6 +584,14 @@ const char *const config_type_names[] =
 static struct config_bool ConfigureNamesBool[] =
 {
 	{
+		{"bytea_output_hex", PGC_USERSET, CLIENT_CONN_OTHER,
+			gettext_noop("Sets the bytea output format to the hex format."),
+			NULL
+		},
+		&bytea_output_hex,
+		true, NULL, NULL
+	},
+	{
 		{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of sequential-scan plans."),
 			NULL
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index c1b9393..d91a54c 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -704,6 +704,8 @@ extern Datum unknownout(PG_FUNCTION_ARGS);
 extern Datum unknownrecv(PG_FUNCTION_ARGS);
 extern Datum unknownsend(PG_FUNCTION_ARGS);
 
+extern bool bytea_output_hex;
+
 extern Datum byteain(PG_FUNCTION_ARGS);
 extern Datum byteaout(PG_FUNCTION_ARGS);
 extern Datum bytearecv(PG_FUNCTION_ARGS);
@@ -728,6 +730,9 @@ extern Datum bytea_substr(PG_FUNCTION_ARGS);
 extern Datum bytea_substr_no_len(PG_FUNCTION_ARGS);
 extern Datum pg_column_size(PG_FUNCTION_ARGS);
 
+extern size_t hex_encode(const char *src, size_t len, char *dst);
+extern size_t hex_decode(const char *src, size_t len, char *dst);
+
 /* version.c */
 extern Datum pgsql_version(PG_FUNCTION_ARGS);
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e80e1a4..3742dc0 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -1,3 +1,4 @@
+SET bytea_output_hex TO false;
 --
 -- create user defined conversion
 --
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index be8eb91..0c1fc41 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -97,6 +97,87 @@ LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
                ^
 DETAIL:  String constants with Unicode escapes cannot be used when standard_conforming_strings is off.
 RESET standard_conforming_strings;
+-- bytea
+SET bytea_output_hex = true;
+SELECT E'\\xDeAdBeEf'::bytea;
+   bytea    
+------------
+ \xdeadbeef
+(1 row)
+
+SELECT E'\\xDeAdBeE'::bytea;
+ERROR:  invalid hexadecimal data: odd number of digits
+LINE 1: SELECT E'\\xDeAdBeE'::bytea;
+               ^
+SELECT E'\\xDeAdBeEx'::bytea;
+ERROR:  invalid hexadecimal digit: "x"
+LINE 1: SELECT E'\\xDeAdBeEx'::bytea;
+               ^
+SELECT E'\\xDe00BeEf'::bytea;
+   bytea    
+------------
+ \xde00beef
+(1 row)
+
+SELECT E'DeAdBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465416442654566
+(1 row)
+
+SELECT E'De\\000dBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465006442654566
+(1 row)
+
+SELECT E'De\123dBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465536442654566
+(1 row)
+
+SELECT E'De\\123dBeEf'::bytea;
+       bytea        
+--------------------
+ \x4465536442654566
+(1 row)
+
+SELECT E'De\\678dBeEf'::bytea;
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT E'De\\678dBeEf'::bytea;
+               ^
+SET bytea_output_hex = false;
+SELECT E'\\xDeAdBeEf'::bytea;
+      bytea       
+------------------
+ \336\255\276\357
+(1 row)
+
+SELECT E'\\xDe00BeEf'::bytea;
+      bytea       
+------------------
+ \336\000\276\357
+(1 row)
+
+SELECT E'DeAdBeEf'::bytea;
+  bytea   
+----------
+ DeAdBeEf
+(1 row)
+
+SELECT E'De\\000dBeEf'::bytea;
+    bytea    
+-------------
+ De\000dBeEf
+(1 row)
+
+SELECT E'De\\123dBeEf'::bytea;
+  bytea   
+----------
+ DeSdBeEf
+(1 row)
+
 --
 -- test conversions between various string types
 -- E021-10 implicit casting among the character data types
diff --git a/src/test/regress/input/largeobject.source b/src/test/regress/input/largeobject.source
index 46ba926..43f6855 100644
--- a/src/test/regress/input/largeobject.source
+++ b/src/test/regress/input/largeobject.source
@@ -2,6 +2,8 @@
 -- Test large object support
 --
 
+SET bytea_output_hex TO false;
+
 -- Load a file
 CREATE TABLE lotest_stash_values (loid oid, fd integer);
 -- lo_creat(mode integer) returns oid
diff --git a/src/test/regress/output/largeobject.source b/src/test/regress/output/largeobject.source
index 9d69f6c..7012586 100644
--- a/src/test/regress/output/largeobject.source
+++ b/src/test/regress/output/largeobject.source
@@ -1,6 +1,7 @@
 --
 -- Test large object support
 --
+SET bytea_output_hex TO false;
 -- Load a file
 CREATE TABLE lotest_stash_values (loid oid, fd integer);
 -- lo_creat(mode integer) returns oid
diff --git a/src/test/regress/output/largeobject_1.source b/src/test/regress/output/largeobject_1.source
index 1fbc29c..0ece752 100644
--- a/src/test/regress/output/largeobject_1.source
+++ b/src/test/regress/output/largeobject_1.source
@@ -1,6 +1,7 @@
 --
 -- Test large object support
 --
+SET bytea_output_hex TO false;
 -- Load a file
 CREATE TABLE lotest_stash_values (loid oid, fd integer);
 -- lo_creat(mode integer) returns oid
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 99a9178..001de4e 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -1,3 +1,5 @@
+SET bytea_output_hex TO false;
+
 --
 -- create user defined conversion
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index a28c75a..8345534 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -43,6 +43,25 @@ SELECT U&'wrong: +0061' UESCAPE '+';
 
 RESET standard_conforming_strings;
 
+-- bytea
+SET bytea_output_hex = true;
+SELECT E'\\xDeAdBeEf'::bytea;
+SELECT E'\\xDeAdBeE'::bytea;
+SELECT E'\\xDeAdBeEx'::bytea;
+SELECT E'\\xDe00BeEf'::bytea;
+SELECT E'DeAdBeEf'::bytea;
+SELECT E'De\\000dBeEf'::bytea;
+SELECT E'De\123dBeEf'::bytea;
+SELECT E'De\\123dBeEf'::bytea;
+SELECT E'De\\678dBeEf'::bytea;
+
+SET bytea_output_hex = false;
+SELECT E'\\xDeAdBeEf'::bytea;
+SELECT E'\\xDe00BeEf'::bytea;
+SELECT E'DeAdBeEf'::bytea;
+SELECT E'De\\000dBeEf'::bytea;
+SELECT E'De\\123dBeEf'::bytea;
+
 --
 -- test conversions between various string types
 -- E021-10 implicit casting among the character data types

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] bytea vs. pg_dump

Reply via email to