Peter Eisentraut wrote:
On lör, 2010-07-03 at 09:26 +0100, Mike Fowler wrote:
What I will do instead is implement the xml_is_well_formed function and get a patch out in the next day or two.

That sounds very useful.
Here's the patch to add the 'xml_is_well_formed' function. Paraphrasing the SGML the syntax is:

|xml_is_well_formed|(/text/)

The function |xml_is_well_formed| evaluates whether the /text/ is well formed XML content, returning a boolean. I've done some tests (included in the patch) with tables containing a mixture of well formed documents and content and the function is happily returning the expected result. Combining with IS (NOT) DOCUMENT is working nicely for pulling out content or documents from a table of text.

Unless I missed something in the original correspondence, I think this patch will solve the issue.

Regards,

--
Mike Fowler
Registered Linux user: 379787

*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
***************
*** 8554,8562 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
  ]]></screen>
      </para>
     </sect3>
  
     <sect3>
!     <title>XML Predicates</title>
  
      <indexterm>
       <primary>IS DOCUMENT</primary>
--- 8554,8566 ----
  ]]></screen>
      </para>
     </sect3>
+   </sect2>
+ 
+   <sect2>
+    <title>XML Predicates</title>
  
     <sect3>
!     <title>IS DOCUMENT</title>
  
      <indexterm>
       <primary>IS DOCUMENT</primary>
***************
*** 8574,8579 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
--- 8578,8653 ----
       between documents and content fragments.
      </para>
     </sect3>
+ 
+    <sect3>
+     <title>xml_is_well_formed</title>
+ 
+     <indexterm>
+      <primary>xml_is_well_formed</primary>
+      <secondary>well formed</secondary>
+     </indexterm>
+ 
+ <synopsis>
+ <function>xml_is_well_formed</function>(<replaceable>text</replaceable>)
+ </synopsis>
+ 
+     <para>
+      The function <function>xml_is_well_formed</function> evaluates whether
+      the <replaceable>text</replaceable> is well formed XML content, returning
+      a boolean.
+     </para>
+     <para>
+     Example:
+ <screen><![CDATA[
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+ ]]></screen>
+     </para>
+     <para>
+     This function can be combined with the IS DOCUMENT predicate to prevent
+     invalid XML content errors from occuring in queries. For example, given a
+     table that may have rows with invalid XML mixed in with rows of valid
+     XML, <function>xml_is_well_formed</function> can be used to filter out all
+     the invalid rows.
+     </para>
+     <para>
+     Example:
+ <screen><![CDATA[
+ SELECT * FROM mixed;
+              data
+ ------------------------------
+  <foo>bar</foo>
+  <foo>bar</foo
+  <foo>bar</foo><bar>foo</bar>
+  <foo>bar</foo><bar>foo</bar
+ (4 rows)
+ 
+ SELECT COUNT(data) FROM mixed WHERE data::xml IS DOCUMENT;
+ ERROR:  invalid XML content
+ DETAIL:  Entity: line 1: parser error : expected '>'
+ <foo>bar</foo
+              ^
+ Entity: line 1: parser error : chunk is not well balanced
+ <foo>bar</foo
+              ^
+ 
+ SELECT COUNT(data) FROM mixed WHERE xml_is_well_formed(data) AND data::xml IS DOCUMENT;
+  count
+ -------
+      1
+ (1 row)
+ ]]></screen>
+     </para>
+    </sect3>
    </sect2>
  
    <sect2 id="functions-xml-processing">
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***************
*** 3293,3298 **** xml_xmlnodetoxmltype(xmlNodePtr cur)
--- 3293,3365 ----
  }
  #endif
  
+ Datum
+ xml_is_well_formed(PG_FUNCTION_ARGS)
+ {
+ #ifdef USE_LIBXML
+ 	text				*data = PG_GETARG_TEXT_P(0);
+ 	bool				result;
+ 	int					res_code;
+ 	int32				len;
+ 	const xmlChar		*string;
+ 	xmlParserCtxtPtr	ctxt;
+ 	xmlDocPtr			doc = NULL;
+ 
+ 	len = VARSIZE(data) - VARHDRSZ;
+ 	string = xml_text2xmlChar(data);
+ 
+ 	/* Start up libxml and its parser (no-ops if already done) */
+ 	pg_xml_init();
+ 	xmlInitParser();
+ 
+ 	ctxt = xmlNewParserCtxt();
+ 	if (ctxt == NULL)
+ 		xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
+ 					"could not allocate parser context");
+ 
+ 	PG_TRY();
+ 	{
+ 		size_t		count;
+ 		xmlChar    *version = NULL;
+ 		int			standalone = -1;
+ 
+ 		res_code = parse_xml_decl(string, &count, &version, NULL, &standalone);
+ 		if (res_code != 0)
+ 			xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
+ 						  "invalid XML content: invalid XML declaration",
+ 							res_code);
+ 
+ 		doc = xmlNewDoc(version);
+ 		doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
+ 		doc->standalone = 1;
+ 
+ 		res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string + count, NULL);
+ 
+ 		result = !res_code;
+ 	}
+ 	PG_CATCH();
+ 	{
+ 		if (doc)
+ 			xmlFreeDoc(doc);
+ 		if (ctxt)
+ 			xmlFreeParserCtxt(ctxt);
+ 
+ 		PG_RE_THROW();
+ 	}
+ 	PG_END_TRY();
+ 
+ 	if (doc)
+ 		xmlFreeDoc(doc);
+ 	if (ctxt)
+ 		xmlFreeParserCtxt(ctxt);
+ 
+ 	return result;
+ #else
+ 	NO_XML_SUPPORT();
+ 	return 0;
+ #endif
+ }
+ 
  
  /*
   * Evaluate XPath expression and return array of XML values.
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4385,4390 **** DESCR("evaluate XPath expression, with namespaces support");
--- 4385,4393 ----
  DATA(insert OID = 2932 (  xpath		 PGNSP PGUID 14 1 0 0 f f f t f i 2 0 143 "25 142" _null_ _null_ _null_ _null_ "select pg_catalog.xpath($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ ));
  DESCR("evaluate XPath expression");
  
+ DATA(insert OID = 3037 (  xml_is_well_formed	PGNSP PGUID 12 1 0 0 f f f t f i 1 0 16 "25" _null_ _null_ _null_ _null_ xml_is_well_formed _null_ _null_ _null_ ));
+ DESCR("determine if a text fragment is well formed XML");
+ 
  /* uuid */
  DATA(insert OID = 2952 (  uuid_in		   PGNSP PGUID 12 1 0 0 f f f t f i 1 0 2950 "2275" _null_ _null_ _null_ _null_ uuid_in _null_ _null_ _null_ ));
  DESCR("I/O");
*** a/src/include/utils/xml.h
--- b/src/include/utils/xml.h
***************
*** 46,51 **** extern Datum query_to_xmlschema(PG_FUNCTION_ARGS);
--- 46,52 ----
  extern Datum cursor_to_xmlschema(PG_FUNCTION_ARGS);
  extern Datum table_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
  extern Datum query_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
+ extern Datum xml_is_well_formed(PG_FUNCTION_ARGS);
  
  extern Datum schema_to_xml(PG_FUNCTION_ARGS);
  extern Datum schema_to_xmlschema(PG_FUNCTION_ARGS);
*** a/src/test/regress/expected/xml.out
--- b/src/test/regress/expected/xml.out
***************
*** 502,504 **** SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
--- 502,565 ----
   {<b>two</b>,<b>etc</b>}
  (1 row)
  
+ -- Test xml_is_well_formed
+ SELECT xml_is_well_formed('<>');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('abc');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<abc/>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+  ?column? 
+ ----------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+  ?column? 
+ ----------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+  ?column? 
+ ----------
+  t
+ (1 row)
+ 
*** a/src/test/regress/sql/xml.sql
--- b/src/test/regress/sql/xml.sql
***************
*** 163,165 **** SELECT xpath('', '<!-- error -->');
--- 163,179 ----
  SELECT xpath('//text()', '<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
  SELECT xpath('//loc:piece/@id', '<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>', ARRAY[ARRAY['loc', 'http://127.0.0.1']]);
  SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
+ 
+ -- Test xml_is_well_formed
+ 
+ SELECT xml_is_well_formed('<>');
+ SELECT xml_is_well_formed('abc');
+ SELECT xml_is_well_formed('<abc/>');
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+ SELECT xml_is_well_formed('<foo>bar</foo');
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+ 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to