Last January I submitted a patch to raptor that added a added a new parser option called 'loadDTD'. The patch hasn't been applied and there's been no subsequent discussion that I've seen. Perhaps it's my fault for going about submitting the patch in the wrong way or the wrong place, and if so, apologies.

However I still feel that the patch is of definite advantage to raptor, in particular in its handling of entities in RDFa where it's very common to see an example like this:

  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"
      "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd";>
  <html xmlns="http://www.w3.org/1999/xhtml";
        version="XHTML+RDFa 1.0" xml:lang="en">
    <head>
      <title>Test</title>
    </head>
    <body>
      <p>This page was written by
        <span xmlns:dc="http://purl.org/dc/elements/1.1/";
              property="dc:creator">Jos&eacute;</span>.</p>
    </body>
  </html>

Note the use of the HTML eacute entity in the name José. (This got mangled in the web archive last time round.) When I submitted the patch, rapper 2.0.6 couldn't parse this, and testing today with 2.0.9 it is still the case. If Redland is to be of use with real-world RDFa, without an otherwise unnecessary additional pre-procesing stage, this needs fixing.

The argument that W3 add a 30s delay in serving the DTDs is largely irrelevant. With the patch, DTD loading only happens only if you specifically request it, and if you have a suitable XML catalog, libxml2 won't fetch the DTD from the W3 but from a local copy on your machine. (Under Debian, the w3c-sgml-lib package installs such a catalog for you.) And in any case, if you really need to parse the entity, there will be cases when the 30s delay is quite acceptable.

Is it worth me reworking the patch so that it applies cleanly against the current code base?

Richard
diff -ur raptor2-2.0.6/configure.ac raptor2-2.0.6+patch/configure.ac
--- raptor2-2.0.6/configure.ac  2011-11-24 07:15:15.000000000 +0000
+++ raptor2-2.0.6+patch/configure.ac    2012-01-06 01:51:56.815669830 +0000
@@ -700,6 +700,16 @@
 
     AC_CHECK_FUNCS(xmlSAX2InternalSubset xmlCtxtUseOptions)
 
+    AC_MSG_CHECKING(if libxml has parser option XML_PARSE_DTDLOAD)
+    AC_TRY_LINK([
+#ifdef HAVE_LIBXML_PARSER_H
+#include <libxml/parser.h>
+#endif
+], [xmlParserOption foo; foo = XML_PARSE_DTDLOAD],
+                AC_MSG_RESULT(yes)
+               AC_DEFINE(RAPTOR_LIBXML_XML_PARSE_DTDLOAD, 1, [does libxml have 
XML_PARSE_DTDLOA]),
+               AC_MSG_RESULT(no))
+
     AC_MSG_CHECKING(if libxml has parser option XML_PARSE_NONET)
     AC_TRY_LINK([
 #ifdef HAVE_LIBXML_PARSER_H
diff -ur raptor2-2.0.6/librdfa/rdfa.c raptor2-2.0.6+patch/librdfa/rdfa.c
--- raptor2-2.0.6/librdfa/rdfa.c        2011-08-22 07:05:56.000000000 +0100
+++ raptor2-2.0.6+patch/librdfa/rdfa.c  2012-01-06 09:59:25.158089322 +0000
@@ -1218,6 +1218,18 @@
    rdfa_init_context(context);
 
 #ifdef LIBRDFA_IN_RAPTOR
+  /* Optionally forbid network requests in the XML parser */
+  raptor_sax2_set_option(context->sax2, 
+                         RAPTOR_OPTION_NO_NET, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(context, 
RAPTOR_OPTION_NO_NET));
+
+  /* Optionally force DTD loads in the XML parser */
+  raptor_sax2_set_option(context->sax2, 
+                         RAPTOR_OPTION_LOAD_DTD, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(context, 
RAPTOR_OPTION_LOAD_DTD));
+#endif
+
+#ifdef LIBRDFA_IN_RAPTOR
    context->base_uri=raptor_new_uri(context->sax2->world, (const unsigned 
char*)context->base);
    raptor_sax2_parse_start(context->sax2, context->base_uri);
 #endif
diff -ur raptor2-2.0.6/librdfa/rdfa.h raptor2-2.0.6+patch/librdfa/rdfa.h
--- raptor2-2.0.6/librdfa/rdfa.h        2011-04-26 19:16:35.000000000 +0100
+++ raptor2-2.0.6+patch/librdfa/rdfa.h  2012-01-06 10:03:37.046101513 +0000
@@ -233,6 +233,8 @@
    raptor_sax2* sax2;
    raptor_namespace_handler namespace_handler;
    void* namespace_handler_user_data;
+  raptor_object_options options;
+   
 #else
    XML_Parser parser;
 #endif
diff -ur raptor2-2.0.6/src/raptor2.h.in raptor2-2.0.6+patch/src/raptor2.h.in
--- raptor2-2.0.6/src/raptor2.h.in      2011-11-27 17:36:30.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor2.h.in        2012-01-06 02:04:21.895705896 
+0000
@@ -494,6 +494,7 @@
  * @RAPTOR_OPTION_WRITER_XML_VERSION: Integer XML version XML 1.0 (10) or XML 
1.1 (11)
  * @RAPTOR_OPTION_WRITER_XML_DECLARATION: Write XML 1.0 or 1.1 declaration.
  * @RAPTOR_OPTION_NO_NET: Deny network requests.
+ * @RAPTOR_OPTION_LOAD_DTD: Load document DTDs.
  * @RAPTOR_OPTION_RESOURCE_BORDER: Border color of resource
  *   nodes for GraphViz DOT serializer.
  * @RAPTOR_OPTION_LITERAL_BORDER: Border color of literal nodes
@@ -568,7 +569,8 @@
   RAPTOR_OPTION_WWW_CERT_FILENAME,
   RAPTOR_OPTION_WWW_CERT_TYPE,
   RAPTOR_OPTION_WWW_CERT_PASSPHRASE,
-  RAPTOR_OPTION_LAST = RAPTOR_OPTION_WWW_CERT_PASSPHRASE
+  RAPTOR_OPTION_LOAD_DTD,
+  RAPTOR_OPTION_LAST = RAPTOR_OPTION_LOAD_DTD
 } raptor_option;
 
 
diff -ur raptor2-2.0.6/src/raptor_config.h.in 
raptor2-2.0.6+patch/src/raptor_config.h.in
--- raptor2-2.0.6/src/raptor_config.h.in        2011-11-24 07:15:46.000000000 
+0000
+++ raptor2-2.0.6+patch/src/raptor_config.h.in  2012-01-06 01:55:06.359679001 
+0000
@@ -196,6 +196,9 @@
 /* does libxml xmlSAXHandler have initialized field */
 #undef RAPTOR_LIBXML_XMLSAXHANDLER_INITIALIZED
 
+/* does libxml have XML_PARSE_DTDLOAD */
+#undef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+
 /* does libxml have XML_PARSE_NONET */
 #undef RAPTOR_LIBXML_XML_PARSE_NONET
 
diff -ur raptor2-2.0.6/src/raptor_grddl.c raptor2-2.0.6+patch/src/raptor_grddl.c
--- raptor2-2.0.6/src/raptor_grddl.c    2011-08-31 20:53:24.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_grddl.c      2012-01-06 02:07:42.351715591 
+0000
@@ -878,6 +878,10 @@
       if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_NO_NET))
         libxml_options |= XML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+      if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+        libxml_options |= XML_PARSE_DTDLOAD;
+#endif
 #ifdef HAVE_XMLCTXTUSEOPTIONS
       xmlCtxtUseOptions(xc, libxml_options);
 #endif
@@ -1439,6 +1443,10 @@
       if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET))
         libxml_options |= XML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+      if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+        libxml_options |= XML_PARSE_DTDLOAD;
+#endif
 #ifdef HAVE_XMLCTXTUSEOPTIONS
       xmlCtxtUseOptions(grddl_parser->xml_ctxt, libxml_options);
 #endif
@@ -1488,6 +1496,10 @@
         if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET))
           options |= HTML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+        if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+          options |= XML_PARSE_DTDLOAD;
+#endif
 
         htmlCtxtUseOptions(grddl_parser->html_ctxt, options);
  
diff -ur raptor2-2.0.6/src/raptor_librdfa.c 
raptor2-2.0.6+patch/src/raptor_librdfa.c
--- raptor2-2.0.6/src/raptor_librdfa.c  2011-10-21 21:41:16.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_librdfa.c    2012-01-06 10:05:44.150107663 
+0000
@@ -267,6 +267,8 @@
   /* returns RDFa Processing Graph error triples - not used by raptor */
   rdfa_set_processor_graph_triple_handler(librdfa_parser->context, NULL);
 
+  librdfa_parser->context->options = rdf_parser->options;
+
   rc = rdfa_parse_start(librdfa_parser->context);
   if(rc != RDFA_PARSE_SUCCESS)
     return 1;
diff -ur raptor2-2.0.6/src/raptor_option.c 
raptor2-2.0.6+patch/src/raptor_option.c
--- raptor2-2.0.6/src/raptor_option.c   2011-08-01 03:02:22.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_option.c     2012-01-06 09:40:28.342034303 
+0000
@@ -277,6 +277,12 @@
     RAPTOR_OPTION_VALUE_TYPE_STRING,
     "wwwCertPassphrase",
     "SSL client certificate passphrase"
+  },
+  { RAPTOR_OPTION_LOAD_DTD,
+    (raptor_option_area)(RAPTOR_OPTION_AREA_PARSER | RAPTOR_OPTION_AREA_SAX2),
+    RAPTOR_OPTION_VALUE_TYPE_BOOL,
+    "loadDTD",
+    "Parsers and SAX2 XML Parser should load DTDs."
   }
 };
 
diff -ur raptor2-2.0.6/src/raptor_rdfxml.c 
raptor2-2.0.6+patch/src/raptor_rdfxml.c
--- raptor2-2.0.6/src/raptor_rdfxml.c   2011-10-21 21:41:16.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_rdfxml.c     2012-01-06 02:09:14.807720071 
+0000
@@ -1001,6 +1001,11 @@
   raptor_sax2_set_option(rdf_xml_parser->sax2, 
                          RAPTOR_OPTION_NO_NET, NULL,
                          RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, 
RAPTOR_OPTION_NO_NET));
+
+  /* Optionally force DTD loads in the XML parser */
+  raptor_sax2_set_option(rdf_xml_parser->sax2, 
+                         RAPTOR_OPTION_LOAD_DTD, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, 
RAPTOR_OPTION_LOAD_DTD));
   
   raptor_sax2_parse_start(rdf_xml_parser->sax2, uri);
 
diff -ur raptor2-2.0.6/src/raptor_rss.c raptor2-2.0.6+patch/src/raptor_rss.c
--- raptor2-2.0.6/src/raptor_rss.c      2011-08-31 20:53:24.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_rss.c        2012-01-06 02:11:18.495726048 
+0000
@@ -249,6 +249,11 @@
   raptor_sax2_set_option(rss_parser->sax2, 
                          RAPTOR_OPTION_NO_NET, NULL,
                          RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, 
RAPTOR_OPTION_NO_NET));
+
+  /* Optionally force DTD loads in the XML parser */
+  raptor_sax2_set_option(rss_parser->sax2, 
+                         RAPTOR_OPTION_LOAD_DTD, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, 
RAPTOR_OPTION_LOAD_DTD));
   
   raptor_sax2_parse_start(rss_parser->sax2, uri);
 
diff -ur raptor2-2.0.6/src/raptor_sax2.c raptor2-2.0.6+patch/src/raptor_sax2.c
--- raptor2-2.0.6/src/raptor_sax2.c     2011-11-27 17:36:30.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_sax2.c       2012-01-06 10:06:33.994110079 
+0000
@@ -518,6 +518,10 @@
     if(RAPTOR_OPTIONS_GET_NUMERIC(sax2, RAPTOR_OPTION_NO_NET))
       libxml_options |= XML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+    if(RAPTOR_OPTIONS_GET_NUMERIC(sax2, RAPTOR_OPTION_LOAD_DTD))
+      libxml_options |= XML_PARSE_DTDLOAD;
+#endif
 #ifdef HAVE_XMLCTXTUSEOPTIONS
     xmlCtxtUseOptions(xc, libxml_options);
 #endif
diff -ur raptor2-2.0.6/src/raptor_turtle_writer.c 
raptor2-2.0.6+patch/src/raptor_turtle_writer.c
--- raptor2-2.0.6/src/raptor_turtle_writer.c    2011-11-12 21:18:03.000000000 
+0000
+++ raptor2-2.0.6+patch/src/raptor_turtle_writer.c      2012-01-06 
02:11:56.555727893 +0000
@@ -704,6 +704,7 @@
       
     /* Shared */
     case RAPTOR_OPTION_NO_NET:
+    case RAPTOR_OPTION_LOAD_DTD:
 
     /* XML writer options */
     case RAPTOR_OPTION_RELATIVE_URIS:
_______________________________________________
redland-dev mailing list
[email protected]
http://lists.librdf.org/mailman/listinfo/redland-dev

Reply via email to