mbeckerle commented on a change in pull request #560:
URL: https://github.com/apache/daffodil/pull/560#discussion_r637054288
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,226 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
+ *
+ * Checks like UPA are in step (2) above. They are coded algorithmically
+ * right into Xerces. This is accomplished by
+ * using the below SchemaFactory and SchemaFactory.newSchema calls. The
+ * newSchema call is what forces schema validation to take place.
+ */
+ private lazy val schemaFactory = {
+ val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+ sf.setResourceResolver(resolver)
+ sf.setErrorHandler(errorHandler)
+ sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+ // These are not recognized by a schemaFactory
+ // sf.setFeature("http://xml.org/sax/features/validation", true)
+ // sf.setFeature("http://apache.org/xml/features/validation/schema", true)
+ //
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ sf
+ }
+
+ /**
+ * This loads the DFDL schema as an XML Schema. This will
+ * check many more things (ex: UPA) about the DFDL schema other than
+ * just whether it validates against the XML Schema for DFDL schemas.
+ *
+ * Unfortunately, we don't have control over how Xerces loads up these
schemas
+ * (other than the resolver anyway), so we can't really redirect the way
+ * it issues error messages so that it properly lays blame at say, the
schema fragments
+ * inside an embedded schema of a TDML file.
+ *
+ * So if we want good file/line/column info from this, we have to give
+ * it a plain old file or resource, and not try to play games to get it to
+ * pick up the file/line/col information from attributes of the elements.
+ *
+ * Due to limitations in the xerces newSchema() method
+ * this method should be called only after loading
+ * the schema as a regular XML file, which itself insists
+ * on the XMLUtils.setSecureDefaults, so we don't need to
+ * further check that here.
+ */
+ def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+ // first we load it, with validation explicitly against the
+ // schema for DFDL Schemas.
+ load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes =
true)
+ //
+ // Then we validate explicitly so Xerces can check things
+ // such as for UPA violations
+ //
+ val inputSource = source.newInputSource()
+ val saxSource = new SAXSource(inputSource)
+ //
+ // We would like this saxSource to be created from an XMLReader
+ // so that we can call XMLUtils.setSecureDefaults on it.
+ // but we get strange errors if I do that, where every symbol
+ // in the schema has an unrecognized namespace prefix.
+ //
+ schemaFactory.newSchema(saxSource)
+ inputSource.getByteStream().close()
+ }
- def setValidation(flag: Boolean): Unit = {
- doValidation = flag
+ // $COVERAGE-OFF$
+ override def parser = {
+ Assert.usageError("not to be called.")
}
+ // $COVERAGE-ON$
/**
+ * Obtain and initialize parser which validates the schema is defined.
+ */
+ private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+ if (optSchemaURI.isEmpty) noSchemaParser
+ else {
+ val f = parserFactory()
+ val schema = schemaFromURI(optSchemaURI)
+ f.setSchema(schema)
+ parserFromFactory(f)
+ }
+ }
+
+ private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
+ val sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+ sf.setErrorHandler(errorHandler)
+ sf.setResourceResolver(resolver)
+ val schema = sf.newSchema(new StreamSource(optSchemaURI.get.toString))
+ schema
+ }
+
+ private def parserFactory() = {
+ val f = new SAXParserFactoryImpl
+ f.setNamespaceAware(true)
+ f.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ f.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ f.setValidating(false)// according to javadoc, just controls DTD validation
+ f.setFeature("http://xml.org/sax/features/validation", true)
+ // not recognized by SAXParserFactory
+ // f.setFeature("http://xml.org/sax/features/validation/dynamic", true)
+ f.setFeature("http://apache.org/xml/features/honour-all-schemaLocations",
true)
+ f.setFeature("http://apache.org/xml/features/validation/schema", true)
+
f.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ f
+ }
+
+ private lazy val noSchemaParser: SAXParser = {
+ parserFromFactory(parserFactory())
+ }
+
+ private def parserFromFactory(f: SAXParserFactory) = {
+ val p = f.newSAXParser()
+ //
+ // Not allowed on a SAXParser
+ // p.setProperty(XMLUtils.SAX_NAvMESPACES_FEATURE, true)
+ // Not allowed on a SAXParser
+ // p.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ val xrdr = p.getXMLReader()
+ XMLUtils.setSecureDefaults(xrdr)
+ xrdr.setErrorHandler(errorHandler)
+ // not recognized by XMLReader
+ // xrdr.setFeature("http://xml.org/sax/features/validation/dynamic", true)
+ xrdr.setContentHandler(this)
+ //
+ // This is required to get the parse to really use our resolver.
+ // The setEntityResolver(resolver) does not work.
+ //
+
xrdr.setProperty("http://apache.org/xml/properties/internal/entity-resolver",
resolver)
+ p
+ }
+
+ /**
+ * This is the common routine called by all the load calls to actually
+ * carry out the loading of the schema.
+ *
+ * There are no calls to this in Daffodil code base as of this writing, but
+ * the base class scala.xml.factory.XMLLoader calls it.
+ *
* Does (optional) validation,
+ *
+ * @param source The URI for the XML document which may be a XML or DFDL
schema, or just XML data.
+ * @param optSchemaURI Optional URI for XML schema for the XML source
document.
+ * @param addPositionAttributes True to add dafint:file dafint:line
attributes to all elements.
+ * Defaults to false.
+ * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF.
This should usually be true,
+ * but some special case situations may require
preservation of CRLF/CR.
+ * @return an scala.xml.Node (Element actually) which is the document
element of the source.
*/
- def load(source: DaffodilSchemaSource): scala.xml.Node = {
- if (doValidation) {
- val inputSource = source.newInputSource()
- val xercesNode = xercesAdapter.load(inputSource) // validates
- inputSource.getByteStream().close()
-
- if (xercesNode == null) return null
- // Note: we don't call xercesAdapter.validateSchema(source)
- // here, because this is an XML loader, not necessarily
- // just a DFDL schema loader. So for example the doValidation flag
- // above could be telling us to validate a TDML file or not.
+ def load(source: DaffodilSchemaSource,
+ optSchemaURI: Option[URI],
+ addPositionAttributes: Boolean = false,
+ normalizeCRLFtoLF: Boolean = true): scala.xml.Node = {
+ //
+ // First we invoke the validator to explicitly validate the XML against
+ // the XML Schema (not necessarily a DFDL schema), via the
+ // javax.xml.validation.Validator's validate method.
+ //
+ optSchemaURI.foreach { schemaURI =>
+
+ val validator = XercesValidator.fromURIs(Seq(schemaURI))
+ val inputStream = source.uriForLoading.toURL.openStream()
+ validator.validateXML(inputStream, errorHandler)
+ inputStream.close()
+ //
+ // Next we have to invoke a regular xerces loader, setup for validation
+ // because that will actually interpret things like xsi:schemaLocation
attributes
+ // of the root element.
+ //
+ // scopeStack.push(TopScope) // not in scala xml v2.0.0
+ val parser = parserFromURI(optSchemaURI)
+ val xrdr = parser.getXMLReader()
+ val saxSource = scala.xml.Source.fromSysId(source.uriForLoading.toString)
+ // parser.parse(source.uriForLoading.toURL.openStream(), this)
+ xrdr.parse(saxSource)
+ // scopeStack.pop() // not in scala xml v2.0.0
+ // no result, as the errors are reported separately
}
//
// To get reliable xml nodes including conversion of CDATA syntax into
// PCData nodes, we have to use a different loader.
//
- val constructingLoader = new
DaffodilConstructingLoader(source.uriForLoading, errorHandler)
+ val constructingLoader =
+ new DaffodilConstructingLoader(source.uriForLoading,
+ errorHandler, addPositionAttributes, normalizeCRLFtoLF)
val res = constructingLoader.load() // construct the XML objects for us.
constructingLoader.input.close()
res
}
- def validateSchema(source: DaffodilSchemaSource) =
xercesAdapter.validateSchema(source)
+ // We disallow any of these to force us of our own
+ // load(source, optSchemaURI) entry point.
+ // $COVERAGE-OFF$
+ private def noWay = Assert.usageError("Operation is not supported. Use
load(uri) or loadFile(file)")
Review comment:
Removed. No longer mixing in XMLLoader, so don't need these.
I had confused mixing in XMLLoader (which required these) with mixing in
NoBindingFactoryAdapter, which is needed so this loader can server as a
contentHandler.
##########
File path:
daffodil-core/src/main/scala/org/apache/daffodil/dsom/DFDLSchemaFile.scala
##########
@@ -134,20 +134,17 @@ final class DFDLSchemaFile(
lazy val isDFDLSchemaFile = iiXMLSchemaDocument.isDFDLSchema
+ private lazy val loader = new DaffodilXMLLoader(this)
+
lazy val iiXMLSchemaDocument = LV('iiXMLSchemaDocument) {
val res = loadXMLSchemaDocument(seenBefore, Some(this))
if (res.isDFDLSchema && sset.validateDFDLSchemas) {
//
// We validate DFDL schemas, only if validation is requested.
// Some things, tests generally, want to turn this validation off.
//
-
- val ldr = new DaffodilXMLLoader(this)
- ldr.setValidation(true)
- try {
- ldr.load(schemaSource) // validate as XML file with XML Schema for
DFDL Schemas
- ldr.validateSchema(schemaSource) // validate as XSD (catches UPA
errors for example)
- } catch {
+ try loader.validateAsXMLSchema(schemaSource) // validate as XSD (catches
UPA errors for example)
+ catch {
Review comment:
It's actually 3 passes. One in loadXMLSchemaDocument, which is really
just loading, with no specific schema unless one is referenced in an
xsi:schemaLocation. Second is loading it while validating against the
XML-schema-for-dfdl-schemas, the third is constructing a schema object from it,
which is the step that catches UPA errors, and other schema-content-specific
things. For a big DFDL schema (100 files) this will definitely take time.
You are correct this overhead could be substantial. We'd need to do a
performance study to find out.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -179,6 +203,58 @@ class DaffodilConstructingLoader(uri: URI, errorHandler:
org.xml.sax.ErrorHandle
super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
}
+ /**
+ * To emulate the behavior of Xerces loader (standard scala loader)
+ * we have to normalize CRLF to LF, and solitary CR to LF.
+ *
+ * This is optional controlled by a constructor parameter.
+ */
+ override def text(pos: Int, txt: String): Text = {
+ val newText:String = {
+ if (normalizeCRLFtoLF) {
+ txt.
+ replaceAll("\r\n", "\n").
+ replaceAll("\r", "\n")
+ } else {
+ txt
+ }
+ }
+ //
+ // On MS-Windows it seems somehow that the TDML Runner
+ // loads expected infosets as XML, and they have CRLFs
+ // in them. Not sure how this happens.
+ //
+ if (normalizeCRLFtoLF)
+ Assert.invariant(!newText.contains("\r"))
+ super.text(pos, newText)
+ }
+
+ /**
+ * We override this to force the ConstrutingParser to process CDATA regions
+ * specially with an override-able method named cdata.
+ *
+ * Strangely, if you look at the implementation of this in the MarkupParser
+ * trait, it calls the handler for text, but then it ignores the result of
that
+ * and constructs a PCDATA node from the original text.
+ *
+ * It's possible this is a bug fix.
+ */
+ override def xCharData: NodeSeq = {
+ xToken("[CDATA[")
+ def mkResult(pos: Int, s: String): NodeSeq = {
+ val s1 = cdata(pos, s).text
+ PCData(s1)
+ }
+ xTakeUntil(mkResult, () => pos, "]]>")
+ }
Review comment:
Reported: https://github.com/scala/scala-xml/issues/528
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
Review comment:
(3) is not a validating parser, so can't be used as a replacement for (1)
We need (3) because it provides file/line number information, and also it
does CDATA properly.
(Someday I will open a ticket on Xerces about that.....)
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -1751,13 +1796,26 @@ case class DefinedSchema(xml: Node, parent:
DFDLTestSuite) {
* */
final val DEFAULT_ELEMENT_FORM_DEFAULT_VALUE = "qualified"
+ // TODO: We want this default to be false eventually.
+ // The rationale is if people want a default namespace
+ // in their TDML defineSchemas, they should define it. We shouldn't tack
+ // such a sensitive thing on automatically. (81 tests fail in daffodil-test
+ // if you change this however.)
+ //
+ final val DEFAULT_USE_DEFAULT_NAMESPACE_VALUE = true
Review comment:
I think this is spurious. The code just below uses this, and almost
always is going to use this since the majority of tdml:defineSchema do not have
elementFormDefault='unqualified' in them. I've been adding them as I work on
variouso tests, but most still don't have this. That means below code *does*
access this constant.
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -376,19 +400,19 @@ class DFDLTestSuite private[tdml] (
System.gc()
Thread.sleep(1) // needed to give tools like jvisualvm ability to "grab
on" quickly
}
+ val testCase = testCaseMap.get(testName) // causes loading
if (isTDMLFileValid) {
Review comment:
Added a documentation comment near top of the DFDLTestSuite class just
before all the lazy members start.
The justification for all this laziness is ultimately that it's just too
complicated to do in the context of constructors.
##########
File path: project/Dependencies.scala
##########
@@ -48,7 +48,8 @@ object Dependencies {
lazy val test = Seq(
"junit" % "junit" % "4.13.2" % "it,test",
"com.novocode" % "junit-interface" % "0.11" % "it,test",
- "org.scalacheck" %% "scalacheck" % "1.15.4" % "it,test"
+ "org.scalacheck" %% "scalacheck" % "1.15.4" % "it,test",
+ "xmlunit" % "xmlunit" % "1.6" % "test"
Review comment:
removed.
##########
File path:
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/DaffodilTDMLDFDLProcessor.scala
##########
@@ -126,16 +125,21 @@ final class TDMLDFDLProcessorFactory private (
val diags = p.getDiagnostics
Left(diags)
} else {
- val dp =
+ val dp = {
+ // Always serialize the parser/unparser objects.
+ // This insures nothing is left uncomputed by lazy evaluation.
Review comment:
I've removed this from here. It now only saves and reloads if
useSerializedProcessor.
This is the compiler's problem to insure that everything is fully evaluated,
not code out here using the data processor.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,226 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
Review comment:
This is needed. We call setContentHandler(loader) passing the loader,
and that requires it to be a handler.
I was confusing myself. We no longer mix in XMLLoader, hence a bunch of
these non-usable methods are now removed below. That had nothing to do with
NoBindingFactoryAdapter.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -179,6 +203,58 @@ class DaffodilConstructingLoader(uri: URI, errorHandler:
org.xml.sax.ErrorHandle
super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
}
+ /**
+ * To emulate the behavior of Xerces loader (standard scala loader)
+ * we have to normalize CRLF to LF, and solitary CR to LF.
+ *
+ * This is optional controlled by a constructor parameter.
+ */
+ override def text(pos: Int, txt: String): Text = {
+ val newText:String = {
+ if (normalizeCRLFtoLF) {
+ txt.
+ replaceAll("\r\n", "\n").
+ replaceAll("\r", "\n")
+ } else {
+ txt
+ }
+ }
+ //
+ // On MS-Windows it seems somehow that the TDML Runner
+ // loads expected infosets as XML, and they have CRLFs
+ // in them. Not sure how this happens.
+ //
+ if (normalizeCRLFtoLF)
+ Assert.invariant(!newText.contains("\r"))
Review comment:
Removed.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/validation/XercesValidator.scala
##########
@@ -89,28 +82,35 @@ class XercesValidator(schemaFileNames: Seq[String])
private val validator = new ThreadLocal[XercesValidatorImpl] {
override def initialValue(): XercesValidatorImpl =
- initializeValidator(schema.newValidator(), resolver)
+ initializeValidator(schema.newValidator, resolver)
}
- def validateXML(document: java.io.InputStream): ValidationResult = {
+ def validateXML(document: java.io.InputStream) =
+ validateXML(document, new XercesErrorHandler)
+
+ def validateXML(
+ document: java.io.InputStream,
+ eh: ErrorHandler): ValidationResult = {
+
val documentSource = new StreamSource(document)
// get the validator instance for this thread
val xv = validator.get()
- // create a new error handler for this execution
- val eh = new XercesErrorHandler
xv.setErrorHandler(eh)
// validate the document
xv.validate(documentSource)
- // error handler contents as daffodil result
- ValidationResult(eh.warnings, eh.errors)
+ eh match {
+ case xeh: XercesErrorHandler => ValidationResult(xeh.warnings,
xeh.errors)
+ case _ => ValidationResult.empty
+ }
Review comment:
Comments inserted
##########
File path:
daffodil-core/src/test/scala/org/apache/daffodil/processor/TestSAXUtils.scala
##########
@@ -217,8 +224,10 @@ object TestSAXUtils {
val msgs = pf.getDiagnostics.map { _.getMessage() }.mkString("\n")
fail("pf compile errors: " + msgs)
}
- pf.sset.root.erd.preSerialization // force evaluation of all compile-time
constructs
+ // pf.sset.root.erd.preSerialization // force evaluation of all
compile-time constructs
Review comment:
Moving this once and for all to SchemaSet.onPath method. Nobody should
have to worry about this any more.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
+ *
+ * Checks like UPA are in step (2) above. They are coded algorithmically
+ * right into Xerces. This is accomplished by
+ * using the below SchemaFactory and SchemaFactory.newSchema calls. The
+ * newSchema call is what forces schema validation to take place.
+ */
+ private lazy val schemaFactory = {
+ val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+ sf.setResourceResolver(resolver)
+ sf.setErrorHandler(errorHandler)
+ sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+ // These are not recognized by a schemaFactory
+ // sf.setFeature("http://xml.org/sax/features/validation", true)
+ // sf.setFeature("http://apache.org/xml/features/validation/schema", true)
+ //
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ sf
+ }
+
+ /**
+ * This loads the DFDL schema as an XML Schema. This will
+ * check many more things (ex: UPA) about the DFDL schema other than
+ * just whether it validates against the XML Schema for DFDL schemas.
+ *
+ * Unfortunately, we don't have control over how Xerces loads up these
schemas
+ * (other than the resolver anyway), so we can't really redirect the way
+ * it issues error messages so that it properly lays blame at say, the
schema fragments
+ * inside an embedded schema of a TDML file.
+ *
+ * So if we want good file/line/column info from this, we have to give
+ * it a plain old file or resource, and not try to play games to get it to
+ * pick up the file/line/col information from attributes of the elements.
+ *
+ * Due to limitations in the xerces newSchema() method
+ * this method should be called only after loading
+ * the schema as a regular XML file, which itself insists
+ * on the XMLUtils.setSecureDefaults, so we don't need to
+ * further check that here.
+ */
+ def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+ // first we load it, with validation explicitly against the
+ // schema for DFDL Schemas.
+ load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes =
true)
+ //
+ // Then we validate explicitly so Xerces can check things
+ // such as for UPA violations
+ //
+ val inputSource = source.newInputSource()
+ val saxSource = new SAXSource(inputSource)
+ //
+ // We would like this saxSource to be created from an XMLReader
+ // so that we can call XMLUtils.setSecureDefaults on it.
+ // but we get strange errors if I do that, where every symbol
+ // in the schema has an unrecognized namespace prefix.
+ //
+ schemaFactory.newSchema(saxSource)
+ inputSource.getByteStream().close()
+ }
- def setValidation(flag: Boolean): Unit = {
- doValidation = flag
+ // $COVERAGE-OFF$
+ override def parser = {
+ Assert.usageError("not to be called.")
}
+ // $COVERAGE-ON$
/**
+ * Obtain and initialize parser which validates the schema is defined.
+ */
+ private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+ if (optSchemaURI.isEmpty) noSchemaParser
+ else {
+ val f = parserFactory()
+ val schema = schemaFromURI(optSchemaURI)
+ f.setSchema(schema)
+ parserFromFactory(f)
+ }
+ }
+
+ private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
Review comment:
done
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
+ *
+ * Checks like UPA are in step (2) above. They are coded algorithmically
+ * right into Xerces. This is accomplished by
+ * using the below SchemaFactory and SchemaFactory.newSchema calls. The
+ * newSchema call is what forces schema validation to take place.
+ */
+ private lazy val schemaFactory = {
+ val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+ sf.setResourceResolver(resolver)
+ sf.setErrorHandler(errorHandler)
+ sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+ // These are not recognized by a schemaFactory
+ // sf.setFeature("http://xml.org/sax/features/validation", true)
+ // sf.setFeature("http://apache.org/xml/features/validation/schema", true)
+ //
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
Review comment:
The exact contract of what aspects of the features/properties which of
the XML/SAX parsers/loaders/factories/jaxp implement seems ad-hoc and opaque to
me. There's a big lack of type-safety in such code. I had to try many
combinations to determine where to modify what on XML objects. E.g., like
XMLSchemaFactory() in this case. XMLReader and SAXParser in other cases.
Our SAX API needs control over those two namespaces-related features, so
I'll remove those comments as they're just wrong.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -179,6 +203,58 @@ class DaffodilConstructingLoader(uri: URI, errorHandler:
org.xml.sax.ErrorHandle
super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
}
+ /**
+ * To emulate the behavior of Xerces loader (standard scala loader)
+ * we have to normalize CRLF to LF, and solitary CR to LF.
+ *
+ * This is optional controlled by a constructor parameter.
+ */
+ override def text(pos: Int, txt: String): Text = {
+ val newText:String = {
+ if (normalizeCRLFtoLF) {
+ txt.
+ replaceAll("\r\n", "\n").
+ replaceAll("\r", "\n")
+ } else {
+ txt
+ }
+ }
+ //
+ // On MS-Windows it seems somehow that the TDML Runner
+ // loads expected infosets as XML, and they have CRLFs
+ // in them. Not sure how this happens.
Review comment:
Improved comments.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -68,14 +68,28 @@ object Position {
* wrap these with CDATA, and use this loader, not Xerces, then these will be
* preserved properly.
*
- * Also, enhanced to capture file/line/column info for every element and add it
+ * Also, enhanced so that when addPositionAttributes is true, it will capture
+ * file/line/column info for every element and add it
* as attributes onto each XML element.
*
* The way the constructing loader (aka ConstructingParser (for XML))
* gets positions is different. It is given just an offset into the document
file/stream,
* and it therefore must synthesize line number/col number info itself.
+ *
+ * @param uri URI for the XML to be loaded.
+ * @param errorHandler Called back on load errors.
+ * @param addPositionAttributes Use true if you want dafint:file,
+ * dafint:col, and dafint:line attributes.
+ * Defaults to false.
+ * @param normalizeCRLFtoLF Use true to emulate the scala XML load
+ * behavior of normalizing CRLF to LF, and solitary
CR to LF.
+ * Defaults to true. Should only be changed in
special circumstances
+ * as not normalizing CRLFs is non-standard for XML.
*/
-class DaffodilConstructingLoader(uri: URI, errorHandler:
org.xml.sax.ErrorHandler)
+class DaffodilConstructingLoader(uri: URI,
+ errorHandler: org.xml.sax.ErrorHandler,
+ addPositionAttributes: Boolean = false,
+ normalizeCRLFtoLF: Boolean = true)
Review comment:
To centralize the loader in one place so that one can insure it's always
set up to be secure, you then need that one loader to support different modes
of operation.
We only need the line/file position stuff when we're loading DFDL schemas,
and TDML files since those embed DFDL Schemas. Other XML loading (like TDML
infoset files) don't need that.
The normalize CRLF to LF stuff should always be true, as that is the
behavior of the regular xerces loader, but was not the case for the
ConstructingParser which we need to use because we need CDATA nodes preserved.
There was some code depending on this non-CRLF-normalizing behavior that had
crept in. I needed this flag to figure things out.
At this point, I could remove the normalizeCRLFtoLF flag, because it is only
used by tests that show it works.
Given how non-standard loading XML, while preserving CRLFs or CRs is, I
should probably remove it.
I am leaving this feature in for now, because this is, in effect, an
incompatible change to the TDML runner. It's possible people other than us have
TDML, and that TDML may be depending on the TDML Runner loading the tdml file,
or infoset XML files, with CRLFs preserved.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -86,6 +100,16 @@ class DaffodilConstructingLoader(uri: URI, errorHandler:
org.xml.sax.ErrorHandle
Source.fromInputStream(is, enc)
}, true) {
+ /**
+ * Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
+ */
+ // $COVERAGE-OFF$
+ override def parseDTD(): Unit = {
+ val e = makeSAXParseException(pos, "DOCTYPE is disallowed.")
+ throw e
+ }
+ // $COVERAGE-ON$
Review comment:
This is in case, under maintenance, someone descides not to do one of
the earlier load passes which would error-out on the doctype. In that case,
this is a "backup plan". I will insert comment to that effect.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
+ *
+ * Checks like UPA are in step (2) above. They are coded algorithmically
+ * right into Xerces. This is accomplished by
+ * using the below SchemaFactory and SchemaFactory.newSchema calls. The
+ * newSchema call is what forces schema validation to take place.
+ */
+ private lazy val schemaFactory = {
+ val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+ sf.setResourceResolver(resolver)
+ sf.setErrorHandler(errorHandler)
+ sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+ // These are not recognized by a schemaFactory
+ // sf.setFeature("http://xml.org/sax/features/validation", true)
+ // sf.setFeature("http://apache.org/xml/features/validation/schema", true)
+ //
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ sf
+ }
+
+ /**
+ * This loads the DFDL schema as an XML Schema. This will
+ * check many more things (ex: UPA) about the DFDL schema other than
+ * just whether it validates against the XML Schema for DFDL schemas.
+ *
+ * Unfortunately, we don't have control over how Xerces loads up these
schemas
+ * (other than the resolver anyway), so we can't really redirect the way
+ * it issues error messages so that it properly lays blame at say, the
schema fragments
+ * inside an embedded schema of a TDML file.
+ *
+ * So if we want good file/line/column info from this, we have to give
+ * it a plain old file or resource, and not try to play games to get it to
+ * pick up the file/line/col information from attributes of the elements.
+ *
+ * Due to limitations in the xerces newSchema() method
+ * this method should be called only after loading
+ * the schema as a regular XML file, which itself insists
+ * on the XMLUtils.setSecureDefaults, so we don't need to
+ * further check that here.
+ */
+ def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+ // first we load it, with validation explicitly against the
+ // schema for DFDL Schemas.
+ load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes =
true)
+ //
+ // Then we validate explicitly so Xerces can check things
+ // such as for UPA violations
+ //
+ val inputSource = source.newInputSource()
+ val saxSource = new SAXSource(inputSource)
+ //
+ // We would like this saxSource to be created from an XMLReader
+ // so that we can call XMLUtils.setSecureDefaults on it.
+ // but we get strange errors if I do that, where every symbol
+ // in the schema has an unrecognized namespace prefix.
+ //
+ schemaFactory.newSchema(saxSource)
+ inputSource.getByteStream().close()
+ }
- def setValidation(flag: Boolean): Unit = {
- doValidation = flag
+ // $COVERAGE-OFF$
+ override def parser = {
+ Assert.usageError("not to be called.")
}
+ // $COVERAGE-ON$
/**
+ * Obtain and initialize parser which validates the schema is defined.
+ */
+ private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+ if (optSchemaURI.isEmpty) noSchemaParser
+ else {
+ val f = parserFactory()
+ val schema = schemaFromURI(optSchemaURI)
+ f.setSchema(schema)
+ parserFromFactory(f)
+ }
+ }
+
+ private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
+ val sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+ sf.setErrorHandler(errorHandler)
+ sf.setResourceResolver(resolver)
+ val schema = sf.newSchema(new StreamSource(optSchemaURI.get.toString))
+ schema
+ }
+
+ private def parserFactory() = {
+ val f = new SAXParserFactoryImpl
+ f.setNamespaceAware(true)
+ f.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ f.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ f.setValidating(false)// according to javadoc, just controls DTD validation
+ f.setFeature("http://xml.org/sax/features/validation", true)
+ // not recognized by SAXParserFactory
+ // f.setFeature("http://xml.org/sax/features/validation/dynamic", true)
+ f.setFeature("http://apache.org/xml/features/honour-all-schemaLocations",
true)
+ f.setFeature("http://apache.org/xml/features/validation/schema", true)
+
f.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ f
+ }
+
+ private lazy val noSchemaParser: SAXParser = {
+ parserFromFactory(parserFactory())
+ }
+
+ private def parserFromFactory(f: SAXParserFactory) = {
+ val p = f.newSAXParser()
+ //
+ // Not allowed on a SAXParser
+ // p.setProperty(XMLUtils.SAX_NAvMESPACES_FEATURE, true)
+ // Not allowed on a SAXParser
+ // p.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ val xrdr = p.getXMLReader()
+ XMLUtils.setSecureDefaults(xrdr)
+ xrdr.setErrorHandler(errorHandler)
+ // not recognized by XMLReader
+ // xrdr.setFeature("http://xml.org/sax/features/validation/dynamic", true)
+ xrdr.setContentHandler(this)
+ //
+ // This is required to get the parse to really use our resolver.
+ // The setEntityResolver(resolver) does not work.
+ //
+
xrdr.setProperty("http://apache.org/xml/properties/internal/entity-resolver",
resolver)
+ p
+ }
+
+ /**
+ * This is the common routine called by all the load calls to actually
+ * carry out the loading of the schema.
+ *
+ * There are no calls to this in Daffodil code base as of this writing, but
+ * the base class scala.xml.factory.XMLLoader calls it.
+ *
* Does (optional) validation,
+ *
+ * @param source The URI for the XML document which may be a XML or DFDL
schema, or just XML data.
+ * @param optSchemaURI Optional URI for XML schema for the XML source
document.
+ * @param addPositionAttributes True to add dafint:file dafint:line
attributes to all elements.
+ * Defaults to false.
+ * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF.
This should usually be true,
+ * but some special case situations may require
preservation of CRLF/CR.
+ * @return an scala.xml.Node (Element actually) which is the document
element of the source.
*/
- def load(source: DaffodilSchemaSource): scala.xml.Node = {
- if (doValidation) {
- val inputSource = source.newInputSource()
- val xercesNode = xercesAdapter.load(inputSource) // validates
- inputSource.getByteStream().close()
-
- if (xercesNode == null) return null
- // Note: we don't call xercesAdapter.validateSchema(source)
- // here, because this is an XML loader, not necessarily
- // just a DFDL schema loader. So for example the doValidation flag
- // above could be telling us to validate a TDML file or not.
+ def load(source: DaffodilSchemaSource,
+ optSchemaURI: Option[URI],
+ addPositionAttributes: Boolean = false,
+ normalizeCRLFtoLF: Boolean = true): scala.xml.Node = {
+ //
+ // First we invoke the validator to explicitly validate the XML against
+ // the XML Schema (not necessarily a DFDL schema), via the
+ // javax.xml.validation.Validator's validate method.
+ //
+ optSchemaURI.foreach { schemaURI =>
+
+ val validator = XercesValidator.fromURIs(Seq(schemaURI))
+ val inputStream = source.uriForLoading.toURL.openStream()
+ validator.validateXML(inputStream, errorHandler)
+ inputStream.close()
+ //
+ // Next we have to invoke a regular xerces loader, setup for validation
+ // because that will actually interpret things like xsi:schemaLocation
attributes
+ // of the root element.
+ //
+ // The check of xsi:schemaLocation schemas seems to be the only reason we
+ // have to run this additional test.
+ //
+ // Possibly xsi:noNamespaceSchemaLocation would have the same issue, but
as of
+ // this writing, we have no tests that use that.
+ //
+ // scopeStack.push(TopScope) // not in scala xml v2.0.0
Review comment:
done
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
+ *
+ * Checks like UPA are in step (2) above. They are coded algorithmically
+ * right into Xerces. This is accomplished by
+ * using the below SchemaFactory and SchemaFactory.newSchema calls. The
+ * newSchema call is what forces schema validation to take place.
+ */
+ private lazy val schemaFactory = {
+ val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+ sf.setResourceResolver(resolver)
+ sf.setErrorHandler(errorHandler)
+ sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+ // These are not recognized by a schemaFactory
+ // sf.setFeature("http://xml.org/sax/features/validation", true)
+ // sf.setFeature("http://apache.org/xml/features/validation/schema", true)
+ //
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ sf
+ }
+
+ /**
+ * This loads the DFDL schema as an XML Schema. This will
+ * check many more things (ex: UPA) about the DFDL schema other than
+ * just whether it validates against the XML Schema for DFDL schemas.
+ *
+ * Unfortunately, we don't have control over how Xerces loads up these
schemas
+ * (other than the resolver anyway), so we can't really redirect the way
+ * it issues error messages so that it properly lays blame at say, the
schema fragments
+ * inside an embedded schema of a TDML file.
+ *
+ * So if we want good file/line/column info from this, we have to give
+ * it a plain old file or resource, and not try to play games to get it to
+ * pick up the file/line/col information from attributes of the elements.
+ *
+ * Due to limitations in the xerces newSchema() method
+ * this method should be called only after loading
+ * the schema as a regular XML file, which itself insists
+ * on the XMLUtils.setSecureDefaults, so we don't need to
+ * further check that here.
+ */
+ def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
Review comment:
renamed
##########
File path: daffodil-lib/src/main/resources/org/apache/daffodil/xsd/tdml.xsd
##########
@@ -129,6 +129,22 @@
<!-- We want the default to be 'unqualified' in the longer
term but
we leave it 'qualified here to prevent lots of tests from
breaking. -->
<attribute name="elementFormDefault" type="tns:elementFormDefaultType"
default="qualified"/>
+ <!-- we want the default for useDefaultNamespace to be false longer
term but
+ we leave it as true here to prevent lots of tests from breaking.
+ 81 tests to be exact. -->
+ <attribute name="useDefaultNamespace" type="xs:boolean" default="true">
+ <annotation><documentation>
+ Controls whether a xmlns="http://example.com" default namespace prefix
+ definition will be provided for the embedded schema.
+
+ This matters when the defined schema is used to validate the infoset
xml
+ before unparsing, or after parsing, as it affects the interpretation of
+ path expressions in the DFDL schema based on the
unqualifiedPathStepPolicy
+ tunable.
+
+ False is recommended since default namespaces can be confusing.
+ </documentation></annotation>
Review comment:
https://issues.apache.org/jira/browse/DAFFODIL-2523 created
It is hard to say whether this feature caused any actual problem, or was
added as part of diagnosing issues in the SAX stuff. I couldn't figure out what
was going on in the SAX stuff, and why there were default namespaces being
created, until I looked and saw default namespaces were being created
implicitly.
That's a bad idea generally.
In TDML files, really the default namespace should eliminate the need to
type "tdml:" everywhere. (Though the TDML language itself should be using
elementFormDefault 'unqualified' which would eliminate most of them. ) Or
inside a defineSchema one should be able to use the default namespace for the
XMLSchema namespace to avoid having to type xs: everywhere. Authors should have
that freedom anyway, but if defineSchema is inserting its own default
namespace, you are stuck with it.
##########
File path:
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Evaluatable.scala
##########
@@ -319,7 +319,7 @@ abstract class Evaluatable[+T <: AnyRef](protected val ci:
DPathCompileInfo, qNa
* Looks like an attribute definition, but the value part
* may contain XML-element-like syntax. If so it is either escaped with
* CDATA bracketing (preferred), or if there are quotes in it, then
- * it is escapified meaning " < etc.
+ * it is escapified meaning &quot; &lt; etc.
Review comment:
The scaladoc/javadoc actually wasn't looking right. You couldn't tell
what it was talking about. Think of this as fixing a typo in a comment.
##########
File path:
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
* do any validation either), however, once Daffodil starts processing the
* DFDL schema nodes, it resolves references using the same one true XML
catalog resolver.
*/
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+ extends NoBindingFactoryAdapter {
def this() = this(RethrowSchemaErrorHandler)
- def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+ private def resolver = DFDLCatalogResolver.get
- //
- // Controls whether we setup Xerces for validation or not.
- //
- final var doValidation: Boolean = true
+ /**
+ * UPA errors are detected by xerces if the schema-full-checking feature is
+ * turned on, AND if you inform xerces that it is reading an XML Schema
+ * (i.e., xsd).
+ *
+ * Detecting these requires that we do THREE passes
+ * 1) load the DFDL schema as an XML document. This validates it against the
XML Schema
+ * for DFDL schemas.
+ * 2) load the DFDL schema as an XSD - xerces then does lots of more
intensive checking
+ * of the schema
+ * 3) load the schema for our own consumption by Daffodil code. This uses the
+ * constructing parser so as to preserve CDATA regions (xerces just does the
wrong
+ * thing with those,...fatally so). Then our own semantic checking is
performed
+ * as part of compiling the DFDL schema.
+ *
+ * Checks like UPA are in step (2) above. They are coded algorithmically
+ * right into Xerces. This is accomplished by
+ * using the below SchemaFactory and SchemaFactory.newSchema calls. The
+ * newSchema call is what forces schema validation to take place.
+ */
+ private lazy val schemaFactory = {
+ val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+ sf.setResourceResolver(resolver)
+ sf.setErrorHandler(errorHandler)
+ sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+ // These are not recognized by a schemaFactory
+ // sf.setFeature("http://xml.org/sax/features/validation", true)
+ // sf.setFeature("http://apache.org/xml/features/validation/schema", true)
+ //
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ sf
+ }
+
+ /**
+ * This loads the DFDL schema as an XML Schema. This will
+ * check many more things (ex: UPA) about the DFDL schema other than
+ * just whether it validates against the XML Schema for DFDL schemas.
+ *
+ * Unfortunately, we don't have control over how Xerces loads up these
schemas
+ * (other than the resolver anyway), so we can't really redirect the way
+ * it issues error messages so that it properly lays blame at say, the
schema fragments
+ * inside an embedded schema of a TDML file.
+ *
+ * So if we want good file/line/column info from this, we have to give
+ * it a plain old file or resource, and not try to play games to get it to
+ * pick up the file/line/col information from attributes of the elements.
+ *
+ * Due to limitations in the xerces newSchema() method
+ * this method should be called only after loading
+ * the schema as a regular XML file, which itself insists
+ * on the XMLUtils.setSecureDefaults, so we don't need to
+ * further check that here.
+ */
+ def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+ // first we load it, with validation explicitly against the
+ // schema for DFDL Schemas.
+ load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes =
true)
+ //
+ // Then we validate explicitly so Xerces can check things
+ // such as for UPA violations
+ //
+ val inputSource = source.newInputSource()
+ val saxSource = new SAXSource(inputSource)
+ //
+ // We would like this saxSource to be created from an XMLReader
+ // so that we can call XMLUtils.setSecureDefaults on it.
+ // but we get strange errors if I do that, where every symbol
+ // in the schema has an unrecognized namespace prefix.
+ //
+ schemaFactory.newSchema(saxSource)
+ inputSource.getByteStream().close()
+ }
- def setValidation(flag: Boolean): Unit = {
- doValidation = flag
+ // $COVERAGE-OFF$
+ override def parser = {
+ Assert.usageError("not to be called.")
}
+ // $COVERAGE-ON$
/**
+ * Obtain and initialize parser which validates the schema is defined.
+ */
+ private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+ if (optSchemaURI.isEmpty) noSchemaParser
+ else {
+ val f = parserFactory()
+ val schema = schemaFromURI(optSchemaURI)
+ f.setSchema(schema)
+ parserFromFactory(f)
+ }
+ }
+
+ private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
+ val sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+ sf.setErrorHandler(errorHandler)
+ sf.setResourceResolver(resolver)
+ val schema = sf.newSchema(new StreamSource(optSchemaURI.get.toString))
+ schema
+ }
+
+ private def parserFactory() = {
+ val f = new SAXParserFactoryImpl
+ f.setNamespaceAware(true)
+ f.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ f.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+ f.setValidating(false)// according to javadoc, just controls DTD validation
+ f.setFeature("http://xml.org/sax/features/validation", true)
+ // not recognized by SAXParserFactory
+ // f.setFeature("http://xml.org/sax/features/validation/dynamic", true)
+ f.setFeature("http://apache.org/xml/features/honour-all-schemaLocations",
true)
+ f.setFeature("http://apache.org/xml/features/validation/schema", true)
+
f.setFeature("http://apache.org/xml/features/validation/schema-full-checking",
true)
+ f
+ }
+
+ private lazy val noSchemaParser: SAXParser = {
+ parserFromFactory(parserFactory())
+ }
+
+ private def parserFromFactory(f: SAXParserFactory) = {
+ val p = f.newSAXParser()
+ //
+ // Not allowed on a SAXParser
+ // p.setProperty(XMLUtils.SAX_NAvMESPACES_FEATURE, true)
+ // Not allowed on a SAXParser
+ // p.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ val xrdr = p.getXMLReader()
+ XMLUtils.setSecureDefaults(xrdr)
+ xrdr.setErrorHandler(errorHandler)
+ // not recognized by XMLReader
+ // xrdr.setFeature("http://xml.org/sax/features/validation/dynamic", true)
+ xrdr.setContentHandler(this)
+ //
+ // This is required to get the parse to really use our resolver.
+ // The setEntityResolver(resolver) does not work.
+ //
+
xrdr.setProperty("http://apache.org/xml/properties/internal/entity-resolver",
resolver)
+ p
+ }
+
+ /**
+ * This is the common routine called by all the load calls to actually
+ * carry out the loading of the schema.
+ *
+ * There are no calls to this in Daffodil code base as of this writing, but
+ * the base class scala.xml.factory.XMLLoader calls it.
+ *
* Does (optional) validation,
+ *
+ * @param source The URI for the XML document which may be a XML or DFDL
schema, or just XML data.
+ * @param optSchemaURI Optional URI for XML schema for the XML source
document.
+ * @param addPositionAttributes True to add dafint:file dafint:line
attributes to all elements.
+ * Defaults to false.
+ * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF.
This should usually be true,
+ * but some special case situations may require
preservation of CRLF/CR.
+ * @return an scala.xml.Node (Element actually) which is the document
element of the source.
*/
- def load(source: DaffodilSchemaSource): scala.xml.Node = {
- if (doValidation) {
- val inputSource = source.newInputSource()
- val xercesNode = xercesAdapter.load(inputSource) // validates
- inputSource.getByteStream().close()
-
- if (xercesNode == null) return null
- // Note: we don't call xercesAdapter.validateSchema(source)
- // here, because this is an XML loader, not necessarily
- // just a DFDL schema loader. So for example the doValidation flag
- // above could be telling us to validate a TDML file or not.
+ def load(source: DaffodilSchemaSource,
+ optSchemaURI: Option[URI],
+ addPositionAttributes: Boolean = false,
+ normalizeCRLFtoLF: Boolean = true): scala.xml.Node = {
+ //
+ // First we invoke the validator to explicitly validate the XML against
+ // the XML Schema (not necessarily a DFDL schema), via the
+ // javax.xml.validation.Validator's validate method.
+ //
+ optSchemaURI.foreach { schemaURI =>
+
+ val validator = XercesValidator.fromURIs(Seq(schemaURI))
+ val inputStream = source.uriForLoading.toURL.openStream()
+ validator.validateXML(inputStream, errorHandler)
+ inputStream.close()
+ //
+ // Next we have to invoke a regular xerces loader, setup for validation
+ // because that will actually interpret things like xsi:schemaLocation
attributes
+ // of the root element.
+ //
+ // The check of xsi:schemaLocation schemas seems to be the only reason we
+ // have to run this additional test.
+ //
+ // Possibly xsi:noNamespaceSchemaLocation would have the same issue, but
as of
+ // this writing, we have no tests that use that.
+ //
+ // scopeStack.push(TopScope) // not in scala xml v2.0.0
+ val parser = parserFromURI(optSchemaURI)
+ val xrdr = parser.getXMLReader()
+ val saxSource = scala.xml.Source.fromSysId(source.uriForLoading.toString)
+ // parser.parse(source.uriForLoading.toURL.openStream(), this)
Review comment:
done
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -1751,13 +1796,26 @@ case class DefinedSchema(xml: Node, parent:
DFDLTestSuite) {
* */
final val DEFAULT_ELEMENT_FORM_DEFAULT_VALUE = "qualified"
+ // TODO: We want this default to be false eventually.
Review comment:
DAFFODIL-2523
##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -450,6 +464,55 @@ object XMLUtils {
val SAX_NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces"
val SAX_NAMESPACE_PREFIXES_FEATURE =
"http://xml.org/sax/features/namespace-prefixes"
+ /**
+ * Always enable this feature (which disables doctypes).
+ */
+ val XML_DISALLOW_DOCTYPE_FEATURE =
"http://apache.org/xml/features/disallow-doctype-decl"
+
+ /**
+ * Always disable this. Might not be necessary if doctypes are disallowed.
+ */
+ val XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE =
"http://xml.org/sax/features/external-parameter-entities"
+
+ /**
+ * Always disable this. Might not be necessary if doctypes are disallowed.
+ */
+ val XML_EXTERNAL_GENERAL_ENTITIES_FEATURE =
"http://xml.org/sax/features/external-general-entities"
+
+ /**
+ * Sets properties that disable insecure XML reader behaviors.
+ * @param xmlReader - the reader to change feature settings on.
+ */
+ def setSecureDefaults(xmlReader: XMLReader) : Unit = {
+ // Try all of these. Then see if any of them were rejected.
+ try{ Seq(
+ Try { xmlReader.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true) },
+ Try { xmlReader.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true) },
+ Try {
xmlReader.setFeature(XMLUtils.XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE, false) },
+ Try { xmlReader.setFeature(XMLUtils.XML_EXTERNAL_GENERAL_ENTITIES_FEATURE,
false) }
+ // not available on an XMLReader
+ // xmlReader.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ // not available on an XMLReader
+ // xmlReader.setProperty(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // System.err.println("SAX XMLReader supports disallow properties: " +
xmlReader)
+ ).find(_.isFailure).getOrElse(()) // throws the first failure, after
attempting all of them.
+ } catch {
+ case e: SAXNotRecognizedException =>
+ //
+ // Our specific SAX XMLReader does not accept these options.
+ // But this is in daffodil-lib, and the DaffodilParseXMLReader is in
+ // runtime1, so we can't use the class directly here. So we play
+ // these class name tricks to see if the reader is ours, and
+ // only re-throw if it isn't.
+ //
+ val className = Misc.getNameFromClass(xmlReader)
+ if (!className.contains("DaffodilParseXMLReader")) {
+ // $COVERAGE-OFF$
+ Assert.abort(e) // bug.
Review comment:
Gone. However, I would say, that this pluggable XML readers stuff feels
like stuff of a bygone era when everyone thought there would be competition
around XML libraries, and pluggable XML technologies from vendors, etc. This
seems silly in retrospect. But maybe it is needed for backward bug-for-bug
compatibility in some cases.
For us, I don't think pluggable XMLReader makes any sense. We're far too
sensitive to exactly their characteristics.
I think we should explicitly check for the exact XMLReader classes we
support, and error if it isn't that.
Thoughts on this check?
##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -450,6 +464,55 @@ object XMLUtils {
val SAX_NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces"
val SAX_NAMESPACE_PREFIXES_FEATURE =
"http://xml.org/sax/features/namespace-prefixes"
+ /**
+ * Always enable this feature (which disables doctypes).
+ */
+ val XML_DISALLOW_DOCTYPE_FEATURE =
"http://apache.org/xml/features/disallow-doctype-decl"
+
+ /**
+ * Always disable this. Might not be necessary if doctypes are disallowed.
+ */
+ val XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE =
"http://xml.org/sax/features/external-parameter-entities"
+
+ /**
+ * Always disable this. Might not be necessary if doctypes are disallowed.
+ */
+ val XML_EXTERNAL_GENERAL_ENTITIES_FEATURE =
"http://xml.org/sax/features/external-general-entities"
+
+ /**
+ * Sets properties that disable insecure XML reader behaviors.
+ * @param xmlReader - the reader to change feature settings on.
+ */
+ def setSecureDefaults(xmlReader: XMLReader) : Unit = {
+ // Try all of these. Then see if any of them were rejected.
+ try{ Seq(
+ Try { xmlReader.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true) },
+ Try { xmlReader.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true) },
+ Try {
xmlReader.setFeature(XMLUtils.XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE, false) },
+ Try { xmlReader.setFeature(XMLUtils.XML_EXTERNAL_GENERAL_ENTITIES_FEATURE,
false) }
+ // not available on an XMLReader
+ // xmlReader.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+ // not available on an XMLReader
+ // xmlReader.setProperty(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+ // System.err.println("SAX XMLReader supports disallow properties: " +
xmlReader)
+ ).find(_.isFailure).getOrElse(()) // throws the first failure, after
attempting all of them.
+ } catch {
+ case e: SAXNotRecognizedException =>
+ //
+ // Our specific SAX XMLReader does not accept these options.
+ // But this is in daffodil-lib, and the DaffodilParseXMLReader is in
+ // runtime1, so we can't use the class directly here. So we play
+ // these class name tricks to see if the reader is ours, and
+ // only re-throw if it isn't.
+ //
Review comment:
I went with modifying the DaffodilXMLReader to support these. You can't
set them to other than their secure values, which are the default, or it will
throw.
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -251,10 +243,17 @@ class DFDLTestSuite private[tdml] (
* our loader here accumulates load-time errors here on the
* test suite object.
*/
- val loader = new DaffodilXMLLoader(errorHandler)
- loader.setValidation(validateTDMLFile)
+ lazy val loader = new DaffodilXMLLoader(errorHandler)
+ lazy val optTDMLSchema =
+ if (validateTDMLFile) {
+ val TDMLSchemaURI =
Misc.getRequiredResource("org/apache/daffodil/xsd/tdml.xsd")
Review comment:
done
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -264,66 +263,91 @@ class DFDLTestSuite private[tdml] (
tmpDir.mkdirs()
val src = UnitTestSchemaSource(tsNode, "", Some(tmpDir))
- loader.load(src)
+
+ loader.load(src, optTDMLSchema,
+ addPositionAttributes = true) // want line numbers for TDML
//
(tsNode, src.uriForLoading)
}
case tdmlFile: File => {
log(LogLevel.Info, "loading TDML file: %s", tdmlFile)
val uri = tdmlFile.toURI()
- val newNode = loader.load(URISchemaSource(uri))
+ val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+ addPositionAttributes = true)
val res = (newNode, uri)
log(LogLevel.Debug, "done loading TDML file: %s", tdmlFile)
res
}
case uri: URI => {
- val newNode = loader.load(URISchemaSource(uri))
+ val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+ addPositionAttributes = true)
val res = (newNode, uri)
res
}
case _ => Assert.usageError("not a Node, File, or URL")
} // end match
- lazy val isTDMLFileValid = !this.isLoadingError
+ lazy val ts = {
+ if (tsRaw eq null) {
+ // must have been a loader error
+ reportLoadingErrors()
+ } else {
+ tsRaw
+ }
+ }
+
+ lazy val isTDMLFileValid = {
+ (ts ne null) &&
+ !loadingExceptions.headOption.isDefined
+ }
+
+ def reportLoadingErrors(): Nothing = {
+ log(LogLevel.Error, "TDML file %s is not valid.", tsURI)
+ throw TDMLException(loadingExceptions.toSeq, None)
+ }
var checkAllTopLevel: Boolean = compileAllTopLevel
def setCheckAllTopLevel(flag: Boolean): Unit = {
checkAllTopLevel = flag
}
- val parserTestCases = (ts \ "parserTestCase").map { node =>
ParserTestCase(node, this) }
+ lazy val parserTestCases = (ts \ "parserTestCase").map { node =>
ParserTestCase(node, this) }
//
// Note: IBM started this TDML file format. They call an unparser test a
"serializer" test.
// We call it an UnparserTestCase
//
- val unparserTestCases = (ts \ "unparserTestCase").map { node =>
UnparserTestCase(node, this) }
- val testCases: Seq[TestCase] = parserTestCases ++ unparserTestCases
- val duplicateTestCases = testCases.groupBy {
- _.tcName
- }.filter { case (name, seq) => seq.length > 1 }
- if (duplicateTestCases.nonEmpty) {
- val listOfDups = duplicateTestCases.map{ case(name, _) => name}
- throw new TDMLExceptionImpl(
- "More than one test for names: " + listOfDups.mkString(","), None)
- }
- val testCaseMap = testCases.map { tc => (tc.tcName -> tc) }.toMap
- val suiteName = (ts \ "@suiteName").text
- val suiteID = (ts \ "@ID").text
- val description = (ts \ "@description").text
- val defaultRoundTrip = {
+ lazy val unparserTestCases = (ts \ "unparserTestCase").map { node =>
UnparserTestCase(node, this) }
+ lazy val testCases = {
+ val tcs: Seq[TestCase] = parserTestCases ++ unparserTestCases
+ val dups = tcs.groupBy {
+ _.tcName
+ }.filter { case (name, seq) => seq.length > 1 }
+ if (dups.nonEmpty) {
+ val listOfDups = dups.map{ case(name, _) => name}
+ throw new TDMLExceptionImpl(
+ "More than one test for names: " + listOfDups.mkString(","), None)
Review comment:
Added TDML file to the message.
I did not find the ticket for duplicate issues. This only detects duplicates
in the same TDML file. I know there was at one time a ticket about duplicates
across files.
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -1751,13 +1796,26 @@ case class DefinedSchema(xml: Node, parent:
DFDLTestSuite) {
* */
final val DEFAULT_ELEMENT_FORM_DEFAULT_VALUE = "qualified"
+ // TODO: We want this default to be false eventually.
+ // The rationale is if people want a default namespace
+ // in their TDML defineSchemas, they should define it. We shouldn't tack
+ // such a sensitive thing on automatically. (81 tests fail in daffodil-test
+ // if you change this however.)
+ //
+ final val DEFAULT_USE_DEFAULT_NAMESPACE_VALUE = true
+
val name = (xml \ "@name").text.toString
val elementFormDefault = {
val value = (xml \ "@elementFormDefault").text.toString
if (value == "") DEFAULT_ELEMENT_FORM_DEFAULT_VALUE
else value
}
+ val useDefaultNamespace = {
+ val value = (xml \ "@useDefaultNamespace").text.toString
+ if (value == "") DEFAULT_USE_DEFAULT_NAMESPACE_VALUE
+ else value.toBoolean
Review comment:
added test testDefineSchemaWithNoDefaultNamespace
##########
File path:
daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala
##########
@@ -1027,6 +1028,7 @@ class TestScalaAPI {
// prep for SAX unparse
val unparseContentHandler = dp.newContentHandlerInstance(wbc)
val unparseXMLReader =
javax.xml.parsers.SAXParserFactory.newInstance.newSAXParser.getXMLReader
+ XMLUtils.setSecureDefaults(unparseXMLReader)
Review comment:
Removed. Uses a local version of this function now to illustrate the
technique.
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -264,66 +263,91 @@ class DFDLTestSuite private[tdml] (
tmpDir.mkdirs()
val src = UnitTestSchemaSource(tsNode, "", Some(tmpDir))
- loader.load(src)
+
+ loader.load(src, optTDMLSchema,
+ addPositionAttributes = true) // want line numbers for TDML
//
(tsNode, src.uriForLoading)
}
case tdmlFile: File => {
log(LogLevel.Info, "loading TDML file: %s", tdmlFile)
val uri = tdmlFile.toURI()
- val newNode = loader.load(URISchemaSource(uri))
+ val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+ addPositionAttributes = true)
val res = (newNode, uri)
log(LogLevel.Debug, "done loading TDML file: %s", tdmlFile)
res
}
case uri: URI => {
- val newNode = loader.load(URISchemaSource(uri))
+ val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+ addPositionAttributes = true)
val res = (newNode, uri)
res
}
case _ => Assert.usageError("not a Node, File, or URL")
} // end match
- lazy val isTDMLFileValid = !this.isLoadingError
+ lazy val ts = {
+ if (tsRaw eq null) {
+ // must have been a loader error
+ reportLoadingErrors()
+ } else {
+ tsRaw
+ }
+ }
+
+ lazy val isTDMLFileValid = {
+ (ts ne null) &&
+ !loadingExceptions.headOption.isDefined
Review comment:
This is actually a pretty tricky area. There is code where we actually
snapshot the loadingExceptions, do more loading, then look to see if it is
bigger. The code is, unfortunately, using state and timing.
This is due to this whole content-handler/error-handler event-style
approach. We should rewrite this to make loading return an
```
Either[Seq[Exception], (Node, Seq[Exception])
```
So you get back either the loading exceptions because it failed, or a loaded
Node, and sequence of warnings.
That would make this code much clearer and less fragile. Not a problem for
this change set though.
##########
File path:
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/SchemaCache.scala
##########
@@ -33,6 +33,21 @@ object ENoWarnTDML { EqualitySuppressUnusedImportWarning() }
*/
object SchemaDataProcessorCache extends SchemaCache[(Seq[Diagnostic],
DFDL.DataProcessor), Seq[Diagnostic]]
+object SchemaCache {
+ /**
+ * A key object for caching compiled DFDL schemas. Everything that effects
compilation
+ * must be captured here.
+ */
+ case class Key (
+ uri: URISchemaSource, // The DFDL schema's URI
+ useSerializedProcessor: Boolean,
+ compileAllTopLevels: Boolean,
+ optRootName: Option[String],
+ optRootNamespace: Option[String],
+ tunables: Map[String, String]) // tunables map - required since tunables
effect compilation
+ // (ex: parseUnparsePolicy, unqualifiedPathStepPolicy, perhaps others)
+
+}
Review comment:
This is part of fixing a bug where we needed the tunables in the
SchemaCache key or they would return a cached result incorrectly when tunables
had changed the compilation parameters around namespaces. That got aggravated
as part of debugging SAX tests with the various namespace calls.
It's an interesting point though. If as part of debugging a problem you fix
things and improve them that turn out to have been unrelated to your orignal
problem, do you back out those changes? Often it's not easy to identify what
goes with what, and one could spend a lot of time backing things out and
breaking stuff to isolate the changes.
I cannot tell you if, in conjunction with the other changes, this change to
the schema cache would still cause a failure. But it was not correct before
because the tunables weren't part of the key. So this is more correct, albeit
possibly unrelated. I really just do not know any more if this fix would have
mattered.
##########
File path:
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetInputter.scala
##########
@@ -68,22 +67,20 @@ class TDMLInfosetInputter(val scalaInputter:
ScalaXMLInfosetInputter, others: Se
override def getSimpleText(primType: NodeInfo.Kind): String = {
val res = scalaInputter.getSimpleText(primType)
val resIsEmpty = res == null || res == ""
- val othersmatch = others.forall { i =>
- val st = i.getSimpleText(primType)
+ val otherStrings = others.map { i =>
+ val firstVersion = i.getSimpleText(primType)
+ val finalVersion = i match {
+ case _ if (firstVersion eq null) => ""
+ case jsonii: JsonInfosetInputter =>
firstVersion.replaceAll("(\r\n|\r)", "\n").
+ replaceAll("\r", "\n")
+ case _ => firstVersion
+ }
+ finalVersion
+ }
+ val othersmatch = otherStrings.forall { case st: String =>
val stIsEmpty = st == null || res == ""
Review comment:
Good catch. I'll fix this and see if anything breaks.
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -2294,29 +2361,60 @@ sealed abstract class DocumentPart(part: Node, parent:
Document) {
}
case class Infoset(i: NodeSeq, parent: TestCase) {
- val Seq(dfdlInfoset) = (i \ "dfdlInfoset").map { node => DFDLInfoset(node,
this) }
- val contents = dfdlInfoset.contents
+ lazy val Seq(dfdlInfoset) = (i \ "dfdlInfoset").map { node =>
DFDLInfoset(node, this) }
+ lazy val contents = dfdlInfoset.contents
}
case class DFDLInfoset(di: Node, parent: Infoset) {
- val infosetNodeSeq = {
- (di \ "@type").toString match {
- case "infoset" | "" => di.child.filter {
- _.isInstanceOf[scala.xml.Elem]
- }
- case "file" => {
- val path = di.text.trim()
- val maybeURI = parent.parent.parent.findTDMLResource(path)
- val uri = maybeURI.getOrElse(throw new
FileNotFoundException("TDMLRunner: infoset file '" + path + "' was not found"))
- val elem = scala.xml.XML.load(uri.toURL)
- elem
+ private lazy val infosetNodeSeq = {
+ val testCase: TestCase = parent.parent
+ val loader = testCase.parent.loader
+ val optDataSchema: Option[URI] = {
+
testCase.optSchemaFileURI.orElse(testCase.optEmbeddedSchema.map{_.uriForLoading})
+ }
+ val src =
+ (di \ "@type").toString match {
+ case "infoset" | "" => {
+ val rawElem = di.child.filter {
+ _.isInstanceOf[scala.xml.Elem]
+ }.head
+ UnitTestSchemaSource(rawElem, testCase.tcName)
+ }
+ case "file" => {
+ val path = di.text.trim()
+ val maybeURI = parent.parent.parent.findTDMLResource(path)
+ val uri = maybeURI.getOrElse(throw new
FileNotFoundException("TDMLRunner: infoset file '" + path + "' was not found"))
+ URISchemaSource(uri)
+ }
+ case value => Assert.abort("Uknown value for type attribute on
dfdlInfoset: " + value)
}
- case value => Assert.abort("Uknown value for type attribute on
dfdlInfoset: " + value)
+ //
+ // TODO: DAFFODIL-288 validate the infoset also
+ // You can pass the optDataSchema, which appears to be the correct thing
+ // but in many cases it doesn't seem to be able to resolve things.
+ //
+ val testSuite = testCase.parent
+ val before = testSuite.loadingExceptions.clone()
+ // val elem: Node = loader.load(src, optDataSchema)
+ val elem = loader.load(src, None) // no schema
Review comment:
Really the right fix is to get rid of all this errorHandler event
callback stuff, and have the load call return an object that contains loader
exceptions along with the Node loaded.
Is that worth doing now?
##########
File path:
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetInputter.scala
##########
@@ -68,22 +67,20 @@ class TDMLInfosetInputter(val scalaInputter:
ScalaXMLInfosetInputter, others: Se
override def getSimpleText(primType: NodeInfo.Kind): String = {
val res = scalaInputter.getSimpleText(primType)
val resIsEmpty = res == null || res == ""
- val othersmatch = others.forall { i =>
- val st = i.getSimpleText(primType)
+ val otherStrings = others.map { i =>
+ val firstVersion = i.getSimpleText(primType)
+ val finalVersion = i match {
+ case _ if (firstVersion eq null) => ""
+ case jsonii: JsonInfosetInputter =>
firstVersion.replaceAll("(\r\n|\r)", "\n").
+ replaceAll("\r", "\n")
Review comment:
fixed
##########
File path:
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetInputter.scala
##########
@@ -68,22 +67,20 @@ class TDMLInfosetInputter(val scalaInputter:
ScalaXMLInfosetInputter, others: Se
override def getSimpleText(primType: NodeInfo.Kind): String = {
val res = scalaInputter.getSimpleText(primType)
val resIsEmpty = res == null || res == ""
- val othersmatch = others.forall { i =>
- val st = i.getSimpleText(primType)
+ val otherStrings = others.map { i =>
+ val firstVersion = i.getSimpleText(primType)
+ val finalVersion = i match {
+ case _ if (firstVersion eq null) => ""
+ case jsonii: JsonInfosetInputter =>
firstVersion.replaceAll("(\r\n|\r)", "\n").
+ replaceAll("\r", "\n")
+ case _ => firstVersion
+ }
+ finalVersion
+ }
+ val othersmatch = otherStrings.forall { case st: String =>
val stIsEmpty = st == null || res == ""
val areSame = res == st || (resIsEmpty && stIsEmpty)
- if (areSame) {
- true
- } else {
- if (i.isInstanceOf[JsonInfosetInputter]) {
- // the json infoset inputter maintains CRLF/CR, but XML converts
CRLF/CR to
- // LF. So if this is Json, then compare with the CRLF/CR converted
to LF
Review comment:
added.
##########
File path:
daffodil-tdml-processor/src/test/scala/org/apache/daffodil/tdml/TestTDMLUnparseCases.scala
##########
@@ -49,17 +47,18 @@ class TestTDMLUnparseCases {
</ts:unparserTestCase>
</ts:testSuite>
lazy val ts = new DFDLTestSuite(testSuite)
- val tc: UnparserTestCase = ts.unparserTestCases.find { utc => utc.tcName
== "test1" }.get
- // println(tc)
- tc.document
- val is = tc.inputInfoset
- // println(is)
- val bar = is.dfdlInfoset.contents
- val scala.xml.Elem(pre, label, _, _, _) = bar
- assertEquals("ex", pre)
- assertEquals("bar", label)
+// val tc: UnparserTestCase = ts.unparserTestCases.find { utc => utc.tcName
== "test1" }.get
+// // println(tc)
+// tc.document
+// val is = tc.inputInfoset
+// // println(is)
+// val bar = is.dfdlInfoset.contents
+// val scala.xml.Elem(pre, label, _, _, _) = bar
+// assertEquals("ex", pre)
+// assertEquals("bar", label)
// ts.trace
ts.runOneTest("test1")
+ println(ts.loadingDiagnosticMessages)
Review comment:
yup
##########
File path:
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -376,19 +400,19 @@ class DFDLTestSuite private[tdml] (
System.gc()
Thread.sleep(1) // needed to give tools like jvisualvm ability to "grab
on" quickly
}
+ val testCase = testCaseMap.get(testName) // causes loading
if (isTDMLFileValid) {
- loadingExceptions.foreach { le => log(LogLevel.Warning, le.toString) }
- val testCase = testCaseMap.get(testName)
+ // display warnings if any
+ // too verbose. removing
+ // loadingExceptions.foreach { le => log(LogLevel.Warning, le.toString)
}
Review comment:
https://issues.apache.org/jira/browse/DAFFODIL-2410 is this bug.
I annotated the comment here with the bug ID.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]