[GitHub] [daffodil] mbeckerle commented on a change in pull request #560: Verify that we get errors if the files contain a DOCTYPE

GitBox Fri, 21 May 2021 13:35:59 -0700


mbeckerle commented on a change in pull request #560:
URL: https://github.com/apache/daffodil/pull/560#discussion_r637054288




##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,226 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.
+   *
+   * Checks like UPA are in step (2) above. They are coded algorithmically
+   * right into Xerces. This is accomplished by
+   * using the below SchemaFactory and SchemaFactory.newSchema calls.  The
+   * newSchema call is what forces schema validation to take place.
+   */
+  private lazy val schemaFactory = {
+    val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+    sf.setResourceResolver(resolver)
+    sf.setErrorHandler(errorHandler)
+    sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+    // These are not recognized by a schemaFactory
+    // sf.setFeature("http://xml.org/sax/features/validation";, true)
+    // sf.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    // 
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    sf
+  }
+
+  /**
+   * This loads the DFDL schema as an XML Schema. This will
+   * check many more things (ex: UPA) about the DFDL schema other than
+   * just whether it validates against the XML Schema for DFDL schemas.
+   *
+   * Unfortunately, we don't have control over how Xerces loads up these 
schemas
+   * (other than the resolver anyway), so we can't really redirect the way
+   * it issues error messages so that it properly lays blame at say, the 
schema fragments
+   * inside an embedded schema of a TDML file.
+   *
+   * So if we want good file/line/column info from this, we have to give
+   * it a plain old file or resource, and not try to play games to get it to
+   * pick up the file/line/col information from attributes of the elements.
+   *
+   * Due to limitations in the xerces newSchema() method
+   * this method should be called only after loading
+   * the schema as a regular XML file, which itself insists
+   * on the XMLUtils.setSecureDefaults, so we don't need to
+   * further check that here.
+   */
+  def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+    // first we load it, with validation explicitly against the
+    // schema for DFDL Schemas.
+    load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes = 
true)
+    //
+    // Then we validate explicitly so Xerces can check things
+    // such as for UPA violations
+    //
+    val inputSource = source.newInputSource()
+    val saxSource = new SAXSource(inputSource)
+    //
+    // We would like this saxSource to be created from an XMLReader
+    // so that we can call XMLUtils.setSecureDefaults on it.
+    // but we get strange errors if I do that, where every symbol
+    // in the schema has an unrecognized namespace prefix.
+    //
+    schemaFactory.newSchema(saxSource)
+    inputSource.getByteStream().close()
+  }
 
-  def setValidation(flag: Boolean): Unit = {
-    doValidation = flag
+  // $COVERAGE-OFF$
+  override def parser = {
+    Assert.usageError("not to be called.")
   }
+  // $COVERAGE-ON$
 
   /**
+   * Obtain and initialize parser which validates the schema is defined.
+   */
+  private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+    if (optSchemaURI.isEmpty) noSchemaParser
+    else {
+      val f = parserFactory()
+      val schema = schemaFromURI(optSchemaURI)
+      f.setSchema(schema)
+      parserFromFactory(f)
+    }
+  }
+
+  private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
+    val sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+    sf.setErrorHandler(errorHandler)
+    sf.setResourceResolver(resolver)
+    val schema = sf.newSchema(new StreamSource(optSchemaURI.get.toString))
+    schema
+  }
+
+  private def parserFactory() = {
+    val f = new SAXParserFactoryImpl
+    f.setNamespaceAware(true)
+    f.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    f.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    f.setValidating(false)// according to javadoc, just controls DTD validation
+    f.setFeature("http://xml.org/sax/features/validation";, true)
+    // not recognized by SAXParserFactory
+    // f.setFeature("http://xml.org/sax/features/validation/dynamic";, true)
+    f.setFeature("http://apache.org/xml/features/honour-all-schemaLocations";, 
true)
+    f.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    
f.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    f
+  }
+
+  private lazy val noSchemaParser: SAXParser = {
+    parserFromFactory(parserFactory())
+  }
+
+  private def parserFromFactory(f: SAXParserFactory) = {
+    val p = f.newSAXParser()
+    //
+    // Not allowed on a SAXParser
+    // p.setProperty(XMLUtils.SAX_NAvMESPACES_FEATURE, true)
+    // Not allowed on a SAXParser
+    // p.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    val xrdr = p.getXMLReader()
+    XMLUtils.setSecureDefaults(xrdr)
+    xrdr.setErrorHandler(errorHandler)
+    // not recognized by XMLReader
+    // xrdr.setFeature("http://xml.org/sax/features/validation/dynamic";, true)
+    xrdr.setContentHandler(this)
+    //
+    // This is required to get the parse to really use our resolver.
+    // The setEntityResolver(resolver) does not work.
+    //
+    
xrdr.setProperty("http://apache.org/xml/properties/internal/entity-resolver";, 
resolver)
+    p
+  }
+
+  /**
+   * This is the common routine called by all the load calls to actually
+   * carry out the loading of the schema.
+   *
+   * There are no calls to this in Daffodil code base as of this writing, but
+   * the base class scala.xml.factory.XMLLoader calls it.
+   *
    * Does (optional) validation,
+   *
+   * @param source The URI for the XML document which may be a XML or DFDL 
schema, or just XML data.
+   * @param optSchemaURI Optional URI for XML schema for the XML source 
document.
+   * @param addPositionAttributes True to add dafint:file dafint:line 
attributes to all elements.
+   *                              Defaults to false.
+   * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF. 
This should usually be true,
+   *                          but some special case situations may require 
preservation of CRLF/CR.
+   * @return an scala.xml.Node (Element actually) which is the document 
element of the source.
    */
-  def load(source: DaffodilSchemaSource): scala.xml.Node = {
-    if (doValidation) {
-      val inputSource = source.newInputSource()
-      val xercesNode = xercesAdapter.load(inputSource) // validates
-      inputSource.getByteStream().close()
-
-      if (xercesNode == null) return null
-      // Note: we don't call xercesAdapter.validateSchema(source)
-      // here, because this is an XML loader, not necessarily
-      // just a DFDL schema loader. So for example the doValidation flag
-      // above could be telling us to validate a TDML file or not.
+  def load(source: DaffodilSchemaSource,
+    optSchemaURI: Option[URI],
+    addPositionAttributes: Boolean = false,
+    normalizeCRLFtoLF: Boolean = true): scala.xml.Node = {
+    //
+    // First we invoke the validator to explicitly validate the XML against
+    // the XML Schema (not necessarily a DFDL schema), via the
+    // javax.xml.validation.Validator's validate method.
+    //
+    optSchemaURI.foreach { schemaURI =>
+
+      val validator = XercesValidator.fromURIs(Seq(schemaURI))
+      val inputStream = source.uriForLoading.toURL.openStream()
+      validator.validateXML(inputStream, errorHandler)
+      inputStream.close()
+      //
+      // Next we have to invoke a regular xerces loader, setup for validation
+      // because that will actually interpret things like xsi:schemaLocation 
attributes
+      // of the root element.
+      //
+      // scopeStack.push(TopScope) // not in scala xml v2.0.0
+      val parser = parserFromURI(optSchemaURI)
+      val xrdr = parser.getXMLReader()
+      val saxSource = scala.xml.Source.fromSysId(source.uriForLoading.toString)
+      // parser.parse(source.uriForLoading.toURL.openStream(), this)
+      xrdr.parse(saxSource)
+      // scopeStack.pop() // not in scala xml v2.0.0
+      // no result, as the errors are reported separately
     }
     //
     // To get reliable xml nodes including conversion of CDATA syntax into
     // PCData nodes, we have to use a different loader.
     //
-    val constructingLoader = new 
DaffodilConstructingLoader(source.uriForLoading, errorHandler)
+    val constructingLoader =
+      new DaffodilConstructingLoader(source.uriForLoading,
+        errorHandler, addPositionAttributes, normalizeCRLFtoLF)
     val res = constructingLoader.load() // construct the XML objects for us.
     constructingLoader.input.close()
     res
   }
 
-  def validateSchema(source: DaffodilSchemaSource) = 
xercesAdapter.validateSchema(source)
+  // We disallow any of these to force us of our own
+  // load(source, optSchemaURI) entry point.
+  // $COVERAGE-OFF$
+  private def noWay = Assert.usageError("Operation is not supported. Use 
load(uri) or loadFile(file)")

Review comment:
       Removed. No longer mixing in XMLLoader, so don't need these. 
   I had confused mixing in XMLLoader (which required these) with mixing in 
NoBindingFactoryAdapter, which is needed so this loader can server as a 
contentHandler. 

##########
File path: 
daffodil-core/src/main/scala/org/apache/daffodil/dsom/DFDLSchemaFile.scala
##########
@@ -134,20 +134,17 @@ final class DFDLSchemaFile(
 
   lazy val isDFDLSchemaFile = iiXMLSchemaDocument.isDFDLSchema
 
+  private lazy val loader = new DaffodilXMLLoader(this)
+
   lazy val iiXMLSchemaDocument = LV('iiXMLSchemaDocument) {
     val res = loadXMLSchemaDocument(seenBefore, Some(this))
     if (res.isDFDLSchema && sset.validateDFDLSchemas) {
       //
       // We validate DFDL schemas, only if validation is requested.
       // Some things, tests generally, want to turn this validation off.
       //
-
-      val ldr = new DaffodilXMLLoader(this)
-      ldr.setValidation(true)
-      try {
-        ldr.load(schemaSource) // validate as XML file with XML Schema for 
DFDL Schemas
-        ldr.validateSchema(schemaSource) // validate as XSD (catches UPA 
errors for example)
-      } catch {
+      try loader.validateAsXMLSchema(schemaSource) // validate as XSD (catches 
UPA errors for example)
+      catch {

Review comment:
       It's actually 3 passes. One in loadXMLSchemaDocument, which is really 
just loading, with no specific schema unless one is referenced in an 
xsi:schemaLocation. Second is loading it while validating against the 
XML-schema-for-dfdl-schemas, the third is constructing a schema object from it, 
which is the step that catches UPA errors, and other schema-content-specific 
things.  For a big DFDL schema (100 files) this will definitely take time. 
   
   You are correct this overhead could be substantial. We'd need to do a 
performance study to find out. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -179,6 +203,58 @@ class DaffodilConstructingLoader(uri: URI, errorHandler: 
org.xml.sax.ErrorHandle
     super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
   }
 
+  /**
+   * To emulate the behavior of Xerces loader (standard scala loader)
+   * we have to normalize CRLF to LF, and solitary CR to LF.
+   *
+   * This is optional controlled by a constructor parameter.
+   */
+  override def text(pos: Int, txt: String): Text = {
+    val newText:String = {
+      if (normalizeCRLFtoLF) {
+        txt.
+          replaceAll("\r\n", "\n").
+          replaceAll("\r", "\n")
+      } else {
+        txt
+      }
+    }
+    //
+    // On MS-Windows it seems somehow that the TDML Runner
+    // loads expected infosets as XML, and they have CRLFs
+    // in them. Not sure how this happens.
+    //
+    if (normalizeCRLFtoLF)
+      Assert.invariant(!newText.contains("\r"))
+    super.text(pos, newText)
+  }
+
+  /**
+   * We override this to force the ConstrutingParser to process CDATA regions
+   * specially with an override-able method named cdata.
+   *
+   * Strangely, if you look at the implementation of this in the MarkupParser
+   * trait, it calls the handler for text, but then it ignores the result of 
that
+   * and constructs a PCDATA node from the original text.
+   *
+   * It's possible this is a bug fix.
+   */
+  override def xCharData: NodeSeq = {
+    xToken("[CDATA[")
+    def mkResult(pos: Int, s: String): NodeSeq = {
+      val s1 = cdata(pos, s).text
+      PCData(s1)
+    }
+    xTakeUntil(mkResult, () => pos, "]]>")
+  }

Review comment:
       Reported: https://github.com/scala/scala-xml/issues/528

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.

Review comment:
       (3) is not a validating parser, so can't be used as a replacement for (1)
   We need (3) because it provides file/line number information, and also it 
does CDATA properly. 
   (Someday I will open a ticket on Xerces about that.....)

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -1751,13 +1796,26 @@ case class DefinedSchema(xml: Node, parent: 
DFDLTestSuite) {
    * */
   final val DEFAULT_ELEMENT_FORM_DEFAULT_VALUE = "qualified"
 
+  // TODO: We want this default to be false eventually.
+  // The rationale is if people want a default namespace
+  // in their TDML defineSchemas, they should define it. We shouldn't tack
+  // such a sensitive thing on automatically. (81 tests fail in daffodil-test
+  // if you change this however.)
+  //
+  final val DEFAULT_USE_DEFAULT_NAMESPACE_VALUE = true

Review comment:
       I think this is spurious. The code just below uses this, and almost 
always is going to use this since the majority of tdml:defineSchema do not have 
elementFormDefault='unqualified' in them. I've been adding them as I work on 
variouso tests, but most still don't have this. That means below code *does* 
access this constant. 

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -376,19 +400,19 @@ class DFDLTestSuite private[tdml] (
       System.gc()
       Thread.sleep(1) // needed to give tools like jvisualvm ability to "grab 
on" quickly
     }
+    val testCase = testCaseMap.get(testName) // causes loading
     if (isTDMLFileValid) {

Review comment:
       Added a documentation comment near top of the DFDLTestSuite class just 
before all the lazy members start. 
   
   The justification for all this laziness is ultimately that it's just too 
complicated to do in the context of constructors. 

##########
File path: project/Dependencies.scala
##########
@@ -48,7 +48,8 @@ object Dependencies {
   lazy val test = Seq(
     "junit" % "junit" % "4.13.2" % "it,test",
     "com.novocode" % "junit-interface" % "0.11" % "it,test",
-    "org.scalacheck" %% "scalacheck" % "1.15.4" % "it,test"
+    "org.scalacheck" %% "scalacheck" % "1.15.4" % "it,test",
+    "xmlunit" % "xmlunit" % "1.6" % "test"

Review comment:
       removed.

##########
File path: 
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/DaffodilTDMLDFDLProcessor.scala
##########
@@ -126,16 +125,21 @@ final class TDMLDFDLProcessorFactory private (
       val diags = p.getDiagnostics
       Left(diags)
     } else {
-      val dp =
+      val dp = {
+        // Always serialize the parser/unparser objects.
+        // This insures nothing is left uncomputed by lazy evaluation.

Review comment:
       I've removed this from here. It now only saves and reloads if 
useSerializedProcessor. 
   This is the compiler's problem to insure that everything is fully evaluated, 
not code out here using the data processor.

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,226 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {

Review comment:
       This is needed. We call setContentHandler(loader) passing the loader, 
and that requires it to be a handler. 
   I was confusing myself. We no longer mix in XMLLoader, hence a bunch of 
these non-usable methods are now removed below. That had nothing to do with 
NoBindingFactoryAdapter.

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -179,6 +203,58 @@ class DaffodilConstructingLoader(uri: URI, errorHandler: 
org.xml.sax.ErrorHandle
     super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
   }
 
+  /**
+   * To emulate the behavior of Xerces loader (standard scala loader)
+   * we have to normalize CRLF to LF, and solitary CR to LF.
+   *
+   * This is optional controlled by a constructor parameter.
+   */
+  override def text(pos: Int, txt: String): Text = {
+    val newText:String = {
+      if (normalizeCRLFtoLF) {
+        txt.
+          replaceAll("\r\n", "\n").
+          replaceAll("\r", "\n")
+      } else {
+        txt
+      }
+    }
+    //
+    // On MS-Windows it seems somehow that the TDML Runner
+    // loads expected infosets as XML, and they have CRLFs
+    // in them. Not sure how this happens.
+    //
+    if (normalizeCRLFtoLF)
+      Assert.invariant(!newText.contains("\r"))

Review comment:
       Removed. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/validation/XercesValidator.scala
##########
@@ -89,28 +82,35 @@ class XercesValidator(schemaFileNames: Seq[String])
 
   private val validator = new ThreadLocal[XercesValidatorImpl] {
     override def initialValue(): XercesValidatorImpl =
-      initializeValidator(schema.newValidator(), resolver)
+      initializeValidator(schema.newValidator, resolver)
   }
 
-  def validateXML(document: java.io.InputStream): ValidationResult = {
+  def validateXML(document: java.io.InputStream) =
+    validateXML(document, new XercesErrorHandler)
+
+  def validateXML(
+    document: java.io.InputStream,
+    eh: ErrorHandler): ValidationResult = {
+
     val documentSource = new StreamSource(document)
 
     // get the validator instance for this thread
     val xv = validator.get()
 
-    // create a new error handler for this execution
-    val eh = new XercesErrorHandler
     xv.setErrorHandler(eh)
 
     // validate the document
     xv.validate(documentSource)
 
-    // error handler contents as daffodil result
-    ValidationResult(eh.warnings, eh.errors)
+    eh match {
+      case xeh: XercesErrorHandler => ValidationResult(xeh.warnings, 
xeh.errors)
+      case _ => ValidationResult.empty
+    }

Review comment:
       Comments inserted

##########
File path: 
daffodil-core/src/test/scala/org/apache/daffodil/processor/TestSAXUtils.scala
##########
@@ -217,8 +224,10 @@ object TestSAXUtils {
       val msgs = pf.getDiagnostics.map { _.getMessage() }.mkString("\n")
       fail("pf compile errors: " + msgs)
     }
-    pf.sset.root.erd.preSerialization // force evaluation of all compile-time 
constructs
+    // pf.sset.root.erd.preSerialization // force evaluation of all 
compile-time constructs

Review comment:
       Moving this once and for all to SchemaSet.onPath method. Nobody should 
have to worry about this any more. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.
+   *
+   * Checks like UPA are in step (2) above. They are coded algorithmically
+   * right into Xerces. This is accomplished by
+   * using the below SchemaFactory and SchemaFactory.newSchema calls.  The
+   * newSchema call is what forces schema validation to take place.
+   */
+  private lazy val schemaFactory = {
+    val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+    sf.setResourceResolver(resolver)
+    sf.setErrorHandler(errorHandler)
+    sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+    // These are not recognized by a schemaFactory
+    // sf.setFeature("http://xml.org/sax/features/validation";, true)
+    // sf.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    // 
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    sf
+  }
+
+  /**
+   * This loads the DFDL schema as an XML Schema. This will
+   * check many more things (ex: UPA) about the DFDL schema other than
+   * just whether it validates against the XML Schema for DFDL schemas.
+   *
+   * Unfortunately, we don't have control over how Xerces loads up these 
schemas
+   * (other than the resolver anyway), so we can't really redirect the way
+   * it issues error messages so that it properly lays blame at say, the 
schema fragments
+   * inside an embedded schema of a TDML file.
+   *
+   * So if we want good file/line/column info from this, we have to give
+   * it a plain old file or resource, and not try to play games to get it to
+   * pick up the file/line/col information from attributes of the elements.
+   *
+   * Due to limitations in the xerces newSchema() method
+   * this method should be called only after loading
+   * the schema as a regular XML file, which itself insists
+   * on the XMLUtils.setSecureDefaults, so we don't need to
+   * further check that here.
+   */
+  def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+    // first we load it, with validation explicitly against the
+    // schema for DFDL Schemas.
+    load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes = 
true)
+    //
+    // Then we validate explicitly so Xerces can check things
+    // such as for UPA violations
+    //
+    val inputSource = source.newInputSource()
+    val saxSource = new SAXSource(inputSource)
+    //
+    // We would like this saxSource to be created from an XMLReader
+    // so that we can call XMLUtils.setSecureDefaults on it.
+    // but we get strange errors if I do that, where every symbol
+    // in the schema has an unrecognized namespace prefix.
+    //
+    schemaFactory.newSchema(saxSource)
+    inputSource.getByteStream().close()
+  }
 
-  def setValidation(flag: Boolean): Unit = {
-    doValidation = flag
+  // $COVERAGE-OFF$
+  override def parser = {
+    Assert.usageError("not to be called.")
   }
+  // $COVERAGE-ON$
 
   /**
+   * Obtain and initialize parser which validates the schema is defined.
+   */
+  private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+    if (optSchemaURI.isEmpty) noSchemaParser
+    else {
+      val f = parserFactory()
+      val schema = schemaFromURI(optSchemaURI)
+      f.setSchema(schema)
+      parserFromFactory(f)
+    }
+  }
+
+  private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {

Review comment:
       done

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.
+   *
+   * Checks like UPA are in step (2) above. They are coded algorithmically
+   * right into Xerces. This is accomplished by
+   * using the below SchemaFactory and SchemaFactory.newSchema calls.  The
+   * newSchema call is what forces schema validation to take place.
+   */
+  private lazy val schemaFactory = {
+    val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+    sf.setResourceResolver(resolver)
+    sf.setErrorHandler(errorHandler)
+    sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+    // These are not recognized by a schemaFactory
+    // sf.setFeature("http://xml.org/sax/features/validation";, true)
+    // sf.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    // 
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)

Review comment:
       The exact contract of what aspects of the features/properties which of 
the XML/SAX parsers/loaders/factories/jaxp implement seems ad-hoc and opaque to 
me. There's a big lack of type-safety in such code. I had to try many 
combinations to determine where to modify what on XML objects. E.g., like 
XMLSchemaFactory() in this case. XMLReader and SAXParser in other cases. 
   
   Our SAX API needs control over those two namespaces-related features, so 
I'll remove those comments as they're just wrong. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -179,6 +203,58 @@ class DaffodilConstructingLoader(uri: URI, errorHandler: 
org.xml.sax.ErrorHandle
     super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
   }
 
+  /**
+   * To emulate the behavior of Xerces loader (standard scala loader)
+   * we have to normalize CRLF to LF, and solitary CR to LF.
+   *
+   * This is optional controlled by a constructor parameter.
+   */
+  override def text(pos: Int, txt: String): Text = {
+    val newText:String = {
+      if (normalizeCRLFtoLF) {
+        txt.
+          replaceAll("\r\n", "\n").
+          replaceAll("\r", "\n")
+      } else {
+        txt
+      }
+    }
+    //
+    // On MS-Windows it seems somehow that the TDML Runner
+    // loads expected infosets as XML, and they have CRLFs
+    // in them. Not sure how this happens.

Review comment:
       Improved comments. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -68,14 +68,28 @@ object Position {
  * wrap these with CDATA, and use this loader, not Xerces, then these will be
  * preserved properly.
  *
- * Also, enhanced to capture file/line/column info for every element and add it
+ * Also, enhanced so that when addPositionAttributes is true, it will capture
+ * file/line/column info for every element and add it
  * as attributes onto each XML element.
  *
  * The way the constructing loader (aka ConstructingParser (for XML))
  * gets positions is different. It is given just an offset into the document 
file/stream,
  * and it therefore must synthesize line number/col number info itself.
+ *
+ * @param uri URI for the XML to be loaded.
+ * @param errorHandler Called back on load errors.
+ * @param addPositionAttributes Use true if you want dafint:file,
+ *                              dafint:col, and dafint:line attributes.
+ *                              Defaults to false.
+ * @param normalizeCRLFtoLF Use true to emulate the scala XML load
+ *                          behavior of normalizing CRLF to LF, and solitary 
CR to LF.
+ *                          Defaults to true. Should only be changed in 
special circumstances
+ *                          as not normalizing CRLFs is non-standard for XML.
  */
-class DaffodilConstructingLoader(uri: URI, errorHandler: 
org.xml.sax.ErrorHandler)
+class DaffodilConstructingLoader(uri: URI,
+  errorHandler: org.xml.sax.ErrorHandler,
+  addPositionAttributes: Boolean = false,
+  normalizeCRLFtoLF: Boolean = true)

Review comment:
       To centralize the loader in one place so that one can insure it's always 
set up to be secure, you then need that one loader to support different modes 
of operation. 
   
   We only need the line/file position stuff when we're loading DFDL schemas, 
and TDML files since those embed DFDL Schemas. Other XML loading (like TDML 
infoset files) don't need that. 
   
   The normalize CRLF to LF stuff should always be true, as that is the 
behavior of the regular xerces loader, but was not the case for the 
ConstructingParser which we need to use because we need CDATA nodes preserved. 
There was some code depending on this non-CRLF-normalizing behavior that had 
crept in.  I needed this flag to figure things out.
   
   At this point, I could remove the normalizeCRLFtoLF flag, because it is only 
used by tests that show it works. 
   Given how non-standard loading XML, while preserving CRLFs or CRs is, I 
should probably remove it. 
   
   I am leaving this feature in for now, because this is, in effect, an 
incompatible change to the TDML runner. It's possible people other than us have 
TDML, and that TDML may be depending on the TDML Runner loading the tdml file, 
or infoset XML files, with CRLFs preserved. 
   

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
##########
@@ -86,6 +100,16 @@ class DaffodilConstructingLoader(uri: URI, errorHandler: 
org.xml.sax.ErrorHandle
     Source.fromInputStream(is, enc)
   }, true) {
 
+  /**
+   * Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
+   */
+  // $COVERAGE-OFF$
+  override def parseDTD(): Unit = {
+    val e = makeSAXParseException(pos, "DOCTYPE is disallowed.")
+    throw e
+  }
+  // $COVERAGE-ON$

Review comment:
       This is in case, under maintenance, someone descides not to do one of 
the earlier load passes which would error-out on the doctype. In that case, 
this is a "backup plan".  I will insert comment to that effect. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.
+   *
+   * Checks like UPA are in step (2) above. They are coded algorithmically
+   * right into Xerces. This is accomplished by
+   * using the below SchemaFactory and SchemaFactory.newSchema calls.  The
+   * newSchema call is what forces schema validation to take place.
+   */
+  private lazy val schemaFactory = {
+    val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+    sf.setResourceResolver(resolver)
+    sf.setErrorHandler(errorHandler)
+    sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+    // These are not recognized by a schemaFactory
+    // sf.setFeature("http://xml.org/sax/features/validation";, true)
+    // sf.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    // 
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    sf
+  }
+
+  /**
+   * This loads the DFDL schema as an XML Schema. This will
+   * check many more things (ex: UPA) about the DFDL schema other than
+   * just whether it validates against the XML Schema for DFDL schemas.
+   *
+   * Unfortunately, we don't have control over how Xerces loads up these 
schemas
+   * (other than the resolver anyway), so we can't really redirect the way
+   * it issues error messages so that it properly lays blame at say, the 
schema fragments
+   * inside an embedded schema of a TDML file.
+   *
+   * So if we want good file/line/column info from this, we have to give
+   * it a plain old file or resource, and not try to play games to get it to
+   * pick up the file/line/col information from attributes of the elements.
+   *
+   * Due to limitations in the xerces newSchema() method
+   * this method should be called only after loading
+   * the schema as a regular XML file, which itself insists
+   * on the XMLUtils.setSecureDefaults, so we don't need to
+   * further check that here.
+   */
+  def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+    // first we load it, with validation explicitly against the
+    // schema for DFDL Schemas.
+    load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes = 
true)
+    //
+    // Then we validate explicitly so Xerces can check things
+    // such as for UPA violations
+    //
+    val inputSource = source.newInputSource()
+    val saxSource = new SAXSource(inputSource)
+    //
+    // We would like this saxSource to be created from an XMLReader
+    // so that we can call XMLUtils.setSecureDefaults on it.
+    // but we get strange errors if I do that, where every symbol
+    // in the schema has an unrecognized namespace prefix.
+    //
+    schemaFactory.newSchema(saxSource)
+    inputSource.getByteStream().close()
+  }
 
-  def setValidation(flag: Boolean): Unit = {
-    doValidation = flag
+  // $COVERAGE-OFF$
+  override def parser = {
+    Assert.usageError("not to be called.")
   }
+  // $COVERAGE-ON$
 
   /**
+   * Obtain and initialize parser which validates the schema is defined.
+   */
+  private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+    if (optSchemaURI.isEmpty) noSchemaParser
+    else {
+      val f = parserFactory()
+      val schema = schemaFromURI(optSchemaURI)
+      f.setSchema(schema)
+      parserFromFactory(f)
+    }
+  }
+
+  private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
+    val sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+    sf.setErrorHandler(errorHandler)
+    sf.setResourceResolver(resolver)
+    val schema = sf.newSchema(new StreamSource(optSchemaURI.get.toString))
+    schema
+  }
+
+  private def parserFactory() = {
+    val f = new SAXParserFactoryImpl
+    f.setNamespaceAware(true)
+    f.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    f.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    f.setValidating(false)// according to javadoc, just controls DTD validation
+    f.setFeature("http://xml.org/sax/features/validation";, true)
+    // not recognized by SAXParserFactory
+    // f.setFeature("http://xml.org/sax/features/validation/dynamic";, true)
+    f.setFeature("http://apache.org/xml/features/honour-all-schemaLocations";, 
true)
+    f.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    
f.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    f
+  }
+
+  private lazy val noSchemaParser: SAXParser = {
+    parserFromFactory(parserFactory())
+  }
+
+  private def parserFromFactory(f: SAXParserFactory) = {
+    val p = f.newSAXParser()
+    //
+    // Not allowed on a SAXParser
+    // p.setProperty(XMLUtils.SAX_NAvMESPACES_FEATURE, true)
+    // Not allowed on a SAXParser
+    // p.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    val xrdr = p.getXMLReader()
+    XMLUtils.setSecureDefaults(xrdr)
+    xrdr.setErrorHandler(errorHandler)
+    // not recognized by XMLReader
+    // xrdr.setFeature("http://xml.org/sax/features/validation/dynamic";, true)
+    xrdr.setContentHandler(this)
+    //
+    // This is required to get the parse to really use our resolver.
+    // The setEntityResolver(resolver) does not work.
+    //
+    
xrdr.setProperty("http://apache.org/xml/properties/internal/entity-resolver";, 
resolver)
+    p
+  }
+
+  /**
+   * This is the common routine called by all the load calls to actually
+   * carry out the loading of the schema.
+   *
+   * There are no calls to this in Daffodil code base as of this writing, but
+   * the base class scala.xml.factory.XMLLoader calls it.
+   *
    * Does (optional) validation,
+   *
+   * @param source The URI for the XML document which may be a XML or DFDL 
schema, or just XML data.
+   * @param optSchemaURI Optional URI for XML schema for the XML source 
document.
+   * @param addPositionAttributes True to add dafint:file dafint:line 
attributes to all elements.
+   *                              Defaults to false.
+   * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF. 
This should usually be true,
+   *                          but some special case situations may require 
preservation of CRLF/CR.
+   * @return an scala.xml.Node (Element actually) which is the document 
element of the source.
    */
-  def load(source: DaffodilSchemaSource): scala.xml.Node = {
-    if (doValidation) {
-      val inputSource = source.newInputSource()
-      val xercesNode = xercesAdapter.load(inputSource) // validates
-      inputSource.getByteStream().close()
-
-      if (xercesNode == null) return null
-      // Note: we don't call xercesAdapter.validateSchema(source)
-      // here, because this is an XML loader, not necessarily
-      // just a DFDL schema loader. So for example the doValidation flag
-      // above could be telling us to validate a TDML file or not.
+  def load(source: DaffodilSchemaSource,
+    optSchemaURI: Option[URI],
+    addPositionAttributes: Boolean = false,
+    normalizeCRLFtoLF: Boolean = true): scala.xml.Node = {
+    //
+    // First we invoke the validator to explicitly validate the XML against
+    // the XML Schema (not necessarily a DFDL schema), via the
+    // javax.xml.validation.Validator's validate method.
+    //
+    optSchemaURI.foreach { schemaURI =>
+
+      val validator = XercesValidator.fromURIs(Seq(schemaURI))
+      val inputStream = source.uriForLoading.toURL.openStream()
+      validator.validateXML(inputStream, errorHandler)
+      inputStream.close()
+      //
+      // Next we have to invoke a regular xerces loader, setup for validation
+      // because that will actually interpret things like xsi:schemaLocation 
attributes
+      // of the root element.
+      //
+      // The check of xsi:schemaLocation schemas seems to be the only reason we
+      // have to run this additional test.
+      //
+      // Possibly xsi:noNamespaceSchemaLocation would have the same issue, but 
as of
+      // this writing, we have no tests that use that.
+      //
+      // scopeStack.push(TopScope) // not in scala xml v2.0.0

Review comment:
       done

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.
+   *
+   * Checks like UPA are in step (2) above. They are coded algorithmically
+   * right into Xerces. This is accomplished by
+   * using the below SchemaFactory and SchemaFactory.newSchema calls.  The
+   * newSchema call is what forces schema validation to take place.
+   */
+  private lazy val schemaFactory = {
+    val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+    sf.setResourceResolver(resolver)
+    sf.setErrorHandler(errorHandler)
+    sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+    // These are not recognized by a schemaFactory
+    // sf.setFeature("http://xml.org/sax/features/validation";, true)
+    // sf.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    // 
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    sf
+  }
+
+  /**
+   * This loads the DFDL schema as an XML Schema. This will
+   * check many more things (ex: UPA) about the DFDL schema other than
+   * just whether it validates against the XML Schema for DFDL schemas.
+   *
+   * Unfortunately, we don't have control over how Xerces loads up these 
schemas
+   * (other than the resolver anyway), so we can't really redirect the way
+   * it issues error messages so that it properly lays blame at say, the 
schema fragments
+   * inside an embedded schema of a TDML file.
+   *
+   * So if we want good file/line/column info from this, we have to give
+   * it a plain old file or resource, and not try to play games to get it to
+   * pick up the file/line/col information from attributes of the elements.
+   *
+   * Due to limitations in the xerces newSchema() method
+   * this method should be called only after loading
+   * the schema as a regular XML file, which itself insists
+   * on the XMLUtils.setSecureDefaults, so we don't need to
+   * further check that here.
+   */
+  def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {

Review comment:
       renamed

##########
File path: daffodil-lib/src/main/resources/org/apache/daffodil/xsd/tdml.xsd
##########
@@ -129,6 +129,22 @@
                 <!-- We want the default to be 'unqualified' in the longer 
term but 
                      we leave it 'qualified here to prevent lots of tests from 
breaking. -->
     <attribute name="elementFormDefault" type="tns:elementFormDefaultType" 
default="qualified"/>
+        <!-- we want the default for useDefaultNamespace to be false longer 
term but
+             we leave it as true here to prevent lots of tests from breaking.
+             81 tests to be exact. -->
+    <attribute name="useDefaultNamespace" type="xs:boolean" default="true">
+      <annotation><documentation>
+        Controls whether a xmlns="http://example.com"; default namespace prefix
+        definition will be provided for the embedded schema.
+
+        This matters when the defined schema is used to validate the infoset 
xml
+        before unparsing, or after parsing, as it affects the interpretation of
+        path expressions in the DFDL schema based on the 
unqualifiedPathStepPolicy
+        tunable.
+
+        False is recommended since default namespaces can be confusing.
+      </documentation></annotation>

Review comment:
       https://issues.apache.org/jira/browse/DAFFODIL-2523 created
   
   It is hard to say whether this feature caused any actual problem, or was 
added as part of diagnosing issues in the SAX stuff. I couldn't figure out what 
was going on in the SAX stuff, and why there were default namespaces being 
created, until I looked and saw default namespaces were being created 
implicitly. 
   
   That's a bad idea generally. 
   
   In TDML files, really the default namespace should eliminate the need to 
type "tdml:" everywhere. (Though the TDML language itself should be using 
elementFormDefault 'unqualified' which would eliminate most of them. ) Or 
inside a defineSchema one should be able to use the default namespace for the 
XMLSchema namespace to avoid having to type xs: everywhere. Authors should have 
that freedom anyway, but if defineSchema is inserting its own default 
namespace, you are stuck with it. 

##########
File path: 
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Evaluatable.scala
##########
@@ -319,7 +319,7 @@ abstract class Evaluatable[+T <: AnyRef](protected val ci: 
DPathCompileInfo, qNa
    * Looks like an attribute definition, but the value part
    * may contain XML-element-like syntax. If so it is either escaped with
    * CDATA bracketing (preferred), or if there are quotes in it, then
-   * it is escapified meaning &quot; &lt; etc.
+   * it is escapified meaning &amp;quot; &amp;lt; etc.

Review comment:
       The scaladoc/javadoc actually wasn't looking right. You couldn't tell 
what it was talking about. Think of this as fixing a typo in a comment. 

##########
File path: 
daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilXMLLoader.scala
##########
@@ -538,47 +401,232 @@ trait SchemaAwareLoaderMixin {
  * do any validation either), however, once Daffodil starts processing the
  * DFDL schema nodes, it resolves references using the same one true XML 
catalog resolver.
  */
-class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) {
+class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
+  extends NoBindingFactoryAdapter {
 
   def this() = this(RethrowSchemaErrorHandler)
 
-  def xercesAdapter = new DFDLXercesAdapter(errorHandler)
+  private def resolver = DFDLCatalogResolver.get
 
-  //
-  // Controls whether we setup Xerces for validation or not.
-  //
-  final var doValidation: Boolean = true
+  /**
+   * UPA errors are detected by xerces if the schema-full-checking feature is
+   * turned on, AND if you inform xerces that it is reading an XML Schema
+   * (i.e., xsd).
+   *
+   * Detecting these requires that we do THREE passes
+   * 1) load the DFDL schema as an XML document. This validates it against the 
XML Schema
+   * for DFDL schemas.
+   * 2) load the DFDL schema as an XSD - xerces then does lots of more 
intensive checking
+   * of the schema
+   * 3) load the schema for our own consumption by Daffodil code. This uses the
+   * constructing parser so as to preserve CDATA regions (xerces just does the 
wrong
+   * thing with those,...fatally so). Then our own semantic checking is 
performed
+   * as part of compiling the DFDL schema.
+   *
+   * Checks like UPA are in step (2) above. They are coded algorithmically
+   * right into Xerces. This is accomplished by
+   * using the below SchemaFactory and SchemaFactory.newSchema calls.  The
+   * newSchema call is what forces schema validation to take place.
+   */
+  private lazy val schemaFactory = {
+    val sf = new org.apache.xerces.jaxp.validation.XMLSchemaFactory()
+    sf.setResourceResolver(resolver)
+    sf.setErrorHandler(errorHandler)
+    sf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    sf.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
+    // These are not recognized by a schemaFactory
+    // sf.setFeature("http://xml.org/sax/features/validation";, true)
+    // sf.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    // 
sf.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+    // sf.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    sf
+  }
+
+  /**
+   * This loads the DFDL schema as an XML Schema. This will
+   * check many more things (ex: UPA) about the DFDL schema other than
+   * just whether it validates against the XML Schema for DFDL schemas.
+   *
+   * Unfortunately, we don't have control over how Xerces loads up these 
schemas
+   * (other than the resolver anyway), so we can't really redirect the way
+   * it issues error messages so that it properly lays blame at say, the 
schema fragments
+   * inside an embedded schema of a TDML file.
+   *
+   * So if we want good file/line/column info from this, we have to give
+   * it a plain old file or resource, and not try to play games to get it to
+   * pick up the file/line/col information from attributes of the elements.
+   *
+   * Due to limitations in the xerces newSchema() method
+   * this method should be called only after loading
+   * the schema as a regular XML file, which itself insists
+   * on the XMLUtils.setSecureDefaults, so we don't need to
+   * further check that here.
+   */
+  def validateAsXMLSchema(source: DaffodilSchemaSource): Unit = {
+    // first we load it, with validation explicitly against the
+    // schema for DFDL Schemas.
+    load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes = 
true)
+    //
+    // Then we validate explicitly so Xerces can check things
+    // such as for UPA violations
+    //
+    val inputSource = source.newInputSource()
+    val saxSource = new SAXSource(inputSource)
+    //
+    // We would like this saxSource to be created from an XMLReader
+    // so that we can call XMLUtils.setSecureDefaults on it.
+    // but we get strange errors if I do that, where every symbol
+    // in the schema has an unrecognized namespace prefix.
+    //
+    schemaFactory.newSchema(saxSource)
+    inputSource.getByteStream().close()
+  }
 
-  def setValidation(flag: Boolean): Unit = {
-    doValidation = flag
+  // $COVERAGE-OFF$
+  override def parser = {
+    Assert.usageError("not to be called.")
   }
+  // $COVERAGE-ON$
 
   /**
+   * Obtain and initialize parser which validates the schema is defined.
+   */
+  private def parserFromURI(optSchemaURI: Option[URI]): SAXParser = {
+    if (optSchemaURI.isEmpty) noSchemaParser
+    else {
+      val f = parserFactory()
+      val schema = schemaFromURI(optSchemaURI)
+      f.setSchema(schema)
+      parserFromFactory(f)
+    }
+  }
+
+  private def schemaFromURI(optSchemaURI: Option[URI]): Schema = {
+    val sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+    sf.setErrorHandler(errorHandler)
+    sf.setResourceResolver(resolver)
+    val schema = sf.newSchema(new StreamSource(optSchemaURI.get.toString))
+    schema
+  }
+
+  private def parserFactory() = {
+    val f = new SAXParserFactoryImpl
+    f.setNamespaceAware(true)
+    f.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    f.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
+    f.setValidating(false)// according to javadoc, just controls DTD validation
+    f.setFeature("http://xml.org/sax/features/validation";, true)
+    // not recognized by SAXParserFactory
+    // f.setFeature("http://xml.org/sax/features/validation/dynamic";, true)
+    f.setFeature("http://apache.org/xml/features/honour-all-schemaLocations";, 
true)
+    f.setFeature("http://apache.org/xml/features/validation/schema";, true)
+    
f.setFeature("http://apache.org/xml/features/validation/schema-full-checking";, 
true)
+    f
+  }
+
+  private lazy val noSchemaParser: SAXParser = {
+    parserFromFactory(parserFactory())
+  }
+
+  private def parserFromFactory(f: SAXParserFactory) = {
+    val p = f.newSAXParser()
+    //
+    // Not allowed on a SAXParser
+    // p.setProperty(XMLUtils.SAX_NAvMESPACES_FEATURE, true)
+    // Not allowed on a SAXParser
+    // p.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+    val xrdr = p.getXMLReader()
+    XMLUtils.setSecureDefaults(xrdr)
+    xrdr.setErrorHandler(errorHandler)
+    // not recognized by XMLReader
+    // xrdr.setFeature("http://xml.org/sax/features/validation/dynamic";, true)
+    xrdr.setContentHandler(this)
+    //
+    // This is required to get the parse to really use our resolver.
+    // The setEntityResolver(resolver) does not work.
+    //
+    
xrdr.setProperty("http://apache.org/xml/properties/internal/entity-resolver";, 
resolver)
+    p
+  }
+
+  /**
+   * This is the common routine called by all the load calls to actually
+   * carry out the loading of the schema.
+   *
+   * There are no calls to this in Daffodil code base as of this writing, but
+   * the base class scala.xml.factory.XMLLoader calls it.
+   *
    * Does (optional) validation,
+   *
+   * @param source The URI for the XML document which may be a XML or DFDL 
schema, or just XML data.
+   * @param optSchemaURI Optional URI for XML schema for the XML source 
document.
+   * @param addPositionAttributes True to add dafint:file dafint:line 
attributes to all elements.
+   *                              Defaults to false.
+   * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF. 
This should usually be true,
+   *                          but some special case situations may require 
preservation of CRLF/CR.
+   * @return an scala.xml.Node (Element actually) which is the document 
element of the source.
    */
-  def load(source: DaffodilSchemaSource): scala.xml.Node = {
-    if (doValidation) {
-      val inputSource = source.newInputSource()
-      val xercesNode = xercesAdapter.load(inputSource) // validates
-      inputSource.getByteStream().close()
-
-      if (xercesNode == null) return null
-      // Note: we don't call xercesAdapter.validateSchema(source)
-      // here, because this is an XML loader, not necessarily
-      // just a DFDL schema loader. So for example the doValidation flag
-      // above could be telling us to validate a TDML file or not.
+  def load(source: DaffodilSchemaSource,
+    optSchemaURI: Option[URI],
+    addPositionAttributes: Boolean = false,
+    normalizeCRLFtoLF: Boolean = true): scala.xml.Node = {
+    //
+    // First we invoke the validator to explicitly validate the XML against
+    // the XML Schema (not necessarily a DFDL schema), via the
+    // javax.xml.validation.Validator's validate method.
+    //
+    optSchemaURI.foreach { schemaURI =>
+
+      val validator = XercesValidator.fromURIs(Seq(schemaURI))
+      val inputStream = source.uriForLoading.toURL.openStream()
+      validator.validateXML(inputStream, errorHandler)
+      inputStream.close()
+      //
+      // Next we have to invoke a regular xerces loader, setup for validation
+      // because that will actually interpret things like xsi:schemaLocation 
attributes
+      // of the root element.
+      //
+      // The check of xsi:schemaLocation schemas seems to be the only reason we
+      // have to run this additional test.
+      //
+      // Possibly xsi:noNamespaceSchemaLocation would have the same issue, but 
as of
+      // this writing, we have no tests that use that.
+      //
+      // scopeStack.push(TopScope) // not in scala xml v2.0.0
+      val parser = parserFromURI(optSchemaURI)
+      val xrdr = parser.getXMLReader()
+      val saxSource = scala.xml.Source.fromSysId(source.uriForLoading.toString)
+      // parser.parse(source.uriForLoading.toURL.openStream(), this)

Review comment:
       done

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -1751,13 +1796,26 @@ case class DefinedSchema(xml: Node, parent: 
DFDLTestSuite) {
    * */
   final val DEFAULT_ELEMENT_FORM_DEFAULT_VALUE = "qualified"
 
+  // TODO: We want this default to be false eventually.

Review comment:
       DAFFODIL-2523

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -450,6 +464,55 @@ object XMLUtils {
   val SAX_NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces";
   val SAX_NAMESPACE_PREFIXES_FEATURE = 
"http://xml.org/sax/features/namespace-prefixes";
 
+  /**
+   * Always enable this feature (which disables doctypes).
+   */
+  val XML_DISALLOW_DOCTYPE_FEATURE = 
"http://apache.org/xml/features/disallow-doctype-decl";
+
+  /**
+   * Always disable this. Might not be necessary if doctypes are disallowed.
+   */
+  val XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE = 
"http://xml.org/sax/features/external-parameter-entities";
+
+  /**
+   * Always disable this. Might not be necessary if doctypes are disallowed.
+   */
+  val XML_EXTERNAL_GENERAL_ENTITIES_FEATURE = 
"http://xml.org/sax/features/external-general-entities";
+
+  /**
+   * Sets properties that disable insecure XML reader behaviors.
+   * @param xmlReader - the reader to change feature settings on.
+   */
+  def setSecureDefaults(xmlReader: XMLReader) : Unit = {
+    // Try all of these. Then see if any of them were rejected.
+    try{ Seq(
+    Try { xmlReader.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true) },
+    Try { xmlReader.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true) },
+    Try { 
xmlReader.setFeature(XMLUtils.XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE, false) },
+    Try { xmlReader.setFeature(XMLUtils.XML_EXTERNAL_GENERAL_ENTITIES_FEATURE, 
false) }
+      // not available on an XMLReader
+      // xmlReader.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+      // not available on an XMLReader
+      // xmlReader.setProperty(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+      // System.err.println("SAX XMLReader supports disallow properties: " + 
xmlReader)
+    ).find(_.isFailure).getOrElse(()) // throws the first failure, after 
attempting all of them.
+    } catch {
+      case e: SAXNotRecognizedException =>
+        //
+        // Our specific SAX XMLReader does not accept these options.
+        // But this is in daffodil-lib, and the DaffodilParseXMLReader is in
+        // runtime1, so we can't use the class directly here. So we play
+        // these class name tricks to see if the reader is ours, and
+        // only re-throw if it isn't.
+        //
+        val className = Misc.getNameFromClass(xmlReader)
+        if (!className.contains("DaffodilParseXMLReader")) {
+          // $COVERAGE-OFF$
+          Assert.abort(e) // bug.

Review comment:
       Gone. However, I would say, that this pluggable XML readers stuff feels 
like stuff of a bygone era when everyone thought there would be competition 
around XML libraries, and pluggable XML technologies from vendors, etc. This 
seems silly in retrospect. But maybe it is needed for backward bug-for-bug 
compatibility in some cases.
   
   For us, I don't think pluggable XMLReader makes any sense. We're far too 
sensitive to exactly their characteristics.
   
   I think we should explicitly check for the exact XMLReader classes we 
support, and error if it isn't that. 
   
   Thoughts on this check?

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -450,6 +464,55 @@ object XMLUtils {
   val SAX_NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces";
   val SAX_NAMESPACE_PREFIXES_FEATURE = 
"http://xml.org/sax/features/namespace-prefixes";
 
+  /**
+   * Always enable this feature (which disables doctypes).
+   */
+  val XML_DISALLOW_DOCTYPE_FEATURE = 
"http://apache.org/xml/features/disallow-doctype-decl";
+
+  /**
+   * Always disable this. Might not be necessary if doctypes are disallowed.
+   */
+  val XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE = 
"http://xml.org/sax/features/external-parameter-entities";
+
+  /**
+   * Always disable this. Might not be necessary if doctypes are disallowed.
+   */
+  val XML_EXTERNAL_GENERAL_ENTITIES_FEATURE = 
"http://xml.org/sax/features/external-general-entities";
+
+  /**
+   * Sets properties that disable insecure XML reader behaviors.
+   * @param xmlReader - the reader to change feature settings on.
+   */
+  def setSecureDefaults(xmlReader: XMLReader) : Unit = {
+    // Try all of these. Then see if any of them were rejected.
+    try{ Seq(
+    Try { xmlReader.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true) },
+    Try { xmlReader.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true) },
+    Try { 
xmlReader.setFeature(XMLUtils.XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE, false) },
+    Try { xmlReader.setFeature(XMLUtils.XML_EXTERNAL_GENERAL_ENTITIES_FEATURE, 
false) }
+      // not available on an XMLReader
+      // xmlReader.setProperty(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
+      // not available on an XMLReader
+      // xmlReader.setProperty(XMLUtils.SAX_NAMESPACES_FEATURE, true)
+      // System.err.println("SAX XMLReader supports disallow properties: " + 
xmlReader)
+    ).find(_.isFailure).getOrElse(()) // throws the first failure, after 
attempting all of them.
+    } catch {
+      case e: SAXNotRecognizedException =>
+        //
+        // Our specific SAX XMLReader does not accept these options.
+        // But this is in daffodil-lib, and the DaffodilParseXMLReader is in
+        // runtime1, so we can't use the class directly here. So we play
+        // these class name tricks to see if the reader is ours, and
+        // only re-throw if it isn't.
+        //

Review comment:
       I went with modifying the DaffodilXMLReader to support these. You can't 
set them to other than their secure values, which are the default, or it will 
throw. 

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -251,10 +243,17 @@ class DFDLTestSuite private[tdml] (
    * our loader here accumulates load-time errors here on the
    * test suite object.
    */
-  val loader = new DaffodilXMLLoader(errorHandler)
-  loader.setValidation(validateTDMLFile)
+  lazy val loader = new DaffodilXMLLoader(errorHandler)
+  lazy val optTDMLSchema =
+    if (validateTDMLFile) {
+      val TDMLSchemaURI = 
Misc.getRequiredResource("org/apache/daffodil/xsd/tdml.xsd")

Review comment:
       done

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -264,66 +263,91 @@ class DFDLTestSuite private[tdml] (
       tmpDir.mkdirs()
 
       val src = UnitTestSchemaSource(tsNode, "", Some(tmpDir))
-      loader.load(src)
+
+      loader.load(src, optTDMLSchema,
+        addPositionAttributes = true) // want line numbers for TDML
       //
       (tsNode, src.uriForLoading)
     }
     case tdmlFile: File => {
       log(LogLevel.Info, "loading TDML file: %s", tdmlFile)
       val uri = tdmlFile.toURI()
-      val newNode = loader.load(URISchemaSource(uri))
+      val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+        addPositionAttributes = true)
       val res = (newNode, uri)
       log(LogLevel.Debug, "done loading TDML file: %s", tdmlFile)
       res
     }
     case uri: URI => {
-      val newNode = loader.load(URISchemaSource(uri))
+      val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+        addPositionAttributes = true)
       val res = (newNode, uri)
       res
     }
     case _ => Assert.usageError("not a Node, File, or URL")
   } // end match
 
-  lazy val isTDMLFileValid = !this.isLoadingError
+  lazy val ts = {
+    if (tsRaw eq null) {
+      // must have been a loader error
+      reportLoadingErrors()
+    } else {
+      tsRaw
+    }
+  }
+
+  lazy val isTDMLFileValid = {
+    (ts ne null) &&
+    !loadingExceptions.headOption.isDefined
+  }
+
+  def reportLoadingErrors(): Nothing = {
+    log(LogLevel.Error, "TDML file %s is not valid.", tsURI)
+    throw TDMLException(loadingExceptions.toSeq, None)
+  }
 
   var checkAllTopLevel: Boolean = compileAllTopLevel
 
   def setCheckAllTopLevel(flag: Boolean): Unit = {
     checkAllTopLevel = flag
   }
 
-  val parserTestCases = (ts \ "parserTestCase").map { node => 
ParserTestCase(node, this) }
+  lazy val parserTestCases = (ts \ "parserTestCase").map { node => 
ParserTestCase(node, this) }
   //
   // Note: IBM started this TDML file format. They call an unparser test a 
"serializer" test.
   // We call it an UnparserTestCase
   //
-  val unparserTestCases = (ts \ "unparserTestCase").map { node => 
UnparserTestCase(node, this) }
-  val testCases: Seq[TestCase] = parserTestCases ++ unparserTestCases
-  val duplicateTestCases = testCases.groupBy {
-    _.tcName
-  }.filter { case (name, seq) => seq.length > 1 }
-  if (duplicateTestCases.nonEmpty) {
-    val listOfDups = duplicateTestCases.map{ case(name, _) => name}
-    throw new TDMLExceptionImpl(
-      "More than one test for names: " + listOfDups.mkString(","), None)
-  }
-  val testCaseMap = testCases.map { tc => (tc.tcName -> tc) }.toMap
-  val suiteName = (ts \ "@suiteName").text
-  val suiteID = (ts \ "@ID").text
-  val description = (ts \ "@description").text
-  val defaultRoundTrip = {
+  lazy val unparserTestCases = (ts \ "unparserTestCase").map { node => 
UnparserTestCase(node, this) }
+  lazy val testCases = {
+    val tcs: Seq[TestCase] = parserTestCases ++ unparserTestCases
+    val dups = tcs.groupBy {
+      _.tcName
+    }.filter { case (name, seq) => seq.length > 1 }
+    if (dups.nonEmpty) {
+      val listOfDups = dups.map{ case(name, _) => name}
+      throw new TDMLExceptionImpl(
+        "More than one test for names: " + listOfDups.mkString(","), None)

Review comment:
       Added TDML file to the message. 
   
   I did not find the ticket for duplicate issues. This only detects duplicates 
in the same TDML file. I know there was at one time a ticket about duplicates 
across files. 

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -1751,13 +1796,26 @@ case class DefinedSchema(xml: Node, parent: 
DFDLTestSuite) {
    * */
   final val DEFAULT_ELEMENT_FORM_DEFAULT_VALUE = "qualified"
 
+  // TODO: We want this default to be false eventually.
+  // The rationale is if people want a default namespace
+  // in their TDML defineSchemas, they should define it. We shouldn't tack
+  // such a sensitive thing on automatically. (81 tests fail in daffodil-test
+  // if you change this however.)
+  //
+  final val DEFAULT_USE_DEFAULT_NAMESPACE_VALUE = true
+
   val name = (xml \ "@name").text.toString
   val elementFormDefault = {
     val value = (xml \ "@elementFormDefault").text.toString
 
     if (value == "") DEFAULT_ELEMENT_FORM_DEFAULT_VALUE
     else value
   }
+  val useDefaultNamespace = {
+    val value = (xml \ "@useDefaultNamespace").text.toString
+    if (value == "") DEFAULT_USE_DEFAULT_NAMESPACE_VALUE
+    else value.toBoolean

Review comment:
       added test testDefineSchemaWithNoDefaultNamespace

##########
File path: 
daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala
##########
@@ -1027,6 +1028,7 @@ class TestScalaAPI {
     // prep for SAX unparse
     val unparseContentHandler = dp.newContentHandlerInstance(wbc)
     val unparseXMLReader = 
javax.xml.parsers.SAXParserFactory.newInstance.newSAXParser.getXMLReader
+    XMLUtils.setSecureDefaults(unparseXMLReader)

Review comment:
       Removed. Uses a local version of this function now to illustrate the 
technique. 

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -264,66 +263,91 @@ class DFDLTestSuite private[tdml] (
       tmpDir.mkdirs()
 
       val src = UnitTestSchemaSource(tsNode, "", Some(tmpDir))
-      loader.load(src)
+
+      loader.load(src, optTDMLSchema,
+        addPositionAttributes = true) // want line numbers for TDML
       //
       (tsNode, src.uriForLoading)
     }
     case tdmlFile: File => {
       log(LogLevel.Info, "loading TDML file: %s", tdmlFile)
       val uri = tdmlFile.toURI()
-      val newNode = loader.load(URISchemaSource(uri))
+      val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+        addPositionAttributes = true)
       val res = (newNode, uri)
       log(LogLevel.Debug, "done loading TDML file: %s", tdmlFile)
       res
     }
     case uri: URI => {
-      val newNode = loader.load(URISchemaSource(uri))
+      val newNode = loader.load(URISchemaSource(uri), optTDMLSchema,
+        addPositionAttributes = true)
       val res = (newNode, uri)
       res
     }
     case _ => Assert.usageError("not a Node, File, or URL")
   } // end match
 
-  lazy val isTDMLFileValid = !this.isLoadingError
+  lazy val ts = {
+    if (tsRaw eq null) {
+      // must have been a loader error
+      reportLoadingErrors()
+    } else {
+      tsRaw
+    }
+  }
+
+  lazy val isTDMLFileValid = {
+    (ts ne null) &&
+    !loadingExceptions.headOption.isDefined

Review comment:
       This is actually a pretty tricky area. There is code where we actually 
snapshot the loadingExceptions, do more loading, then look to see if it is 
bigger.  The code is, unfortunately, using state and timing. 
   
   This is due to this whole content-handler/error-handler event-style 
approach. We should rewrite this to make loading return an
   ```
   Either[Seq[Exception], (Node, Seq[Exception])
   ```
   So you get back either the loading exceptions because it failed, or a loaded 
Node, and sequence of warnings. 
   That would make this code much clearer and less fragile. Not a problem for 
this change set though. 

##########
File path: 
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/SchemaCache.scala
##########
@@ -33,6 +33,21 @@ object ENoWarnTDML { EqualitySuppressUnusedImportWarning() }
  */
 object SchemaDataProcessorCache extends SchemaCache[(Seq[Diagnostic], 
DFDL.DataProcessor), Seq[Diagnostic]]
 
+object SchemaCache {
+  /**
+   * A key object for caching compiled DFDL schemas. Everything that effects 
compilation
+   * must be captured here.
+   */
+  case class Key (
+    uri: URISchemaSource,      // The DFDL schema's URI
+    useSerializedProcessor: Boolean,
+    compileAllTopLevels: Boolean,
+    optRootName: Option[String],
+    optRootNamespace: Option[String],
+    tunables: Map[String, String])  // tunables map - required since tunables 
effect compilation
+  // (ex: parseUnparsePolicy, unqualifiedPathStepPolicy, perhaps others)
+
+}

Review comment:
       This is part of fixing a bug where we needed the tunables in the 
SchemaCache key or they would return a cached result incorrectly when tunables 
had changed the compilation parameters around namespaces. That got aggravated 
as part of debugging SAX tests with the various namespace calls. 
   
   It's an interesting point though. If as part of debugging a problem you fix 
things and improve them that turn out to have been unrelated to your orignal 
problem, do you back out those changes? Often it's not easy to identify what 
goes with what, and one could spend a lot of time backing things out and 
breaking stuff to isolate the changes. 
   
   I cannot tell you if, in conjunction with the other changes, this change to 
the schema cache would still cause a failure. But it was not correct before 
because the tunables weren't part of the key. So this is more correct, albeit 
possibly unrelated. I really just do not know any more if this fix would have 
mattered. 
   
   

##########
File path: 
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetInputter.scala
##########
@@ -68,22 +67,20 @@ class TDMLInfosetInputter(val scalaInputter: 
ScalaXMLInfosetInputter, others: Se
   override def getSimpleText(primType: NodeInfo.Kind): String = {
     val res = scalaInputter.getSimpleText(primType)
     val resIsEmpty = res == null || res == ""
-    val othersmatch = others.forall { i =>
-      val st = i.getSimpleText(primType)
+    val otherStrings = others.map { i =>
+      val firstVersion = i.getSimpleText(primType)
+      val finalVersion = i match {
+        case _ if (firstVersion eq null) => ""
+        case jsonii: JsonInfosetInputter => 
firstVersion.replaceAll("(\r\n|\r)", "\n").
+          replaceAll("\r", "\n")
+        case _ => firstVersion
+      }
+      finalVersion
+    }
+    val othersmatch = otherStrings.forall { case st: String =>
       val stIsEmpty = st == null || res == ""

Review comment:
       Good catch. I'll fix this and see if anything breaks. 

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -2294,29 +2361,60 @@ sealed abstract class DocumentPart(part: Node, parent: 
Document) {
 }
 
 case class Infoset(i: NodeSeq, parent: TestCase) {
-  val Seq(dfdlInfoset) = (i \ "dfdlInfoset").map { node => DFDLInfoset(node, 
this) }
-  val contents = dfdlInfoset.contents
+  lazy val Seq(dfdlInfoset) = (i \ "dfdlInfoset").map { node => 
DFDLInfoset(node, this) }
+  lazy val contents = dfdlInfoset.contents
 }
 
 case class DFDLInfoset(di: Node, parent: Infoset) {
 
-  val infosetNodeSeq = {
-    (di \ "@type").toString match {
-      case "infoset" | "" => di.child.filter {
-        _.isInstanceOf[scala.xml.Elem]
-      }
-      case "file" => {
-        val path = di.text.trim()
-        val maybeURI = parent.parent.parent.findTDMLResource(path)
-        val uri = maybeURI.getOrElse(throw new 
FileNotFoundException("TDMLRunner: infoset file '" + path + "' was not found"))
-        val elem = scala.xml.XML.load(uri.toURL)
-        elem
+  private lazy val infosetNodeSeq = {
+    val testCase: TestCase = parent.parent
+    val loader = testCase.parent.loader
+    val optDataSchema: Option[URI] = {
+      
testCase.optSchemaFileURI.orElse(testCase.optEmbeddedSchema.map{_.uriForLoading})
+    }
+    val src =
+      (di \ "@type").toString match {
+        case "infoset" | "" => {
+          val rawElem = di.child.filter {
+            _.isInstanceOf[scala.xml.Elem]
+          }.head
+          UnitTestSchemaSource(rawElem, testCase.tcName)
+        }
+        case "file" => {
+          val path = di.text.trim()
+          val maybeURI = parent.parent.parent.findTDMLResource(path)
+          val uri = maybeURI.getOrElse(throw new 
FileNotFoundException("TDMLRunner: infoset file '" + path + "' was not found"))
+          URISchemaSource(uri)
+        }
+        case value => Assert.abort("Uknown value for type attribute on 
dfdlInfoset: " + value)
       }
-      case value => Assert.abort("Uknown value for type attribute on 
dfdlInfoset: " + value)
+    //
+    // TODO: DAFFODIL-288 validate the infoset also
+    // You can pass the optDataSchema, which appears to be the correct thing
+    // but in many cases it doesn't seem to be able to resolve things.
+    //
+    val testSuite = testCase.parent
+    val before = testSuite.loadingExceptions.clone()
+    // val elem: Node = loader.load(src, optDataSchema)
+    val elem = loader.load(src, None) // no schema

Review comment:
       Really the right fix is to get rid of all this errorHandler event 
callback stuff, and have the load call return an object that contains loader 
exceptions along with the Node loaded. 
   
   Is that worth doing now?

##########
File path: 
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetInputter.scala
##########
@@ -68,22 +67,20 @@ class TDMLInfosetInputter(val scalaInputter: 
ScalaXMLInfosetInputter, others: Se
   override def getSimpleText(primType: NodeInfo.Kind): String = {
     val res = scalaInputter.getSimpleText(primType)
     val resIsEmpty = res == null || res == ""
-    val othersmatch = others.forall { i =>
-      val st = i.getSimpleText(primType)
+    val otherStrings = others.map { i =>
+      val firstVersion = i.getSimpleText(primType)
+      val finalVersion = i match {
+        case _ if (firstVersion eq null) => ""
+        case jsonii: JsonInfosetInputter => 
firstVersion.replaceAll("(\r\n|\r)", "\n").
+          replaceAll("\r", "\n")

Review comment:
       fixed

##########
File path: 
daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetInputter.scala
##########
@@ -68,22 +67,20 @@ class TDMLInfosetInputter(val scalaInputter: 
ScalaXMLInfosetInputter, others: Se
   override def getSimpleText(primType: NodeInfo.Kind): String = {
     val res = scalaInputter.getSimpleText(primType)
     val resIsEmpty = res == null || res == ""
-    val othersmatch = others.forall { i =>
-      val st = i.getSimpleText(primType)
+    val otherStrings = others.map { i =>
+      val firstVersion = i.getSimpleText(primType)
+      val finalVersion = i match {
+        case _ if (firstVersion eq null) => ""
+        case jsonii: JsonInfosetInputter => 
firstVersion.replaceAll("(\r\n|\r)", "\n").
+          replaceAll("\r", "\n")
+        case _ => firstVersion
+      }
+      finalVersion
+    }
+    val othersmatch = otherStrings.forall { case st: String =>
       val stIsEmpty = st == null || res == ""
       val areSame = res == st || (resIsEmpty && stIsEmpty)
-      if (areSame) {
-        true
-      } else {
-        if (i.isInstanceOf[JsonInfosetInputter]) {
-          // the json infoset inputter maintains CRLF/CR, but XML converts 
CRLF/CR to
-          // LF. So if this is Json, then compare with the CRLF/CR converted 
to LF

Review comment:
       added. 

##########
File path: 
daffodil-tdml-processor/src/test/scala/org/apache/daffodil/tdml/TestTDMLUnparseCases.scala
##########
@@ -49,17 +47,18 @@ class TestTDMLUnparseCases {
                       </ts:unparserTestCase>
                     </ts:testSuite>
     lazy val ts = new DFDLTestSuite(testSuite)
-    val tc: UnparserTestCase = ts.unparserTestCases.find { utc => utc.tcName 
== "test1" }.get
-    // println(tc)
-    tc.document
-    val is = tc.inputInfoset
-    // println(is)
-    val bar = is.dfdlInfoset.contents
-    val scala.xml.Elem(pre, label, _, _, _) = bar
-    assertEquals("ex", pre)
-    assertEquals("bar", label)
+//    val tc: UnparserTestCase = ts.unparserTestCases.find { utc => utc.tcName 
== "test1" }.get
+//    // println(tc)
+//    tc.document
+//    val is = tc.inputInfoset
+//    // println(is)
+//    val bar = is.dfdlInfoset.contents
+//    val scala.xml.Elem(pre, label, _, _, _) = bar
+//    assertEquals("ex", pre)
+//    assertEquals("bar", label)
     // ts.trace
     ts.runOneTest("test1")
+    println(ts.loadingDiagnosticMessages)

Review comment:
       yup

##########
File path: 
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -376,19 +400,19 @@ class DFDLTestSuite private[tdml] (
       System.gc()
       Thread.sleep(1) // needed to give tools like jvisualvm ability to "grab 
on" quickly
     }
+    val testCase = testCaseMap.get(testName) // causes loading
     if (isTDMLFileValid) {
-      loadingExceptions.foreach { le =>  log(LogLevel.Warning, le.toString) }
-      val testCase = testCaseMap.get(testName)
+      // display warnings if any
+      // too verbose. removing
+      // loadingExceptions.foreach { le =>  log(LogLevel.Warning, le.toString) 
}

Review comment:
       https://issues.apache.org/jira/browse/DAFFODIL-2410 is this bug. 
   I annotated the comment here with the bug ID. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [daffodil] mbeckerle commented on a change in pull request #560: Verify that we get errors if the files contain a DOCTYPE

Reply via email to