This is an automated email from the ASF dual-hosted git repository.
mbeckerle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new cf7399541 Improve schema compiler performance
cf7399541 is described below
commit cf739954171db97426d2e1ac7327cc8683293481
Author: Michael Beckerle <[email protected]>
AuthorDate: Thu Dec 19 15:15:06 2024 -0500
Improve schema compiler performance
With this change, schema compilation for
Link16 drops from 2+ minutes to 18 seconds.
Added instrumentation to compiler which shows
that most time (by far, like 80+ percent) is
spent in Xerces validating each schema
file individually. This is unnecessary. We
only need to validate the top level schema
to detect UPA errors.
Fixed other error causing repeated
evaluation of OOLAG LV values.
Eliminate isError inside synchronized block
Avoids slowdown of Daffodil test suite.
DAFFODIL-2965, DAFFODIL-2781
---
.../apache/daffodil/core/compiler/Compiler.scala | 19 ++++-
.../apache/daffodil/core/dsom/DFDLSchemaFile.scala | 19 ++++-
.../org/apache/daffodil/core/dsom/SchemaSet.scala | 50 +++++++++----
.../core/runtime1/SchemaSetRuntime1Mixin.scala | 9 ++-
.../scala/org/apache/daffodil/japi/Daffodil.scala | 4 +-
.../org/apache/daffodil/example/TestJavaAPI.java | 2 +-
.../org/apache/daffodil/lib/oolag/OOLAG.scala | 85 +++++++++++++---------
.../daffodil/lib/xml/DaffodilXMLLoader.scala | 40 ++++++++--
.../org/apache/daffodil/example/TestScalaAPI.scala | 2 +-
.../daffodil/section00/general/TestResolver.tdml | 3 +-
10 files changed, 164 insertions(+), 69 deletions(-)
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala
index 44b50ecc7..78db91234 100644
---
a/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala
+++
b/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala
@@ -144,7 +144,7 @@ final class ProcessorFactory private (
codeGenerator
}
- override def isError: Boolean = sset.isError
+ override lazy val isError: Boolean = sset.isError
def withDistinguishedRootNode(name: String, namespace: String):
ProcessorFactory = {
Assert.usage(name ne null)
@@ -378,7 +378,13 @@ class Compiler private (
tunables
)
}
-
+ // It is tempting to call pf.isError here to drive compilation to
completion before we
+ // return the pf to the caller.
+ // However, this slows down TDML-based testing in Daffodil substantially
by moving
+ // the entire isError checking pass inside the synchronized block of the
Daffodil
+ // schema compiler. This results in reduced concurrency which
substantially slows
+ // the daffodil test suite.
+ // pf.isError // don't call this here. Call it outside the synchronized
block.
pf
}
@@ -409,9 +415,16 @@ object Compiler {
optRootName: Option[String],
optRootNamespace: Option[String]
): ProcessorFactory = {
- synchronized {
+ val pf = synchronized {
c.compileSourceInternal(schemaSource, optRootName, optRootNamespace)
}
+ // Force all compilation to complete. Called here outside of synchronized
block on purpose
+ // to avoid over-serializing things (which would slow down large test
suites like Daffodil's test suite.)
+ // Note that this requires that the shared data structures which require
Daffodil schema compilation to
+ // be serialized do *not* include the data structures being modified
during isError processing (which is
+ // lots of OOLAG evaluations).
+ pf.isError
+ pf
}
}
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala
index 03ee27fbb..525a187ea 100644
---
a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala
+++
b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala
@@ -205,6 +205,18 @@ final class DFDLSchemaFile(
private lazy val errHandler = new
DFDLSchemaFileLoadErrorHandler(schemaFileLocation)
private lazy val loader = new DaffodilXMLLoader(errHandler)
+ lazy val isValidAsCompleteDFDLSchema: Boolean =
LV('isValidAsCompleteDFDLSchema) {
+ try {
+ loader.validateAsCompleteDFDLSchema(schemaSource)
+ } catch {
+ case e: org.xml.sax.SAXParseException =>
+ errHandler.error(e) // accumulate with error handler.
+ } finally {
+ errHandler.handleLoadErrors(this)
+ }
+ true
+ }.value
+
lazy val iiXMLSchemaDocument = LV('iiXMLSchemaDocument) {
val res = makeXMLSchemaDocument(seenBefore, Some(this))
if (res.isDFDLSchema && sset.shouldValidateDFDLSchemas) {
@@ -213,15 +225,16 @@ final class DFDLSchemaFile(
// Some things, tests generally, want to turn this validation off.
//
try {
- loader.validateAsDFDLSchema(schemaSource)
+ loader.validateAsIndividualDFDLSchemaFile(schemaSource)
} catch {
- // validateAsDFDLSchema doesn't always capture all exceptions in the
+ // validateAsIndividualDFDLSchemaFile doesn't always capture all
exceptions in the
// loader's error handler. Even for fatal errors it sometimes
// just throws them.
case e: org.xml.sax.SAXParseException =>
errHandler.error(e) // accumulate with error handler.
+ } finally {
+ errHandler.handleLoadErrors(this)
}
- errHandler.handleLoadErrors(this)
}
res
}.value
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala
index 62bdb796b..30a06ea47 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala
@@ -171,21 +171,34 @@ final class SchemaSet private (
lazy val schemaFileList = schemas.map(s => s.uriString)
private lazy val isValid: Boolean = {
- //
- // We use keepGoing here, because we want to gather a validation error,
- // suppress further propagation of it, and return false.
- //
- val isV = OOLAG.keepGoing(false) {
- val files = allSchemaFiles
- val fileValids = files.map {
- _.isValid
+ if (!shouldValidateDFDLSchemas)
+ true // pretend it's valid though for some specific tests it may not be
+ else {
+ //
+ // We use keepGoing here, because we want to gather a validation error,
+ // suppress further propagation of it, and return false.
+ //
+ val isEachFileIndividuallyValid = OOLAG.keepGoing(false) {
+ val files = allSchemaFiles
+ val fileValids = files.map {
+ _.isValid
+ }
+ val res = fileValids.length > 0 && fileValids.fold(true) {
+ _ && _
+ }
+ res
}
- val res = fileValids.length > 0 && fileValids.fold(true) {
- _ && _
+ val isEntireSchemaValidAsAnXSD: Boolean = OOLAG.keepGoing(false) {
+ this.root.xmlSchemaDocument.schemaFile
+ .map { primaryDfdlSchemaFile =>
+ primaryDfdlSchemaFile.isValidAsCompleteDFDLSchema
+ }
+ .getOrElse(true)
}
+
+ val res = isEachFileIndividuallyValid && isEntireSchemaValidAsAnXSD
res
}
- isV
}
lazy val validationDiagnostics = {
@@ -369,7 +382,7 @@ final class SchemaSet private (
* Or, you can leave the root unspecified, and this method will determine it
from the
* first element declaration of the first schema file.
*/
- lazy val root: Root = {
+ lazy val root: Root = LV('root) {
val re: GlobalElementDecl =
optPFRootSpec match {
case Some(rs) =>
@@ -393,7 +406,7 @@ final class SchemaSet private (
case _ => Assert.invariantFailed("illegal combination of root element
specifications")
}
re.asRoot
- }
+ }.value
/**
* Retrieve schema by namespace
@@ -670,7 +683,7 @@ final class SchemaSet private (
* and finally the AST objects are checked for errors, which recursively
* demands that all other aspects of compilation occur.
*/
- override def isError: Boolean = {
+ override lazy val isError: Boolean = {
if (!isValid) true
else if (
// use keepGoing so we can capture errors and
@@ -681,6 +694,15 @@ final class SchemaSet private (
}
) true
else {
+ // we must check for errors here via the super method, as that iterates
over
+ // all the objects evaluating them for any errors in their required
evaluations.
+ // This has to be after everything else that could report an error here
(on some
+ // other object) has been done.
+ //
+ // That is to say, if we called super.isError at the top of this method
that would
+ // be incorrect since isValid and areComponentsConstructed above might
cause errors
+ // to be recorded or objects created that this call to super.isError
would then not
+ // take into account.
val hasErrors = super.isError
if (!hasErrors) {
// must be called after compilation is done
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala
index 25a1290f8..4b2ee489a 100644
---
a/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala
+++
b/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala
@@ -48,21 +48,24 @@ trait SchemaSetRuntime1Mixin {
}.value
lazy val parser = LV('parser) {
- val par = if (generateParser) root.document.parser else new
NotParsableParser(root.erd)
+ val par =
+ if (generateParser) root.document.parser
+ else new NotParsableParser(root.erd)
Processor.initialize(par)
par
}.value
lazy val unparser = LV('unparser) {
val unp =
- if (generateUnparser) root.document.unparser else new
NotUnparsableUnparser(root.erd)
+ if (generateUnparser) root.document.unparser
+ else new NotUnparsableUnparser(root.erd)
Processor.initialize(unp)
unp
}.value
private lazy val layerRuntimeCompiler = new LayerRuntimeCompiler
- lazy val allLayers: Seq[LayerRuntimeData] = LV('allLayers) {
+ private lazy val allLayers: Seq[LayerRuntimeData] = LV('allLayers) {
val lrds: Seq[LayerRuntimeData] = self.allSchemaComponents
.collect {
case stb: SequenceTermBase if (stb.isLayered) => stb
diff --git
a/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala
b/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala
index 9e590c860..4114e91a0 100644
--- a/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala
+++ b/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala
@@ -298,7 +298,9 @@ class ProcessorFactory private[japi] (private var pf:
SProcessorFactory)
*/
def onPath(path: String) = {
val dp = pf.onPath(path).asInstanceOf[SDataProcessor]
- new DataProcessor(dp)
+ val res = new DataProcessor(dp)
+ res.isError // ensure all errors have been detected before we return the DP
+ res
}
/**
diff --git
a/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java
b/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java
index a42df23e3..0942ef79b 100644
--- a/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java
+++ b/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java
@@ -1252,7 +1252,7 @@ public class TestJavaAPI {
Throwable cause = e.getCause();
assertTrue(cause.toString().contains("Must call isError"));
assertTrue(cause.getCause().toString().contains("Schema Definition
Error"));
- assertTrue(cause.getCause().toString().contains("Cannot resolve
the name 'tns:nonExistent'"));
+
assertTrue(cause.getCause().toString().contains("tns:nonExistent"));
}
}
diff --git
a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala
b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala
index 56b2725b4..31856bff7 100644
--- a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala
+++ b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala
@@ -478,16 +478,13 @@ object OOLAG {
}
/**
- * Currently we depend on being able to evaluate these
- * repeatedly, and get different answers.
- *
- * because it forces evaluation of all the requiredEvaluationsAlways(...)
+ * Forces evaluation of all the requiredEvaluationsAlways(...)
* or requiredEvaluationsIfActivated(...)
* on all objects first, but that is only for the objects
* that have been created and activated at the time this is called.
*/
-
- def isError: Boolean = {
+ def isError: Boolean = isErrorOnce
+ private lazy val isErrorOnce: Boolean = {
oolagRoot.checkErrors
val errorCount = oolagRoot.errors.size
errorCount > 0
@@ -511,18 +508,27 @@ object OOLAG {
sealed abstract class OOLAGValueBase(
val oolagContext: OOLAGHost,
nameArg: String,
- body: => Any
+ bodyArg: => Any
) {
+ // SCHEMA COMPILER PERFORMANCE INSTRUMENTATION
+ // To get timing on every OOLAG LV, use these 3 lines to define the 'body'
lazy var.
+ // private lazy val bodyOnce = bodyArg
+ // private lazy val timedBody = TimeTracker.track(name) { bodyOnce }
+ // private lazy val body = timedBody
+ // You will also need to call TimeTracker.logTimes() at the end of
Compiler.compileSource.
+ private lazy val body = bodyArg
+
Assert.usage(oolagContext != null)
final lazy val name = nameArg
- private var alreadyTriedThis = false
+ private var errorAlreadyHandled: Maybe[ErrorAlreadyHandled] = Nope
+ private var alreadyTriedThis: Maybe[AlreadyTried] = Nope
protected final def hasValue: Boolean = value_.isDefined
private var value_ : Maybe[AnyRef] = Nope
- protected final def wasTried = alreadyTriedThis
+ private final def wasTried = alreadyTriedThis.isDefined
// private def warn(th: Diagnostic): Unit = oolagContext.warn(th)
private def error(th: Diagnostic): Unit = oolagContext.error(th)
@@ -549,9 +555,9 @@ object OOLAG {
}
}
- override def toString = thisThing
+ override def toString: String = thisThing
- protected final def toss(th: Throwable) = {
+ protected final def toss(th: Throwable): Nothing = {
throw th
}
@@ -603,7 +609,8 @@ object OOLAG {
// Typically this will be for a Schema Definition Error
//
Assert.invariant(hasValue == false)
- Assert.invariant(alreadyTriedThis == true)
+ Assert.invariant(alreadyTriedThis.isDefined)
+ Assert.invariant(errorAlreadyHandled.isEmpty)
Logger.log.trace(" " * indent + s"${thisThing} has no value due to
${e}")
error(e)
@@ -611,11 +618,15 @@ object OOLAG {
// Catch this if you can carry on with more error gathering
// from other contexts. Otherwise just let it propagate.
//
- toss(new ErrorAlreadyHandled(e, this))
+ val eah = new ErrorAlreadyHandled(e, this)
+ errorAlreadyHandled = One(eah) // in theory, saving this doesn't
matter.
+ toss(eah)
}
case e @ ErrorsNotYetRecorded(diags) => {
+ Assert.invariant(alreadyTriedThis.isDefined)
+ Assert.invariant(!hasValue)
diags.foreach { error(_) }
- toss(new AlreadyTried(this))
+ toss(alreadyTriedThis.get)
}
case th => toss(th)
}
@@ -639,12 +650,12 @@ object OOLAG {
Logger.log.trace(" " * indent + s"LV: ${thisThing} CIRCULAR")
toss(c)
}
- if (alreadyTriedThis) {
+ if (alreadyTriedThis.isDefined) {
Logger.log.trace(" " * indent + s"LV: ${thisThing} was tried and
failed")
- val e = AlreadyTried(this)
+ val e = alreadyTriedThis.get
toss(e)
}
- alreadyTriedThis = true
+ alreadyTriedThis = One(AlreadyTried(this))
Logger.log.trace(" " * indent + s"Evaluating ${thisThing}")
}
@@ -660,7 +671,7 @@ object OOLAG {
oolagContext.currentOVList = oolagContext.currentOVList.tail
}
- final def hasError = alreadyTriedThis && !hasValue
+ final def hasError = alreadyTriedThis.isDefined && !hasValue
/**
* forces the value, then boolean result tells you if
@@ -670,7 +681,7 @@ object OOLAG {
final def isError = {
val res =
- if (alreadyTriedThis) !hasValue
+ if (alreadyTriedThis.isDefined) !hasValue
else {
try {
valueAsAny
@@ -687,22 +698,26 @@ object OOLAG {
final lazy val valueAsAny: Any = {
if (hasValue) value_.get
- val res =
- try {
- oolagBefore
- val v = body // good place for a breakpoint
- oolagAfterValue(v.asInstanceOf[AnyRef])
- v
- } catch {
- case npe: NullPointerException => throw npe
- case s: scala.util.control.ControlThrowable => throw s
- case u: UnsuppressableException => throw u
- case e: Error => throw e
- case th: Throwable => oolagCatch(th)
- } finally {
- oolagFinalize
- }
- res
+ else {
+ // egad.... on 2024-12-19 this was always re-evaluating due to missing
`else` keyword.
+ // this should make a substantial difference in schema compilation
time.
+ val res =
+ try {
+ oolagBefore
+ val v = body // good place for a breakpoint
+ oolagAfterValue(v.asInstanceOf[AnyRef])
+ v
+ } catch {
+ case npe: NullPointerException => throw npe
+ case s: scala.util.control.ControlThrowable => throw s
+ case u: UnsuppressableException => throw u
+ case e: Error => throw e
+ case th: Throwable => oolagCatch(th)
+ } finally {
+ oolagFinalize
+ }
+ res
+ }
}
protected lazy val toOptionAny: Option[Any] = {
diff --git
a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala
b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala
index 34eb2fda5..dd876c736 100644
---
a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala
+++
b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala
@@ -542,11 +542,42 @@ class DaffodilXMLLoader(val errorHandler:
org.xml.sax.ErrorHandler)
sf
}
+ /**
+ * checks that a DFDL schema file, viewed as an XML document is valid
+ * relative to the XML Schema for DFDL Schemas.
+ *
+ * For example this ensures that you can only have minOccurs and maxOccurs
+ * on local element declarations and element refs, not on
sequence/choice/group.
+ */
+ def validateAsIndividualDFDLSchemaFile(source: DaffodilSchemaSource): Unit =
{
+ // first we load it, with validation explicitly against the
+ // schema for DFDL Schemas.
+ try {
+ load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes
= true)
+ } catch {
+ // fatal errors are thrown.
+ // validation errors are never fatal.
+ case e: SAXParseException => {
+ // Capturing this would be redundant.
+ // It will already have been passed to the errorHandler.fatalError
+ // method.
+ // So it is explicitly ok to just rethrow this exception.
+ // we don't want to record it again, but we do want to stop with a
+ // fatal error because the schema was invalid. Daffodil assumes the
+ // schema is valid all over its code base.
+ throw e
+ }
+ }
+ }
+
/**
* This loads the DFDL schema as an XML Schema. This will
* check many more things (ex: UPA) about the DFDL schema other than
* just whether it validates against the XML Schema for DFDL schemas.
*
+ * This should only be called once, for the entire schema, not per file of
+ * the schema.
+ *
* Unfortunately, we don't have control over how Xerces loads up these
schemas
* (other than the resolver anyway), so we can't really redirect the way
* it issues error messages so that it properly lays blame at say, the
schema fragments
@@ -562,13 +593,10 @@ class DaffodilXMLLoader(val errorHandler:
org.xml.sax.ErrorHandler)
* on the XMLUtils.setSecureDefaults, so we don't need to
* further check that here.
*/
- def validateAsDFDLSchema(source: DaffodilSchemaSource): Unit = {
- // first we load it, with validation explicitly against the
- // schema for DFDL Schemas.
+ def validateAsCompleteDFDLSchema(source: DaffodilSchemaSource): Boolean = {
try {
- load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes
= true)
//
- // Then we validate explicitly so Xerces can check things
+ // We validate explicitly so Xerces can check things
// such as for UPA violations
//
val inputSource = source.newInputSource()
@@ -584,6 +612,7 @@ class DaffodilXMLLoader(val errorHandler:
org.xml.sax.ErrorHandler)
} finally {
inputSource.getByteStream().close()
}
+ true
} catch {
// fatal errors are thrown.
// validation errors are never fatal.
@@ -598,7 +627,6 @@ class DaffodilXMLLoader(val errorHandler:
org.xml.sax.ErrorHandler)
throw e
}
}
-
}
// $COVERAGE-OFF$ These three functions should only be used if someone calls
one of the
diff --git
a/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala
b/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala
index 040667290..2456c474e 100644
---
a/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala
+++
b/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala
@@ -1234,7 +1234,7 @@ class TestScalaAPI {
cause.getCause.toString.contains("Schema Definition Error")
)
assertTrue(
- cause.getCause.toString.contains("Cannot resolve the name
'tns:nonExistent'")
+ cause.getCause.toString.contains("tns:nonExistent")
)
}
}
diff --git
a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml
b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml
index 7176b7710..8374ebd19 100644
---
a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml
+++
b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml
@@ -36,9 +36,8 @@
<tdml:document/>
<tdml:errors>
<tdml:error>Schema Definition Error</tdml:error>
- <tdml:error>Unable to resolve</tdml:error>
<tdml:error>this/does/not/exist/schema.dfdl.xsd</tdml:error>
- <tdml:error>Schema context: Location in</tdml:error>
+ <tdml:error>Include Location</tdml:error>
</tdml:errors>
</tdml:parserTestCase>