stevedlawrence commented on code in PR #912: URL: https://github.com/apache/daffodil/pull/912#discussion_r1072319306
########## daffodil-cli/src/it/scala/org/apache/daffodil/xml/TestXMLConversionControl.scala: ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.xml + +import org.apache.commons.io.FileUtils +import org.junit.Test +import org.apache.daffodil.CLI.Util._ +import org.apache.daffodil.Main.ExitCode +import org.junit.Assert.assertTrue + +import java.nio.charset.StandardCharsets + +class TestXMLConversionControl { + + // + // To run tests conveniently under IntelliJ IDEA, + // rename the src/test dir to src/test1. Rename the src/it dir to src/test. + // Then modify this val to be "test". + // Then you can run these as ordinary junit-style tests under the IDE. + val test = "it" + + @Test def test_CLI_XMLConversionControlConvertCR(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-convertCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + runCLI(args"parse -s $schema -c $config --root a -o $output $input") { + cli => cli.expect("") + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\ndef\nghi</ex:a>")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRParse(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + runCLI(args"parse -s $schema -c $config --root a -o $output $input") { cli => + cli.expect("") + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\uE00D\ndef\uE00D\nghi</ex:a>")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRRoundTrip(): Unit = { + withTempFile { output => + withTempFile { xmlOut => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + var cmd = args"parse -s $schema -c $config --root a -o $xmlOut $input " + runCLI(cmd) { cli => + cli.expect(s"") Review Comment: The body of he runCLI function can be empty. You don't need to expect the empty string. ########## daffodil-cli/src/it/scala/org/apache/daffodil/xml/TestXMLConversionControl.scala: ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.xml + +import org.apache.commons.io.FileUtils +import org.junit.Test +import org.apache.daffodil.CLI.Util._ +import org.apache.daffodil.Main.ExitCode +import org.junit.Assert.assertTrue + +import java.nio.charset.StandardCharsets + +class TestXMLConversionControl { + + // + // To run tests conveniently under IntelliJ IDEA, + // rename the src/test dir to src/test1. Rename the src/it dir to src/test. + // Then modify this val to be "test". + // Then you can run these as ordinary junit-style tests under the IDE. + val test = "it" Review Comment: I'd prefer we solve this issue and update our entire integration suite in a separate PR. Making one-off change to a subset of our integration tests made it really difficult to update everything to be consistent. ########## daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala: ########## @@ -997,17 +1008,22 @@ object Main { val validate = unparseOpts.validate.toOption.get - val optDafConfig = unparseOpts.config.toOption.map{ DaffodilConfig.fromFile(_) } + val dafConfig = unparseOpts.config.toOption.map{ + DaffodilConfig.fromFile(_) + }getOrElse( + DaffodilConfig() + ) val processor = { if (unparseOpts.parser.isDefined) { createProcessorFromParser(unparseOpts.parser(), unparseOpts.path.toOption, validate) } else { - val tunables = DaffodilTunables.configPlusMoreTunablesMap(unparseOpts.tunables, optDafConfig) + val tunables = DaffodilTunables.configPlusMoreTunablesMap(unparseOpts.tunables, dafConfig) createProcessorFromSchema(unparseOpts.schema(), unparseOpts.rootNS.toOption, unparseOpts.path.toOption, tunables, validate) } - }.map{ _.withExternalVariables(combineExternalVariables(unparseOpts.vars, optDafConfig)) } + }.map{ _.withExternalVariables(combineExternalVariables(unparseOpts.vars, dafConfig)) } .map{ _.withValidationMode(validate) } + .map{ _.withDaffodilConfig(dafConfig)} Review Comment: This seems a little odd. We parse a bunch of stuff from the config (like variables, tunable, etc.) set those on the DataProcessor, and then also set the config on the data processor. Why are we doing that? Up until now, the config file was external to the Daffodil and was just a way for the CLI/TDMLRunner to configure it. Seems like we shouldn't need to pass in the config file. ########## daffodil-japi/src/main/java/org/apache/daffodil/japi/package-info.java: ########## @@ -118,7 +118,7 @@ * * <pre> * {@code - * JDOMInfosetOutputter jdomOutputter= new JDOMInfosetOutputter(); + * JDOMInfosetOutputter jdomOutputter= new JDOMInfosetOutputter(dp.daffodilConfig().xmlConversionControl()); Review Comment: This feels like a werid API to suggeste to the user, especially since there might be cases where the infoset outputter is created before we even have an DataProcessor. Access to the DP shouldn't be needed to configure this setting. ########## daffodil-japi/src/main/scala/org/apache/daffodil/japi/infoset/Infoset.scala: ########## @@ -217,10 +218,10 @@ abstract class InfosetOutputter extends SInfosetOutputter { * * @param showFormatInfo add additional properties to each scala.xml.Node for debug purposes */ -class ScalaXMLInfosetOutputter(showFormatInfo: Boolean = false) +class ScalaXMLInfosetOutputter(xcc: XMLConversionControl, showFormatInfo: Boolean = false) Review Comment: Backwards incompatible change, these new XCC parameters should all be optional. ########## daffodil-core/src/test/scala/org/apache/daffodil/dsom/TestExternalVariables.scala: ########## @@ -332,7 +332,7 @@ class TestExternalVariables { val dp1 = pf.onPath("/") val dp2 = pf.onPath("/").withExternalVariables(variables) - val outputter = new ScalaXMLInfosetOutputter() + val outputter = new ScalaXMLInfosetOutputter(dp2.daffodilConfig.xmlConversionControl) Review Comment: Related to a previous comment, it seems odd to be getting the XCC from the DataProcesosr when all the other information is read directly from the the config file and then passed to the DataProcessor. Seems like Daffodil internals shouldn't know about the config file and it's only the external interfaces? ########## daffodil-lib/src/main/scala/org/apache/daffodil/api/DaffodilConfig.scala: ########## @@ -26,8 +28,105 @@ import java.net.URI import scala.xml.Elem import scala.xml.Node -object DaffodilConfig { +/** + * Makes it simple to define enum corresponding to an XSD attribute declaration. + * + * TODO: Move to daffodil-lib xml package. + */ +trait AttrEnum extends Enumeration { + type Type = Value + + final def fromXML(xml: Node): Value = { + var rawtxt = (xml \ ("@" + attributeName)).text + val opt: Option[Value] = + if (rawtxt == "") + None + else + Some(withName(rawtxt)) + opt.getOrElse(default) + } + + def default: Value + + private lazy val nameFromClass = + Misc.toInitialLowerCaseUnlessAllUpperCase( + Misc.getNameFromClass(this) + ) + + /** + * object class name must match the config file element name + * except for starting with upper case letter. + * + * Or you can override. + */ + def attributeName = nameFromClass + + /** + * Appended to diagnostic messages. But these + * should not happen if the XML Loading does validation. + */ + protected def adviceString: String +} + +/** + * Makes it easy to construct structures corresponding + * to sub-elements. + */ +trait SubElement extends Serializable { + + final def parseFromParentXML(parentXML: Node): Option[Node] = { + var optNode = (parentXML \ subElementName).headOption + optNode + } + + private lazy val nameFromClass = + Misc.toInitialLowerCaseUnlessAllUpperCase( + Misc.getNameFromClass(this) + ) + def subElementName: String = nameFromClass +} + +/** + * For use with Config files for defining + * enums for XSD attributes. + */ +trait ConfigAttrEnum extends AttrEnum { + override protected def adviceString = "Config files should be XSD validated before processing them." +} + + +object XMLConversionControl extends SubElement { Review Comment: The more I think about the .properties file I wonder if that would alleviate a lot of this complexity (and a number of bugs we already have surrounding incorrect parsing the config files). People hate XML, maybe we should consider moving our config file away from it? A number of key value pairs is so much simpler to parse. And our config files really aren't complex enough to warrant the extra complexity that XML provides. A property file that contains `xmlConversionControl = convertCR2LF` could be parsed with a one-liner like this: ```scala val xcc = XMLConversionControl.withName(properties.getProperty("xmlConversionControl")) ``` And we can "namespace" properties pretty easily, e.g. the new CDATA escaping using only in the XMLTextInfosetOuputter might look like ```properties infoset.xmlTextInfosetOutputter.escapeStyle = cdata ``` ########## daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala: ########## @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.daffodil.util + +/** + * A abstract base for Remappers which convert strings. + * + * The public interface is just `def remap(s: String): String`. + * + * There are protected methods that implementations must provide. + * + * Contains shared implementation methods also. + * + * NOTE: This is inner loop stuff. Keep it and derived classes lean and fast. + * Use a java-like coding style. While loops, not map/flatmap/etc. avoid tuples. + */ +trait CharacterSetRemapper { + + /** + * Remaps the string. Returns the original string object if no remapping is required. + */ + def remap(s: String): String = remapImpl(s) Review Comment: Should we just make this final and put remapImpl's content here? Nothing should need to override this remap function right? Only the `remap(Int,Int,Int)` function? ########## daffodil-runtime1/src/main/scala/org/apache/daffodil/infoset/NullInfosetInputter.scala: ########## @@ -41,8 +45,49 @@ object NullInfosetInputter { simpleText: String = null, isNilled: MaybeBoolean = MaybeBoolean.Nope, ) +} + +/** + * InfosetInputter that has the minimum possible amount of overhead during + * unparse operations, intended to be used for performance comparisons. The + * events array should be created by calling NullInfosetInputter.toEvents() + * prior to any performance testing and outside any critical sections, and + * passed into a new NullInfosetInputter for unparsing. + */ +class NullInfosetInputter(is: InputStream, + override val xmlConversionControl: XMLConversionControl) extends InfosetInputter + with XMLInfosetInputterMixin { + + lazy val events: Array[NullInfosetInputter.Event] = toEvents(is) Review Comment: I think this is incorrect. Converting to an array of events must happen outside of the creation of the NullInfosetInputer to avoid including converting the stream to events in performance metrics. Which means converting it to events must know about the XCC and not the infoset inputter. I think changes to this file need to be reverted. ########## daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd: ########## @@ -98,504 +117,1034 @@ - minExclusive - maxExclusive --> - <xs:element name="tunables"> - <xs:complexType> - <xs:all> - <xs:element name="allowExpressionResultCoercion" type="xs:boolean" default="true" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Defines how Daffodil coerces expressions where the result type differs - from the expected type. As an example, assume the expected type of an - expression is an xs:string, but the expression is { 3 }. In this case, the - expression result is an xs:int, which should not be automatically coerced - to an xs:string. Instead, the expression should be { xs:string(3) } or { "3" } - If the value of this tunable is false, these types of expressions will - result in a schema definition error. If the value is true, Daffodil will - provide a warning and attempt to coerce the result type to the expected - type. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="allowExternalPathExpressions" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - By default, path expressions in Daffodil will only work correctly if path - steps are used in an expression defined in the schema when compiled. To - enable the use of other expressions (e.g. during debugging, where not all - expressions are known at schema compile time), set this tunable to true. - This may cause a degredation of performance in path expression evaluation, - so this should be avoided when in production. This flag is automatically - enabled when debugging is enabled. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="blobChunkSizeInBytes" default="4096" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When reading/writing blob data, the maximum number of bytes to read/write - at a time. This is also used when parsing xs:hexBinary data. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - <xs:maxInclusive value="268435455" /> <!-- Limit to (MaxInt / 8) because some places convert this tunable to bits --> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="defaultEmptyElementParsePolicy" type="daf:TunableEmptyElementParsePolicy" default="treatAsEmpty" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Defines the default empty element parse policy to use if it is not defined - in a schema. This is only used if requireEmptyElementParsePolicyProperty is - false. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="defaultInitialRegexMatchLimitInChars" default="32" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="errorOnUnsupportedJavaVersion" type="xs:boolean" default="true" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="generatedNamespacePrefixStem" type="xs:string" default="tns" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Stem to use when generating a namespace prefix when one is not defined for - the target naespace. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="initialElementOccurrencesHint" default="10" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Initial array buffer size allocated for recurring elements/arrays. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="initialRegexMatchLimitInCharacters" default="64" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Initial number of characters to match when performing regular expression - matches on input data. When a regex fails to match, more data may be - consumed up to the maximumRegexMatchLengthInCharacters tunable. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="infosetWalkerSkipMin" default="32" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil periodically walks the internal infoset to send events to the configured - InfosetOutputter, skipping at least this number of walk attempts. Larger values - mean delayed InfosetOutputter events and more memory usage; Smaller values mean - more CPU usage. Set this value to zero to never skip any walk attempts. This is - specifically for advanced testing behavior and should not need to be changed by users. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="0" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="infosetWalkerSkipMax" default="2048" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil periodically walks the internal infoset to send events to the configured - InfosetOutputter. On walks where no progress is made, the number of walks to skip - is increased with the assumption that something is blocking it (like an - unresolved point of uncertainty), up to this maximum value. Higher values mean - less attempts are made when blocked for a long time, but with potentially more - delays and memory usage before InfosetOutputter events are created. This is - specifically for advanced testing behavior and should not need to be changed by users. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="0" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="inputFileMemoryMapLowThreshold" type="xs:int" default="33554432" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="maxBinaryDecimalVirtualPoint" default="200" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The largest allowed value of the dfdl:binaryDecimalVirtualPoint property. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxByteArrayOutputStreamBufferSizeInBytes" default="2097152000" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When unparsing, this is the maximum size of the buffer that the - ByteArrayOutputStream can grow to before switching to a file based - output stream. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="0" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxDataDumpSizeInBytes" default="256" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The maximum size of data to retrive when when getting data to display - for debugging. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxFieldContentLengthInBytes" type="xs:int" default="1048576" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="maxHexBinaryLengthInBytes" default="2147483647" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The maximum size allowed for an xs:hexBinary element. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxLengthForVariableLengthDelimiterDisplay" default="10" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When unexpected text is found where a delimiter is expected, this is the maximum - number of bytes (characters) to display when the expected delimiter is a variable - length delimiter. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxLookaheadFunctionBits" default="512" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Max distance that the DPath lookahead function is permitted to look. - Distance is defined by the distance to the last bit accessed, and - so it is offset+bitsize. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:long"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxOccursBounds" default="2147483647" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of occurances of an array element. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:long"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxSkipLengthInBytes" default="1024" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of bytes allowed to skip in a skip region. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxValidYear" default="9999" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Due to differences in the DFDL spec and ICU4J SimpleDateFormat, we must - have SimpleDateFormat parse in lenient mode, which allows the year value - to overflow with very large years into possibly negative years. This - tunable tunable sets an upper limit for values to prevent overflow. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maximumRegexMatchLengthInCharacters" default="1048576" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of characters to match when performing regular expression - matches on input data. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maximumSimpleElementSizeInCharacters" default="1048576" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of characters to parse when parsing string data. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="minBinaryDecimalVirtualPoint" default="-200" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The smallest allowed value of the dfdl:binaryDecimalVirtualPoint property. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:maxInclusive value="-1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="minValidYear" type="xs:int" default="0" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Due to differences in the DFDL spec and ICU4J SimpleDateFormat, we must - have SimpleDateFormat parse in lenient mode, which allows the year value - to overflow with very large years into possibly negative years. This - tunable tunable sets an upper limit for values to prevent underflow. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="outputStreamChunkSizeInBytes" default="65536" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When writing file data to the output stream during unparse, this - is the maximum number of bytes to write at a time. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="parseUnparsePolicy" type="daf:TunableParseUnparsePolicyTunable" default="fromRoot" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Whether to compile a schema to support only parsing, only unparsing, both, or to - use the daf:parseUnparsePolicy from the root node. All child elements of the root - must have a compatable daf:parseUnaprsePolicy property. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="readerByteBufferSize" type="xs:int" default="8192" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="releaseUnneededInfoset" type="xs:boolean" default="true" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil will periodically release internal infoset elements that it determines - are no longer needed, thus freeing memory. Setting this value to false will - prevent this from taking place. This should usually only be used while debugging - or with very specific tests. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireBitOrderProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:bitOrder property is specified. If false, use a - default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireEmptyElementParsePolicyProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:emptyElementParsePolicy property is specified in - the schema. If false, and not defined in the schema, uses the - defaultEmptyElementParsePolicy as the value of emptyElementParsePolicy. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireEncodingErrorPolicyProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:encodingErrorPolicy property is specified. If - false, use a default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireFloatingProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:floating property is specified. If - false, use a default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireTextBidiProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:testBidi property is specified. If - false, use a default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireTextStandardBaseProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:textStandardBase property is specified. If false - and the property is missing, behave as if the property is set to 10. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="saxUnparseEventBatchSize" default="100" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil's SAX Unparse API allows events to be batched in memory to minimize the - frequency of context switching between the SAXInfosetInputter thread that processes - the events, and the DaffodilUnparseContentHandler thread that generates the events. - Setting this value to a low number will increase the frequency of context switching, - but will reduce the memory footprint. Swtting it to a high number will decrease the - frequency of context switching, but increase the memory footprint. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="suppressSchemaDefinitionWarnings" type="daf:TunableSuppressSchemaDefinitionWarnings" default="emptyElementParsePolicyError" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Space-separated list of schema definition warnings that should be ignored, - or "all" to ignore all warnings. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="tempFilePath" type="xs:string" default="This string is ignored. Default value is taken from java.io.tmpdir property" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When unparsing, use this path to store temporary files that may be genrated. - The default value (empty string) will result in the use of the java.io.tmpdir - property being used as the path. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="unqualifiedPathStepPolicy" type="daf:TunableUnqualifiedPathStepPolicy" default="noNamespace" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Defines how to lookup DFDL expression path steps that to not include a - namespace prefix. Values are: - - noNamespace: only match elements that do not have a namespace - - defaultNamespace: only match elements defined in the default namespace - - preferDefaultNamespace: match elements defined in the default namespace; - if non are found, match elemnts that do not have a namespace - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="unparseSuspensionWaitOld" default="100" minOccurs="0"> - <xs:annotation> - <xs:documentation> - While unparsing, some unparse actions require "suspending" which - requires buffering unparse output until the suspension can be - evaluated. Daffodil periodically attempts to reevaluate these - suspensions so that these buffers can be released. We attempt to - evaluate young suspensions shortly after creation with the hope - that it will succeed and we can release associated buffers. But if - a young suspension fails it is moved to the old suspension list. - Old suspensions are evaluated less frequently since they are less - likely to succeeded. This minimizes the overhead related to - evaluating suspensions that are likely to fail. The - unparseSuspensionWaitYoung and unparseSuspensionWaitOld - values determine how many elements are unparsed before evaluating - young and old suspensions, respectively. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="unparseSuspensionWaitYoung" default="5" minOccurs="0"> - <xs:annotation> - <xs:documentation> - See unparseSuspensionWaitOld - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - </xs:all> - </xs:complexType> - </xs:element> + <xs:element name="tunables" type="tns:tunablesType"/> Review Comment: Why this change? It's hard to tell if there are actual changes to the tunables. ########## daffodil-cli/src/it/scala/org/apache/daffodil/xml/TestXMLConversionControl.scala: ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.xml + +import org.apache.commons.io.FileUtils +import org.junit.Test +import org.apache.daffodil.CLI.Util._ +import org.apache.daffodil.Main.ExitCode +import org.junit.Assert.assertTrue + +import java.nio.charset.StandardCharsets + +class TestXMLConversionControl { + + // + // To run tests conveniently under IntelliJ IDEA, + // rename the src/test dir to src/test1. Rename the src/it dir to src/test. + // Then modify this val to be "test". + // Then you can run these as ordinary junit-style tests under the IDE. + val test = "it" + + @Test def test_CLI_XMLConversionControlConvertCR(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-convertCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + runCLI(args"parse -s $schema -c $config --root a -o $output $input") { + cli => cli.expect("") + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\ndef\nghi</ex:a>")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRParse(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + runCLI(args"parse -s $schema -c $config --root a -o $output $input") { cli => + cli.expect("") + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\uE00D\ndef\uE00D\nghi</ex:a>")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRRoundTrip(): Unit = { + withTempFile { output => + withTempFile { xmlOut => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + var cmd = args"parse -s $schema -c $config --root a -o $xmlOut $input " + runCLI(cmd) { cli => + cli.expect(s"") + }(ExitCode.Success) + + cmd = args"unparse -s $schema -c $config --root a -o $output $xmlOut" + runCLI(cmd) { cli => + cli.expect(s"") + }(ExitCode.Success) + + + val xml = FileUtils.readFileToString(xmlOut.toFile, StandardCharsets.UTF_8) + println(xml) + assertTrue(xml.toString.contains("abc\uE00D\ndef\uE00D\nghi")) + } + + val xml = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(xml.toString.contains("abc\r\ndef\r\nghi")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRUnparseToFile(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + + runCLI(args"unparse -s $schema -c $config --root a -o $output ") { cli => + cli.send("<ex:a xmlns:ex='urn:ex'>abc\uE00D\ndef\uE00D\nghi</ex:a>", inputDone = true) + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("abc\r\ndef\r\nghi")) + } + } + + // + // Illustrates a problem with the expect library (perhaps?) Review Comment: This is because of the line in Util.scala: ```scala eb.withInputFilters(replaceInString("\r\n", "\n")) ``` Windows and Linux output different new lines. By doing this we don't have to worry about the CLI outputting a CRLF or LF and we just convert it all to a LF. We may need another alternative either to your tests or the test infrastructure if this is something we need. ########## daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat: ########## @@ -0,0 +1,3 @@ +abc +def +ghi Review Comment: I don't think you can rely on git not munging these CRLFs on windows. We need an alternative method to ensure that we actually send CRLF as test data. One option is to use withTempFile, write the bytes directly to that and then parse with the input file being the temp file. ########## daffodil-cli/src/it/scala/org/apache/daffodil/xml/TestXMLConversionControl.scala: ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.xml + +import org.apache.commons.io.FileUtils +import org.junit.Test +import org.apache.daffodil.CLI.Util._ +import org.apache.daffodil.Main.ExitCode +import org.junit.Assert.assertTrue + +import java.nio.charset.StandardCharsets + +class TestXMLConversionControl { + + // + // To run tests conveniently under IntelliJ IDEA, + // rename the src/test dir to src/test1. Rename the src/it dir to src/test. + // Then modify this val to be "test". + // Then you can run these as ordinary junit-style tests under the IDE. + val test = "it" + + @Test def test_CLI_XMLConversionControlConvertCR(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-convertCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") Review Comment: Suggest moving these outside the withTempFile block since they don't need it. Makes it more consistent with the rest of our test suite. ########## daffodil-cli/src/it/scala/org/apache/daffodil/xml/TestXMLConversionControl.scala: ########## @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.xml + +import org.apache.commons.io.FileUtils +import org.junit.Test +import org.apache.daffodil.CLI.Util._ +import org.apache.daffodil.Main.ExitCode +import org.junit.Assert.assertTrue + +import java.nio.charset.StandardCharsets + +class TestXMLConversionControl { + + // + // To run tests conveniently under IntelliJ IDEA, + // rename the src/test dir to src/test1. Rename the src/it dir to src/test. + // Then modify this val to be "test". + // Then you can run these as ordinary junit-style tests under the IDE. + val test = "it" + + @Test def test_CLI_XMLConversionControlConvertCR(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-convertCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + runCLI(args"parse -s $schema -c $config --root a -o $output $input") { + cli => cli.expect("") + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\ndef\nghi</ex:a>")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRParse(): Unit = { + withTempFile { output => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + runCLI(args"parse -s $schema -c $config --root a -o $output $input") { cli => + cli.expect("") + }(ExitCode.Success) + + val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8) + assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\uE00D\ndef\uE00D\nghi</ex:a>")) + } + } + + @Test def test_CLI_XMLConversionControlPreserveCRRoundTrip(): Unit = { + withTempFile { output => + withTempFile { xmlOut => + val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd") + val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml") + val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.txt.dat") + + var cmd = args"parse -s $schema -c $config --root a -o $xmlOut $input " + runCLI(cmd) { cli => + cli.expect(s"") + }(ExitCode.Success) + + cmd = args"unparse -s $schema -c $config --root a -o $output $xmlOut" + runCLI(cmd) { cli => + cli.expect(s"") + }(ExitCode.Success) + + + val xml = FileUtils.readFileToString(xmlOut.toFile, StandardCharsets.UTF_8) + println(xml) Review Comment: Remove prinln ########## daffodil-cli/src/main/scala/org/apache/daffodil/InfosetTypes.scala: ########## @@ -473,18 +475,18 @@ case class NULLInfosetHandler(dataProcessor: DataProcessor) } def unparse(data: AnyRef, output: DFDL.Output): UnparseResult = { - val events = data.asInstanceOf[Array[NullInfosetInputter.Event]] - val input = new NullInfosetInputter(events) + val is = data match { + case bytes: Array[Byte] => new ByteArrayInputStream(bytes) + case is: InputStream => is Review Comment: These changes to the NullInfosetInputter feel wrong. The goal of the unparse NullInfosetInputter is to avoid overhead related to parsing a stream or bytes to events, so it should the stream/bytes must be converted to the NullInfosetInputter Event array in the dataToInfoset function like it was doing before. This way actual reading/parsing the infoset happens outside the performance loop. ########## daffodil-cli/src/main/scala/org/apache/daffodil/InfosetTypes.scala: ########## @@ -414,15 +414,17 @@ class ScalaXMLInfosetParseResult(parseResult: ParseResult, output: ScalaXMLInfos case class W3CDOMInfosetHandler(dataProcessor: DataProcessor) extends InfosetHandler { + private val xcc = dataProcessor.daffodilConfig.xmlConversionControl Review Comment: You didn't create this var in other handlers, for consistency I'd suggest we don't do it here. ########## daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala: ########## @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.daffodil.util + +/** + * A abstract base for Remappers which convert strings. + * + * The public interface is just `def remap(s: String): String`. + * + * There are protected methods that implementations must provide. + * + * Contains shared implementation methods also. + * + * NOTE: This is inner loop stuff. Keep it and derived classes lean and fast. + * Use a java-like coding style. While loops, not map/flatmap/etc. avoid tuples. + */ +trait CharacterSetRemapper { + + /** + * Remaps the string. Returns the original string object if no remapping is required. + */ + def remap(s: String): String = remapImpl(s) + + /** + * Remaps 1 character, does not consider any context. + */ + def remapChar(c: Char): Char = remap(0, c, 0).toChar + + /** + * Remaps characters. Provides the previous and following characters since some remappings + * require this context. + * + * Plays a trick with negating the return value in order to avoid having to + * return more than one value, which is potentially less efficient. + * + * @param prev The character prior to the one being considered. (Needed for surrogates) + * @param curr The character under consideration for remapping. + * @param next The next character afterwards. (Needed for surrogates and CRLF pairs) + * @return The remapped character (as an Int) or that same remapped character Int + * value negated, which signals that curr+next was remapped to a single character. + * Such as is needed if CRLF is remapped to just LF. + */ + protected def remap (prev: Int, curr: Int, next: Int): Int + + private def needsRemapping(s: String): Boolean = { + // a one liner in scala, + // + // `s.exists{ remapChar(_) != _ }` + // + // but we need a fast java-like while loop... + var pos = 0 + var c = 0.toChar + val len = s.length + if (len != 0) + while (pos < len) { + c = s(pos) + if (remapChar(c) != c) Review Comment: Do surrogate pairs not matter when considering if something needs remapping? ########## daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala: ########## @@ -486,6 +484,14 @@ class DataProcessor private[japi] (private var dp: SDataProcessor) catch { case e: SExternalVariableException => throw new ExternalVariableException(e.getMessage) } } + def daffodilConfig: DaffodilConfig = dp.daffodilConfig + + def withDaffodilConfig(uri: URI): DataProcessor = + copy(dp = dp.withDaffodilConfig(uri)) + + def withDaffodilConfig(dc: DaffodilConfig): DataProcessor = + copy(dp = dp.withDaffodilConfig(dc)) + Review Comment: It's not totally clear to me the need for this. It feels like your just storing the config on the DP so you can access it later, when we should really just be passing the config or XCC information to the places that need this information (i.e. infoset outputter creators). This might be interesting if we automatically read tunables/variable etc from this config, but it doesn't look like we do that. ########## daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala: ########## @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.daffodil.util + +/** + * A abstract base for Remappers which convert strings. + * + * The public interface is just `def remap(s: String): String`. + * + * There are protected methods that implementations must provide. + * + * Contains shared implementation methods also. + * + * NOTE: This is inner loop stuff. Keep it and derived classes lean and fast. + * Use a java-like coding style. While loops, not map/flatmap/etc. avoid tuples. + */ +trait CharacterSetRemapper { + + /** + * Remaps the string. Returns the original string object if no remapping is required. + */ + def remap(s: String): String = remapImpl(s) + + /** + * Remaps 1 character, does not consider any context. + */ + def remapChar(c: Char): Char = remap(0, c, 0).toChar + + /** + * Remaps characters. Provides the previous and following characters since some remappings + * require this context. + * + * Plays a trick with negating the return value in order to avoid having to + * return more than one value, which is potentially less efficient. + * + * @param prev The character prior to the one being considered. (Needed for surrogates) + * @param curr The character under consideration for remapping. + * @param next The next character afterwards. (Needed for surrogates and CRLF pairs) + * @return The remapped character (as an Int) or that same remapped character Int + * value negated, which signals that curr+next was remapped to a single character. + * Such as is needed if CRLF is remapped to just LF. + */ + protected def remap (prev: Int, curr: Int, next: Int): Int + + private def needsRemapping(s: String): Boolean = { Review Comment: A thought instead of this needsRemapping thing. Maybe we just make the remap function copy on write? So as long as the remap function returns the same thing we do nothing, but if remap changes something then it copies the string up until that point to the StringBuilder and then all chars are appended from that point on. This way, we avoid potential different logic from needsRemapping (which doesn't take into context) from the remap function (which does). ########## daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java: ########## @@ -127,7 +127,7 @@ public void testJavaAPI1() throws IOException, ClassNotFoundException { java.io.File file = getResource("/test/japi/myData.dat"); java.io.FileInputStream fis = new java.io.FileInputStream(file); InputSourceDataInputStream dis = new InputSourceDataInputStream(fis); - JDOMInfosetOutputter outputter = new JDOMInfosetOutputter(); + JDOMInfosetOutputter outputter = new JDOMInfosetOutputter(dp.daffodilConfig().xmlConversionControl()); Review Comment: Ideally we wouldn't need to change most of these tests, so all these infoset outputter could just use the defaults. ########## daffodil-runtime1/src/main/scala/org/apache/daffodil/debugger/InteractiveDebugger.scala: ########## @@ -435,9 +435,9 @@ class InteractiveDebugger(runner: InteractiveDebuggerRunner, eCompilers: Express } } - private def infosetToString(ie: InfosetElement): String = { + private def infosetToString(ie: InfosetElement, daffodilConfig: DaffodilConfig): String = { val bos = new java.io.ByteArrayOutputStream() - val xml = new XMLTextInfosetOutputter(bos, true) + val xml = new XMLTextInfosetOutputter(bos, true, daffodilConfig.xmlConversionControl) Review Comment: I suggest the debugger is just opinionated about this XML conversions, or if we do want it to be modifiable then it be a debugger setting with the `set` command. Having to pass around this config for one option feels unnecessary. ########## daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd: ########## @@ -98,504 +117,1034 @@ - minExclusive - maxExclusive --> - <xs:element name="tunables"> - <xs:complexType> - <xs:all> - <xs:element name="allowExpressionResultCoercion" type="xs:boolean" default="true" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Defines how Daffodil coerces expressions where the result type differs - from the expected type. As an example, assume the expected type of an - expression is an xs:string, but the expression is { 3 }. In this case, the - expression result is an xs:int, which should not be automatically coerced - to an xs:string. Instead, the expression should be { xs:string(3) } or { "3" } - If the value of this tunable is false, these types of expressions will - result in a schema definition error. If the value is true, Daffodil will - provide a warning and attempt to coerce the result type to the expected - type. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="allowExternalPathExpressions" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - By default, path expressions in Daffodil will only work correctly if path - steps are used in an expression defined in the schema when compiled. To - enable the use of other expressions (e.g. during debugging, where not all - expressions are known at schema compile time), set this tunable to true. - This may cause a degredation of performance in path expression evaluation, - so this should be avoided when in production. This flag is automatically - enabled when debugging is enabled. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="blobChunkSizeInBytes" default="4096" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When reading/writing blob data, the maximum number of bytes to read/write - at a time. This is also used when parsing xs:hexBinary data. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - <xs:maxInclusive value="268435455" /> <!-- Limit to (MaxInt / 8) because some places convert this tunable to bits --> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="defaultEmptyElementParsePolicy" type="daf:TunableEmptyElementParsePolicy" default="treatAsEmpty" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Defines the default empty element parse policy to use if it is not defined - in a schema. This is only used if requireEmptyElementParsePolicyProperty is - false. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="defaultInitialRegexMatchLimitInChars" default="32" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="errorOnUnsupportedJavaVersion" type="xs:boolean" default="true" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="generatedNamespacePrefixStem" type="xs:string" default="tns" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Stem to use when generating a namespace prefix when one is not defined for - the target naespace. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="initialElementOccurrencesHint" default="10" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Initial array buffer size allocated for recurring elements/arrays. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="initialRegexMatchLimitInCharacters" default="64" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Initial number of characters to match when performing regular expression - matches on input data. When a regex fails to match, more data may be - consumed up to the maximumRegexMatchLengthInCharacters tunable. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="infosetWalkerSkipMin" default="32" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil periodically walks the internal infoset to send events to the configured - InfosetOutputter, skipping at least this number of walk attempts. Larger values - mean delayed InfosetOutputter events and more memory usage; Smaller values mean - more CPU usage. Set this value to zero to never skip any walk attempts. This is - specifically for advanced testing behavior and should not need to be changed by users. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="0" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="infosetWalkerSkipMax" default="2048" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil periodically walks the internal infoset to send events to the configured - InfosetOutputter. On walks where no progress is made, the number of walks to skip - is increased with the assumption that something is blocking it (like an - unresolved point of uncertainty), up to this maximum value. Higher values mean - less attempts are made when blocked for a long time, but with potentially more - delays and memory usage before InfosetOutputter events are created. This is - specifically for advanced testing behavior and should not need to be changed by users. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="0" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="inputFileMemoryMapLowThreshold" type="xs:int" default="33554432" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="maxBinaryDecimalVirtualPoint" default="200" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The largest allowed value of the dfdl:binaryDecimalVirtualPoint property. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxByteArrayOutputStreamBufferSizeInBytes" default="2097152000" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When unparsing, this is the maximum size of the buffer that the - ByteArrayOutputStream can grow to before switching to a file based - output stream. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="0" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxDataDumpSizeInBytes" default="256" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The maximum size of data to retrive when when getting data to display - for debugging. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxFieldContentLengthInBytes" type="xs:int" default="1048576" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="maxHexBinaryLengthInBytes" default="2147483647" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The maximum size allowed for an xs:hexBinary element. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxLengthForVariableLengthDelimiterDisplay" default="10" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When unexpected text is found where a delimiter is expected, this is the maximum - number of bytes (characters) to display when the expected delimiter is a variable - length delimiter. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxLookaheadFunctionBits" default="512" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Max distance that the DPath lookahead function is permitted to look. - Distance is defined by the distance to the last bit accessed, and - so it is offset+bitsize. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:long"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxOccursBounds" default="2147483647" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of occurances of an array element. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:long"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxSkipLengthInBytes" default="1024" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of bytes allowed to skip in a skip region. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maxValidYear" default="9999" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Due to differences in the DFDL spec and ICU4J SimpleDateFormat, we must - have SimpleDateFormat parse in lenient mode, which allows the year value - to overflow with very large years into possibly negative years. This - tunable tunable sets an upper limit for values to prevent overflow. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maximumRegexMatchLengthInCharacters" default="1048576" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of characters to match when performing regular expression - matches on input data. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="maximumSimpleElementSizeInCharacters" default="1048576" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Maximum number of characters to parse when parsing string data. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="minBinaryDecimalVirtualPoint" default="-200" minOccurs="0"> - <xs:annotation> - <xs:documentation> - The smallest allowed value of the dfdl:binaryDecimalVirtualPoint property. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:maxInclusive value="-1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="minValidYear" type="xs:int" default="0" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Due to differences in the DFDL spec and ICU4J SimpleDateFormat, we must - have SimpleDateFormat parse in lenient mode, which allows the year value - to overflow with very large years into possibly negative years. This - tunable tunable sets an upper limit for values to prevent underflow. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="outputStreamChunkSizeInBytes" default="65536" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When writing file data to the output stream during unparse, this - is the maximum number of bytes to write at a time. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="parseUnparsePolicy" type="daf:TunableParseUnparsePolicyTunable" default="fromRoot" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Whether to compile a schema to support only parsing, only unparsing, both, or to - use the daf:parseUnparsePolicy from the root node. All child elements of the root - must have a compatable daf:parseUnaprsePolicy property. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="readerByteBufferSize" type="xs:int" default="8192" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Deprecated. This tunable no longer has any affect and is only kept for - backwards compatability. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="releaseUnneededInfoset" type="xs:boolean" default="true" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil will periodically release internal infoset elements that it determines - are no longer needed, thus freeing memory. Setting this value to false will - prevent this from taking place. This should usually only be used while debugging - or with very specific tests. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireBitOrderProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:bitOrder property is specified. If false, use a - default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireEmptyElementParsePolicyProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:emptyElementParsePolicy property is specified in - the schema. If false, and not defined in the schema, uses the - defaultEmptyElementParsePolicy as the value of emptyElementParsePolicy. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireEncodingErrorPolicyProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:encodingErrorPolicy property is specified. If - false, use a default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireFloatingProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:floating property is specified. If - false, use a default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireTextBidiProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:testBidi property is specified. If - false, use a default value if the property is not defined in the schema. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="requireTextStandardBaseProperty" type="xs:boolean" default="false" minOccurs="0"> - <xs:annotation> - <xs:documentation> - If true, require that the dfdl:textStandardBase property is specified. If false - and the property is missing, behave as if the property is set to 10. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="saxUnparseEventBatchSize" default="100" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Daffodil's SAX Unparse API allows events to be batched in memory to minimize the - frequency of context switching between the SAXInfosetInputter thread that processes - the events, and the DaffodilUnparseContentHandler thread that generates the events. - Setting this value to a low number will increase the frequency of context switching, - but will reduce the memory footprint. Swtting it to a high number will decrease the - frequency of context switching, but increase the memory footprint. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="suppressSchemaDefinitionWarnings" type="daf:TunableSuppressSchemaDefinitionWarnings" default="emptyElementParsePolicyError" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Space-separated list of schema definition warnings that should be ignored, - or "all" to ignore all warnings. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="tempFilePath" type="xs:string" default="This string is ignored. Default value is taken from java.io.tmpdir property" minOccurs="0"> - <xs:annotation> - <xs:documentation> - When unparsing, use this path to store temporary files that may be genrated. - The default value (empty string) will result in the use of the java.io.tmpdir - property being used as the path. - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="unqualifiedPathStepPolicy" type="daf:TunableUnqualifiedPathStepPolicy" default="noNamespace" minOccurs="0"> - <xs:annotation> - <xs:documentation> - Defines how to lookup DFDL expression path steps that to not include a - namespace prefix. Values are: - - noNamespace: only match elements that do not have a namespace - - defaultNamespace: only match elements defined in the default namespace - - preferDefaultNamespace: match elements defined in the default namespace; - if non are found, match elemnts that do not have a namespace - </xs:documentation> - </xs:annotation> - </xs:element> - <xs:element name="unparseSuspensionWaitOld" default="100" minOccurs="0"> - <xs:annotation> - <xs:documentation> - While unparsing, some unparse actions require "suspending" which - requires buffering unparse output until the suspension can be - evaluated. Daffodil periodically attempts to reevaluate these - suspensions so that these buffers can be released. We attempt to - evaluate young suspensions shortly after creation with the hope - that it will succeed and we can release associated buffers. But if - a young suspension fails it is moved to the old suspension list. - Old suspensions are evaluated less frequently since they are less - likely to succeeded. This minimizes the overhead related to - evaluating suspensions that are likely to fail. The - unparseSuspensionWaitYoung and unparseSuspensionWaitOld - values determine how many elements are unparsed before evaluating - young and old suspensions, respectively. - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - <xs:element name="unparseSuspensionWaitYoung" default="5" minOccurs="0"> - <xs:annotation> - <xs:documentation> - See unparseSuspensionWaitOld - </xs:documentation> - </xs:annotation> - <xs:simpleType> - <xs:restriction base="xs:int"> - <xs:minInclusive value="1" /> - </xs:restriction> - </xs:simpleType> - </xs:element> - </xs:all> - </xs:complexType> - </xs:element> + <xs:element name="tunables" type="tns:tunablesType"/> + + <!-- + Keep tunablesType and tunablesTypeUnqualified in synch. All changes to one must + also be made to the other. + + Note that tunablesTypeUnqualified is IDENTICAL except each child element carries + a form='unqualified' so that the name does not have to be qualified. + --> + <xs:complexType name="tunablesType"> + <xs:all> + <!-- + TODO: really these all should have been XML attributes + --> + <xs:element name="allowExpressionResultCoercion" type="xs:boolean" default="true" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Defines how Daffodil coerces expressions where the result type differs + from the expected type. As an example, assume the expected type of an + expression is an xs:string, but the expression is { 3 }. In this case, the + expression result is an xs:int, which should not be automatically coerced + to an xs:string. Instead, the expression should be { xs:string(3) } or { "3" } + If the value of this tunable is false, these types of expressions will + result in a schema definition error. If the value is true, Daffodil will + provide a warning and attempt to coerce the result type to the expected + type. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="allowExternalPathExpressions" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + By default, path expressions in Daffodil will only work correctly if path + steps are used in an expression defined in the schema when compiled. To + enable the use of other expressions (e.g. during debugging, where not all + expressions are known at schema compile time), set this tunable to true. + This may cause a degredation of performance in path expression evaluation, + so this should be avoided when in production. This flag is automatically + enabled when debugging is enabled. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="blobChunkSizeInBytes" default="4096" minOccurs="0"> + <xs:annotation> + <xs:documentation> + When reading/writing blob data, the maximum number of bytes to read/write + at a time. This is also used when parsing xs:hexBinary data. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + <xs:maxInclusive + value="268435455"/> <!-- Limit to (MaxInt / 8) because some places convert this tunable to bits --> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="defaultEmptyElementParsePolicy" type="daf:TunableEmptyElementParsePolicy" default="treatAsEmpty" + minOccurs="0"> + <xs:annotation> + <xs:documentation> + Defines the default empty element parse policy to use if it is not defined + in a schema. This is only used if requireEmptyElementParsePolicyProperty is + false. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="defaultInitialRegexMatchLimitInChars" default="32" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Deprecated. This tunable no longer has any affect and is only kept for + backwards compatability. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="errorOnUnsupportedJavaVersion" type="xs:boolean" default="true" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Deprecated. This tunable no longer has any affect and is only kept for + backwards compatability. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="generatedNamespacePrefixStem" type="xs:string" default="tns" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Stem to use when generating a namespace prefix when one is not defined for + the target naespace. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="initialElementOccurrencesHint" default="10" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Initial array buffer size allocated for recurring elements/arrays. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="initialRegexMatchLimitInCharacters" default="64" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Initial number of characters to match when performing regular expression + matches on input data. When a regex fails to match, more data may be + consumed up to the maximumRegexMatchLengthInCharacters tunable. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="infosetWalkerSkipMin" default="32" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Daffodil periodically walks the internal infoset to send events to the configured + InfosetOutputter, skipping at least this number of walk attempts. Larger values + mean delayed InfosetOutputter events and more memory usage; Smaller values mean + more CPU usage. Set this value to zero to never skip any walk attempts. This is + specifically for advanced testing behavior and should not need to be changed by users. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="0"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="infosetWalkerSkipMax" default="2048" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Daffodil periodically walks the internal infoset to send events to the configured + InfosetOutputter. On walks where no progress is made, the number of walks to skip + is increased with the assumption that something is blocking it (like an + unresolved point of uncertainty), up to this maximum value. Higher values mean + less attempts are made when blocked for a long time, but with potentially more + delays and memory usage before InfosetOutputter events are created. This is + specifically for advanced testing behavior and should not need to be changed by users. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="0"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="inputFileMemoryMapLowThreshold" type="xs:int" default="33554432" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Deprecated. This tunable no longer has any affect and is only kept for + backwards compatability. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="maxBinaryDecimalVirtualPoint" default="200" minOccurs="0"> + <xs:annotation> + <xs:documentation> + The largest allowed value of the dfdl:binaryDecimalVirtualPoint property. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxByteArrayOutputStreamBufferSizeInBytes" default="2097152000" minOccurs="0"> + <xs:annotation> + <xs:documentation> + When unparsing, this is the maximum size of the buffer that the + ByteArrayOutputStream can grow to before switching to a file based + output stream. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="0"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxDataDumpSizeInBytes" default="256" minOccurs="0"> + <xs:annotation> + <xs:documentation> + The maximum size of data to retrive when when getting data to display + for debugging. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxFieldContentLengthInBytes" type="xs:int" default="1048576" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Deprecated. This tunable no longer has any affect and is only kept for + backwards compatability. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="maxHexBinaryLengthInBytes" default="2147483647" minOccurs="0"> + <xs:annotation> + <xs:documentation> + The maximum size allowed for an xs:hexBinary element. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxLengthForVariableLengthDelimiterDisplay" default="10" minOccurs="0"> + <xs:annotation> + <xs:documentation> + When unexpected text is found where a delimiter is expected, this is the maximum + number of bytes (characters) to display when the expected delimiter is a variable + length delimiter. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxLookaheadFunctionBits" default="512" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Max distance that the DPath lookahead function is permitted to look. + Distance is defined by the distance to the last bit accessed, and + so it is offset+bitsize. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:long"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxOccursBounds" default="2147483647" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Maximum number of occurances of an array element. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:long"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxSkipLengthInBytes" default="1024" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Maximum number of bytes allowed to skip in a skip region. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maxValidYear" default="9999" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Due to differences in the DFDL spec and ICU4J SimpleDateFormat, we must + have SimpleDateFormat parse in lenient mode, which allows the year value + to overflow with very large years into possibly negative years. This + tunable tunable sets an upper limit for values to prevent overflow. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maximumRegexMatchLengthInCharacters" default="1048576" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Maximum number of characters to match when performing regular expression + matches on input data. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="maximumSimpleElementSizeInCharacters" default="1048576" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Maximum number of characters to parse when parsing string data. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="minBinaryDecimalVirtualPoint" default="-200" minOccurs="0"> + <xs:annotation> + <xs:documentation> + The smallest allowed value of the dfdl:binaryDecimalVirtualPoint property. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:maxInclusive value="-1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="minValidYear" type="xs:int" default="0" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Due to differences in the DFDL spec and ICU4J SimpleDateFormat, we must + have SimpleDateFormat parse in lenient mode, which allows the year value + to overflow with very large years into possibly negative years. This + tunable tunable sets an upper limit for values to prevent underflow. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="outputStreamChunkSizeInBytes" default="65536" minOccurs="0"> + <xs:annotation> + <xs:documentation> + When writing file data to the output stream during unparse, this + is the maximum number of bytes to write at a time. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="parseUnparsePolicy" type="daf:TunableParseUnparsePolicyTunable" default="fromRoot" + minOccurs="0"> + <xs:annotation> + <xs:documentation> + Whether to compile a schema to support only parsing, only unparsing, both, or to + use the daf:parseUnparsePolicy from the root node. All child elements of the root + must have a compatable daf:parseUnaprsePolicy property. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="readerByteBufferSize" type="xs:int" default="8192" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Deprecated. This tunable no longer has any affect and is only kept for + backwards compatability. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="releaseUnneededInfoset" type="xs:boolean" default="true" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Daffodil will periodically release internal infoset elements that it determines + are no longer needed, thus freeing memory. Setting this value to false will + prevent this from taking place. This should usually only be used while debugging + or with very specific tests. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="requireBitOrderProperty" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + If true, require that the dfdl:bitOrder property is specified. If false, use a + default value if the property is not defined in the schema. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="requireEmptyElementParsePolicyProperty" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + If true, require that the dfdl:emptyElementParsePolicy property is specified in + the schema. If false, and not defined in the schema, uses the + defaultEmptyElementParsePolicy as the value of emptyElementParsePolicy. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="requireEncodingErrorPolicyProperty" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + If true, require that the dfdl:encodingErrorPolicy property is specified. If + false, use a default value if the property is not defined in the schema. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="requireFloatingProperty" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + If true, require that the dfdl:floating property is specified. If + false, use a default value if the property is not defined in the schema. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="requireTextBidiProperty" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + If true, require that the dfdl:testBidi property is specified. If + false, use a default value if the property is not defined in the schema. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="requireTextStandardBaseProperty" type="xs:boolean" default="false" minOccurs="0"> + <xs:annotation> + <xs:documentation> + If true, require that the dfdl:textStandardBase property is specified. If false + and the property is missing, behave as if the property is set to 10. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="saxUnparseEventBatchSize" default="100" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Daffodil's SAX Unparse API allows events to be batched in memory to minimize the + frequency of context switching between the SAXInfosetInputter thread that processes + the events, and the DaffodilUnparseContentHandler thread that generates the events. + Setting this value to a low number will increase the frequency of context switching, + but will reduce the memory footprint. Swtting it to a high number will decrease the + frequency of context switching, but increase the memory footprint. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="suppressSchemaDefinitionWarnings" type="daf:TunableSuppressSchemaDefinitionWarnings" + default="emptyElementParsePolicyError" minOccurs="0"> + <xs:annotation> + <xs:documentation> + Space-separated list of schema definition warnings that should be ignored, + or "all" to ignore all warnings. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="tempFilePath" type="xs:string" + default="This string is ignored. Default value is taken from java.io.tmpdir property" minOccurs="0"> + <xs:annotation> + <xs:documentation> + When unparsing, use this path to store temporary files that may be genrated. + The default value (empty string) will result in the use of the java.io.tmpdir + property being used as the path. + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="unqualifiedPathStepPolicy" type="daf:TunableUnqualifiedPathStepPolicy" default="noNamespace" + minOccurs="0"> + <xs:annotation> + <xs:documentation> + Defines how to lookup DFDL expression path steps that to not include a + namespace prefix. Values are: + - noNamespace: only match elements that do not have a namespace + - defaultNamespace: only match elements defined in the default namespace + - preferDefaultNamespace: match elements defined in the default namespace; + if non are found, match elemnts that do not have a namespace + </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="unparseSuspensionWaitOld" default="100" minOccurs="0"> + <xs:annotation> + <xs:documentation> + While unparsing, some unparse actions require "suspending" which + requires buffering unparse output until the suspension can be + evaluated. Daffodil periodically attempts to reevaluate these + suspensions so that these buffers can be released. We attempt to + evaluate young suspensions shortly after creation with the hope + that it will succeed and we can release associated buffers. But if + a young suspension fails it is moved to the old suspension list. + Old suspensions are evaluated less frequently since they are less + likely to succeeded. This minimizes the overhead related to + evaluating suspensions that are likely to fail. The + unparseSuspensionWaitYoung and unparseSuspensionWaitOld + values determine how many elements are unparsed before evaluating + young and old suspensions, respectively. + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + <xs:element name="unparseSuspensionWaitYoung" default="5" minOccurs="0"> + <xs:annotation> + <xs:documentation> + See unparseSuspensionWaitOld + </xs:documentation> + </xs:annotation> + <xs:simpleType> + <xs:restriction base="xs:int"> + <xs:minInclusive value="1"/> + </xs:restriction> + </xs:simpleType> + </xs:element> + </xs:all> + </xs:complexType> + + <!-- + Keep tunablesType and tunablesTypeUnqualified in synch. All changes to one must + also be made to the other. + + Note that tunablesTypeUnqualified is IDENTICAL except each child element carries + a form='unqualified' so that the name does not have to be qualified. + --> + <xs:complexType name="tunablesTypeUnqualified"> + <xs:all> + <!-- + TODO: really these all should have been XML attributes + --> + <xs:element name="allowExpressionResultCoercion" type="xs:boolean" default="true" minOccurs="0" form="unqualified"> Review Comment: Is this all copy paste from above, just with a different type? Is there a way around this? This is going to be a pain to keep these in sync. ########## daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd: ########## @@ -696,13 +1245,97 @@ </xs:list> </xs:simpleType> - <xs:element name="dfdlConfig"> - <xs:complexType> - <xs:sequence> - <xs:element ref="tns:externalVariableBindings" minOccurs="0" /> - <xs:element ref="tns:tunables" minOccurs="0" /> - </xs:sequence> - </xs:complexType> - </xs:element> + <!-- + This is just an element with attributes, i.e., ordinary XSD + + Currently we're not anticipating a flurry of controls, rather + just a few, so there is no attempt at code-generation from + this type. + --> + <xs:complexType name="xmlConversionControlType"> + <xs:annotation> + <xs:documentation> + Allows control over the XML conversion process + when the DFDL Infoset is converted to/from XML. + + For example, one can control whether CRLFs and CRs are replaced + by LF when data strings are converted to XML. + </xs:documentation> + </xs:annotation> + <xs:sequence/> + <xs:attribute name="carriageReturnMapping" type="tns:carriageReturnMappingType" + use="optional" default="ConvertCR2LF"> + <xs:annotation> + <xs:documentation> + Specifies how CR (carriage return, 0x0d) is treated when data strings are converted + into XML. + + XML readers always convert CRLF pairs to single LF, and isolated CR to LF. + + This control allows one to choose that same behavior for Daffodil, + or to choose a setting where CR are preserved by remapping them + to characters that XML does not modify. This mapping in inverted when + unparsing data so that CR characters are preserved. + </xs:documentation> + </xs:annotation> + </xs:attribute> + </xs:complexType> + + <xs:simpleType name="carriageReturnMappingType"> + <xs:restriction base="xs:token"> + <xs:enumeration value="PreserveCR"> + <xs:annotation> + <xs:documentation> + Preserves CR found in the data (not delimiters, actual data strings) + by remapping them by remapping them into the Unicode Public Use Area (PUA). + CR has code U+000D, and is remapped to U+E00D. + Unparsing inverts this mapping so that unparser output will + recreate CR characters faithfully. + + This is not usually how XML processing behaves. The usual + XML reader behavior is not PreserveCR, but ReplaceCR. + </xs:documentation> + </xs:annotation> + </xs:enumeration> + <xs:enumeration value="ConvertCR2LF"> + <xs:annotation> + <xs:documentation> + This is the standard behavior for XML readers. + CRLF is converted to LF and isolated CR is also + converted to LF. + + This is, unfortunately, lossy. Tests + where string data (not delimiters, actual data) + contains CR characters will not + round trip without twoPass testing. + </xs:documentation> + </xs:annotation> + </xs:enumeration> + </xs:restriction> + </xs:simpleType> + + <xs:complexType name="dfdlConfigType"> + <xs:sequence> + <xs:choice> + <!-- allow qualified or unqualified form --> + <xs:element ref="tns:externalVariableBindings" minOccurs="0"/> + <xs:element name="externalVariableBindings" minOccurs="0" + type="daf:externalVarType" form="unqualified"/> + </xs:choice> + <xs:choice> + <!-- allow qualified or unqualified form --> + <xs:element ref="tns:tunables" minOccurs="0"/> + <xs:element name="tunables" minOccurs="0" type="tns:tunablesTypeUnqualified" form="unqualified"/> Review Comment: I suggest we just pick one and go with it (or switch to a easier config format like .properties) and not worry about backwards compa for config files. I don't think they are used that often, and it's easy to change them. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
