This is an automated email from the ASF dual-hosted git repository.
mbeckerle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new 182f157d6 Added test to show preexisting PUA chars are preserved.
182f157d6 is described below
commit 182f157d64d515af84e94576625bb3510c5126af
Author: Michael Beckerle <[email protected]>
AuthorDate: Thu May 30 12:39:41 2024 -0400
Added test to show preexisting PUA chars are preserved.
See test_puaPreexistingInfosetChars
test_puaPreexistingInfosetChars_remapped
Note that a JSON workaround was required in the TDML runner.
DAFFODIL-2883
---
.../processor/tdml/TDMLInfosetInputter.scala | 36 +++++++++-
.../section00/general/testUnparserGeneral.tdml | 82 +++++++++++++++++++---
.../section00/general/TestUnparserGeneral.scala | 17 ++++-
3 files changed, 118 insertions(+), 17 deletions(-)
diff --git
a/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
b/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
index 5e4cb2d59..36d8e067c 100644
---
a/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
+++
b/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
@@ -22,6 +22,7 @@ import java.net.URISyntaxException
import org.apache.daffodil.lib.util.MaybeBoolean
import org.apache.daffodil.lib.util.Misc
+import org.apache.daffodil.lib.xml.XMLUtils
import org.apache.daffodil.runtime1.dpath.NodeInfo
import org.apache.daffodil.runtime1.infoset.InfosetInputter
import org.apache.daffodil.runtime1.infoset.InfosetInputterEventType
@@ -76,12 +77,18 @@ class TDMLInfosetInputter(
val res = scalaInputter.getSimpleText(primType, runtimeProperties)
val resIsEmpty = res == null || res == ""
val otherStrings = others.map { i =>
+ // Note in an unparserTestCase, there are no others (infoset inputters),
because the input infoset is
+ // coming from the TDML file, which is already XML.
+ // Rather, this is used in a parserTestCase where after populating a
TDMLInfosetOutputter
+ // which contains every kind of infoset (jdom, dom, JSON, etc.),
+ // the toInfosetInputter is called, which creates this kind of infoset
inputter where this
+ // method has each kind of infoset inputter (json, dom, JSON, etc.) so
that it can verify they're
+ // all equivalent.
val firstVersion = i.getSimpleText(primType, runtimeProperties)
val finalVersion = i match {
case _ if (firstVersion eq null) => ""
- // the json infoset inputter maintains CRLF/CR, but XML converts
CRLF/CR to
- // LF. So if this is Json, then we want the CRLF/CR converted to LF
- case jsonii: JsonInfosetInputter =>
firstVersion.replaceAll("(\r\n|\r)", "\n")
+ case jsonii: JsonInfosetInputter =>
+ convertJSONInfosetStringToXMLEquivalent(firstVersion)
case _ => firstVersion
}
finalVersion
@@ -143,4 +150,27 @@ class TDMLInfosetInputter(
}
override val supportsNamespaces = true
+
+ /**
+ * Converts a JSON infoset string to its XML equivalent.
+ *
+ * This enables comparing a string parsed by DFDL into a JSON infoset string
with the same string
+ * parsed by DFDL into an XML infoset string.
+ *
+ * Unlike XML, JSON preserves CRLF (Carriage Return Line Feed) and CR
(Carriage Return)
+ * characters, and every Unicode character without the need for PUA (Private
Use Area) remapping.
+ * If the input string is in JSON format, this method converts CRLF and CR
characters to LF (Line Feed).
+ * Additionally, the JSON infoset inputter maintains Unicode PUA characters,
while the XML infoset inputters remap
+ * some characters from PUA back to XML-illegal characters. For consistent
comparison with other XML infoset inputters,
+ * this method remaps the characters as if they were XML.
+ *
+ * @param jsonString the JSON infoset string to be converted
+ * @return the XML equivalent of the given JSON infoset string
+ */
+ private def convertJSONInfosetStringToXMLEquivalent(jsonString: String) = {
+ val withLFString =
+ jsonString.replaceAll("(\r\n|\r)", "\n") // because parsing into JSON
didn't do this.
+ val xmlString = XMLUtils.remapPUAToXMLIllegalCharacters(withLFString)
+ xmlString
+ }
}
diff --git
a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
index 6988f554f..ef3403ce5 100644
---
a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
+++
b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
@@ -47,6 +47,14 @@
</tdml:defineSchema>
+ <tdml:defineSchema name="utf8Chars">
+ <xs:include
schemaLocation="/org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd" />
+ <dfdl:format ref="ex:GeneralFormat" encoding="utf-8"/>
+
+ <xs:element name="e1" dfdl:lengthKind="delimited" type="xs:string"/>
+
+ </tdml:defineSchema>
+
<!--
Test Name: apostrophe_01
Schema: fixedLengthStrings
@@ -104,30 +112,82 @@
</tdml:parserTestCase>
-<!--
- Test Name: puaInfosetChars_03
+ <!--
+ Test Name: puaInfosetChars_CR_CRLF_01
Schema: illegalChars
- Purpose: When parsing illegal XML characters, they get mapped to PUA.
This test demonstrates that unparsing and unparsing actual PUA values is
handled correctly.
+ Purpose: When unparsing XML that has the remapped CR (0xE00D) code
point, those get remapped back to actual CR.
+ Note that unparsing (without round trip) never deals with
CR/CRLF replacement by LF in XML.
+ Nor is there any comparison of JSON infoset strings to XML
infoset strings involved because we're
+ starting from XML Infoset in the TDML file in a TDML test.
-->
- <tdml:parserTestCase name="puaInfosetChars_03" root="e1"
model="illegalChars" roundTrip="true">
- <tdml:document></tdml:document>
+ <tdml:unparserTestCase name="puaInfosetChars_CR_CRLF_01" root="e1"
model="illegalChars" roundTrip="none">
+ <tdml:document>
+ <tdml:documentPart type="byte">010203</tdml:documentPart>
+ <tdml:documentPart type="text"
replaceDFDLEntities="true">A%CR;B%CR;%LF;C</tdml:documentPart>
+ </tdml:document>
<tdml:infoset>
<tdml:dfdlInfoset>
- <ex:e1></ex:e1>
+ <ex:e1>AB
C</ex:e1>
+ </tdml:dfdlInfoset>
+ </tdml:infoset>
+
+ </tdml:unparserTestCase>
+
+ <!--
+ Test Name: puaInfosetChars_CR_CRLF_02
+ Schema: illegalChars
+ Purpose: When parsing, the DFDL infoset can contain CR and CRLF, but
+ When outputting XML from there, CR/CRLF get turned into LF
consistent with XML readers.
+-->
+
+ <tdml:parserTestCase name="puaInfosetChars_CR_CRLF_02" root="e1"
model="illegalChars" roundTrip="none">
+ <tdml:document>
+ <tdml:documentPart type="byte">010203</tdml:documentPart>
+ <tdml:documentPart type="text"
replaceDFDLEntities="true">A%CR;B%CR;%LF;C</tdml:documentPart>
+ </tdml:document>
+
+ <tdml:infoset>
+ <tdml:dfdlInfoset>
+ <ex:e1>A
B
C</ex:e1>
</tdml:dfdlInfoset>
</tdml:infoset>
</tdml:parserTestCase>
<!--
- Test Name: puaInfosetChars_04
- Schema: illegalChars
- Purpose: When parsing illegal XML characters, they get mapped to PUA.
This test demonstrates that when unparsing these mapped infosets, the
characters get mapped back to their original values.
+ Test Name: puaPreexistingInfosetChars
+ Schema: utf8Chars
+ Purpose: When parsing legal PUA chars that are in the data, the PUA
chars are preserved unless they collide with
+ our remapped PUA area.
+-->
+
+ <tdml:parserTestCase name="puaPreexistingInfosetChars" root="e1"
model="utf8Chars" roundTrip="true">
+
+ <tdml:infoset>
+ <tdml:dfdlInfoset>
+ <ex:e1></ex:e1>
+ </tdml:dfdlInfoset>
+ </tdml:infoset>
+
+ <tdml:document></tdml:document>
+
+ </tdml:parserTestCase>
+
+ <!--
+ Test Name: puaPreexistingInfosetChars_remapped
+ Schema: utf8Chars
+ Purpose: When parsing legal PUA chars that are in the data, but happen
to collide with our remapping region
+ the PUA chars are NOT preserved because we can't
differentiate whether they were remapped onto
+ the PUA, or were pre-existing in the PUA. This is just an XML
limitation due to its illegal chars.
+
+ This test is twoPass because on unparse we will NOT get back
the PUA characters, but if we
+ parse again we will get the same infoset as the first parse,
and unparse a second time will create the
+ same unparse output as the first unparse.
-->
- <tdml:unparserTestCase name="puaInfosetChars_04" root="e1"
model="illegalChars" roundTrip="true">
+ <tdml:parserTestCase name="puaPreexistingInfosetChars_remapped" root="e1"
model="utf8Chars" roundTrip="twoPass">
<tdml:infoset>
<tdml:dfdlInfoset>
@@ -137,7 +197,7 @@
<tdml:document></tdml:document>
- </tdml:unparserTestCase>
+ </tdml:parserTestCase>
<!--
Test Name: unparseFixedLengthString01
diff --git
a/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
b/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
index 8de3888e2..9dd4de267 100644
---
a/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
+++
b/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
@@ -36,9 +36,20 @@ class TestUnparserGeneral {
@Test def test_apostrophe_01(): Unit = { runner.runOneTest("apostrophe_01") }
- // DFDL-1395
- // @Test def test_puaInfosetChars_03() {
runner.runOneTest("puaInfosetChars_03") }
- // @Test def test_puaInfosetChars_04() {
runner.runOneTest("puaInfosetChars_04") }
+ @Test def test_puaPreexistingInfosetChars(): Unit = {
+ runner.runOneTest("puaPreexistingInfosetChars")
+ }
+ @Test def test_puaPreexistingInfosetChars_remapped(): Unit = {
+ runner.runOneTest("puaPreexistingInfosetChars_remapped")
+ }
+
+ @Test def test_puaInfosetChars_CR_CRLF_01(): Unit = {
+ runner.runOneTest("puaInfosetChars_CR_CRLF_01")
+ }
+
+ @Test def test_puaInfosetChars_CR_CRLF_02(): Unit = {
+ runner.runOneTest("puaInfosetChars_CR_CRLF_02")
+ }
@Test def test_puaInfosetChars_01(): Unit = {
runner.runOneTest("puaInfosetChars_01") }
@Test def test_puaInfosetChars_02(): Unit = {
runner.runOneTest("puaInfosetChars_02") }