(daffodil) branch main updated: Added test to show preexisting PUA chars are preserved.

mbeckerle Fri, 31 May 2024 10:35:54 -0700

This is an automated email from the ASF dual-hosted git repository.

mbeckerle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git



The following commit(s) were added to refs/heads/main by this push:
     new 182f157d6 Added test to show preexisting PUA chars are preserved.
182f157d6 is described below

commit 182f157d64d515af84e94576625bb3510c5126af
Author: Michael Beckerle <[email protected]>
AuthorDate: Thu May 30 12:39:41 2024 -0400

    Added test to show preexisting PUA chars are preserved.
    
    See test_puaPreexistingInfosetChars
    test_puaPreexistingInfosetChars_remapped
    
    Note that a JSON workaround was required in the TDML runner.
    
    DAFFODIL-2883
---
 .../processor/tdml/TDMLInfosetInputter.scala       | 36 +++++++++-
 .../section00/general/testUnparserGeneral.tdml     | 82 +++++++++++++++++++---
 .../section00/general/TestUnparserGeneral.scala    | 17 ++++-
 3 files changed, 118 insertions(+), 17 deletions(-)

diff --git 
a/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
 
b/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
index 5e4cb2d59..36d8e067c 100644
--- 
a/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
+++ 
b/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/processor/tdml/TDMLInfosetInputter.scala
@@ -22,6 +22,7 @@ import java.net.URISyntaxException
 
 import org.apache.daffodil.lib.util.MaybeBoolean
 import org.apache.daffodil.lib.util.Misc
+import org.apache.daffodil.lib.xml.XMLUtils
 import org.apache.daffodil.runtime1.dpath.NodeInfo
 import org.apache.daffodil.runtime1.infoset.InfosetInputter
 import org.apache.daffodil.runtime1.infoset.InfosetInputterEventType
@@ -76,12 +77,18 @@ class TDMLInfosetInputter(
     val res = scalaInputter.getSimpleText(primType, runtimeProperties)
     val resIsEmpty = res == null || res == ""
     val otherStrings = others.map { i =>
+      // Note in an unparserTestCase, there are no others (infoset inputters), 
because the input infoset is
+      // coming from the TDML file, which is already XML.
+      // Rather, this is used in a parserTestCase where after populating a 
TDMLInfosetOutputter
+      // which contains every kind of infoset (jdom, dom, JSON, etc.),
+      // the toInfosetInputter is called, which creates this kind of infoset 
inputter where this
+      // method has each kind of infoset inputter (json, dom, JSON, etc.) so 
that it can verify they're
+      // all equivalent.
       val firstVersion = i.getSimpleText(primType, runtimeProperties)
       val finalVersion = i match {
         case _ if (firstVersion eq null) => ""
-        // the json infoset inputter maintains CRLF/CR, but XML converts 
CRLF/CR to
-        // LF. So if this is Json, then we want the CRLF/CR converted to LF
-        case jsonii: JsonInfosetInputter => 
firstVersion.replaceAll("(\r\n|\r)", "\n")
+        case jsonii: JsonInfosetInputter =>
+          convertJSONInfosetStringToXMLEquivalent(firstVersion)
         case _ => firstVersion
       }
       finalVersion
@@ -143,4 +150,27 @@ class TDMLInfosetInputter(
   }
 
   override val supportsNamespaces = true
+
+  /**
+   * Converts a JSON infoset string to its XML equivalent.
+   *
+   * This enables comparing a string parsed by DFDL into a JSON infoset string 
with the same string
+   * parsed by DFDL into an XML infoset string.
+   *
+   * Unlike XML, JSON preserves CRLF (Carriage Return Line Feed) and CR 
(Carriage Return)
+   * characters, and every Unicode character without the need for PUA (Private 
Use Area) remapping.
+   * If the input string is in JSON format, this method converts CRLF and CR 
characters to LF (Line Feed).
+   * Additionally, the JSON infoset inputter maintains Unicode PUA characters, 
while the XML infoset inputters remap
+   * some characters from PUA back to XML-illegal characters. For consistent 
comparison with other XML infoset inputters,
+   * this method remaps the characters as if they were XML.
+   *
+   * @param jsonString the JSON infoset string to be converted
+   * @return the XML equivalent of the given JSON infoset string
+   */
+  private def convertJSONInfosetStringToXMLEquivalent(jsonString: String) = {
+    val withLFString =
+      jsonString.replaceAll("(\r\n|\r)", "\n") // because parsing into JSON 
didn't do this.
+    val xmlString = XMLUtils.remapPUAToXMLIllegalCharacters(withLFString)
+    xmlString
+  }
 }
diff --git 
a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
 
b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
index 6988f554f..ef3403ce5 100644
--- 
a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
+++ 
b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/testUnparserGeneral.tdml
@@ -47,6 +47,14 @@
 
   </tdml:defineSchema>
 
+  <tdml:defineSchema name="utf8Chars">
+    <xs:include 
schemaLocation="/org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd" />
+    <dfdl:format ref="ex:GeneralFormat" encoding="utf-8"/>
+
+    <xs:element name="e1" dfdl:lengthKind="delimited" type="xs:string"/>
+
+  </tdml:defineSchema>
+
 <!--
       Test Name: apostrophe_01
          Schema: fixedLengthStrings
@@ -104,30 +112,82 @@
 
   </tdml:parserTestCase>
 
-<!--
-      Test Name: puaInfosetChars_03
+  <!--
+      Test Name: puaInfosetChars_CR_CRLF_01
          Schema: illegalChars
-        Purpose: When parsing illegal XML characters, they get mapped to PUA. 
This test demonstrates that unparsing and unparsing actual PUA values is 
handled correctly.
+        Purpose: When unparsing XML that has the remapped CR (0xE00D) code 
point, those get remapped back to actual CR.
+                 Note that unparsing (without round trip) never deals with 
CR/CRLF replacement by LF in XML.
+                 Nor is there any comparison of JSON infoset strings to XML 
infoset strings involved because we're
+                 starting from XML Infoset in the TDML file in a TDML test.
 -->
 
-  <tdml:parserTestCase name="puaInfosetChars_03" root="e1" 
model="illegalChars" roundTrip="true">
-    <tdml:document>&#xE001;</tdml:document>
+  <tdml:unparserTestCase name="puaInfosetChars_CR_CRLF_01" root="e1" 
model="illegalChars" roundTrip="none">
+    <tdml:document>
+      <tdml:documentPart type="byte">010203</tdml:documentPart>
+      <tdml:documentPart type="text" 
replaceDFDLEntities="true">A%CR;B%CR;%LF;C</tdml:documentPart>
+    </tdml:document>
 
     <tdml:infoset>
       <tdml:dfdlInfoset>
-        <ex:e1>&#xE001;</ex:e1>
+        <ex:e1>&#xE001;&#xE002;&#xE003;A&#xE00D;B&#xE00D;&#x0A;C</ex:e1>
+      </tdml:dfdlInfoset>
+    </tdml:infoset>
+
+  </tdml:unparserTestCase>
+
+  <!--
+    Test Name: puaInfosetChars_CR_CRLF_02
+       Schema: illegalChars
+      Purpose: When parsing, the DFDL infoset can contain CR and CRLF, but
+               When outputting XML from there, CR/CRLF get turned into LF 
consistent with XML readers.
+-->
+
+  <tdml:parserTestCase name="puaInfosetChars_CR_CRLF_02" root="e1" 
model="illegalChars" roundTrip="none">
+    <tdml:document>
+      <tdml:documentPart type="byte">010203</tdml:documentPart>
+      <tdml:documentPart type="text" 
replaceDFDLEntities="true">A%CR;B%CR;%LF;C</tdml:documentPart>
+    </tdml:document>
+
+    <tdml:infoset>
+      <tdml:dfdlInfoset>
+        <ex:e1>&#xE001;&#xE002;&#xE003;A&#x0A;B&#x0A;C</ex:e1>
       </tdml:dfdlInfoset>
     </tdml:infoset>
 
   </tdml:parserTestCase>
 
 <!--
-      Test Name: puaInfosetChars_04
-         Schema: illegalChars
-        Purpose: When parsing illegal XML characters, they get mapped to PUA. 
This test demonstrates that when unparsing these mapped infosets, the 
characters get mapped back to their original values.
+      Test Name: puaPreexistingInfosetChars
+         Schema: utf8Chars
+        Purpose: When parsing legal PUA chars that are in the data, the PUA 
chars are preserved unless they collide with
+                 our remapped PUA area.
+-->
+
+  <tdml:parserTestCase name="puaPreexistingInfosetChars" root="e1" 
model="utf8Chars" roundTrip="true">
+
+    <tdml:infoset>
+      <tdml:dfdlInfoset>
+        <ex:e1>&#xE101;&#xE102;&#xE103;</ex:e1>
+      </tdml:dfdlInfoset>
+    </tdml:infoset>
+
+    <tdml:document>&#xE101;&#xE102;&#xE103;</tdml:document>
+
+  </tdml:parserTestCase>
+
+  <!--
+      Test Name: puaPreexistingInfosetChars_remapped
+         Schema: utf8Chars
+        Purpose: When parsing legal PUA chars that are in the data, but happen 
to collide with our remapping region
+                 the PUA chars are NOT preserved because we can't 
differentiate whether they were remapped onto
+                 the PUA, or were pre-existing in the PUA. This is just an XML 
limitation due to its illegal chars.
+
+                 This test is twoPass because on unparse we will NOT get back 
the PUA characters, but if we
+                 parse again we will get the same infoset as the first parse, 
and unparse a second time will create the
+                 same unparse output as the first unparse.
 -->
 
-  <tdml:unparserTestCase name="puaInfosetChars_04" root="e1" 
model="illegalChars" roundTrip="true">
+  <tdml:parserTestCase name="puaPreexistingInfosetChars_remapped" root="e1" 
model="utf8Chars" roundTrip="twoPass">
 
     <tdml:infoset>
       <tdml:dfdlInfoset>
@@ -137,7 +197,7 @@
 
     <tdml:document>&#xE001;&#xE002;&#xE003;</tdml:document>
 
-  </tdml:unparserTestCase>
+  </tdml:parserTestCase>
 
 <!--
       Test Name: unparseFixedLengthString01
diff --git 
a/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
 
b/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
index 8de3888e2..9dd4de267 100644
--- 
a/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
+++ 
b/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestUnparserGeneral.scala
@@ -36,9 +36,20 @@ class TestUnparserGeneral {
 
   @Test def test_apostrophe_01(): Unit = { runner.runOneTest("apostrophe_01") }
 
-  // DFDL-1395
-  // @Test def test_puaInfosetChars_03() { 
runner.runOneTest("puaInfosetChars_03") }
-  // @Test def test_puaInfosetChars_04() { 
runner.runOneTest("puaInfosetChars_04") }
+  @Test def test_puaPreexistingInfosetChars(): Unit = {
+    runner.runOneTest("puaPreexistingInfosetChars")
+  }
+  @Test def test_puaPreexistingInfosetChars_remapped(): Unit = {
+    runner.runOneTest("puaPreexistingInfosetChars_remapped")
+  }
+
+  @Test def test_puaInfosetChars_CR_CRLF_01(): Unit = {
+    runner.runOneTest("puaInfosetChars_CR_CRLF_01")
+  }
+
+  @Test def test_puaInfosetChars_CR_CRLF_02(): Unit = {
+    runner.runOneTest("puaInfosetChars_CR_CRLF_02")
+  }
 
   @Test def test_puaInfosetChars_01(): Unit = { 
runner.runOneTest("puaInfosetChars_01") }
   @Test def test_puaInfosetChars_02(): Unit = { 
runner.runOneTest("puaInfosetChars_02") }

(daffodil) branch main updated: Added test to show preexisting PUA chars are preserved.

Reply via email to