[daffodil] branch main updated: Rewrite XML to/from PUA remappers

mbeckerle Fri, 20 Jan 2023 10:28:39 -0800

This is an automated email from the ASF dual-hosted git repository.

mbeckerle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git



The following commit(s) were added to refs/heads/main by this push:
     new f5ca77ff4 Rewrite XML to/from PUA remappers
f5ca77ff4 is described below

commit f5ca77ff4f61b6b734b67e6e79e04b2bf5d2c9b5
Author: Michael Beckerle <[email protected]>
AuthorDate: Thu Jan 19 17:33:48 2023 -0500

    Rewrite XML to/from PUA remappers
    
    DAFFODIL-1559
---
 .../daffodil/util/CharacterSetRemapper.scala       | 136 +++++++++++++
 .../org/apache/daffodil/xml/PUARemappers.scala     | 128 ++++++++++++
 .../scala/org/apache/daffodil/xml/XMLUtils.scala   | 216 ++-------------------
 .../org/apache/daffodil/util/TestPUARemapper.scala | 151 ++++++++++++++
 .../daffodil/xml/test/unit/TestXMLUtils.scala      |  18 --
 .../org/apache/daffodil/tdml/TDMLRunner.scala      |   8 +-
 6 files changed, 440 insertions(+), 217 deletions(-)

diff --git 
a/daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala
 
b/daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala
new file mode 100644
index 000000000..25d9aee33
--- /dev/null
+++ 
b/daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.daffodil.util
+
+/**
+ * A abstract base for Remappers which convert strings.
+ *
+ * The public interface is just `def remap(s: String): String`.
+ *
+ * There are protected methods that implementations must provide.
+ *
+ * Contains shared implementation methods also.
+ *
+ * NOTE: This is inner loop stuff. Keep it and derived classes lean and fast.
+ * Use a java-like coding style. While loops, not map/flatmap/etc. avoid 
tuples.
+ */
+trait CharacterSetRemapper {
+
+  /**
+   * Remaps characters. Provides the previous and following characters since 
some remappings
+   * require this context for Surrogate Pairs, CRLF->LF etc.
+   *
+   * Plays a trick with negating the return value in order to avoid having to
+   * return more than one value, which is potentially less efficient.
+   *
+   * @param prev The character prior to the one being considered. (Needed for 
surrogates)
+   * @param curr The character under consideration for remapping.
+   * @param next The next character afterwards. (Needed for surrogates and 
CRLF pairs)
+   * @return The remapped character (as an Int) or that same remapped 
character Int
+   *         value negated, which signals that curr+next was remapped to a 
single character.
+   *         Such as is needed if CRLF is remapped to just LF.
+   */
+  protected def remap (prev: Char, curr: Char, next: Char): Int
+
+  /**
+   * Remaps the string. Returns the original string object if no remapping is 
required.
+   *
+   * Because of surrogate pairs, and the difference between 16-bit string 
codepoints
+   * and real character codes, lots of things that traverse strings need
+   * to consider either the codepoint after (if current is a leading surrogate)
+   * or codepoint before (if current is a trailing surrogate).
+   *
+   * This is not the only kind of character set remapping. In particular this 
is
+   * restricted to replace 1 character with 0 or 1 character in the remapped 
string.
+   * Other character set remappers can convert 1 character into 0 to many 
characters
+   * or even change from characters to bytes. Simple 1 to 1 remappings can be 
done
+   * with just a map, and 1 to N remappings can be done with flatmap if no 
context is
+   * needed for surrogates or CRLFs.
+   *
+   * See XMLUtils.walkUnicodeString for a more general kind of remapping that 
can
+   * replace 1 character with N as well as being context sensitive about 
adjacent
+   * characters before and after.
+   *
+   * This algorithm uses a StringBuilder which is not synchronized
+   * so it is noticably faster than StringBuffer, and since the StringBuilder
+   * is local to the function, we don't have to worry about any threading 
issues.
+   * This makes for a noticeable speed increase.
+   *
+   * This remapper is called for every piece of string data, both when parsing
+   * and when unparsing. Is very important for it to be high performance.
+   * Hence, this very Java loop-oriented coding style,
+   * avoiding map, or returning tuples or any other potentially inefficient 
scala-isms.
+   */
+  final def remap(s: String): String = {
+
+    val len = s.length
+    if (len == 0) return s
+
+    // Use a java StringBuilder because it has an
+    // append(charsequence, start end) method which lets us easily copy
+    // a prefix of the string into the stringbuilder.
+    // scala StringBuilder doesn't have this method.
+    var sb: java.lang.StringBuilder = null // created only if remapping proves 
to be needed
+
+    def isRemapNeeded = sb ne null
+    var pos = 0;
+    var prev = 0.toChar
+    var curr = s(0).toChar
+    var next = 0.toChar
+    var newCurr: Int = 0 // positive normally, but will be negative if we're 
to skip a char
+
+    while (pos < len) {
+      next = if (pos + 1 < len) s(pos + 1) else 0.toChar
+      //
+      // sign of newCurr is negative if we're to skip 1 character
+      // such as if the prior iteration collapsed a CRLF to just LF.
+      //
+      if (newCurr >= 0) {
+        // don't skip any character
+        newCurr = remap(prev, curr, next)
+        if (!isRemapNeeded && newCurr != curr) {
+          // we have just hit our first character that
+          // needs remapping.
+          // This block happens only once.
+          sb = new java.lang.StringBuilder(s.length)
+          sb.append(s, 0, pos)
+          // Now we have a string builder, and can proceed as
+          // if we always had one accumulating the characters
+        }
+        if (isRemapNeeded) {
+          // something in the string needed remapping, so
+          // now we have to always append characters to the
+          // string builder.
+          //
+          // if newCurr is negative, it's still the replacement
+          // remapped character code, just negated to indicate need to skip
+          val c = (if (newCurr < 0) -newCurr else newCurr).toChar
+          sb.append(c)
+        }
+      } else {
+        // Skip a character
+        newCurr = -newCurr // flip it so we only skip once
+      }
+      prev = curr
+      curr = next
+      pos += 1
+    }
+
+    val res = if (isRemapNeeded) sb.toString else s
+    res
+  }
+}
diff --git 
a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/PUARemappers.scala 
b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/PUARemappers.scala
new file mode 100644
index 000000000..ab2c9fda4
--- /dev/null
+++ b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/PUARemappers.scala
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.daffodil.xml
+
+import org.apache.daffodil.exceptions.Assert
+import org.apache.daffodil.util.CharacterSetRemapper
+
+/**
+ * Remaps illegal XML chars to the Unicode Private use Area (PUA), and 
optionally CR also to the PUA.
+ *
+ * The Unicode PUA is a set of characters reserved for application-specific 
uses.
+ * Daffodil is one of many tools that use the PUA so as to preserve characters 
XML doesn't support.
+ *
+ * Handles unpaired Unicode surrogate code points properly (remaps them).
+ *
+ * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] 
| [#x10000-#x10FFFF]
+ *
+ * Normally XML also remaps CRLF to LF and CR (isolated) to LF, but this is 
problematic
+ * when data must be preserved perfectly so we have options to turn that off.
+ *
+ * We also can check and error if the string contains conflicting PUA 
characters to begin with.
+ *
+ * See https://daffodil.apache.org/infoset/, specifically the section "XML 
Illegal Characters", for
+ * more discussion.
+ */
+final class RemapXMLIllegalCharToPUA (
+  checkForExistingPUA: Boolean,
+  replaceCRWithLF: Boolean)
+  extends CharacterSetRemapper {
+
+  /**
+   * Remaps to PUA. Note return is negated char code of replacement char if 
we're to skip a character
+   */
+  override protected def remap(prev: Char, curr: Char, next: Char): Int = {
+    val res: Int = curr match {
+      case 0x9 => curr
+      case 0xA => curr
+      case 0xD =>
+        if (next == 0xA) {
+          // CRLF case.
+          if (replaceCRWithLF)
+            -0xA // CRLF => LF, standard XML behavior. Note negated.
+          else
+            0xE00D // remap CR to preserve it. Leave LF alone.
+        } else {
+          // isolated CR case
+          if (replaceCRWithLF)
+            0xA // isolated CR => LF, standard XML behavior. Note NOT negated.
+          else
+            0xE00D // remap isolated CR to preserve it.
+        }
+      case _ if (curr < 0x20) => curr + 0xE000 // ascii c0 controls
+      // no remapping for the so called C1 controls (0x80-0x9F) Those are not 
XML illegal.
+      case _ if Character.isSurrogate(curr) => {
+        if ((Character.isHighSurrogate(curr) && 
Character.isLowSurrogate(next)) ||
+          (Character.isLowSurrogate(curr) && Character.isHighSurrogate(prev))) 
{
+          // well formed surrogate pairs are preserved
+          curr
+        } else {
+          // curr is an isolated surrogate, so to preserve we must remap to PUA
+          curr + 0x1000
+        }
+      }
+      case _ if (curr >= 0xE000 && curr <= 0xF8FF) => { // Unicode PUA is E000 
to F8FF.
+        if (checkForExistingPUA)
+          throw new RemapPUACharDetected(curr)
+        else curr
+      }
+      case _ if (curr < 0xFFFE) => curr
+      // 0xFFFE and 0xFFFF are regular Unicode chars, but XML illegal.
+      // (XML only allows up to 0xFFFD)
+      // They can't remap into the PUA by the basic techniques of adding
+      // 0xE000 or 0x1000 like with control chars or unpaired surrogate code 
points.
+      // So we just pick two adhoc, but recognizable, PUA code points to use 
by subtracting
+      // 0x0F00 from them.
+      case 0xFFFE => 0xF0FE // U+FFFE is not a legal XML char. Can't remap to 
PUA the regular way.
+      case 0xFFFF => 0xF0FF // U+FFFF is not a legal XML char
+      case bad =>
+        // $COVERAGE-OFF$
+        // This is a final class, so this only gets called with characters by 
the
+        // base class remap(s: String) method. Those chars are only
+        // taken from Scala/Java strings, hence, the char codes cannot be 
beyond 0xFFFF
+        Assert.impossibleCase("Scala/Java character code cannot be beyond 
0xFFFF but was 0x%40X".format(bad))
+        // $COVERAGE-ON$
+    }
+    res
+  }
+
+}
+
+class RemapPUACharDetected(val char: Char)
+extends Exception ("Pre-existing Private Use Area (PUA) character found in 
data: U+%04X.".format(char.toInt))
+
+/**
+ * Reverse of the RemapXMLIllegalCharToPUA mapping.
+ */
+final class RemapPUAToXMLIllegalChar()
+  extends CharacterSetRemapper {
+
+  /**
+   * This direction of remapping is simpler. No context characters are needed, 
and
+   * it never returns a negated character code.
+   */
+  override protected def remap(prevIgnored: Char, c: Char, nextIgnored: Char): 
Int = {
+    val res: Int = c match {
+      case _ if (c >= 0xE000 && c <= 0xE01F) => c - 0xE000 // Ascii c0 controls
+      case _ if (c >= 0xE800 && c <= 0xEFFF) => c - 0x1000 // isolated 
remapped surrogate codepoints
+      case 0xF0FE => 0xFFFE // FFFE is illegal in XML
+      case 0xF0FF => 0xFFFF // FFFF is illegal in XML
+      case _ => c
+    }
+    res
+  }
+}
diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala 
b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
index 57ba052e9..da032cd65 100644
--- a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
+++ b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
@@ -64,111 +64,6 @@ object XMLUtils {
   val NegativeInfinityString = "-INF"
   val NaNString = "NaN"
 
-  /**
-   * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | 
[#xE000-#xFFFD] | [#x10000-#x10FFFF]
-   *
-   * Note that this function is curried. You first close over the parameters 
of the algorithm
-   * to obtain a function that converts individual characters.
-   */
-  def remapXMLIllegalCharToPUA(
-    checkForExistingPUA: Boolean = true,
-    replaceCRWithLF: Boolean = true)(c: Char): Char = {
-    val cInt = c.toInt
-    val res = cInt match {
-      case 0x9 => c
-      case 0xA => c
-      case 0xD =>
-        if (replaceCRWithLF) 0xA.toChar // Map CR to LF. That's what XML does.
-        else 0xE00D.toChar // or remap it to PUA so it is non-whitespace, and 
preserved.
-      case _ if (cInt < 0x20) => (cInt + 0xE000).toChar // ascii c0 controls
-      case _ if (cInt > 0xD7FF && cInt < 0xE000) => (cInt + 0x1000).toChar // 
surrogate code points
-      case _ if (cInt >= 0xE000 && cInt <= 0xF8FF) => { // Unicode PUA is E000 
to F8FF.
-        if (checkForExistingPUA)
-          Assert.usageError("Pre-existing Private Use Area (PUA) character 
found in data: '%s'".format(c))
-        else c
-      }
-      case 0xFFFE => 0xF0FE.toChar
-      case 0xFFFF => 0xF0FF.toChar
-      case _ if (cInt > 0x10FFFF) => {
-        Assert.invariantFailed("Character code beyond U+10FFFF found in data. 
Codepoint: %s".format(cInt))
-      }
-      case _ => c
-
-    }
-    res
-  }
-
-  /**
-   * Scans the string looking for XML-illegal characters. True if any are 
found.
-   *
-   * Note that this considers CR (0x0d) to be a character that requires 
remapping.
-   */
-  def needsXMLToPUARemapping(s: String): Boolean = {
-    var i = 0
-    val len = s.length
-    while (i < len) {
-      val v = s.charAt(i).toInt
-      if ((v < 0x20 && !(v == 0xA || v == 0x9)) ||
-        (v > 0xD7FF && v < 0xE000) ||
-        (v >= 0xE000 && v <= 0xF8FF) ||  // Unicode PUA is E000 to F8FF
-        (v == 0xFFFE) ||
-        (v == 0xFFFF) ||
-        (v > 0x10FFFF)) {
-        return true
-      }
-      i += 1
-    }
-    false
-  }
-
-  /**
-   * Reverse of the remapXMLIllegalCharToPUA method
-   */
-  def remapPUAToXMLIllegalChar(c: Char): Char = {
-    val cInt = c.toInt
-    val res = cInt match {
-      case _ if (c >= 0xE000 && c <= 0xE01F) => (c - 0xE000).toChar // Ascii 
c0 controls
-      case _ if (c >= 0xE800 && c <= 0xEFFF) => (c - 0x1000).toChar // 
surrogate codepoints
-      case 0xF0FE => 0xFFFE.toChar
-      case 0xF0FF => 0xFFFF.toChar
-      case _ if (c > 0x10FFFF) => {
-        Assert.invariantFailed("Character code beyond U+10FFFF found in data. 
Codepoint: %s".format(c.toInt))
-      }
-      case _ => c
-    }
-    res
-  }
-
-  /**
-   * Determines if we need to unmap PUA-mapped characters back to the (XML 
illegal) original characters.
-   *
-   * Used to save allocating a string every time, given that these PUA mapped 
chars are rare.
-   */
-  def needsPUAToXMLRemapping(s: String): Boolean = {
-    var i = 0
-    val len = s.length
-    while (i < len) {
-      val v = s.charAt(i).toInt
-      if ((v == 0xD) || // not PUA, but string still needs remapping since CR 
must be mapped to LF
-          (v >= 0xE000 && v <= 0xE01F) || // PUA chars that are Ascii C0 
controls.
-          (v >= 0xE800 && v <= 0xEFFF) || // Surrogate codepoints
-          (v == 0xF0FE) || (v == 0xF0FF) || // FFFE and FFFF illegal chars
-          (v > 0x10FFFF)) {
-        return true
-      }
-      i += 1
-    }
-    false
-  }
-
-  def isLeadingSurrogate(c: Char) = {
-    c >= 0xD800 && c <= 0xDBFF
-  }
-
-  def isTrailingSurrogate(c: Char) = {
-    c >= 0xDC00 && c <= 0xDFFF
-  }
-
   /**
    * Length where a surrogate pair counts as 1 character, not two.
    */
@@ -189,10 +84,16 @@ object XMLUtils {
    * This calls a body function with prev, current, next bound to those.
    * For first codepoint prev will be 0. For last codepoint next will be 0.
    *
-   * NOTE: This function contains the same algorithm as
-   * remapXMLIllegalCharactersToPUA, but is more general and is a bit slower.
-   * Any changes made to this function probably need to be incorporated into
-   * the other.
+   * NOTE: This function contains a similar algorithm as
+   * CharacterSetRemapper, but is more general in that this can create an 
output
+   * that is a Seq[T] (e.g., a Seq[Byte]) so it can replace a character with
+   * multiple characters, e.g., this can express an encoder for UTF-8 where two
+   * adjacent surrogate code points become a sequence of 4 bytes in the UTF-8
+   * encoding.
+   *
+   * CharacterSetRemapper can only map characters to other characters, and can
+   * only remove 1 character (skip 1) when doing so.
+   *
    */
   def walkUnicodeString[T](str: String)(bodyFunc: (Char, Char, Char) => T): 
Seq[T] = {
     val len = str.length
@@ -216,89 +117,15 @@ object XMLUtils {
     list
   }
 
-  /*
-   * This function contains the same string traversal algorithm as
-   * walkUnicodeString. The only difference is that it uses a StringBuilder
-   * rather than a ListBuffer[T] that would be used in walkUnicodeString. Note
-   * that since StringBuilder is not synchronized it is noticably faster than
-   * StringBuffer, and since the StringBuilder is local to the function, we
-   * don't have to worry about any threading issues. This specificity makes for
-   * a noticable speed increase, so much so that the code duplication is worth
-   * it. Any changes made to this function probably need to be incorporated
-   * into the other.
-   */
-  def remapXMLCharacters(dfdlString: String, remapFunc: (Char) => Char): 
String = {
-    // we want to remap XML-illegal characters
-    // but leave legal surrogate-pair character pairs alone.
-    def remapOneChar(previous: Char, current: Char, next: Char): Char = {
-      if (isLeadingSurrogate(current) && isTrailingSurrogate(next)) return 
current
-      if (isTrailingSurrogate(current) && isLeadingSurrogate(previous)) return 
current
-      remapFunc(current)
-    }
-
-    val len = dfdlString.length
-    if (len == 0) return dfdlString
-
-    val sb = new StringBuilder()
-
-    var pos = 0;
-    var prev = 0.toChar
-    var curr = dfdlString(0)
-    var next = 0.toChar
-
-    while (pos < len) {
-      next = if (pos + 1 < len) dfdlString(pos + 1) else 0.toChar
-      if (curr == 0xD) {
-        if (next != 0xA) {
-          // This is a lone CR (i.e. not a CRLF), so convert the CR to a LF
-          sb.append(0xA.toChar)
-        } else {
-          // This is a CRLF. Skip the CR, essentially converting the CRLF to
-          // just LF. Do nothing.
-        }
-      } else {
-        sb.append(remapOneChar(prev, curr, next))
-      }
-      prev = curr
-      curr = next
+  private val remapXMLToPUA =
+    new RemapXMLIllegalCharToPUA(checkForExistingPUA = true, replaceCRWithLF = 
true)
 
-      pos += 1
-    }
+  def remapXMLIllegalCharactersToPUA(s: String): String = 
remapXMLToPUA.remap(s)
 
-    sb.toString
-  }
+  private val remapPUAToXML = new RemapPUAToXMLIllegalChar()
 
-  def remapXMLIllegalCharactersToPUA(dfdlString: String): String = {
-    if (needsXMLToPUARemapping(dfdlString)) {
-      // This essentially doubles the work if remapping is needed (since we
-      // scan the string once to see if it's needed, then scan again for
-      // remapping). But the common case is that remapping is not needed, so we
-      // only need to scan the string once AND we avoid allocating a new string
-      // with characters remapped.
-      remapXMLCharacters(dfdlString, remapXMLIllegalCharToPUA(false))
-    } else {
-      dfdlString
-    }
-  }
+  def remapPUAToXMLIllegalCharacters(text: String) = remapPUAToXML.remap(text)
 
-  /**
-   * Converts PUA characters back into the original (XML Illegal) characters
-   * they represent.
-   *
-   *
-   */
-  def remapPUAToXMLIllegalCharacters(dfdlString: String): String = {
-    if (needsPUAToXMLRemapping(dfdlString)) {
-      // This essentially doubles the work if remapping is needed (since we
-      // scan the string once to see if it's needed, then scan again for
-      // remapping). But the common case is that remapping is not needed, so we
-      // only need to scan the string once AND we avoid allocating a new string
-      // with characters remapped.
-      remapXMLCharacters(dfdlString, remapPUAToXMLIllegalChar)
-    } else {
-      dfdlString
-    }
-  }
 
   def coalesceAllAdjacentTextNodes(node: Node): Node = {
     node match {
@@ -1299,11 +1126,11 @@ Differences were (path, expected, actual):
    * We have to use our own PUA remapping trick if we want to be sure to 
preserve
    * CR in XML.
    */
-  def escape(str: String, sb: StringBuilder = new StringBuilder()): 
StringBuilder = {
+  def escape(s: String, sb: StringBuilder = new StringBuilder()): 
StringBuilder = {
     var i = 0
+    val str = xmlRemapperPreservingCR.remap(s)
     while (i < str.length) {
-      val x = str(i)
-      val c = escapeMapper(x)
+      val c = str(i)
       i += 1
       c match {
         case '\'' => sb.append("&#x27;") // don't use "&apos;" because it's 
not universally accepted (HTML doesn't have it in early versions)
@@ -1323,10 +1150,8 @@ Differences were (path, expected, actual):
     sb
   }
 
-  private val escapeMapper =
-    remapXMLIllegalCharToPUA(
-      checkForExistingPUA = false,
-      replaceCRWithLF = false) _
+  private val xmlRemapperPreservingCR =
+    new RemapXMLIllegalCharToPUA(checkForExistingPUA = false, replaceCRWithLF 
= false)
 
   def toNumericCharacterEntity(c: Char, sb: StringBuilder) = {
     val i = c.toInt
@@ -1489,3 +1314,4 @@ class QNamePrefixNotInScopeException(pre: String, loc: 
LookupLocation)
 //    res
 //  }
 //}
+
diff --git 
a/daffodil-lib/src/test/scala/org/apache/daffodil/util/TestPUARemapper.scala 
b/daffodil-lib/src/test/scala/org/apache/daffodil/util/TestPUARemapper.scala
new file mode 100644
index 000000000..239024486
--- /dev/null
+++ b/daffodil-lib/src/test/scala/org/apache/daffodil/util/TestPUARemapper.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.daffodil.util
+
+import org.apache.daffodil.Implicits.intercept
+import org.apache.daffodil.xml.RemapPUACharDetected
+import org.junit.Assert._
+import org.junit.Test
+import org.apache.daffodil.xml.RemapPUAToXMLIllegalChar
+import org.apache.daffodil.xml.RemapXMLIllegalCharToPUA
+
+object TestPUARemapper {
+
+  /**
+   * For testing surrogate pairs (good ones, and isolated halves)
+   *
+   * I pasted the single elephant character into this string.
+   * On my ubuntu linux using IntelliJ IDEA it looks like an elephant to me.
+   *
+   * Here is the same elephant between two vertical bars |𓃰| visible if
+   * you have fantastic Unicode support
+   *
+   * That character is EGYPTIAN HIEROGLYPH E026
+   * Hex code point    130F0
+   * Hex UTF-8 bytes   F0 93 83 B0
+   * Hex UTF-16 Surrogates     D80C DCF0
+   */
+  val elephant = "𓃰"
+}
+
+class TestPUARemapper {
+  import TestPUARemapper._
+
+  val xmlToPUANoCR2LF = new RemapXMLIllegalCharToPUA(true, false)
+  val xmlToPUAUnsafe = new RemapXMLIllegalCharToPUA(false, false)
+  val xmlToPUAAndCRToLF = new RemapXMLIllegalCharToPUA(true, true)
+
+  @Test def testRemapXMLToPUAUnnecessary(): Unit = {
+    val input = "no remapping needed."
+    val actual = xmlToPUANoCR2LF.remap(input)
+    assertTrue(actual eq input) // same exact string object.
+  }
+
+  @Test def testRemapXMLToPUAPrexistingPUAOk(): Unit = {
+    val input = "abc\uE001def\u0002ghi" // one pre-existing, one to remap
+    val actual = xmlToPUAUnsafe.remap(input)
+    assertEquals("abc\uE001def\uE002ghi", actual) // both are in in PUA
+  }
+
+  @Test def testRemapXMLToPUAPrexistingPUAError(): Unit = {
+    val input = "abc\uE001def\u0002ghi" // one pre-existing, one to remap
+    val e = intercept[RemapPUACharDetected] {
+      xmlToPUANoCR2LF.remap(input)
+    }
+    val msg = e.getMessage
+    assertEquals(0xE001, e.char)
+    assertTrue(msg.contains("Pre-existing Private Use Area (PUA) character"))
+    assertTrue(msg.contains("U+E001"))
+  }
+
+  @Test def testRemapXMLIllegalCharToPUA(): Unit = {
+    val ec = xmlToPUANoCR2LF.remap("\u0000")
+    assertEquals("\uE000", ec)
+    val ed = xmlToPUANoCR2LF.remap("\uD880")
+    assertEquals("\uE880", ed)
+    val crInPUA = xmlToPUANoCR2LF.remap("\u000d") // CR
+    assertEquals("\uE00D", crInPUA)
+    val lf = xmlToPUAAndCRToLF.remap("\u000D") // CR
+    assertEquals("\u000A", lf)
+  }
+
+  @Test def testRemapXMLIllegalCharactersToPUA(): Unit = {
+    val input = "nul\u0000ctrlA\u0001cr\u000d_fffe\ufffe_ffff\uffffdone"
+    val actual = xmlToPUANoCR2LF.remap(input)
+    assertEquals("nul\uE000ctrlA\uE001cr\uE00d_fffe\uf0fe_ffff\uf0ffdone", 
actual)
+  }
+
+  @Test def testRemapXMLSurrogateCharactersToPUA(): Unit = {
+    assertEquals(2, elephant.length)
+    assertEquals("\uD80C\uDCF0", elephant)
+    val input = "elephant" + elephant + "isolatedHigh\uD80CisolatedLow\uDCF0"
+    val actual = xmlToPUANoCR2LF.remap(input)
+    assertEquals("elephant\uD80C\uDCF0isolatedHigh\uE80CisolatedLow\uECF0", 
actual)
+  }
+
+  @Test def testRemapXMLSurrogateCharactersToPUA2(): Unit = {
+    val input = "badSurrogateOrder lowFirst\uDCF0\uD80CthenHigh"
+    val actual = xmlToPUANoCR2LF.remap(input)
+    assertEquals("badSurrogateOrder lowFirst\uECF0\uE80CthenHigh", actual)
+  }
+
+  @Test def testRemapXMLSurrogateCharactersToPUA3(): Unit = {
+    val input = "badSurrogateOrder 
isolatedLowFirst\uDCF0\uD80C\uDCF0thenACorrectPair"
+    val actual = xmlToPUANoCR2LF.remap(input)
+    assertEquals("badSurrogateOrder 
isolatedLowFirst\uECF0\uD80C\uDCF0thenACorrectPair", actual)
+  }
+
+  @Test def testRemapReplacesCRLFWithLF(): Unit = {
+    val input = "abc\r\ndef\r\nghi\r\njkl"
+    val actual = xmlToPUAAndCRToLF.remap(input)
+    assertEquals("abc\ndef\nghi\njkl", actual)
+  }
+
+  @Test def testRemapDoesNotReplaceCRLFWithLF(): Unit = {
+    val input = "abc\r\ndef\r\nghi\r\njkl"
+    val actual = xmlToPUANoCR2LF.remap(input)
+    assertEquals("abc\uE00D\ndef\uE00D\nghi\uE00D\njkl", actual)
+  }
+}
+
+class TestRemapPUAToXML() {
+  import TestPUARemapper._
+
+  val puaToXML = new RemapPUAToXMLIllegalChar()
+
+  @Test def testRemapPUAToXMLIllegalChar(): Unit = {
+    val ec = puaToXML.remap("\uE000")
+    assertEquals("\u0000", ec)
+    val ed = puaToXML.remap("\uE880")
+    assertEquals("\uD880", ed)
+    val cr = puaToXML.remap("\uE00D")
+    assertEquals("\u000D", cr)
+  }
+
+  @Test def testRemapPUAToIllegalXMLChars(): Unit = {
+    val input = "nul\uE000ctrlA\uE001cr\uE00d_fffe\uf0fe_ffff\uf0ffdone"
+    val actual = puaToXML.remap(input)
+    assertEquals("nul\u0000ctrlA\u0001cr\u000d_fffe\ufffe_ffff\uffffdone", 
actual)
+  }
+
+  @Test def testRemapPUAToSurrogateChars(): Unit = {
+    val input = "elephant\uD80C\uDCF0isolatedHigh\uE80CisolatedLow\uECF0"
+    val actual = puaToXML.remap(input)
+    assertEquals("elephant" + elephant + 
"isolatedHigh\uD80CisolatedLow\uDCF0", actual)
+  }
+
+}
diff --git 
a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
 
b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
index 0b7a67044..9722feb29 100644
--- 
a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
+++ 
b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
@@ -21,13 +21,9 @@ import java.nio.file.Files
 import java.nio.file.Path
 import java.nio.file.Paths
 import java.nio.file.StandardOpenOption
-
 import scala.xml._
-
 import org.junit.Assert._
 import org.junit.Test
-
-import org.apache.daffodil.Implicits._
 import org.apache.daffodil.util.Misc
 import org.apache.daffodil.xml.JDOMUtils
 import org.apache.daffodil.xml.NS
@@ -121,20 +117,6 @@ class TestXMLUtils {
     assertFalse(JDOMUtils.isNil(d2))
   }
 
-  @Test def testRemapXMLIllegalCharToPUA(): Unit = {
-    val ec = XMLUtils.remapXMLIllegalCharToPUA(false)(0x0)
-    assertEquals(0xE000, ec)
-    val ed = XMLUtils.remapXMLIllegalCharToPUA(false)(0xd880)
-    assertEquals(0xE880, ed)
-  }
-
-  @Test def testRemapPUAToXMLIllegalChar(): Unit = {
-    val ec = XMLUtils.remapPUAToXMLIllegalChar(0xE000)
-    assertEquals(0x0, ec)
-    val ed = XMLUtils.remapPUAToXMLIllegalChar(0xE880)
-    assertEquals(0xD880, ed)
-  }
-
   @Test def testWalkUnicodeString1(): Unit = {
     val s = "abc"
     val Seq((ab, 'a', 'b'), ('a', 'b', 'c'), ('b', 'c', ca)) = 
XMLUtils.walkUnicodeString(s)((p, c, n) => (p, c, n))
diff --git 
a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala 
b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
index f3a3eba90..3f6b99010 100644
--- a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
+++ b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
@@ -2564,9 +2564,9 @@ object UTF8Encoder {
         val high5 = ((byte1 & 0x07) << 2) | (byte2 >> 6)
         byteList(high5 | 0xC0, low6 | 0x80)
       }
-      case _ if (XMLUtils.isLeadingSurrogate(c)) => {
+      case _ if (Character.isHighSurrogate(c)) => {
         // High (initial) Surrogate character case.
-        if (XMLUtils.isTrailingSurrogate(next)) {
+        if (Character.isLowSurrogate(next)) {
           // Next codepoint is a low surrogate.
           // We need to create a 4-byte representation from the
           // two surrogate characters.
@@ -2576,9 +2576,9 @@ object UTF8Encoder {
           threeByteEncode()
         }
       }
-      case _ if (XMLUtils.isTrailingSurrogate(c)) => {
+      case _ if (Character.isLowSurrogate(c)) => {
         // Low (subsequent) Surrogate character case.
-        if (XMLUtils.isLeadingSurrogate(prev)) {
+        if (Character.isHighSurrogate(prev)) {
           // Previous codepoint was a high surrogate.
           // This codepoint was handled as part of converting the
           // surrogate pair.

[daffodil] branch main updated: Rewrite XML to/from PUA remappers

Reply via email to