This is an automated email from the ASF dual-hosted git repository.
mbeckerle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new f5ca77ff4 Rewrite XML to/from PUA remappers
f5ca77ff4 is described below
commit f5ca77ff4f61b6b734b67e6e79e04b2bf5d2c9b5
Author: Michael Beckerle <[email protected]>
AuthorDate: Thu Jan 19 17:33:48 2023 -0500
Rewrite XML to/from PUA remappers
DAFFODIL-1559
---
.../daffodil/util/CharacterSetRemapper.scala | 136 +++++++++++++
.../org/apache/daffodil/xml/PUARemappers.scala | 128 ++++++++++++
.../scala/org/apache/daffodil/xml/XMLUtils.scala | 216 ++-------------------
.../org/apache/daffodil/util/TestPUARemapper.scala | 151 ++++++++++++++
.../daffodil/xml/test/unit/TestXMLUtils.scala | 18 --
.../org/apache/daffodil/tdml/TDMLRunner.scala | 8 +-
6 files changed, 440 insertions(+), 217 deletions(-)
diff --git
a/daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala
b/daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala
new file mode 100644
index 000000000..25d9aee33
--- /dev/null
+++
b/daffodil-lib/src/main/scala/org/apache/daffodil/util/CharacterSetRemapper.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.daffodil.util
+
+/**
+ * A abstract base for Remappers which convert strings.
+ *
+ * The public interface is just `def remap(s: String): String`.
+ *
+ * There are protected methods that implementations must provide.
+ *
+ * Contains shared implementation methods also.
+ *
+ * NOTE: This is inner loop stuff. Keep it and derived classes lean and fast.
+ * Use a java-like coding style. While loops, not map/flatmap/etc. avoid
tuples.
+ */
+trait CharacterSetRemapper {
+
+ /**
+ * Remaps characters. Provides the previous and following characters since
some remappings
+ * require this context for Surrogate Pairs, CRLF->LF etc.
+ *
+ * Plays a trick with negating the return value in order to avoid having to
+ * return more than one value, which is potentially less efficient.
+ *
+ * @param prev The character prior to the one being considered. (Needed for
surrogates)
+ * @param curr The character under consideration for remapping.
+ * @param next The next character afterwards. (Needed for surrogates and
CRLF pairs)
+ * @return The remapped character (as an Int) or that same remapped
character Int
+ * value negated, which signals that curr+next was remapped to a
single character.
+ * Such as is needed if CRLF is remapped to just LF.
+ */
+ protected def remap (prev: Char, curr: Char, next: Char): Int
+
+ /**
+ * Remaps the string. Returns the original string object if no remapping is
required.
+ *
+ * Because of surrogate pairs, and the difference between 16-bit string
codepoints
+ * and real character codes, lots of things that traverse strings need
+ * to consider either the codepoint after (if current is a leading surrogate)
+ * or codepoint before (if current is a trailing surrogate).
+ *
+ * This is not the only kind of character set remapping. In particular this
is
+ * restricted to replace 1 character with 0 or 1 character in the remapped
string.
+ * Other character set remappers can convert 1 character into 0 to many
characters
+ * or even change from characters to bytes. Simple 1 to 1 remappings can be
done
+ * with just a map, and 1 to N remappings can be done with flatmap if no
context is
+ * needed for surrogates or CRLFs.
+ *
+ * See XMLUtils.walkUnicodeString for a more general kind of remapping that
can
+ * replace 1 character with N as well as being context sensitive about
adjacent
+ * characters before and after.
+ *
+ * This algorithm uses a StringBuilder which is not synchronized
+ * so it is noticably faster than StringBuffer, and since the StringBuilder
+ * is local to the function, we don't have to worry about any threading
issues.
+ * This makes for a noticeable speed increase.
+ *
+ * This remapper is called for every piece of string data, both when parsing
+ * and when unparsing. Is very important for it to be high performance.
+ * Hence, this very Java loop-oriented coding style,
+ * avoiding map, or returning tuples or any other potentially inefficient
scala-isms.
+ */
+ final def remap(s: String): String = {
+
+ val len = s.length
+ if (len == 0) return s
+
+ // Use a java StringBuilder because it has an
+ // append(charsequence, start end) method which lets us easily copy
+ // a prefix of the string into the stringbuilder.
+ // scala StringBuilder doesn't have this method.
+ var sb: java.lang.StringBuilder = null // created only if remapping proves
to be needed
+
+ def isRemapNeeded = sb ne null
+ var pos = 0;
+ var prev = 0.toChar
+ var curr = s(0).toChar
+ var next = 0.toChar
+ var newCurr: Int = 0 // positive normally, but will be negative if we're
to skip a char
+
+ while (pos < len) {
+ next = if (pos + 1 < len) s(pos + 1) else 0.toChar
+ //
+ // sign of newCurr is negative if we're to skip 1 character
+ // such as if the prior iteration collapsed a CRLF to just LF.
+ //
+ if (newCurr >= 0) {
+ // don't skip any character
+ newCurr = remap(prev, curr, next)
+ if (!isRemapNeeded && newCurr != curr) {
+ // we have just hit our first character that
+ // needs remapping.
+ // This block happens only once.
+ sb = new java.lang.StringBuilder(s.length)
+ sb.append(s, 0, pos)
+ // Now we have a string builder, and can proceed as
+ // if we always had one accumulating the characters
+ }
+ if (isRemapNeeded) {
+ // something in the string needed remapping, so
+ // now we have to always append characters to the
+ // string builder.
+ //
+ // if newCurr is negative, it's still the replacement
+ // remapped character code, just negated to indicate need to skip
+ val c = (if (newCurr < 0) -newCurr else newCurr).toChar
+ sb.append(c)
+ }
+ } else {
+ // Skip a character
+ newCurr = -newCurr // flip it so we only skip once
+ }
+ prev = curr
+ curr = next
+ pos += 1
+ }
+
+ val res = if (isRemapNeeded) sb.toString else s
+ res
+ }
+}
diff --git
a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/PUARemappers.scala
b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/PUARemappers.scala
new file mode 100644
index 000000000..ab2c9fda4
--- /dev/null
+++ b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/PUARemappers.scala
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.daffodil.xml
+
+import org.apache.daffodil.exceptions.Assert
+import org.apache.daffodil.util.CharacterSetRemapper
+
+/**
+ * Remaps illegal XML chars to the Unicode Private use Area (PUA), and
optionally CR also to the PUA.
+ *
+ * The Unicode PUA is a set of characters reserved for application-specific
uses.
+ * Daffodil is one of many tools that use the PUA so as to preserve characters
XML doesn't support.
+ *
+ * Handles unpaired Unicode surrogate code points properly (remaps them).
+ *
+ * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD]
| [#x10000-#x10FFFF]
+ *
+ * Normally XML also remaps CRLF to LF and CR (isolated) to LF, but this is
problematic
+ * when data must be preserved perfectly so we have options to turn that off.
+ *
+ * We also can check and error if the string contains conflicting PUA
characters to begin with.
+ *
+ * See https://daffodil.apache.org/infoset/, specifically the section "XML
Illegal Characters", for
+ * more discussion.
+ */
+final class RemapXMLIllegalCharToPUA (
+ checkForExistingPUA: Boolean,
+ replaceCRWithLF: Boolean)
+ extends CharacterSetRemapper {
+
+ /**
+ * Remaps to PUA. Note return is negated char code of replacement char if
we're to skip a character
+ */
+ override protected def remap(prev: Char, curr: Char, next: Char): Int = {
+ val res: Int = curr match {
+ case 0x9 => curr
+ case 0xA => curr
+ case 0xD =>
+ if (next == 0xA) {
+ // CRLF case.
+ if (replaceCRWithLF)
+ -0xA // CRLF => LF, standard XML behavior. Note negated.
+ else
+ 0xE00D // remap CR to preserve it. Leave LF alone.
+ } else {
+ // isolated CR case
+ if (replaceCRWithLF)
+ 0xA // isolated CR => LF, standard XML behavior. Note NOT negated.
+ else
+ 0xE00D // remap isolated CR to preserve it.
+ }
+ case _ if (curr < 0x20) => curr + 0xE000 // ascii c0 controls
+ // no remapping for the so called C1 controls (0x80-0x9F) Those are not
XML illegal.
+ case _ if Character.isSurrogate(curr) => {
+ if ((Character.isHighSurrogate(curr) &&
Character.isLowSurrogate(next)) ||
+ (Character.isLowSurrogate(curr) && Character.isHighSurrogate(prev)))
{
+ // well formed surrogate pairs are preserved
+ curr
+ } else {
+ // curr is an isolated surrogate, so to preserve we must remap to PUA
+ curr + 0x1000
+ }
+ }
+ case _ if (curr >= 0xE000 && curr <= 0xF8FF) => { // Unicode PUA is E000
to F8FF.
+ if (checkForExistingPUA)
+ throw new RemapPUACharDetected(curr)
+ else curr
+ }
+ case _ if (curr < 0xFFFE) => curr
+ // 0xFFFE and 0xFFFF are regular Unicode chars, but XML illegal.
+ // (XML only allows up to 0xFFFD)
+ // They can't remap into the PUA by the basic techniques of adding
+ // 0xE000 or 0x1000 like with control chars or unpaired surrogate code
points.
+ // So we just pick two adhoc, but recognizable, PUA code points to use
by subtracting
+ // 0x0F00 from them.
+ case 0xFFFE => 0xF0FE // U+FFFE is not a legal XML char. Can't remap to
PUA the regular way.
+ case 0xFFFF => 0xF0FF // U+FFFF is not a legal XML char
+ case bad =>
+ // $COVERAGE-OFF$
+ // This is a final class, so this only gets called with characters by
the
+ // base class remap(s: String) method. Those chars are only
+ // taken from Scala/Java strings, hence, the char codes cannot be
beyond 0xFFFF
+ Assert.impossibleCase("Scala/Java character code cannot be beyond
0xFFFF but was 0x%40X".format(bad))
+ // $COVERAGE-ON$
+ }
+ res
+ }
+
+}
+
+class RemapPUACharDetected(val char: Char)
+extends Exception ("Pre-existing Private Use Area (PUA) character found in
data: U+%04X.".format(char.toInt))
+
+/**
+ * Reverse of the RemapXMLIllegalCharToPUA mapping.
+ */
+final class RemapPUAToXMLIllegalChar()
+ extends CharacterSetRemapper {
+
+ /**
+ * This direction of remapping is simpler. No context characters are needed,
and
+ * it never returns a negated character code.
+ */
+ override protected def remap(prevIgnored: Char, c: Char, nextIgnored: Char):
Int = {
+ val res: Int = c match {
+ case _ if (c >= 0xE000 && c <= 0xE01F) => c - 0xE000 // Ascii c0 controls
+ case _ if (c >= 0xE800 && c <= 0xEFFF) => c - 0x1000 // isolated
remapped surrogate codepoints
+ case 0xF0FE => 0xFFFE // FFFE is illegal in XML
+ case 0xF0FF => 0xFFFF // FFFF is illegal in XML
+ case _ => c
+ }
+ res
+ }
+}
diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
index 57ba052e9..da032cd65 100644
--- a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
+++ b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
@@ -64,111 +64,6 @@ object XMLUtils {
val NegativeInfinityString = "-INF"
val NaNString = "NaN"
- /**
- * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] |
[#xE000-#xFFFD] | [#x10000-#x10FFFF]
- *
- * Note that this function is curried. You first close over the parameters
of the algorithm
- * to obtain a function that converts individual characters.
- */
- def remapXMLIllegalCharToPUA(
- checkForExistingPUA: Boolean = true,
- replaceCRWithLF: Boolean = true)(c: Char): Char = {
- val cInt = c.toInt
- val res = cInt match {
- case 0x9 => c
- case 0xA => c
- case 0xD =>
- if (replaceCRWithLF) 0xA.toChar // Map CR to LF. That's what XML does.
- else 0xE00D.toChar // or remap it to PUA so it is non-whitespace, and
preserved.
- case _ if (cInt < 0x20) => (cInt + 0xE000).toChar // ascii c0 controls
- case _ if (cInt > 0xD7FF && cInt < 0xE000) => (cInt + 0x1000).toChar //
surrogate code points
- case _ if (cInt >= 0xE000 && cInt <= 0xF8FF) => { // Unicode PUA is E000
to F8FF.
- if (checkForExistingPUA)
- Assert.usageError("Pre-existing Private Use Area (PUA) character
found in data: '%s'".format(c))
- else c
- }
- case 0xFFFE => 0xF0FE.toChar
- case 0xFFFF => 0xF0FF.toChar
- case _ if (cInt > 0x10FFFF) => {
- Assert.invariantFailed("Character code beyond U+10FFFF found in data.
Codepoint: %s".format(cInt))
- }
- case _ => c
-
- }
- res
- }
-
- /**
- * Scans the string looking for XML-illegal characters. True if any are
found.
- *
- * Note that this considers CR (0x0d) to be a character that requires
remapping.
- */
- def needsXMLToPUARemapping(s: String): Boolean = {
- var i = 0
- val len = s.length
- while (i < len) {
- val v = s.charAt(i).toInt
- if ((v < 0x20 && !(v == 0xA || v == 0x9)) ||
- (v > 0xD7FF && v < 0xE000) ||
- (v >= 0xE000 && v <= 0xF8FF) || // Unicode PUA is E000 to F8FF
- (v == 0xFFFE) ||
- (v == 0xFFFF) ||
- (v > 0x10FFFF)) {
- return true
- }
- i += 1
- }
- false
- }
-
- /**
- * Reverse of the remapXMLIllegalCharToPUA method
- */
- def remapPUAToXMLIllegalChar(c: Char): Char = {
- val cInt = c.toInt
- val res = cInt match {
- case _ if (c >= 0xE000 && c <= 0xE01F) => (c - 0xE000).toChar // Ascii
c0 controls
- case _ if (c >= 0xE800 && c <= 0xEFFF) => (c - 0x1000).toChar //
surrogate codepoints
- case 0xF0FE => 0xFFFE.toChar
- case 0xF0FF => 0xFFFF.toChar
- case _ if (c > 0x10FFFF) => {
- Assert.invariantFailed("Character code beyond U+10FFFF found in data.
Codepoint: %s".format(c.toInt))
- }
- case _ => c
- }
- res
- }
-
- /**
- * Determines if we need to unmap PUA-mapped characters back to the (XML
illegal) original characters.
- *
- * Used to save allocating a string every time, given that these PUA mapped
chars are rare.
- */
- def needsPUAToXMLRemapping(s: String): Boolean = {
- var i = 0
- val len = s.length
- while (i < len) {
- val v = s.charAt(i).toInt
- if ((v == 0xD) || // not PUA, but string still needs remapping since CR
must be mapped to LF
- (v >= 0xE000 && v <= 0xE01F) || // PUA chars that are Ascii C0
controls.
- (v >= 0xE800 && v <= 0xEFFF) || // Surrogate codepoints
- (v == 0xF0FE) || (v == 0xF0FF) || // FFFE and FFFF illegal chars
- (v > 0x10FFFF)) {
- return true
- }
- i += 1
- }
- false
- }
-
- def isLeadingSurrogate(c: Char) = {
- c >= 0xD800 && c <= 0xDBFF
- }
-
- def isTrailingSurrogate(c: Char) = {
- c >= 0xDC00 && c <= 0xDFFF
- }
-
/**
* Length where a surrogate pair counts as 1 character, not two.
*/
@@ -189,10 +84,16 @@ object XMLUtils {
* This calls a body function with prev, current, next bound to those.
* For first codepoint prev will be 0. For last codepoint next will be 0.
*
- * NOTE: This function contains the same algorithm as
- * remapXMLIllegalCharactersToPUA, but is more general and is a bit slower.
- * Any changes made to this function probably need to be incorporated into
- * the other.
+ * NOTE: This function contains a similar algorithm as
+ * CharacterSetRemapper, but is more general in that this can create an
output
+ * that is a Seq[T] (e.g., a Seq[Byte]) so it can replace a character with
+ * multiple characters, e.g., this can express an encoder for UTF-8 where two
+ * adjacent surrogate code points become a sequence of 4 bytes in the UTF-8
+ * encoding.
+ *
+ * CharacterSetRemapper can only map characters to other characters, and can
+ * only remove 1 character (skip 1) when doing so.
+ *
*/
def walkUnicodeString[T](str: String)(bodyFunc: (Char, Char, Char) => T):
Seq[T] = {
val len = str.length
@@ -216,89 +117,15 @@ object XMLUtils {
list
}
- /*
- * This function contains the same string traversal algorithm as
- * walkUnicodeString. The only difference is that it uses a StringBuilder
- * rather than a ListBuffer[T] that would be used in walkUnicodeString. Note
- * that since StringBuilder is not synchronized it is noticably faster than
- * StringBuffer, and since the StringBuilder is local to the function, we
- * don't have to worry about any threading issues. This specificity makes for
- * a noticable speed increase, so much so that the code duplication is worth
- * it. Any changes made to this function probably need to be incorporated
- * into the other.
- */
- def remapXMLCharacters(dfdlString: String, remapFunc: (Char) => Char):
String = {
- // we want to remap XML-illegal characters
- // but leave legal surrogate-pair character pairs alone.
- def remapOneChar(previous: Char, current: Char, next: Char): Char = {
- if (isLeadingSurrogate(current) && isTrailingSurrogate(next)) return
current
- if (isTrailingSurrogate(current) && isLeadingSurrogate(previous)) return
current
- remapFunc(current)
- }
-
- val len = dfdlString.length
- if (len == 0) return dfdlString
-
- val sb = new StringBuilder()
-
- var pos = 0;
- var prev = 0.toChar
- var curr = dfdlString(0)
- var next = 0.toChar
-
- while (pos < len) {
- next = if (pos + 1 < len) dfdlString(pos + 1) else 0.toChar
- if (curr == 0xD) {
- if (next != 0xA) {
- // This is a lone CR (i.e. not a CRLF), so convert the CR to a LF
- sb.append(0xA.toChar)
- } else {
- // This is a CRLF. Skip the CR, essentially converting the CRLF to
- // just LF. Do nothing.
- }
- } else {
- sb.append(remapOneChar(prev, curr, next))
- }
- prev = curr
- curr = next
+ private val remapXMLToPUA =
+ new RemapXMLIllegalCharToPUA(checkForExistingPUA = true, replaceCRWithLF =
true)
- pos += 1
- }
+ def remapXMLIllegalCharactersToPUA(s: String): String =
remapXMLToPUA.remap(s)
- sb.toString
- }
+ private val remapPUAToXML = new RemapPUAToXMLIllegalChar()
- def remapXMLIllegalCharactersToPUA(dfdlString: String): String = {
- if (needsXMLToPUARemapping(dfdlString)) {
- // This essentially doubles the work if remapping is needed (since we
- // scan the string once to see if it's needed, then scan again for
- // remapping). But the common case is that remapping is not needed, so we
- // only need to scan the string once AND we avoid allocating a new string
- // with characters remapped.
- remapXMLCharacters(dfdlString, remapXMLIllegalCharToPUA(false))
- } else {
- dfdlString
- }
- }
+ def remapPUAToXMLIllegalCharacters(text: String) = remapPUAToXML.remap(text)
- /**
- * Converts PUA characters back into the original (XML Illegal) characters
- * they represent.
- *
- *
- */
- def remapPUAToXMLIllegalCharacters(dfdlString: String): String = {
- if (needsPUAToXMLRemapping(dfdlString)) {
- // This essentially doubles the work if remapping is needed (since we
- // scan the string once to see if it's needed, then scan again for
- // remapping). But the common case is that remapping is not needed, so we
- // only need to scan the string once AND we avoid allocating a new string
- // with characters remapped.
- remapXMLCharacters(dfdlString, remapPUAToXMLIllegalChar)
- } else {
- dfdlString
- }
- }
def coalesceAllAdjacentTextNodes(node: Node): Node = {
node match {
@@ -1299,11 +1126,11 @@ Differences were (path, expected, actual):
* We have to use our own PUA remapping trick if we want to be sure to
preserve
* CR in XML.
*/
- def escape(str: String, sb: StringBuilder = new StringBuilder()):
StringBuilder = {
+ def escape(s: String, sb: StringBuilder = new StringBuilder()):
StringBuilder = {
var i = 0
+ val str = xmlRemapperPreservingCR.remap(s)
while (i < str.length) {
- val x = str(i)
- val c = escapeMapper(x)
+ val c = str(i)
i += 1
c match {
case '\'' => sb.append("'") // don't use "'" because it's
not universally accepted (HTML doesn't have it in early versions)
@@ -1323,10 +1150,8 @@ Differences were (path, expected, actual):
sb
}
- private val escapeMapper =
- remapXMLIllegalCharToPUA(
- checkForExistingPUA = false,
- replaceCRWithLF = false) _
+ private val xmlRemapperPreservingCR =
+ new RemapXMLIllegalCharToPUA(checkForExistingPUA = false, replaceCRWithLF
= false)
def toNumericCharacterEntity(c: Char, sb: StringBuilder) = {
val i = c.toInt
@@ -1489,3 +1314,4 @@ class QNamePrefixNotInScopeException(pre: String, loc:
LookupLocation)
// res
// }
//}
+
diff --git
a/daffodil-lib/src/test/scala/org/apache/daffodil/util/TestPUARemapper.scala
b/daffodil-lib/src/test/scala/org/apache/daffodil/util/TestPUARemapper.scala
new file mode 100644
index 000000000..239024486
--- /dev/null
+++ b/daffodil-lib/src/test/scala/org/apache/daffodil/util/TestPUARemapper.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.daffodil.util
+
+import org.apache.daffodil.Implicits.intercept
+import org.apache.daffodil.xml.RemapPUACharDetected
+import org.junit.Assert._
+import org.junit.Test
+import org.apache.daffodil.xml.RemapPUAToXMLIllegalChar
+import org.apache.daffodil.xml.RemapXMLIllegalCharToPUA
+
+object TestPUARemapper {
+
+ /**
+ * For testing surrogate pairs (good ones, and isolated halves)
+ *
+ * I pasted the single elephant character into this string.
+ * On my ubuntu linux using IntelliJ IDEA it looks like an elephant to me.
+ *
+ * Here is the same elephant between two vertical bars |𓃰| visible if
+ * you have fantastic Unicode support
+ *
+ * That character is EGYPTIAN HIEROGLYPH E026
+ * Hex code point 130F0
+ * Hex UTF-8 bytes F0 93 83 B0
+ * Hex UTF-16 Surrogates D80C DCF0
+ */
+ val elephant = "𓃰"
+}
+
+class TestPUARemapper {
+ import TestPUARemapper._
+
+ val xmlToPUANoCR2LF = new RemapXMLIllegalCharToPUA(true, false)
+ val xmlToPUAUnsafe = new RemapXMLIllegalCharToPUA(false, false)
+ val xmlToPUAAndCRToLF = new RemapXMLIllegalCharToPUA(true, true)
+
+ @Test def testRemapXMLToPUAUnnecessary(): Unit = {
+ val input = "no remapping needed."
+ val actual = xmlToPUANoCR2LF.remap(input)
+ assertTrue(actual eq input) // same exact string object.
+ }
+
+ @Test def testRemapXMLToPUAPrexistingPUAOk(): Unit = {
+ val input = "abc\uE001def\u0002ghi" // one pre-existing, one to remap
+ val actual = xmlToPUAUnsafe.remap(input)
+ assertEquals("abc\uE001def\uE002ghi", actual) // both are in in PUA
+ }
+
+ @Test def testRemapXMLToPUAPrexistingPUAError(): Unit = {
+ val input = "abc\uE001def\u0002ghi" // one pre-existing, one to remap
+ val e = intercept[RemapPUACharDetected] {
+ xmlToPUANoCR2LF.remap(input)
+ }
+ val msg = e.getMessage
+ assertEquals(0xE001, e.char)
+ assertTrue(msg.contains("Pre-existing Private Use Area (PUA) character"))
+ assertTrue(msg.contains("U+E001"))
+ }
+
+ @Test def testRemapXMLIllegalCharToPUA(): Unit = {
+ val ec = xmlToPUANoCR2LF.remap("\u0000")
+ assertEquals("\uE000", ec)
+ val ed = xmlToPUANoCR2LF.remap("\uD880")
+ assertEquals("\uE880", ed)
+ val crInPUA = xmlToPUANoCR2LF.remap("\u000d") // CR
+ assertEquals("\uE00D", crInPUA)
+ val lf = xmlToPUAAndCRToLF.remap("\u000D") // CR
+ assertEquals("\u000A", lf)
+ }
+
+ @Test def testRemapXMLIllegalCharactersToPUA(): Unit = {
+ val input = "nul\u0000ctrlA\u0001cr\u000d_fffe\ufffe_ffff\uffffdone"
+ val actual = xmlToPUANoCR2LF.remap(input)
+ assertEquals("nul\uE000ctrlA\uE001cr\uE00d_fffe\uf0fe_ffff\uf0ffdone",
actual)
+ }
+
+ @Test def testRemapXMLSurrogateCharactersToPUA(): Unit = {
+ assertEquals(2, elephant.length)
+ assertEquals("\uD80C\uDCF0", elephant)
+ val input = "elephant" + elephant + "isolatedHigh\uD80CisolatedLow\uDCF0"
+ val actual = xmlToPUANoCR2LF.remap(input)
+ assertEquals("elephant\uD80C\uDCF0isolatedHigh\uE80CisolatedLow\uECF0",
actual)
+ }
+
+ @Test def testRemapXMLSurrogateCharactersToPUA2(): Unit = {
+ val input = "badSurrogateOrder lowFirst\uDCF0\uD80CthenHigh"
+ val actual = xmlToPUANoCR2LF.remap(input)
+ assertEquals("badSurrogateOrder lowFirst\uECF0\uE80CthenHigh", actual)
+ }
+
+ @Test def testRemapXMLSurrogateCharactersToPUA3(): Unit = {
+ val input = "badSurrogateOrder
isolatedLowFirst\uDCF0\uD80C\uDCF0thenACorrectPair"
+ val actual = xmlToPUANoCR2LF.remap(input)
+ assertEquals("badSurrogateOrder
isolatedLowFirst\uECF0\uD80C\uDCF0thenACorrectPair", actual)
+ }
+
+ @Test def testRemapReplacesCRLFWithLF(): Unit = {
+ val input = "abc\r\ndef\r\nghi\r\njkl"
+ val actual = xmlToPUAAndCRToLF.remap(input)
+ assertEquals("abc\ndef\nghi\njkl", actual)
+ }
+
+ @Test def testRemapDoesNotReplaceCRLFWithLF(): Unit = {
+ val input = "abc\r\ndef\r\nghi\r\njkl"
+ val actual = xmlToPUANoCR2LF.remap(input)
+ assertEquals("abc\uE00D\ndef\uE00D\nghi\uE00D\njkl", actual)
+ }
+}
+
+class TestRemapPUAToXML() {
+ import TestPUARemapper._
+
+ val puaToXML = new RemapPUAToXMLIllegalChar()
+
+ @Test def testRemapPUAToXMLIllegalChar(): Unit = {
+ val ec = puaToXML.remap("\uE000")
+ assertEquals("\u0000", ec)
+ val ed = puaToXML.remap("\uE880")
+ assertEquals("\uD880", ed)
+ val cr = puaToXML.remap("\uE00D")
+ assertEquals("\u000D", cr)
+ }
+
+ @Test def testRemapPUAToIllegalXMLChars(): Unit = {
+ val input = "nul\uE000ctrlA\uE001cr\uE00d_fffe\uf0fe_ffff\uf0ffdone"
+ val actual = puaToXML.remap(input)
+ assertEquals("nul\u0000ctrlA\u0001cr\u000d_fffe\ufffe_ffff\uffffdone",
actual)
+ }
+
+ @Test def testRemapPUAToSurrogateChars(): Unit = {
+ val input = "elephant\uD80C\uDCF0isolatedHigh\uE80CisolatedLow\uECF0"
+ val actual = puaToXML.remap(input)
+ assertEquals("elephant" + elephant +
"isolatedHigh\uD80CisolatedLow\uDCF0", actual)
+ }
+
+}
diff --git
a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
index 0b7a67044..9722feb29 100644
---
a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
+++
b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
@@ -21,13 +21,9 @@ import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.nio.file.StandardOpenOption
-
import scala.xml._
-
import org.junit.Assert._
import org.junit.Test
-
-import org.apache.daffodil.Implicits._
import org.apache.daffodil.util.Misc
import org.apache.daffodil.xml.JDOMUtils
import org.apache.daffodil.xml.NS
@@ -121,20 +117,6 @@ class TestXMLUtils {
assertFalse(JDOMUtils.isNil(d2))
}
- @Test def testRemapXMLIllegalCharToPUA(): Unit = {
- val ec = XMLUtils.remapXMLIllegalCharToPUA(false)(0x0)
- assertEquals(0xE000, ec)
- val ed = XMLUtils.remapXMLIllegalCharToPUA(false)(0xd880)
- assertEquals(0xE880, ed)
- }
-
- @Test def testRemapPUAToXMLIllegalChar(): Unit = {
- val ec = XMLUtils.remapPUAToXMLIllegalChar(0xE000)
- assertEquals(0x0, ec)
- val ed = XMLUtils.remapPUAToXMLIllegalChar(0xE880)
- assertEquals(0xD880, ed)
- }
-
@Test def testWalkUnicodeString1(): Unit = {
val s = "abc"
val Seq((ab, 'a', 'b'), ('a', 'b', 'c'), ('b', 'c', ca)) =
XMLUtils.walkUnicodeString(s)((p, c, n) => (p, c, n))
diff --git
a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
index f3a3eba90..3f6b99010 100644
--- a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
+++ b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
@@ -2564,9 +2564,9 @@ object UTF8Encoder {
val high5 = ((byte1 & 0x07) << 2) | (byte2 >> 6)
byteList(high5 | 0xC0, low6 | 0x80)
}
- case _ if (XMLUtils.isLeadingSurrogate(c)) => {
+ case _ if (Character.isHighSurrogate(c)) => {
// High (initial) Surrogate character case.
- if (XMLUtils.isTrailingSurrogate(next)) {
+ if (Character.isLowSurrogate(next)) {
// Next codepoint is a low surrogate.
// We need to create a 4-byte representation from the
// two surrogate characters.
@@ -2576,9 +2576,9 @@ object UTF8Encoder {
threeByteEncode()
}
}
- case _ if (XMLUtils.isTrailingSurrogate(c)) => {
+ case _ if (Character.isLowSurrogate(c)) => {
// Low (subsequent) Surrogate character case.
- if (XMLUtils.isLeadingSurrogate(prev)) {
+ if (Character.isHighSurrogate(prev)) {
// Previous codepoint was a high surrogate.
// This codepoint was handled as part of converting the
// surrogate pair.