This is an automated email from the ASF dual-hosted git repository.

slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git


The following commit(s) were added to refs/heads/main by this push:
     new ba35b9b  Fix handling of %#x25; character entity
ba35b9b is described below

commit ba35b9bcb78ce91f707f3f811debef9d5f2428b0
Author: Steve Lawrence <[email protected]>
AuthorDate: Tue Sep 21 11:24:07 2021 -0400

    Fix handling of %#x25; character entity
    
    We sort of have a two-pass system for handling DFDL strings that contain
    character entities related to delimiters. The first past converts
    character entities to the character values, excluding things like
    escaped percents and charater classes like NL, WSP, etc. The second pass
    handles those excluded entities.
    
    However, this first pass replaces the hex entity %#x25; (and
    corresponding decimal entity) to a percent sign, which can then
    interfere with the second pass, which should never see single percent
    signs except for the exceptions mentioned above.
    
    This modifies the first pass so that character entities that result in a
    single percent are replaced with a double-percent. This will then be
    converted to single precent used escaped-percent logic elsewhere.
    
    This also has refactors related existing code to be more robust and
    reduce duplication.
    
    Also removed dead function, `replaceNLForUnparse`, that contains a
    potentially buggy `replaceAll` call that does not escape input.
    
    DAFFODIL-2479
---
 .../apache/daffodil/cookers/EntityReplacer.scala   | 47 ++++++++--------
 .../daffodil/section06/entities/Entities.tdml      | 64 ++++++++++++++++++++++
 .../daffodil/section06/entities/TestEntities.scala |  2 +
 3 files changed, 91 insertions(+), 22 deletions(-)

diff --git 
a/daffodil-lib/src/main/scala/org/apache/daffodil/cookers/EntityReplacer.scala 
b/daffodil-lib/src/main/scala/org/apache/daffodil/cookers/EntityReplacer.scala
index 2f4af1e..c60785b 100755
--- 
a/daffodil-lib/src/main/scala/org/apache/daffodil/cookers/EntityReplacer.scala
+++ 
b/daffodil-lib/src/main/scala/org/apache/daffodil/cookers/EntityReplacer.scala
@@ -128,6 +128,25 @@ final class EntityReplacer {
   def hasHexCodePoint(input: String): Boolean = isMatched(input, hexPattern)
   def hasByteCodePoint(input: String): Boolean = isMatched(input, bytePattern)
 
+  private def replaceEntityWithChar(input: String, entity: String, newChar: 
Char): String = {
+    val replacement =
+      if (newChar == '%') {
+        // Some character entities are not replaced in this EntityReplacer,
+        // such as double percents or character classes (NL, WSP, etc.). If
+        // this character entity results in a percent character (e.g. %#x25;),
+        // we must replace it with an escaped percent to be handled later.
+        "%%"
+      } else {
+        // This character might mean something special to the replaceAll method
+        // we are about to use (e.g. a dollar sign for regex group references).
+        // To be safe, call quoteReplaement which escapes any characters that
+        // mean something special in a replacement string so they are replaced
+        // with the literal value.
+        Matcher.quoteReplacement(newChar.toString)
+      }
+    input.replaceAll(entity, replacement)
+  }
+
   def replaceHex(input: String, prefix: String): String = {
     var res: String = input
 
@@ -141,12 +160,8 @@ final class EntityReplacer {
         val rawStr = m.group().toString()
         val trimmedStr = rawStr.replace(prefix, "").replace(";", "")
         val intStr = Integer.parseInt(trimmedStr, 16)
-        val newChar = intStr.toChar.toString
-        // Special case here
-        // $ is used by replaceAll to refer to prior groups
-        // so $ must be escaped into \$
-        val newCharNotDollar = if (newChar == "$") """\$""" else newChar
-        res = res.replaceAll(rawStr, newCharNotDollar)
+        val newChar = intStr.toChar
+        res = replaceEntityWithChar(res, rawStr, newChar)
       }
     }
     res
@@ -165,8 +180,8 @@ final class EntityReplacer {
         val rawStr = m.group().toString()
         val trimmedStr = rawStr.replace(prefix, "").replace(";", "")
         val intStr = Integer.parseInt(trimmedStr, 10)
-
-        res = res.replaceAll(rawStr, intStr.asInstanceOf[Char].toString())
+        val newChar = intStr.toChar
+        res = replaceEntityWithChar(res, rawStr, newChar)
       }
     }
 
@@ -188,8 +203,8 @@ final class EntityReplacer {
         val upperNibble: Int = JByte.parseByte(trimmedStr.substring(0, 1), 16) 
<< 4
         val lowerNibble: Byte = JByte.parseByte(trimmedStr.substring(1, 2), 16)
         val byteStr: Int = upperNibble | lowerNibble
-
-        res = res.replaceAll(rawStr, byteStr.toChar.toString)
+        val newChar = byteStr.toChar
+        res = replaceEntityWithChar(res, rawStr, newChar)
       }
     }
 
@@ -258,18 +273,6 @@ final class EntityReplacer {
     s2
   }
 
-  /**
-   * replace marked NL entity with replacement string (from dfdl:outputNewline 
computation)
-   * and replace double-percent markers with "%".
-   */
-  def replaceNLForUnparse(input: String, replacement: String): String = {
-    markerForNLMatcher.reset(input)
-    val a = markerForNLMatcher.replaceAll(replacement)
-    markerForDPMatcher.reset(a)
-    val b = markerForDPMatcher.replaceAll("%")
-    b
-  }
-
   private def stripLeadingPercent(s: String) = if (s.startsWith("%")) s 
substring (1) else s
 
   /**
diff --git 
a/daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/Entities.tdml
 
b/daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/Entities.tdml
index 720b55d..400aa05 100644
--- 
a/daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/Entities.tdml
+++ 
b/daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/Entities.tdml
@@ -1363,4 +1363,68 @@ is multiple bytes in UTF-8 encoding that is used"
     </tdml:infoset>
   </tdml:parserTestCase>
 
+  <tdml:defineSchema name="AllAsciiCodePointEntities">
+
+    <xs:include 
schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
+    <dfdl:format ref="ex:GeneralFormat" lengthKind="delimited" 
encoding="ISO-8859-1" />
+
+    <xs:element name="hexEntities" type="xs:string" 
dfdl:initiator="%#x00;%#x01;%#x02;%#x03;%#x04;%#x05;%#x06;%#x07;%#x08;%#x09;%#x0A;%#x0B;%#x0C;%#x0D;%#x0E;%#x0F;%#x10;%#x11;%#x12;%#x13;%#x14;%#x15;%#x16;%#x17;%#x18;%#x19;%#x1A;%#x1B;%#x1C;%#x1D;%#x1E;%#x1F;%#x20;%#x21;%#x22;%#x23;%#x24;%#x25;%#x26;%#x27;%#x28;%#x29;%#x2A;%#x2B;%#x2C;%#x2D;%#x2E;%#x2F;%#x30;%#x31;%#x32;%#x33;%#x34;%#x35;%#x36;%#x37;%#x38;%#x39;%#x3A;%#x3B;%#x3C;%#x3D;%#x3E;%#x3F;%#x40;%#x41;%#x42;%#x43;%#x44;%#x45;%#x4
 [...]
+    <xs:element name="decEntities" type="xs:string" 
dfdl:initiator="%#0;%#1;%#2;%#3;%#4;%#5;%#6;%#7;%#8;%#9;%#10;%#11;%#12;%#13;%#14;%#15;%#16;%#17;%#18;%#19;%#20;%#21;%#22;%#23;%#24;%#25;%#26;%#27;%#28;%#29;%#30;%#31;%#32;%#33;%#34;%#35;%#36;%#37;%#38;%#39;%#40;%#41;%#42;%#43;%#44;%#45;%#46;%#47;%#48;%#49;%#50;%#51;%#52;%#53;%#54;%#55;%#56;%#57;%#58;%#59;%#60;%#61;%#62;%#63;%#64;%#65;%#66;%#67;%#68;%#69;%#70;%#71;%#72;%#73;%#74;%#75;%#76;%#77;%#78;%#79;%#80;%#81;%#82;%#83;%#84;%#85;%#86
 [...]
+
+  </tdml:defineSchema>
+
+  <tdml:parserTestCase name="allAsciiHexEntities" root="hexEntities" 
model="AllAsciiCodePointEntities">
+    <tdml:document>
+      <tdml:documentPart type="byte">00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 
0E 0F</tdml:documentPart>
+      <tdml:documentPart type="byte">10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 
1E 1F</tdml:documentPart>
+      <tdml:documentPart type="byte">20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 
2E 2F</tdml:documentPart>
+      <tdml:documentPart type="byte">30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 
3E 3F</tdml:documentPart>
+      <tdml:documentPart type="byte">40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 
4E 4F</tdml:documentPart>
+      <tdml:documentPart type="byte">50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 
5E 5F</tdml:documentPart>
+      <tdml:documentPart type="byte">60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 
6E 6F</tdml:documentPart>
+      <tdml:documentPart type="byte">70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 
7E 7F</tdml:documentPart>
+      <tdml:documentPart type="byte">80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 
8E 8F</tdml:documentPart>
+      <tdml:documentPart type="byte">90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 
9E 9F</tdml:documentPart>
+      <tdml:documentPart type="byte">A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD 
AE AF</tdml:documentPart>
+      <tdml:documentPart type="byte">B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD 
BE BF</tdml:documentPart>
+      <tdml:documentPart type="byte">C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD 
CE CF</tdml:documentPart>
+      <tdml:documentPart type="byte">D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD 
DE DF</tdml:documentPart>
+      <tdml:documentPart type="byte">E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED 
EE EF</tdml:documentPart>
+      <tdml:documentPart type="byte">F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD 
FE FF</tdml:documentPart>
+      <tdml:documentPart type="text">string</tdml:documentPart>
+    </tdml:document>
+    <tdml:infoset>
+      <tdml:dfdlInfoset>
+        <ex:hexEntities>string</ex:hexEntities>
+      </tdml:dfdlInfoset>
+    </tdml:infoset>
+  </tdml:parserTestCase>
+
+  <tdml:parserTestCase name="allAsciiDecEntities" root="decEntities" 
model="AllAsciiCodePointEntities">
+    <tdml:document>
+      <tdml:documentPart type="byte">00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 
0E 0F</tdml:documentPart>
+      <tdml:documentPart type="byte">10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 
1E 1F</tdml:documentPart>
+      <tdml:documentPart type="byte">20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 
2E 2F</tdml:documentPart>
+      <tdml:documentPart type="byte">30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 
3E 3F</tdml:documentPart>
+      <tdml:documentPart type="byte">40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 
4E 4F</tdml:documentPart>
+      <tdml:documentPart type="byte">50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 
5E 5F</tdml:documentPart>
+      <tdml:documentPart type="byte">60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 
6E 6F</tdml:documentPart>
+      <tdml:documentPart type="byte">70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 
7E 7F</tdml:documentPart>
+      <tdml:documentPart type="byte">80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 
8E 8F</tdml:documentPart>
+      <tdml:documentPart type="byte">90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 
9E 9F</tdml:documentPart>
+      <tdml:documentPart type="byte">A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD 
AE AF</tdml:documentPart>
+      <tdml:documentPart type="byte">B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD 
BE BF</tdml:documentPart>
+      <tdml:documentPart type="byte">C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD 
CE CF</tdml:documentPart>
+      <tdml:documentPart type="byte">D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD 
DE DF</tdml:documentPart>
+      <tdml:documentPart type="byte">E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED 
EE EF</tdml:documentPart>
+      <tdml:documentPart type="byte">F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD 
FE FF</tdml:documentPart>
+      <tdml:documentPart type="text">string</tdml:documentPart>
+    </tdml:document>
+    <tdml:infoset>
+      <tdml:dfdlInfoset>
+        <ex:decEntities>string</ex:decEntities>
+      </tdml:dfdlInfoset>
+    </tdml:infoset>
+  </tdml:parserTestCase>
+
 </tdml:testSuite>
diff --git 
a/daffodil-test/src/test/scala/org/apache/daffodil/section06/entities/TestEntities.scala
 
b/daffodil-test/src/test/scala/org/apache/daffodil/section06/entities/TestEntities.scala
index cf9dcc65..e8fa7f4 100644
--- 
a/daffodil-test/src/test/scala/org/apache/daffodil/section06/entities/TestEntities.scala
+++ 
b/daffodil-test/src/test/scala/org/apache/daffodil/section06/entities/TestEntities.scala
@@ -119,4 +119,6 @@ class TestEntities {
   @Test def test_invalid_entity_06(): Unit = { 
runnerInvalid.runOneTest("text_invalid_entity_among_multiple_valid_combined") }
   @Test def test_invalid_entity_07(): Unit = { 
runnerInvalid.runOneTest("text_invalid_entity_escaped") }
 
+  @Test def test_allAsciiHexEntities(): Unit = { 
runner_01.runOneTest("allAsciiHexEntities") }
+  @Test def test_allAsciiDecEntities(): Unit = { 
runner_01.runOneTest("allAsciiDecEntities") }
 }

Reply via email to