stevedlawrence commented on code in PR #799:
URL: https://github.com/apache/daffodil/pull/799#discussion_r896722304


##########
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala:
##########
@@ -2360,6 +2354,45 @@ sealed abstract class DocumentPart(part: Node, parent: 
Document) {
 
 }
 
+object CanonData {
+  private lazy val doubleForwardPattern = "//.*".r
+  private lazy val openClosePattern = "(?s)/[*].*?[*]/".r
+  private lazy val noWarnCharsSet = "|()[].Xx \n"
+
+  /*
+  * Allow "//" and "/* */" to act as comments.
+  * Any valid XML characters not explicitly allowed are also considered 
comments and are removed.
+  */
+  def canonicalizeData(validCharactersSet : String, userData : String) : 
String = {

Review Comment:
   Scala style is to not have a space before the colons.



##########
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala:
##########
@@ -2360,6 +2354,45 @@ sealed abstract class DocumentPart(part: Node, parent: 
Document) {
 
 }
 
+object CanonData {
+  private lazy val doubleForwardPattern = "//.*".r
+  private lazy val openClosePattern = "(?s)/[*].*?[*]/".r
+  private lazy val noWarnCharsSet = "|()[].Xx \n"
+
+  /*
+  * Allow "//" and "/* */" to act as comments.
+  * Any valid XML characters not explicitly allowed are also considered 
comments and are removed.
+  */
+  def canonicalizeData(validCharactersSet : String, userData : String) : 
String = {
+    var doWarning: Boolean = false
+    Assert.invariant(!userData.contains('\r'))      // \r should not exist in 
userData
+
+    //Remove the comments (//) and (/* */)
+    val noCommentsForward = doubleForwardPattern.replaceAllIn(userData,"")
+    val noCommentsBothFormats = 
openClosePattern.replaceAllIn(noCommentsForward,"")
+
+    //Throw exception if /* or */ still found. This means user input was not 
formatted correctly.
+    if (noCommentsBothFormats.contains("/*") || 
noCommentsBothFormats.contains("*/")) {
+      throw TDMLException("Improper formatting of /* */ style comment", None)
+    }
+
+    //Check value of the characters, if invalid character found create log and 
skip over it
+    val validData = noCommentsBothFormats.filter { ch =>
+      if (validCharactersSet.contains(ch)) true
+      else {
+        if (!noWarnCharsSet.contains(ch)) doWarning = true
+        false
+      }
+    }
+
+    if (doWarning) {
+        Logger.log.warn("Data contains invalid character(s). Consider using a 
comment (// or /* */).")
+    }
+
+    return validData

Review Comment:
   There shouldn't be a `return` keyword here. Just simply `validData` as this 
line is the scala standard.



##########
daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala:
##########
@@ -2360,6 +2354,45 @@ sealed abstract class DocumentPart(part: Node, parent: 
Document) {
 
 }
 
+object CanonData {
+  private lazy val doubleForwardPattern = "//.*".r
+  private lazy val openClosePattern = "(?s)/[*].*?[*]/".r
+  private lazy val noWarnCharsSet = "|()[].Xx \n"
+
+  /*
+  * Allow "//" and "/* */" to act as comments.
+  * Any valid XML characters not explicitly allowed are also considered 
comments and are removed.
+  */
+  def canonicalizeData(validCharactersSet : String, userData : String) : 
String = {
+    var doWarning: Boolean = false
+    Assert.invariant(!userData.contains('\r'))      // \r should not exist in 
userData
+
+    //Remove the comments (//) and (/* */)
+    val noCommentsForward = doubleForwardPattern.replaceAllIn(userData,"")
+    val noCommentsBothFormats = 
openClosePattern.replaceAllIn(noCommentsForward,"")

Review Comment:
   Scala style is to have a space after the comma in parameter lists.



##########
daffodil-tdml-lib/src/test/scala/org/apache/daffodil/tdml/UnitTestTDMLRunner.scala:
##########
@@ -490,4 +490,355 @@ class UnitTestTDMLRunner {
     assertTrue(dataElem ne null)
     runner.reset
   }
-}
+
+  @Test def testCommentBit(): Unit = {
+    val xml = <document bitOrder="LSBFirst"><documentPart type="bits">00000010 
//this is a label111</documentPart></document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("00000010", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitWithNewLine(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">01 01 11 //flagByte1
+                                        1 //bool2</documentPart>
+              </document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("0101111", firstPart.bitDigits)
+  }
+
+    @Test def testCommentBitJustComments(): Unit = {
+      val xml = <document bitOrder="LSBFirst">
+                  <documentPart type="bits">
+                  // this doc part contains no bits
+                  // at all. It is just comments.
+                  // 101010101
+                  </documentPart>
+                </document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitNoLineEnding(): Unit = {
+    val xml = <document bitOrder="LSBFirst"><documentPart type="bits">01011010 
// just a comment here no line ending </documentPart>
+              </document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("01011010", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitBothCommentFormatsNewLine(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">0100110110 /*C0mment 01011111 _01*/11
+100111//D1fferent sty1e c0mment</documentPart>
+              </document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("010011011011100111", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitBothCommentFormats(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">0100110110 /*C0mment 01011111 
_01*/100111//D1fferent sty1e c0mment</documentPart>
+              </document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("0100110110100111", firstPart.bitDigits)
+  }
+
+  @Test def testBitBadCommentFormatException(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">0100110110 C0mment 01011111 
_01100*/111//D1fferent sty1e c0mment</documentPart>
+              </document>
+    val exc = intercept[TDMLException] {
+      val doc = new Document(xml, null)
+      doc.documentParts.collect { case x: BitsDocumentPart => x }
+      val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+      val firstPart = dp(0).bitDigits
+    }
+    assertTrue(exc.getMessage().contains("Improper formatting of /* */ style 
comment"))
+  }
+
+  @Test def testCommentBitNoWarningCharacters(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">01|01|00
+                                        (10).[01]</documentPart></document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("0101001001", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitNoWarningCharactersWithInvalid(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">01|01|00 !!
+                                        (10).[01]</documentPart></document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("0101001001", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitNonGreedy(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">0101 /*Data 1*/ 0101 /*Data 
2*/</documentPart></document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("01010101", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitNonGreedyNewLine(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">0101 /*Data 1
+                                              Explanation*/
+                                        0101 /*Data 2
+                                              Explanation*/
+              </documentPart>
+              </document>
+    val doc = new Document(xml, null)
+    val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+    val firstPart = dp(0)
+    assertEquals("01010101", firstPart.bitDigits)
+  }
+
+  @Test def testCommentBitCarriageReturn(): Unit = {
+    val xml = <document bitOrder="LSBFirst">
+              <documentPart type="bits">0101 /*Data
+                                             Explanation*/
+                                        0101 /*Data 2&#13;Explanation*/
+              </documentPart>
+              </document>
+    val exc = intercept[Exception] {
+      val doc = new Document(xml, null)
+      doc.documentParts.collect { case x: BitsDocumentPart => x }
+      val dp = doc.documentParts.collect { case x: BitsDocumentPart => x }
+      val firstPart = dp(0).bitDigits
+    }
+    assertTrue(exc.getMessage().contains("Invariant broken"))

Review Comment:
   The point of the invariant check is that the check should never fail. If 
there is a way for it to fail, it means the invariant doesn't hold it means 
there's a bug (not in this case) or it's valid behavior and our code just needs 
to account for that possibility.
   
   So it seems by using `&#13;` it's possible to get a carriage return in the 
data, and so we need to make sure the code works correctly with a carriage 
return. We use DOTALL so the regex should be fine. I think maybe the only thing 
that needs to change is to add `\r` to the noWarn list? That's the only place 
we currently use `\n` so I think everything else is fine?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to