mbeckerle commented on code in PR #900:
URL: https://github.com/apache/daffodil/pull/900#discussion_r1059411322
##########
daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/PrimitivesTextNumber.scala:
##########
@@ -46,8 +46,226 @@ case class ConvertTextCombinator(e: ElementBase, value:
Gram, converter: Gram)
override lazy val unparser = new
ConvertTextCombinatorUnparser(e.termRuntimeData, value.unparser,
converter.unparser)
}
-case class ConvertTextNumberPrim(e: ElementBase)
- extends Terminal(e, true) {
+// This is a separate object for unit testing purposes.
+private[primitives]
+object TextNumberPatternUtils {
+
+ /**
+ * A regex which matches textNumberPatterns that legally use the V
+ * (implied decimal point) character.
+ */
+ private[primitives] lazy val vregexStandard = {
+ // DFDL v1.0 Spec says
+ // It is a Schema Definition Error if any symbols other than "0", "1"
through "9" or #
+ // are used in the vpinteger region of the pattern.
+ //
+ // The prefix and suffix chars can surround the vpinteger region, and
there can be
+ // a positive and a negative pattern.
+ //
+ // The prefix and suffix cannot be digits, # or P or V. No quoted chars in
prefix either.
+ //
+ val prefix = """([^0-9#PV']?)"""
+ val suffix = prefix // same syntax as prefix
+ val vpinteger = """(#*[0-9]+)V([0-9]+)"""
+ val subpattern = s"""${prefix}${vpinteger}${suffix}"""
+ // don't forget the ^ and $ (start of data, end of data) because we want
+ // the match to consume all the characters, starting at the beginning.
+ val vPattern = s"""^${subpattern}(?:;${subpattern})?$$"""
+ val re = vPattern.r
+ re
+ }
+
+ def textDecimalVirtualPointForStandard(
+ e: ImplementsThrowsOrSavesSDE,
+ originalPattern: String,
+ patternStripped: String): Int = {
+ val r = TextNumberPatternUtils.vregexStandard
+ r.findFirstMatchIn(patternStripped) match {
+ case Some(r(_, _, afterV, _, _, _, _, _)) => afterV.length
+ case None =>
+ e.SDE(
+ s"""The dfdl:textNumberPattern '%s' contains 'V' (virtual decimal
point).
+ | Other than the sign indicators, it can contain only
+ | '#', then digits 0-9 then 'V' then digits 0-9.
+ | The positive part of the dfdl:textNumberPattern is
mandatory.""".stripMargin('|'),
+ originalPattern)
+ }
+ }
+
+ private[primitives] lazy val vregexZoned= {
+ // Note: for zoned, can only have a positive pattern
+ // Prefix or suffix can only be '+' character, to
+ // indicate leading or trailing sign, and can only
+ // one of those.
+ //
+ // Also we're not allowing the # character, since I think that
+ // makes no sense for zoned with virtual decimal point.
+ //
+ val vPattern = """^(\+?)([0-9]+)V([0-9]+)(\+?)$""" // only positive
pattern allowed, only + for prefix/suffix
+ val re = vPattern.r
+ re
+ }
+
+ /**
+ * Checks a pattern for suitability with V (virtual decimal point) in the
pattern
+ * in the context of zoned textNumberRep.
+ *
+ * This is broken out separately for unit testing purposes.
+ *
+ * @param pattern the dfdl:textNumberPattern pattern
+ * @return -1 if the pattern is illegal syntax for use with V (virtual
decimal point)
+ * otherwise the number of digits to the right of the V character.
+ */
+ private[primitives] def textDecimalVirtualPointForZoned(pattern: String):
Option[Int] = {
+ val r = TextNumberPatternUtils.vregexZoned
+ r.findFirstMatchIn(pattern) match {
+ // note: cannot have both a prefix and suffix. Only one of them.
+ case Some(r(pre, _, afterV, suf)) if (pre.length + suf.length <= 1) =>
Some(afterV.length)
+ case _ => None
+ }
+ }
+
+ def textDecimalVirtualPointForZoned(
+ e: ImplementsThrowsOrSavesSDE,
+ originalPattern: String,
+ patternStripped: String): Int = {
+ textDecimalVirtualPointForZoned(patternStripped).getOrElse {
+ e.SDE(
+ s"""The dfdl:textNumberPattern '%s' contains 'V' (virtual decimal
point).
+ |Other than the leading or trailing '+' sign indicator,
+ |it can contain only digits 0-9.""".stripMargin('|'),
+ originalPattern)
+ }
+ }
+
+ /**
+ * The apos character (') is the quoting character in ICU patterns (and DFDL
textNumberPattern).
+ * This removes any unquoted P or V characters (which are not implemented by
ICU)
+ * leaving a pattern string that is suitable for use with ICU.
+ * @param pattern the textNumberPattern string
+ * @return the pattern string with all unquoted P and unquoted V removed.
+ */
+ def removeUnquotedPV(pattern: String) : String = {
+ val uniqueQQString = "alsdflslkjskjslkkjlkkjfppooiipsldsflj"
+ // A single regex that matches an unquoted character
+ // where the quoting char is self quoting requires
+ // a zero-width look behind that matches a potentially
+ // unbounded number of quote chars. That's not allowed.
+ //
+ // Consider ''''''P. We want a regex that matches only the P here
+ // because it is preceded by an even number of quotes.
+ //
+ // I did some tests, and the formulations you think might work
+ // such as from stack-overflow, don't work.
+ // (See test howNotToUseRegexLookBehindWithReplaceAll)
+ //
+ // So we use this brute force technique of replacing all ''
+ // first, so we have only single quotes to deal with.
+ pattern.replaceAll("''", uniqueQQString).
+ replaceAll("(?<!')P", "").
+ replaceAll("(?<!')V", "").
+ replaceAll(uniqueQQString, "''")
+ }
+}
+
+trait ConvertTextNumberMixin {
+
+ protected def pattern: String
+
+
+ // The pattern with escaped characters removed.
+ // This can be reliably searched for pattern characters,
+ // but cannot be used as an operational pattern given that
+ // potentially lots of things have been removed from it.
+ final protected lazy val patternStripped = {
+ // note: tick == apos == ' == single quote.
+ // First remove entirely all escaped ticks ie., ''
+ val noEscapedTicksRegex = """''""".r
+ val patternNoEscapedTicks = noEscapedTicksRegex.replaceAllIn(pattern, "")
+ // Next remove all tick-escaped characters entirely
+ val noQuotedRegex = """'[^']+'""".r
+ val patternNoQuoted = noQuotedRegex.replaceAllIn(patternNoEscapedTicks, "")
+ // the remaining string contains only pattern special characters
+ // and regular non-pattern characters
+ patternNoQuoted
+ }
+
+ protected final lazy val hasV = patternStripped.contains("V")
+ protected final lazy val hasP = patternStripped.contains("P")
+
+ // analogous to the property dfdl:binaryDecimalVirtualPoint
+ protected final def computeTextDecimalVirtualPoint(e: ElementBase) = {
+ if (!hasV && !hasP) 0 // no virtual point
+ else {
+ if (hasP) {
+ e.notYetImplemented("textNumberPattern with P symbol")
+ }
+ Assert.invariant(hasV)
+ //
+ // check for things incompatible with "V"
+ //
+ val virtualPoint = e.textNumberRep match {
+ case TextNumberRep.Standard =>
+ TextNumberPatternUtils.textDecimalVirtualPointForStandard(e,
pattern, patternStripped)
+ case TextNumberRep.Zoned =>
+ TextNumberPatternUtils.textDecimalVirtualPointForZoned(e, pattern,
patternStripped)
+ }
+
+ Assert.invariant(virtualPoint >= 1) // if this fails the regex is broken.
+ virtualPoint
+ }
+ }
+
+ //
+ // We need 2 patterns
+ // 1. The original textNumberPattern string from the schema
+ // 2. Same but with the P or V removed.
+ //
+ // We need to use the original textNumberPattern in diagnostic messages
+ // since that's what the schema author wrote and expects to see.
+ // But if the pattern has P or V, then we need ICU at runtime to use the
original
+ // but with the P or V removed since the P and V don't actually represent
anything in the data.
+ final protected lazy val runtimePattern = {
+ if (hasV || hasP) {
+ val patternWithoutPV = TextNumberPatternUtils.removeUnquotedPV(pattern)
+ patternWithoutPV
+ }
+ else pattern
+ }
+
+ final protected def checkPatternWithICU(e: ElementBase) = {
+ // Load the pattern to make sure it is valid
+ try {
+ if (hasV || hasP) {
+ new DecimalFormat(runtimePattern)
+ } else {
+ new DecimalFormat(pattern)
+ }
+ } catch {
+ case ex: IllegalArgumentException =>
+ if (hasV || hasP) {
+ // we don't know what the diagnostic message will say here
+ // since it is from the ICU library.
+ // That library has only seen the pattern with the P and V
+ // removed. The messages might show that pattern which would
+ // confuse the schema author since that's not the pattern
+ // they wrote.
+ //
+ // So we try to explain....
+ e.SDE(
+ """Invalid textNumberPattern.
+ | The errors are about the pattern with the P (decimal scaling
position)
+ | and V (virtual decimal point) characters removed:
%s""".stripMargin, ex)
Review Comment:
I am actually not sure it's possible to exercise this path. The regexs for
the V pattern (and P eventually) constrain things pretty strongly in terms of
the pattern that's left when it is handed to ICU.
So this is more of a "just in case" there is something ICU rejects even
after we've pre-conditioned it by removing the P and V.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]