escaped characters in string literals

sarutak Tue, 02 Feb 2021 08:07:47 -0800

This is an automated email from the ASF dual-hosted git repository.

sarutak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new d308794  [SPARK-34263][SQL] Simplify the code for treating 
unicode/octal/escaped characters in string literals
d308794 is described below

commit d308794adb821d301847772de3ee1ef3166aaf5b
Author: Kousuke Saruta <[email protected]>
AuthorDate: Wed Feb 3 01:07:12 2021 +0900

    [SPARK-34263][SQL] Simplify the code for treating unicode/octal/escaped 
characters in string literals
    
    ### What changes were proposed in this pull request?
    
    In the current master, the code for treating unicode/octal/escaped 
characters in string literals is a little bit complex so let's simplify it.
    
    ### Why are the changes needed?
    
    To keep it easy to maintain.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    `ParserUtilsSuite` passes.
    
    Closes #31362 from sarutak/refactor-unicode-escapes.
    
    Authored-by: Kousuke Saruta <[email protected]>
    Signed-off-by: Kousuke Saruta <[email protected]>
---
 .../spark/sql/catalyst/parser/ParserUtils.scala    | 77 ++++++++--------------
 .../sql/catalyst/parser/ParserUtilsSuite.scala     |  7 ++
 2 files changed, 34 insertions(+), 50 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 711b507..f7cf2ba 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.catalyst.parser
 
 import java.lang.{Long => JLong}
+import java.nio.CharBuffer
 import java.util
 
 import scala.collection.mutable.StringBuilder
@@ -33,6 +34,12 @@ import org.apache.spark.sql.errors.QueryParsingErrors
  * A collection of utility methods for use during the parsing process.
  */
 object ParserUtils {
+
+  val U16_CHAR_PATTERN = """\\u([a-fA-F0-9]{4})(?s).*""".r
+  val U32_CHAR_PATTERN = """\\U([a-fA-F0-9]{8})(?s).*""".r
+  val OCTAL_CHAR_PATTERN = """\\([01][0-7]{2})(?s).*""".r
+  val ESCAPED_CHAR_PATTERN = """\\((?s).)(?s).*""".r
+
   /** Get the command which created the token. */
   def command(ctx: ParserRuleContext): String = {
     val stream = ctx.getStart.getInputStream
@@ -131,7 +138,6 @@ object ParserUtils {
 
   /** Unescape backslash-escaped string enclosed by quotes. */
   def unescapeSQLString(b: String): String = {
-    var enclosure: Character = null
     val sb = new StringBuilder(b.length())
 
     def appendEscapedChar(n: Char): Unit = {
@@ -152,34 +158,19 @@ object ParserUtils {
       }
     }
 
-    var i = 0
-    val strLength = b.length
-    while (i < strLength) {
-      val currentChar = b.charAt(i)
-      if (enclosure == null) {
-        if (currentChar == '\'' || currentChar == '\"') {
-          enclosure = currentChar
-        }
-      } else if (enclosure == currentChar) {
-        enclosure = null
-      } else if (currentChar == '\\') {
-
-        if ((i + 6 < strLength) && b.charAt(i + 1) == 'u') {
-          // \u0000 style 16-bit unicode character literals.
+    // Skip the first and last quotations enclosing the string literal.
+    val charBuffer = CharBuffer.wrap(b, 1, b.length - 1)
 
-          val base = i + 2
-          val code = (0 until 4).foldLeft(0) { (mid, j) =>
-            val digit = Character.digit(b.charAt(j + base), 16)
-            (mid << 4) + digit
-          }
-          sb.append(code.asInstanceOf[Char])
-          i += 5
-        } else if ((i + 10 < strLength) && b.charAt(i + 1) == 'U' &&
-                   (2 until 10).forall(j => Character.digit(b.charAt(i + j), 
16) != -1)) {
+    while (charBuffer.remaining() > 0) {
+      charBuffer match {
+        case U16_CHAR_PATTERN(cp) =>
+          // \u0000 style 16-bit unicode character literals.
+          sb.append(Integer.parseInt(cp, 16).toChar)
+          charBuffer.position(charBuffer.position() + 6)
+        case U32_CHAR_PATTERN(cp) =>
           // \U00000000 style 32-bit unicode character literals.
-
           // Use Long to treat codePoint as unsigned in the range of 32-bit.
-          val codePoint = JLong.parseLong(b.substring(i + 2, i + 10), 16)
+          val codePoint = JLong.parseLong(cp, 16)
           if (codePoint < 0x10000) {
             sb.append((codePoint & 0xFFFF).toChar)
           } else {
@@ -188,33 +179,19 @@ object ParserUtils {
             sb.append(highSurrogate.toChar)
             sb.append(lowSurrogate.toChar)
           }
-          i += 9
-        } else if (i + 4 < strLength) {
+          charBuffer.position(charBuffer.position() + 10)
+        case OCTAL_CHAR_PATTERN(cp) =>
           // \000 style character literals.
-
-          val i1 = b.charAt(i + 1)
-          val i2 = b.charAt(i + 2)
-          val i3 = b.charAt(i + 3)
-
-          if ((i1 >= '0' && i1 <= '1') && (i2 >= '0' && i2 <= '7') && (i3 >= 
'0' && i3 <= '7')) {
-            val tmp = ((i3 - '0') + ((i2 - '0') << 3) + ((i1 - '0') << 
6)).asInstanceOf[Char]
-            sb.append(tmp)
-            i += 3
-          } else {
-            appendEscapedChar(i1)
-            i += 1
-          }
-        } else if (i + 2 < strLength) {
+          sb.append(Integer.parseInt(cp, 8).toChar)
+          charBuffer.position(charBuffer.position() + 4)
+        case ESCAPED_CHAR_PATTERN(c) =>
           // escaped character literals.
-          val n = b.charAt(i + 1)
-          appendEscapedChar(n)
-          i += 1
-        }
-      } else {
-        // non-escaped character literals.
-        sb.append(currentChar)
+          appendEscapedChar(c.charAt(0))
+          charBuffer.position(charBuffer.position() + 2)
+        case _ =>
+          // non-escaped character literals.
+          sb.append(charBuffer.get())
       }
-      i += 1
     }
     sb.toString()
   }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
index 7306309..35f0900 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala
@@ -105,6 +105,13 @@ class ParserUtilsSuite extends SparkFunSuite {
 
     // String including surrogate pair characters (U+1F408 is a cat and 
U+1F415 is a dog in Emoji).
     assert(unescapeSQLString("\"\\U0001F408 \\U0001F415\"") == "\uD83D\uDC08 
\uD83D\uDC15")
+
+    // String including escaped normal characters.
+    assert(unescapeSQLString(
+      """"ab\
+        |cd\ef"""".stripMargin) ==
+      """ab
+        |cdef""".stripMargin)
     // scalastyle:on nonascii
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-34263][SQL] Simplify the code for treating unicode/octal/escaped characters in string literals

Reply via email to