Re: [PR] [SPARK-54876][SQL] Fix splitSemiColon dropping statement ending with block comment [spark]

via GitHub Fri, 29 May 2026 02:28:14 -0700


cloud-fan commented on code in PR #55466:
URL: https://github.com/apache/spark/pull/55466#discussion_r3323420818



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala:
##########
@@ -279,139 +279,73 @@ object StringUtils extends Logging {
     pattern.matches(cleanText)
   }
 
-  private def containsNonWhiteSpaceCharacters(inputString: String): Boolean = {
-    val pattern = "\\S".r
-    pattern.findFirstIn(inputString).isDefined
-  }
-
-  // Implementation is grabbed from SparkSQLCLIDriver.splitSemiColon, the only 
difference is this
-  // implementation handles backtick and treat it as single/double quote.
-  // Below comments are from the source:
-  // Adapted splitSemiColon from Hive 2.3's CliDriver.splitSemiColon.
-  // Note: [SPARK-31595] if there is a `'` in a double quoted string, or a `"` 
in a single quoted
-  // string, the origin implementation from Hive will not drop the trailing 
semicolon as expected,
-  // hence we refined this function a little bit.
-  // Note: [SPARK-33100] Ignore a semicolon inside a bracketed comment in 
spark-sql.
+  // Structural scanner for splitting SQL by semicolons.
+  // Handles: quoted strings with escapes, line comments (--), nested block 
comments (/* */),
+  // and semicolons inside strings/comments are not treated as delimiters.
+  // Note: [SPARK-31595], [SPARK-33100], [SPARK-54876]
   def splitSemiColonWithIndex(line: String, enableSqlScripting: Boolean): 
List[String] = {
-    var insideSingleQuote = false
-    var insideDoubleQuote = false
-    var insideBacktick = false
-    var insideSimpleComment = false
-    var bracketedCommentLevel = 0
-    var escape = false
-    var beginIndex = 0
-    var leavingBracketedComment = false
-    var hasPrecedingNonCommentString = false
-    var isStatement = false
-    val ret = new ArrayBuffer[String]()
-
     lazy val insideSqlScript: Boolean = isSqlScript(line)
+    if (enableSqlScripting && insideSqlScript) return List(line)
 
-    def insideBracketedComment: Boolean = bracketedCommentLevel > 0
-    def insideComment: Boolean = insideSimpleComment || insideBracketedComment
-    def statementInProgress(index: Int): Boolean =
-      isStatement || (!insideComment &&
-        index > beginIndex && !s"${line.charAt(index)}".trim.isEmpty)
-
-    for (index <- 0 until line.length) {
-      // Checks if we need to decrement a bracketed comment level; the last 
character '/' of
-      // bracketed comments is still inside the comment, so 
`insideBracketedComment` must keep
-      // true in the previous loop and we decrement the level here if needed.
-      if (leavingBracketedComment) {
-        bracketedCommentLevel -= 1
-        leavingBracketedComment = false
+    val ret = new ArrayBuffer[String]()
+    val n = line.length
+    var i = 0
+    var chunkStart = 0
+    var chunkHasSql = false
+    var chunkHasUnclosed = false
+
+    def consumeString(start: Int, quote: Char): Int = {
+      var p = start + 1
+      while (p < n) {
+        val c = line.charAt(p)
+        if (c == '\\' && p + 1 < n) p += 2
+        else if (c == quote) return p + 1
+        else p += 1
       }
+      chunkHasUnclosed = true; n
+    }
 
-      if (line.charAt(index) == '\'' && !insideComment) {
-        // take a look to see if it is escaped
-        // See the comment above about SPARK-31595
-        if (!escape && !insideDoubleQuote && !insideBacktick) {
-          // flip the boolean variable
-          insideSingleQuote = !insideSingleQuote
-        }
-      } else if (line.charAt(index) == '\"' && !insideComment) {
-        // take a look to see if it is escaped
-        // See the comment above about SPARK-31595
-        if (!escape && !insideSingleQuote && !insideBacktick) {
-          // flip the boolean variable
-          insideDoubleQuote = !insideDoubleQuote
-        }
-      } else if (line.charAt(index) == '`' && !insideComment) {
-        // take a look to see if it is escaped
-        // See the comment above about SPARK-31595
-        if (!escape && !insideSingleQuote && !insideDoubleQuote) {
-          // flip the boolean variable
-          insideBacktick = !insideBacktick
-        }
-      } else if (line.charAt(index) == '-') {
-        val hasNext = index + 1 < line.length
-        if (insideDoubleQuote || insideSingleQuote || insideBacktick || 
insideComment) {
-          // Ignores '-' in any case of quotes or comment.
-          // Avoids to start a comment(--) within a quoted segment or already 
in a comment.
-          // Sample query: select "quoted value --"
-          //                                    ^^ avoids starting a comment 
if inside quotes.
-        } else if (hasNext && line.charAt(index + 1) == '-') {
-          // ignore quotes and ; in simple comment
-          insideSimpleComment = true
-        }
-      } else if (line.charAt(index) == ';') {
-        if (insideSingleQuote || insideDoubleQuote || insideBacktick || 
insideComment) {
-          // do not split
-        } else if (enableSqlScripting && insideSqlScript) {
-          // do not split
-        } else {
-          if (isStatement) {
-            // split, do not include ; itself
-            ret.append(line.substring(beginIndex, index))
-          }
-          beginIndex = index + 1
-          isStatement = false
-        }
-      } else if (line.charAt(index) == '\n') {
-        // with a new line the inline simple comment should end.
-        if (!escape) {
-          insideSimpleComment = false
-        }
-      } else if (line.charAt(index) == '/' && !insideSimpleComment) {
-        val hasNext = index + 1 < line.length
-        if (insideSingleQuote || insideDoubleQuote || insideBacktick) {
-          // Ignores '/' in any case of quotes
-        } else if (insideBracketedComment && line.charAt(index - 1) == '*') {
-          // Decrements `bracketedCommentLevel` at the beginning of the next 
loop
-          leavingBracketedComment = true
-        } else if (hasNext && line.charAt(index + 1) == '*') {
-          bracketedCommentLevel += 1
-          // Check if there's non-comment characters(non space, non newline 
characters) before
-          // multiline comments.
-          hasPrecedingNonCommentString = beginIndex != index && 
containsNonWhiteSpaceCharacters(
-            line.substring(beginIndex, index)
-          )
-        }
-      }
-      // set the escape
-      if (escape) {
-        escape = false
-      } else if (line.charAt(index) == '\\') {
-        escape = true
-      }
+    def consumeLineComment(start: Int): Int = {
+      var p = start + 2
+      while (p < n && line.charAt(p) != '\n') p += 1
+      p
+    }
 
-      isStatement = statementInProgress(index)
+    def consumeBlockComment(start: Int): Int = {
+      var p = start + 2
+      var level = 1
+      while (p + 1 < n && level > 0) {
+        val c0 = line.charAt(p); val c1 = line.charAt(p + 1)
+        if (c0 == '/' && c1 == '*') { level += 1; p += 2 }
+        else if (c0 == '*' && c1 == '/') { level -= 1; p += 2 }
+        else p += 1
+      }
+      if (level > 0) { chunkHasUnclosed = true; n } else p
     }
-    // Check the last char is end of nested bracketed comment.
-    val endOfBracketedComment = leavingBracketedComment && 
bracketedCommentLevel == 1 &&
-      !hasPrecedingNonCommentString
-    // Spark SQL support simple comment and nested bracketed comment in query 
body.
-    // But if Spark SQL receives a comment alone, it will throw parser 
exception.
-    // In Spark SQL CLI, if there is a completed comment in the end of whole 
query,
-    // since Spark SQL CLL use `;` to split the query, CLI will pass the 
comment
-    // to the backend engine and throw exception. CLI should ignore this 
comment,
-    // If there is an uncompleted statement or an uncompleted bracketed 
comment in the end,
-    // CLI should also pass this part to the backend engine, which may throw 
an exception
-    // with clear error message (for incomplete statement, if there's non 
comment characters,
-    // we would still append the string).
-    if (!endOfBracketedComment && (isStatement || insideBracketedComment)) {
-      ret.append(line.substring(beginIndex))
+
+    while (i < n) {
+      val c = line.charAt(i)
+      def peek(ch: Char): Boolean = i + 1 < n && line.charAt(i + 1) == ch
+      if (c == '\'' || c == '"' || c == '`') {
+        chunkHasSql = true; i = consumeString(i, c)
+      } else if (c == '-' && peek('-')) {
+        i = consumeLineComment(i)
+      } else if (c == '/' && peek('*')) {
+        i = consumeBlockComment(i)
+      } else if (c == ';') {
+        if (chunkHasSql) ret += line.substring(chunkStart, i)
+        chunkStart = i + 1; chunkHasSql = false; chunkHasUnclosed = false; i 
+= 1
+      } else {
+        if (!Character.isWhitespace(c)) chunkHasSql = true
+        i += 1
+      }
     }
+    if (chunkHasSql || chunkHasUnclosed) ret += line.substring(chunkStart)
     ret.toList
   }
+
+  /** Convenience wrapper: splits on semicolons without SQL scripting 
awareness. */
+  def splitSemiColon(line: String): List[String] = {

Review Comment:
   This `splitSemiColon` wrapper has no callers anywhere in the repo. The 
consolidation wired `SparkSQLCLIDriver.splitSemiColon` straight to 
`splitSemiColonWithIndex(line, enableSqlScripting = false)`, so this wrapper is 
orphaned.
   
   I'd just drop it. Both production callers (`SparkSQLCLIDriver`, 
`SqlGraphRegistrationContext`) pass `enableSqlScripting = false` explicitly, 
and routing them through this wouldn't read better -- it stacks a second 
`splitSemiColon` on top of the misleadingly-named `splitSemiColonWithIndex` 
(which returns `List[String]`, no index). The explicit two-arg call at two 
sites is self-documenting. If you want a cleaner `splitSemiColon(line)` call 
site, better as a follow-up that renames `splitSemiColonWithIndex` than adding 
a third method here.
   
   (Dropping it also makes the PR description accurate -- it says the driver 
"delegates to `StringUtils.splitSemiColon`", but it delegates to 
`splitSemiColonWithIndex`.)



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala:
##########
@@ -279,139 +279,73 @@ object StringUtils extends Logging {
     pattern.matches(cleanText)
   }
 
-  private def containsNonWhiteSpaceCharacters(inputString: String): Boolean = {
-    val pattern = "\\S".r
-    pattern.findFirstIn(inputString).isDefined
-  }
-
-  // Implementation is grabbed from SparkSQLCLIDriver.splitSemiColon, the only 
difference is this
-  // implementation handles backtick and treat it as single/double quote.
-  // Below comments are from the source:
-  // Adapted splitSemiColon from Hive 2.3's CliDriver.splitSemiColon.
-  // Note: [SPARK-31595] if there is a `'` in a double quoted string, or a `"` 
in a single quoted
-  // string, the origin implementation from Hive will not drop the trailing 
semicolon as expected,
-  // hence we refined this function a little bit.
-  // Note: [SPARK-33100] Ignore a semicolon inside a bracketed comment in 
spark-sql.
+  // Structural scanner for splitting SQL by semicolons.
+  // Handles: quoted strings with escapes, line comments (--), nested block 
comments (/* */),
+  // and semicolons inside strings/comments are not treated as delimiters.

Review Comment:
   The list mixes noun phrases with a trailing full clause; reads cleaner as 
two sentences:
   ```suggestion
     // Handles quoted strings with escapes, line comments (--), and nested 
block comments (/* */).
     // Semicolons inside strings or comments are not treated as delimiters.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-54876][SQL] Fix splitSemiColon dropping statement ending with block comment [spark]

Reply via email to