pan3793 commented on code in PR #54946: URL: https://github.com/apache/spark/pull/54946#discussion_r3469350800
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SqlStatementSplitter.scala: ########## @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.parser + +import java.util.{ArrayList => JArrayList} + +import scala.collection.mutable + +import org.antlr.v4.runtime._ +import org.antlr.v4.runtime.atn.PredictionMode +import org.antlr.v4.runtime.misc.ParseCancellationException + +import org.apache.spark.sql.internal.SqlApiConf + +/** + * Represents a single complete SQL statement together with the delimiter that + * terminated it (always `";"` for now). + * + * @param statement the SQL statement text, with surrounding whitespace trimmed and + * without the terminator + * @param terminator the delimiter string that terminated the statement + */ +case class SqlStatement(statement: String, terminator: String) { + override def toString: String = statement + terminator +} + +/** + * Result of splitting a SQL string into individual statements. + * + * @param completeStatements statements that are fully terminated by `;` + * @param partialStatement trailing text after the last `;` that has not yet + * formed a complete statement; an empty string when the + * input ends with `;` or contains no significant + * trailing content + * @param hasUnclosedComment true when [[partialStatement]] contains an unclosed + * bracketed comment (`/* ...` with no matching `*/`). + * Interactive CLIs may want to flush the partial to the + * backend in this case (so the user sees a parse error) + * rather than keep buffering, since the input cannot be + * completed simply by appending more SQL. + */ +case class SqlStatementSplitResult( + completeStatements: Seq[SqlStatement], + partialStatement: String, + hasUnclosedComment: Boolean = false) { + def isEmpty: Boolean = completeStatements.isEmpty && partialStatement.isEmpty +} + +/** + * A parser-based SQL statement splitter, inspired by Trino's + * `io.trino.cli.lexer.StatementSplitter`. + * + * Each candidate statement is consumed and confirmed by the ANTLR-generated + * [[SqlBaseParser]] via the existing `compoundOrSingleStatement` rule (the same + * rule that the normal Spark SQL parser uses). The splitter: + * + * 1. Tokenizes the input once. + * 2. Walks through the token stream. At each significant position, the + * splitter asks the parser whether the prefix ending at the next `;` is a + * complete statement; if not, it extends the prefix to the next `;` and + * re-tries. This is how SQL scripting `BEGIN ... END` blocks (whose body + * contains semicolons) end up emitted as a single statement: only the + * prefix that includes a matching `END` is accepted by the parser. + * 3. When the parser fails because it reached EOF mid-rule (e.g. an + * un-terminated `BEGIN ... END`, a SELECT with a missing operand), the + * remaining input is treated as a partial statement so an interactive + * caller can keep buffering. + * 4. When the parser fails on a non-EOF token (the input is structurally + * invalid), the splitter falls back to splitting at the next `;` so the + * surrounding delimiters still emit chunks and the backend can report + * the error per chunk. + * + * Quoted strings, single-line and bracketed (nested) comments are honored + * throughout. An unterminated bracketed comment is surfaced via + * [[SqlStatementSplitResult.hasUnclosedComment]]. + */ +object SqlStatementSplitter { + + /** Split the given SQL text into individual statements at `;` boundaries. */ + def split(sqlText: String): SqlStatementSplitResult = { + require(sqlText != null, "sqlText must not be null") + + val lexer = new SqlBaseLexer(new UpperCaseCharStream(CharStreams.fromString(sqlText))) + lexer.removeErrorListeners() + val tokenStream = new CommonTokenStream(lexer) + tokenStream.fill() + + val numTokens = tokenStream.size() + // Pre-compute the positions of `;` tokens (on the default channel). + val delimiterPositions: Array[Int] = { + val acc = mutable.ArrayBuffer.empty[Int] + var i = 0 + while (i < numTokens) { + if (tokenStream.get(i).getType == SqlBaseLexer.SEMICOLON) acc += i + i += 1 + } + acc.toArray + } + + val completeStatements = mutable.ArrayBuffer.empty[SqlStatement] + val buffer = new StringBuilder() + // Whether `buffer` contains any non-hidden token (i.e. any actual SQL content + // beyond whitespace and comments). Chunks that only contain whitespace/comments + // are dropped, matching the spark-sql CLI's long-standing behavior. + var bufferHasContent = false + var index = 0 + var stopOuter = false + // The first index in `delimiterPositions` that is still > our cursor. + var delimSearchStart = 0 + + // Snapshot the session config once -- the splitter is short-lived and the + // splitter's parser must agree with the session config on grammar + // interpretation (e.g. `double_quoted_identifiers`). + val conf = SqlApiConf.get + + while (!stopOuter && index < numTokens) { + val startIdx = nextSignificantTokenIndex(tokenStream, index) + if (startIdx < 0) { + // Just hidden trailing tokens (whitespace / closed comments). Drain + // them into the buffer; if any are an unclosed comment, the lexer + // flag will surface it as a partial. + while (index < numTokens) { + val tok = tokenStream.get(index) + index += 1 + if (tok.getType != Token.EOF) buffer.append(tok.getText) + } + stopOuter = true + } else if (tokenStream.get(startIdx).getType == SqlBaseLexer.SEMICOLON) { + // The next significant token is itself a `;`. This is an empty + // statement (e.g. `;;` or leading `;`); drop everything from the + // cursor through this delimiter and continue. + index = startIdx + 1 + } else { + // Advance the search for delimiters past our current cursor. + while (delimSearchStart < delimiterPositions.length && + delimiterPositions(delimSearchStart) <= startIdx) { + delimSearchStart += 1 + } + + // Try increasingly long prefixes (each ending at a `;`) until the Review Comment: This also seems to be more complex due to the tricky `SET` syntax ... A non-EOF terminated single-statement rule would drop this to O(n), but Spark's `setResetStatement: SET .*?` / `RESET .*?` wildcards need an EOF anchor to terminate deterministically, so the simple rule rewrite doesn't drop in cleanly. I would like to defer this if non-blocking. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
