Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

via GitHub Tue, 28 Oct 2025 10:34:49 -0700


srielau commented on code in PR #52638:
URL: https://github.com/apache/spark/pull/52638#discussion_r2470451034



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -45,40 +75,206 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
     withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
   }
 
-  override def visitStringLiteralValue(ctx: StringLiteralValueContext): Token =
-    Option(ctx).map(_.STRING_LITERAL.getSymbol).orNull
+  /**
+   * Visits a stringLit context that may contain multiple singleStringLit 
children (which can be
+   * either singleStringLitWithoutMarker or parameterMarker). When multiple 
children are present,
+   * they are coalesced into a single token.
+   */
+  override def visitStringLit(ctx: StringLitContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    import scala.jdk.CollectionConverters._
+
+    // Collect tokens from all singleStringLit children.
+    // Each child is either a singleStringLitWithoutMarker or a 
parameterMarker.
+    val tokens = ctx
+      .singleStringLit()
+      .asScala
+      .map { child =>
+        visit(child).asInstanceOf[Token]
+      }
+      .toSeq
+
+    if (tokens.isEmpty) {
+      null
+    } else if (tokens.size == 1) {
+      // Fast path: single token, return unchanged
+      tokens.head
+    } else {
+      // Multiple tokens: create coalesced token
+      createCoalescedStringToken(tokens)
+    }
+  }
+
+  /**
+   * Visits a stringLitWithoutMarker context that contains one or more string 
literal terminals.
+   * Multiple literals are automatically coalesced into a single 
CoalescedStringToken.
+   */
+  override def visitStringLitWithoutMarker(ctx: 
StringLitWithoutMarkerContext): Token = {
+    if (ctx == null) {
+      return null
+    }
 
-  override def visitDoubleQuotedStringLiteralValue(
-      ctx: DoubleQuotedStringLiteralValueContext): Token =
-    Option(ctx).map(_.DOUBLEQUOTED_STRING.getSymbol).orNull
+    // Collect all string literal terminals (could be multiple with 
stringLitWithoutMarker+)
+    val allTerminals = collectStringTerminals(ctx)
 
+    if (allTerminals.isEmpty) {
+      null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Visits singleStringLitWithoutMarker alternatives and returns the token. 
Always returns
+   * exactly one token without coalescing.
+   */
+  override def visitSingleStringLiteralValue(ctx: 
SingleStringLiteralValueContext): Token = {
+    ctx.STRING_LITERAL().getSymbol
+  }
+
+  override def visitSingleDoubleQuotedStringLiteralValue(
+      ctx: SingleDoubleQuotedStringLiteralValueContext): Token = {
+    ctx.DOUBLEQUOTED_STRING().getSymbol
+  }
+
+  /**
+   * Visits an integerVal alternative and returns the INTEGER_VALUE token.
+   *
+   * @param ctx
+   *   The integerVal context to process.
+   * @return
+   *   The INTEGER_VALUE token, or null if context is null.
+   */
   override def visitIntegerVal(ctx: IntegerValContext): Token =
     Option(ctx).map(_.INTEGER_VALUE.getSymbol).orNull
 
-  override def visitStringLiteralInContext(ctx: 
StringLiteralInContextContext): Token = {
-    visit(ctx.stringLitWithoutMarker).asInstanceOf[Token]
+  /**
+   * Collects all string literal terminals from a stringLitWithoutMarker 
context. The grammar rule
+   * allows one or more consecutive string literals, which are collected in 
source order for
+   * coalescing.
+   *
+   * @param ctx
+   *   The stringLitWithoutMarker context to process.
+   * @return
+   *   A sequence of terminal nodes representing the string literals.
+   */
+  private def collectStringTerminals(
+      ctx: StringLitWithoutMarkerContext): 
Seq[org.antlr.v4.runtime.tree.TerminalNode] = {
+    // With the grammar change to singleStringLitWithoutMarker+, we visit each 
child context.
+    // Each singleStringLitWithoutMarker has labeled alternatives that we need 
to handle.
+    import scala.jdk.CollectionConverters._
+    ctx
+      .singleStringLitWithoutMarker()
+      .asScala
+      .map { child =>
+        // Visit the child to get its token (handled by 
visitSingleStringLiteralValue or
+        // visitSingleDoubleQuotedStringLiteralValue)
+        val token = visit(child).asInstanceOf[Token]
+        // Get the terminal node from the parse tree
+        child.getChild(0).asInstanceOf[org.antlr.v4.runtime.tree.TerminalNode]
+      }
+      .toSeq
+  }
+
+  /**
+   * Checks if a token's text represents an R-string (raw string literal).
+   *
+   * An R-string has the format `R'...'` or `r"..."` where the first character 
is 'R' or 'r'
+   * (case-insensitive) followed by a quote character (single or double).
+   *
+   * @param tokenText
+   *   The text content of the token to check.
+   * @return
+   *   true if the token represents an R-string, false otherwise.
+   */
+  private def isRString(tokenText: String): Boolean = {
+    tokenText.length >= 2 &&
+    (tokenText.charAt(0) == 'R' || tokenText.charAt(0) == 'r') &&
+    (tokenText.charAt(1) == '\'' || tokenText.charAt(1) == '"')
+  }
+
+  /**
+   * Creates a CoalescedStringToken from multiple string literal tokens.
+   *
+   * This method processes escape sequences in each token individually 
(respecting R-string
+   * semantics), then concatenates the results. This preserves the correct 
behavior when mixing
+   * R-strings and regular strings.
+   *
+   * For example: 'hello\n' R'world\t' -> "hello<NEWLINE>world\t" (first \n 
processed, second \t
+   * not)
+   *
+   * @param tokens
+   *   A sequence of tokens to coalesce (must be non-empty).
+   * @return
+   *   A CoalescedStringToken representing the concatenated value.
+   */
+  private def createCoalescedStringToken(tokens: Seq[Token]): Token = {
+    val firstToken = tokens.head
+    val lastToken = tokens.last
+
+    // Process each token individually, respecting R-string semantics
+    val processedStrings = tokens.map { token =>
+      val text = token.getText
+      // Call string() which internally calls unescapeSQLString
+      // This will handle R-strings correctly (no escape processing) and
+      // regular strings correctly (escape processing)
+      string(token)

Review Comment:
   Reworked



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

Reply via email to