Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

via GitHub Sat, 18 Oct 2025 09:15:34 -0700


dtenedor commented on code in PR #52638:
URL: https://github.com/apache/spark/pull/52638#discussion_r2437423502



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala:
##########
@@ -3645,15 +3646,14 @@ class AstBuilder extends DataTypeAstBuilder
    * converted into "helloworld".
    *
    * Special characters can be escaped by using Hive/C-style escaping.
+   *
+   * Note: With the modified stringLit grammar rule, visitStringLit now 
returns a Token
+   * that already represents the coalesced value of multiple literals.
    */
   private def createString(ctx: StringLiteralContext): String = {
-    if (conf.escapedStringLiterals) {
-      ctx.stringLit.asScala.map(x => 
stringWithoutUnescape(visitStringLit(x))).mkString
-    } else if (conf.getConf(LEGACY_CONSECUTIVE_STRING_LITERALS)) {

Review Comment:
   With this change, we now ignore the `LEGACY_CONSECUTIVE_STRING_LITERALS` 
config [1]. Is this intended?
   
   [1] 
https://github.com/apache/spark/blob/eb117a642bcd217b81f6ac4980f2030fcf17d46b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4382-L4390__



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -46,15 +47,80 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
   }
 
   override def visitStringLit(ctx: StringLitContext): Token = {
-    if (ctx != null) {
-      if (ctx.STRING_LITERAL != null) {
-        ctx.STRING_LITERAL.getSymbol
-      } else {
-        ctx.DOUBLEQUOTED_STRING.getSymbol
-      }
-    } else {
+    if (ctx == null) {
+      return null
+    }
+
+    // Collect all string literal terminals (could be multiple with stringLit+)
+    val allTerminals = collectStringTerminals(ctx)
+
+    if (allTerminals.isEmpty) {
       null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Collect all STRING_LITERAL and DOUBLEQUOTED_STRING terminals from the 
context. With the
+   * stringLit+ grammar rule, there can be multiple terminals.
+   */
+  private def collectStringTerminals(
+      ctx: StringLitContext): Seq[org.antlr.v4.runtime.tree.TerminalNode] = {
+    val stringLiterals = if (ctx.STRING_LITERAL != null) {
+      // With stringLit+, ANTLR generates a List
+      ctx.STRING_LITERAL
+        .asInstanceOf[java.util.List[_]]
+        .asScala
+        .map(_.asInstanceOf[org.antlr.v4.runtime.tree.TerminalNode])
+        .toSeq
+    } else {
+      Seq.empty
+    }
+
+    val doubleQuoted = if (ctx.DOUBLEQUOTED_STRING != null) {
+      // With stringLit+, ANTLR generates a List
+      ctx.DOUBLEQUOTED_STRING
+        .asInstanceOf[java.util.List[_]]

Review Comment:
   This code until L91 looks like a dup of L77-80 above. Should we dedup into a 
helper method?



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -46,15 +47,80 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
   }
 
   override def visitStringLit(ctx: StringLitContext): Token = {
-    if (ctx != null) {
-      if (ctx.STRING_LITERAL != null) {
-        ctx.STRING_LITERAL.getSymbol
-      } else {
-        ctx.DOUBLEQUOTED_STRING.getSymbol
-      }
-    } else {
+    if (ctx == null) {
+      return null
+    }
+
+    // Collect all string literal terminals (could be multiple with stringLit+)
+    val allTerminals = collectStringTerminals(ctx)
+
+    if (allTerminals.isEmpty) {
       null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Collect all STRING_LITERAL and DOUBLEQUOTED_STRING terminals from the 
context. With the
+   * stringLit+ grammar rule, there can be multiple terminals.
+   */
+  private def collectStringTerminals(
+      ctx: StringLitContext): Seq[org.antlr.v4.runtime.tree.TerminalNode] = {
+    val stringLiterals = if (ctx.STRING_LITERAL != null) {
+      // With stringLit+, ANTLR generates a List
+      ctx.STRING_LITERAL
+        .asInstanceOf[java.util.List[_]]
+        .asScala
+        .map(_.asInstanceOf[org.antlr.v4.runtime.tree.TerminalNode])
+        .toSeq
+    } else {
+      Seq.empty
+    }
+
+    val doubleQuoted = if (ctx.DOUBLEQUOTED_STRING != null) {
+      // With stringLit+, ANTLR generates a List
+      ctx.DOUBLEQUOTED_STRING
+        .asInstanceOf[java.util.List[_]]
+        .asScala
+        .map(_.asInstanceOf[org.antlr.v4.runtime.tree.TerminalNode])
+        .toSeq
+    } else {
+      Seq.empty
     }
+
+    // Combine and sort by position in source
+    (stringLiterals ++ doubleQuoted).sortBy(_.getSymbol.getStartIndex)
+  }
+
+  /**
+   * Create a CoalescedStringToken from multiple string literal tokens. 
Processes each token
+   * through string() to handle escaping, then concatenates.
+   */
+  private def createCoalescedStringToken(tokens: Seq[Token]): Token = {
+    val firstToken = tokens.head
+    val lastToken = tokens.last
+
+    // Concatenate the raw content of each token (without the outer quotes).
+    // The getText() on CoalescedStringToken will wrap this in quotes,
+    // and the final string() call will handle unescaping based on the config.
+    val coalescedRawContent = tokens.map { token =>
+      val text = token.getText
+      // Remove outer quotes: first and last character

Review Comment:
   What about R-strings? Let's cover and add testing for this?
   
   ```
   SELECT r'C:\Users\JohnDoe\Documents';
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

Reply via email to