Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

via GitHub Mon, 20 Oct 2025 15:34:36 -0700


dtenedor commented on code in PR #52638:
URL: https://github.com/apache/spark/pull/52638#discussion_r2446274298



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -45,18 +46,188 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
     withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
   }
 
-  override def visitStringLiteralValue(ctx: StringLiteralValueContext): Token =
-    Option(ctx).map(_.STRING_LITERAL.getSymbol).orNull
+  /**
+   * Visits a stringLit context that may contain multiple singleStringLit 
children (which can be
+   * either singleStringLitWithoutMarker or parameterMarker). When multiple 
children are present,
+   * they are coalesced into a single token.
+   */
+  override def visitStringLit(ctx: StringLitContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    import scala.jdk.CollectionConverters._
+
+    // Collect tokens from all singleStringLit children.
+    // Each child is either a singleStringLitWithoutMarker or a 
parameterMarker.
+    val tokens = ctx
+      .singleStringLit()
+      .asScala
+      .map { child =>
+        visit(child).asInstanceOf[Token]
+      }
+      .toSeq
+
+    if (tokens.isEmpty) {
+      null
+    } else if (tokens.size == 1) {
+      // Fast path: single token, return unchanged
+      tokens.head
+    } else {
+      // Multiple tokens: create coalesced token
+      createCoalescedStringToken(tokens)
+    }
+  }
+
+  /**
+   * Visits a stringLitWithoutMarker context that contains one or more string 
literal terminals.
+   * Multiple literals are automatically coalesced into a single 
CoalescedStringToken.
+   */
+  override def visitStringLitWithoutMarker(ctx: 
StringLitWithoutMarkerContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    // Collect all string literal terminals (could be multiple with 
stringLitWithoutMarker+)
+    val allTerminals = collectStringTerminals(ctx)
 
-  override def visitDoubleQuotedStringLiteralValue(
-      ctx: DoubleQuotedStringLiteralValueContext): Token =
-    Option(ctx).map(_.DOUBLEQUOTED_STRING.getSymbol).orNull
+    if (allTerminals.isEmpty) {
+      null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Visits singleStringLitWithoutMarker alternatives and returns the token. 
Always returns
+   * exactly one token without coalescing.
+   */
+  override def visitSingleStringLiteralValue(ctx: 
SingleStringLiteralValueContext): Token = {
+    ctx.STRING_LITERAL().getSymbol
+  }
+
+  override def visitSingleDoubleQuotedStringLiteralValue(
+      ctx: SingleDoubleQuotedStringLiteralValueContext): Token = {
+    ctx.DOUBLEQUOTED_STRING().getSymbol
+  }
 
+  /**
+   * Visits an integerVal alternative and returns the INTEGER_VALUE token.
+   *
+   * @param ctx
+   *   The integerVal context to process.
+   * @return
+   *   The INTEGER_VALUE token, or null if context is null.
+   */
   override def visitIntegerVal(ctx: IntegerValContext): Token =
     Option(ctx).map(_.INTEGER_VALUE.getSymbol).orNull
 
-  override def visitStringLiteralInContext(ctx: 
StringLiteralInContextContext): Token = {
-    visit(ctx.stringLitWithoutMarker).asInstanceOf[Token]
+  /**
+   * Collects all string literal terminals from a stringLitWithoutMarker 
context. The grammar rule
+   * allows one or more consecutive string literals, which are collected in 
source order for
+   * coalescing.
+   *
+   * @param ctx
+   *   The stringLitWithoutMarker context to process.
+   * @return
+   *   A sequence of terminal nodes representing the string literals.
+   */
+  private def collectStringTerminals(
+      ctx: StringLitWithoutMarkerContext): 
Seq[org.antlr.v4.runtime.tree.TerminalNode] = {
+    // With the grammar change to singleStringLitWithoutMarker+, we visit each 
child context.
+    // Each singleStringLitWithoutMarker has labeled alternatives that we need 
to handle.
+    import scala.jdk.CollectionConverters._
+    ctx
+      .singleStringLitWithoutMarker()
+      .asScala
+      .map { child =>
+        // Visit the child to get its token (handled by 
visitSingleStringLiteralValue or
+        // visitSingleDoubleQuotedStringLiteralValue)
+        val token = visit(child).asInstanceOf[Token]
+        // Get the terminal node from the parse tree
+        child.getChild(0).asInstanceOf[org.antlr.v4.runtime.tree.TerminalNode]
+      }
+      .toSeq
+  }
+
+  /**
+   * Creates a CoalescedStringToken from multiple string literal tokens.
+   *
+   * This method concatenates the raw content of the tokens (with outer quotes 
removed but escape
+   * sequences preserved). The resulting token preserves R-string status and 
quote character type
+   * from the original tokens.
+   *
+   * @param tokens
+   *   A sequence of tokens to coalesce (must be non-empty).
+   * @return
+   *   A CoalescedStringToken representing the concatenated value.
+   */
+  private def createCoalescedStringToken(tokens: Seq[Token]): Token = {
+    val firstToken = tokens.head
+    val lastToken = tokens.last
+
+    // Check if any of the tokens are R-strings.
+    val hasRString = tokens.exists { token =>
+      val text = token.getText

Review Comment:
   This looks duplicated with L187-190 and L207-210 below, let's dedup into a 
helper? Bonus point for adding a bunch of unit tests specifically for the 
helper with many different input strings (AI can maybe help generate a bunch of 
test cases).



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala:
##########
@@ -4293,9 +4333,19 @@ class AstBuilder extends DataTypeAstBuilder
   override def visitExpressionPropertyList(
       ctx: ExpressionPropertyListContext): OptionList = {
     val options = ctx.expressionProperty.asScala.map { property =>
-      val key: String = visitPropertyKey(property.key)
-      val value: Expression = Option(property.value).map(expression).getOrElse 
{
-        operationNotAllowed(s"A value must be specified for the key: $key.", 
ctx)
+      val (key, value) = property match {
+        case p: ExpressionPropertyWithKeyAndEqualsContext =>
+          val k = visitPropertyKeyOrStringLit(p.key)
+          val v = Option(p.value).map(expression).getOrElse {
+            operationNotAllowed(s"A value must be specified for the key: $k.", 
ctx)
+          }
+          (k, v)
+        case p: ExpressionPropertyWithKeyNoEqualsContext =>
+          val k = visitPropertyKeyOrStringLitNoCoalesce(p.key)
+          val v = Option(p.value).map(expression).getOrElse {
+            operationNotAllowed(s"A value must be specified for the key: $k.", 
ctx)

Review Comment:
   this looks duplicated with the above, please dedup?



##########
sql/core/src/test/scala/org/apache/spark/sql/StringLiteralCoalescingSuite.scala:
##########
@@ -0,0 +1,739 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSparkSession
+
+/**
+ * Test suite for string literal coalescing across all SQL grammar contexts.
+ *
+ * String literal coalescing allows multiple consecutive string literals to be
+ * automatically concatenated: 'hello' 'world' becomes 'helloworld'.
+ *
+ * This feature works in all contexts where string literals are accepted,
+ * not just in expressions.
+ */
+class StringLiteralCoalescingSuite extends QueryTest with SharedSparkSession {
+
+  // ========================================================================
+  // Basic String Literal Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in SELECT expressions") {
+    checkAnswer(
+      sql("SELECT 'hello' 'world'"),
+      Row("helloworld")
+    )
+
+    checkAnswer(
+      sql("SELECT 'one' 'two' 'three'"),
+      Row("onetwothree")
+    )
+  }
+
+  test("string coalescing with mixed quote styles") {
+    checkAnswer(
+      sql("SELECT 'hello' \"world\""),
+      Row("helloworld")
+    )
+
+    checkAnswer(
+      sql("SELECT \"hello\" 'world' \"!\""),
+      Row("helloworld!")
+    )
+  }
+
+  // ========================================================================
+  // DDL Context Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in DDL LOCATION clause") {
+    withTempPath { dir =>
+      withTable("t") {
+        val path1 = dir.getAbsolutePath
+        val path2 = "/test_data"
+        sql(s"CREATE TABLE t (id INT) USING parquet LOCATION '$path1' 
'$path2'")
+
+        val location = sql("DESCRIBE EXTENDED t")
+          .filter("col_name = 'Location'")
+          .select("data_type")
+          .collect()
+          .head
+          .getString(0)
+
+        assert(location.contains(path1 + path2))
+      }
+    }
+  }
+
+  test("string coalescing in DDL COMMENT clause") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT) COMMENT 'This is ' 'a multi' 'part 
comment'")
+
+      val comment = sql("DESCRIBE EXTENDED t")
+        .filter("col_name = 'Comment'")
+        .select("data_type")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "This is a multipart comment")
+    }
+  }
+
+  test("string coalescing in column COMMENT") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT COMMENT 'User ' 'ID' ' number')")
+
+      val comment = sql("DESCRIBE t")
+        .filter("col_name = 'id'")
+        .select("comment")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "User ID number")
+    }
+  }
+
+  // ========================================================================
+  // LIKE and Pattern Matching Tests
+  // ========================================================================
+
+  test("string coalescing in LIKE pattern") {
+    checkAnswer(
+      sql("SELECT 'test_value' LIKE 'test' '_value'"),
+      Row(true)
+    )
+
+    checkAnswer(
+      sql("SELECT 'test_value' LIKE 'test' '_xyz'"),
+      Row(false)
+    )
+
+    checkAnswer(
+      sql("SELECT 'prefix_middle_suffix' LIKE 'prefix' '%' 'suffix'"),
+      Row(true)
+    )
+  }
+
+  test("string coalescing in ESCAPE clause") {
+    // Test escape character coalescing (though typically escape is single 
char)
+    checkAnswer(
+      sql("SELECT 'test%value' LIKE 'test' '\\%' 'value' ESCAPE '\\\\'"),
+      Row(true)
+    )
+  }
+
+  test("string coalescing in table options") {
+    withTable("t") {
+      withTempPath { dir =>
+        val path1 = s"${dir.getAbsolutePath}/part"
+        val path2 = "1"
+        // Test that LOCATION paths can be coalesced
+        sql(s"CREATE TABLE t (a STRING, b STRING) USING parquet LOCATION 
'$path1' '$path2'")
+        assert(spark.catalog.tableExists("t"))
+        val location = spark.sessionState.catalog.getTableMetadata(
+          spark.sessionState.sqlParser.parseTableIdentifier("t")).location
+        assert(location.toString.contains("part1"))
+      }
+    }
+  }
+
+  test("string coalescing in SHOW TABLES LIKE pattern") {
+    withTable("test_table_123", "test_table_456", "other_table") {
+      sql("CREATE TABLE test_table_123 (id INT)")
+      sql("CREATE TABLE test_table_456 (id INT)")
+      sql("CREATE TABLE other_table (id INT)")

Review Comment:
   do a Seq of the table names and .foreach on it?



##########
sql/core/src/test/scala/org/apache/spark/sql/StringLiteralCoalescingSuite.scala:
##########
@@ -0,0 +1,739 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSparkSession
+
+/**
+ * Test suite for string literal coalescing across all SQL grammar contexts.
+ *
+ * String literal coalescing allows multiple consecutive string literals to be
+ * automatically concatenated: 'hello' 'world' becomes 'helloworld'.
+ *
+ * This feature works in all contexts where string literals are accepted,
+ * not just in expressions.
+ */
+class StringLiteralCoalescingSuite extends QueryTest with SharedSparkSession {
+
+  // ========================================================================
+  // Basic String Literal Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in SELECT expressions") {
+    checkAnswer(
+      sql("SELECT 'hello' 'world'"),
+      Row("helloworld")
+    )
+
+    checkAnswer(
+      sql("SELECT 'one' 'two' 'three'"),
+      Row("onetwothree")
+    )
+  }
+
+  test("string coalescing with mixed quote styles") {
+    checkAnswer(
+      sql("SELECT 'hello' \"world\""),
+      Row("helloworld")
+    )
+
+    checkAnswer(
+      sql("SELECT \"hello\" 'world' \"!\""),
+      Row("helloworld!")
+    )
+  }
+
+  // ========================================================================
+  // DDL Context Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in DDL LOCATION clause") {
+    withTempPath { dir =>
+      withTable("t") {
+        val path1 = dir.getAbsolutePath
+        val path2 = "/test_data"
+        sql(s"CREATE TABLE t (id INT) USING parquet LOCATION '$path1' 
'$path2'")
+
+        val location = sql("DESCRIBE EXTENDED t")
+          .filter("col_name = 'Location'")
+          .select("data_type")
+          .collect()
+          .head
+          .getString(0)
+
+        assert(location.contains(path1 + path2))
+      }
+    }
+  }
+
+  test("string coalescing in DDL COMMENT clause") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT) COMMENT 'This is ' 'a multi' 'part 
comment'")
+
+      val comment = sql("DESCRIBE EXTENDED t")
+        .filter("col_name = 'Comment'")
+        .select("data_type")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "This is a multipart comment")
+    }
+  }
+
+  test("string coalescing in column COMMENT") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT COMMENT 'User ' 'ID' ' number')")
+
+      val comment = sql("DESCRIBE t")
+        .filter("col_name = 'id'")
+        .select("comment")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "User ID number")
+    }
+  }
+
+  // ========================================================================
+  // LIKE and Pattern Matching Tests
+  // ========================================================================
+
+  test("string coalescing in LIKE pattern") {
+    checkAnswer(
+      sql("SELECT 'test_value' LIKE 'test' '_value'"),
+      Row(true)
+    )
+
+    checkAnswer(
+      sql("SELECT 'test_value' LIKE 'test' '_xyz'"),
+      Row(false)
+    )
+
+    checkAnswer(
+      sql("SELECT 'prefix_middle_suffix' LIKE 'prefix' '%' 'suffix'"),
+      Row(true)
+    )
+  }
+
+  test("string coalescing in ESCAPE clause") {
+    // Test escape character coalescing (though typically escape is single 
char)
+    checkAnswer(
+      sql("SELECT 'test%value' LIKE 'test' '\\%' 'value' ESCAPE '\\\\'"),
+      Row(true)
+    )
+  }
+
+  test("string coalescing in table options") {
+    withTable("t") {
+      withTempPath { dir =>
+        val path1 = s"${dir.getAbsolutePath}/part"
+        val path2 = "1"
+        // Test that LOCATION paths can be coalesced
+        sql(s"CREATE TABLE t (a STRING, b STRING) USING parquet LOCATION 
'$path1' '$path2'")
+        assert(spark.catalog.tableExists("t"))
+        val location = spark.sessionState.catalog.getTableMetadata(
+          spark.sessionState.sqlParser.parseTableIdentifier("t")).location
+        assert(location.toString.contains("part1"))
+      }
+    }
+  }
+
+  test("string coalescing in SHOW TABLES LIKE pattern") {
+    withTable("test_table_123", "test_table_456", "other_table") {
+      sql("CREATE TABLE test_table_123 (id INT)")
+      sql("CREATE TABLE test_table_456 (id INT)")
+      sql("CREATE TABLE other_table (id INT)")
+
+      // The pattern is coalesced into 'test_table_*' (regex pattern where * 
matches any chars)
+      val result = sql("SHOW TABLES LIKE 'test' '_table_' '*'").collect()
+      assert(result.length == 2,

Review Comment:
   should we `checkAnswer` on it?



##########
sql/core/src/test/scala/org/apache/spark/sql/StringLiteralCoalescingSuite.scala:
##########
@@ -0,0 +1,739 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSparkSession
+
+/**
+ * Test suite for string literal coalescing across all SQL grammar contexts.
+ *
+ * String literal coalescing allows multiple consecutive string literals to be
+ * automatically concatenated: 'hello' 'world' becomes 'helloworld'.
+ *
+ * This feature works in all contexts where string literals are accepted,
+ * not just in expressions.
+ */
+class StringLiteralCoalescingSuite extends QueryTest with SharedSparkSession {
+
+  // ========================================================================
+  // Basic String Literal Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in SELECT expressions") {
+    checkAnswer(
+      sql("SELECT 'hello' 'world'"),
+      Row("helloworld")
+    )
+
+    checkAnswer(
+      sql("SELECT 'one' 'two' 'three'"),
+      Row("onetwothree")
+    )
+  }
+
+  test("string coalescing with mixed quote styles") {
+    checkAnswer(
+      sql("SELECT 'hello' \"world\""),
+      Row("helloworld")
+    )
+
+    checkAnswer(
+      sql("SELECT \"hello\" 'world' \"!\""),
+      Row("helloworld!")
+    )
+  }
+
+  // ========================================================================
+  // DDL Context Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in DDL LOCATION clause") {
+    withTempPath { dir =>
+      withTable("t") {
+        val path1 = dir.getAbsolutePath
+        val path2 = "/test_data"
+        sql(s"CREATE TABLE t (id INT) USING parquet LOCATION '$path1' 
'$path2'")
+
+        val location = sql("DESCRIBE EXTENDED t")
+          .filter("col_name = 'Location'")
+          .select("data_type")
+          .collect()
+          .head
+          .getString(0)
+
+        assert(location.contains(path1 + path2))
+      }
+    }
+  }
+
+  test("string coalescing in DDL COMMENT clause") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT) COMMENT 'This is ' 'a multi' 'part 
comment'")
+
+      val comment = sql("DESCRIBE EXTENDED t")
+        .filter("col_name = 'Comment'")
+        .select("data_type")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "This is a multipart comment")
+    }
+  }
+
+  test("string coalescing in column COMMENT") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT COMMENT 'User ' 'ID' ' number')")
+
+      val comment = sql("DESCRIBE t")
+        .filter("col_name = 'id'")
+        .select("comment")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "User ID number")
+    }
+  }
+
+  // ========================================================================
+  // LIKE and Pattern Matching Tests
+  // ========================================================================
+
+  test("string coalescing in LIKE pattern") {
+    checkAnswer(

Review Comment:
   There are many tests of this format, with
   ```
   test("category") {
     checkAnswer(
       sql("some query"),
       Row(true)
     )
   }
   ```
   
   It seems like we could simplify this a bunch by just putting a big `Seq` of 
strings and then doing a `.foreach` on it and calling `checkAnswer(sql(query), 
Row(true))` on it. We can still port the existing test case names as comments 
and put them in comments in the `Seq` instead.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

Reply via email to