Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

via GitHub Sat, 25 Oct 2025 20:19:31 -0700


cloud-fan commented on code in PR #52638:
URL: https://github.com/apache/spark/pull/52638#discussion_r2463174814



##########
sql/core/src/test/scala/org/apache/spark/sql/StringLiteralCoalescingSuite.scala:
##########
@@ -0,0 +1,786 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSparkSession
+
+/**
+ * Test suite for string literal coalescing across all SQL grammar contexts.
+ *
+ * String literal coalescing allows multiple consecutive string literals to be
+ * automatically concatenated: 'hello' 'world' becomes 'helloworld'.
+ *
+ * This feature works in all contexts where string literals are accepted,
+ * not just in expressions.
+ */
+class StringLiteralCoalescingSuite extends QueryTest with SharedSparkSession {
+
+  // ========================================================================
+  // Basic String Literal Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing - basic expressions") {
+    val testCases = Seq(
+      // Two literals
+      ("SELECT 'hello' 'world'", "helloworld"),
+      // Three literals
+      ("SELECT 'one' 'two' 'three'", "onetwothree"),
+      // Mixed quote styles: single and double
+      ("SELECT 'hello' \"world\"", "helloworld"),
+      // Mixed quote styles: multiple
+      ("SELECT \"hello\" 'world' \"!\"", "helloworld!"),
+      // Empty strings: middle empty
+      ("SELECT '' 'hello' ''", "hello"),
+      // Empty strings: start and end empty
+      ("SELECT 'start' '' 'end'", "startend")
+    )
+
+    testCases.foreach { case (query, expected) =>
+      checkAnswer(sql(query), Row(expected))
+    }
+  }
+
+  // ========================================================================
+  // DDL Context Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing in DDL LOCATION clause") {
+    withTempPath { dir =>
+      withTable("t") {
+        val path1 = dir.getAbsolutePath
+        val path2 = "/test_data"
+        sql(s"CREATE TABLE t (id INT) USING parquet LOCATION '$path1' 
'$path2'")
+
+        val location = sql("DESCRIBE EXTENDED t")
+          .filter("col_name = 'Location'")
+          .select("data_type")
+          .collect()
+          .head
+          .getString(0)
+
+        assert(location.contains(path1 + path2))
+      }
+    }
+  }
+
+  test("string coalescing in DDL COMMENT clause") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT) COMMENT 'This is ' 'a multi' 'part 
comment'")
+
+      val comment = sql("DESCRIBE EXTENDED t")
+        .filter("col_name = 'Comment'")
+        .select("data_type")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "This is a multipart comment")
+    }
+  }
+
+  test("string coalescing in column COMMENT") {
+    withTable("t") {
+      sql("CREATE TABLE t (id INT COMMENT 'User ' 'ID' ' number')")
+
+      val comment = sql("DESCRIBE t")
+        .filter("col_name = 'id'")
+        .select("comment")
+        .collect()
+        .head
+        .getString(0)
+
+      assert(comment == "User ID number")
+    }
+  }
+
+  // ========================================================================
+  // LIKE and Pattern Matching Tests
+  // ========================================================================
+
+  test("string coalescing in LIKE patterns") {
+    val testCases = Seq(
+      // Coalescing with underscore wildcard - match
+      ("SELECT 'test_value' LIKE 'test' '_value'", true),
+      // Coalescing with underscore wildcard - no match
+      ("SELECT 'test_value' LIKE 'test' '_xyz'", false),
+      // Coalescing with percent wildcard
+      ("SELECT 'prefix_middle_suffix' LIKE 'prefix' '%' 'suffix'", true),
+      // ESCAPE clause with coalescing
+      ("SELECT 'test%value' LIKE 'test' '\\%' 'value' ESCAPE '\\\\'", true)
+    )
+
+    testCases.foreach { case (query, expected) =>
+      checkAnswer(sql(query), Row(expected))
+    }
+  }
+
+  test("string coalescing in table options") {
+    withTable("t") {
+      withTempPath { dir =>
+        val path1 = s"${dir.getAbsolutePath}/part"
+        val path2 = "1"
+        // Test that LOCATION paths can be coalesced
+        sql(s"CREATE TABLE t (a STRING, b STRING) USING parquet LOCATION 
'$path1' '$path2'")
+        assert(spark.catalog.tableExists("t"))
+        val location = spark.sessionState.catalog.getTableMetadata(
+          spark.sessionState.sqlParser.parseTableIdentifier("t")).location
+        assert(location.toString.contains("part1"))
+      }
+    }
+  }
+
+  test("string coalescing in SHOW TABLES LIKE pattern") {
+    val tableNames = Seq("test_table_123", "test_table_456", "other_table")
+    withTable(tableNames: _*) {
+      tableNames.foreach { tableName =>
+        sql(s"CREATE TABLE $tableName (id INT)")
+      }
+
+      // The pattern is coalesced into 'test_table_*' (regex pattern where * 
matches any chars)
+      // SHOW TABLES returns: namespace, tableName, isTemporary
+      checkAnswer(
+        sql("SHOW TABLES LIKE 'test' '_table_' '*'"),
+        Seq(
+          Row("default", "test_table_123", false),
+          Row("default", "test_table_456", false)
+        )
+      )
+    }
+  }
+
+  test("string coalescing across multiple lines") {
+    val result = sql("""
+      SELECT 'line'
+             'one'
+             'two'
+    """).collect().head.getString(0)
+    assert(result == "lineonetwo")
+  }
+
+  test("string coalescing in WHERE clause") {
+    withTable("t") {
+      sql("CREATE TABLE t (name STRING) USING parquet")
+      sql("INSERT INTO t VALUES ('helloworld'), ('hello'), ('world')")
+
+      checkAnswer(
+        sql("SELECT * FROM t WHERE name = 'hello' 'world'"),
+        Row("helloworld")
+      )
+    }
+  }
+
+  // ========================================================================
+  // R-String (Raw String) Coalescing Tests
+  // ========================================================================
+
+  test("R-string detection - basic cases and edge cases") {
+    val testCases = Seq(
+      // Basic cases: uppercase and lowercase R with single/double quotes
+      ("""SELECT R'\n'""", """\n"""),

Review Comment:
   The test is incorrect. `"""\n"""` in scala still does escape, what we really 
want is `raw"\n"`, as Scala has a similar raw string syntax as SQL.



##########
sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4:
##########
@@ -602,14 +602,25 @@ propertyList
     ;
 
 property
-    : key=propertyKey (EQ? value=propertyValue)?
+    : key=propertyKeyOrStringLit EQ value=propertyValue               
#propertyWithKeyAndEquals
+    | key=propertyKeyOrStringLitNoCoalesce value=propertyValue?       
#propertyWithKeyNoEquals
     ;
 
 propertyKey
     : errorCapturingIdentifier (DOT errorCapturingIdentifier)*
+    ;
+
+propertyKeyOrStringLit
+    : propertyKey
+    | stringLit

Review Comment:
   nit: duplicated `stringLit`.



##########
sql/core/src/test/scala/org/apache/spark/sql/StringLiteralCoalescingSuite.scala:
##########
@@ -0,0 +1,786 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSparkSession
+
+/**
+ * Test suite for string literal coalescing across all SQL grammar contexts.
+ *
+ * String literal coalescing allows multiple consecutive string literals to be
+ * automatically concatenated: 'hello' 'world' becomes 'helloworld'.
+ *
+ * This feature works in all contexts where string literals are accepted,
+ * not just in expressions.
+ */
+class StringLiteralCoalescingSuite extends QueryTest with SharedSparkSession {
+
+  // ========================================================================
+  // Basic String Literal Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing - basic expressions") {
+    val testCases = Seq(
+      // Two literals
+      ("SELECT 'hello' 'world'", "helloworld"),
+      // Three literals
+      ("SELECT 'one' 'two' 'three'", "onetwothree"),
+      // Mixed quote styles: single and double
+      ("SELECT 'hello' \"world\"", "helloworld"),
+      // Mixed quote styles: multiple
+      ("SELECT \"hello\" 'world' \"!\"", "helloworld!"),
+      // Empty strings: middle empty
+      ("SELECT '' 'hello' ''", "hello"),
+      // Empty strings: start and end empty

Review Comment:
   ```suggestion
         // Empty strings: middle empty
   ```



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -399,3 +569,54 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
     (start.getOrElse(defaultStart), step.getOrElse(defaultStep))
   }
 }
+
+/**
+ * A synthetic token representing multiple coalesced string literals.
+ *
+ * When the parser encounters consecutive string literals (e.g., 'hello' 
'world'), they are
+ * automatically coalesced into a single logical string. This token class 
represents such
+ * coalesced strings while maintaining the Token interface.
+ *
+ * The coalescedValue contains the raw concatenated content from all the 
string literals (with
+ * outer quotes removed but escape sequences preserved). The getText() method 
wraps this in
+ * quotes, and when SparkParserUtils.string() is called, it will unescape the 
content based on the
+ * current SQL configuration (respecting ESCAPED_STRING_LITERALS).
+ *
+ * @param source
+ *   The token source and input stream
+ * @param tokenType
+ *   The ANTLR token type (typically STRING_LITERAL)
+ * @param channel
+ *   The token channel
+ * @param start
+ *   The start index of the first literal in the input stream
+ * @param stop
+ *   The stop index of the last literal in the input stream
+ * @param coalescedValue
+ *   The raw concatenated content (without outer quotes, escape sequences NOT 
processed)
+ */
+private[parser] class CoalescedStringToken(
+    source: Pair[TokenSource, CharStream],
+    tokenType: Int,
+    channel: Int,
+    start: Int,
+    stop: Int,
+    private val coalescedValue: String,
+    private val isRawString: Boolean = false,
+    private val quoteChar: Char = '\'')
+    extends CommonToken(source, tokenType, channel, start, stop) {
+
+  override def getText: String = {
+    if (isRawString) {

Review Comment:
   if any of the string lit is raw string, the final coalesced string is raw 
string? it doesn't seems to be reasonable...



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -45,18 +46,193 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
     withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
   }
 
-  override def visitStringLiteralValue(ctx: StringLiteralValueContext): Token =
-    Option(ctx).map(_.STRING_LITERAL.getSymbol).orNull
+  /**
+   * Visits a stringLit context that may contain multiple singleStringLit 
children (which can be
+   * either singleStringLitWithoutMarker or parameterMarker). When multiple 
children are present,
+   * they are coalesced into a single token.
+   */
+  override def visitStringLit(ctx: StringLitContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    import scala.jdk.CollectionConverters._
+
+    // Collect tokens from all singleStringLit children.
+    // Each child is either a singleStringLitWithoutMarker or a 
parameterMarker.
+    val tokens = ctx
+      .singleStringLit()
+      .asScala
+      .map { child =>
+        visit(child).asInstanceOf[Token]
+      }
+      .toSeq
+
+    if (tokens.isEmpty) {
+      null
+    } else if (tokens.size == 1) {
+      // Fast path: single token, return unchanged
+      tokens.head
+    } else {
+      // Multiple tokens: create coalesced token
+      createCoalescedStringToken(tokens)
+    }
+  }
 
-  override def visitDoubleQuotedStringLiteralValue(
-      ctx: DoubleQuotedStringLiteralValueContext): Token =
-    Option(ctx).map(_.DOUBLEQUOTED_STRING.getSymbol).orNull
+  /**
+   * Visits a stringLitWithoutMarker context that contains one or more string 
literal terminals.
+   * Multiple literals are automatically coalesced into a single 
CoalescedStringToken.
+   */
+  override def visitStringLitWithoutMarker(ctx: 
StringLitWithoutMarkerContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    // Collect all string literal terminals (could be multiple with 
stringLitWithoutMarker+)
+    val allTerminals = collectStringTerminals(ctx)
 
+    if (allTerminals.isEmpty) {
+      null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Visits singleStringLitWithoutMarker alternatives and returns the token. 
Always returns
+   * exactly one token without coalescing.
+   */
+  override def visitSingleStringLiteralValue(ctx: 
SingleStringLiteralValueContext): Token = {

Review Comment:
   The parser rule is `: STRING_LITERAL        #singleStringLiteralValue`, so 
this override is kind of the default implementation already?



##########
sql/core/src/test/scala/org/apache/spark/sql/StringLiteralCoalescingSuite.scala:
##########
@@ -0,0 +1,786 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSparkSession
+
+/**
+ * Test suite for string literal coalescing across all SQL grammar contexts.
+ *
+ * String literal coalescing allows multiple consecutive string literals to be
+ * automatically concatenated: 'hello' 'world' becomes 'helloworld'.
+ *
+ * This feature works in all contexts where string literals are accepted,
+ * not just in expressions.
+ */
+class StringLiteralCoalescingSuite extends QueryTest with SharedSparkSession {
+
+  // ========================================================================
+  // Basic String Literal Coalescing Tests
+  // ========================================================================
+
+  test("string coalescing - basic expressions") {
+    val testCases = Seq(
+      // Two literals
+      ("SELECT 'hello' 'world'", "helloworld"),
+      // Three literals
+      ("SELECT 'one' 'two' 'three'", "onetwothree"),
+      // Mixed quote styles: single and double
+      ("SELECT 'hello' \"world\"", "helloworld"),
+      // Mixed quote styles: multiple
+      ("SELECT \"hello\" 'world' \"!\"", "helloworld!"),
+      // Empty strings: middle empty

Review Comment:
   ```suggestion
         // Empty strings: start and end empty
   ```



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -45,18 +46,193 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
     withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
   }
 
-  override def visitStringLiteralValue(ctx: StringLiteralValueContext): Token =
-    Option(ctx).map(_.STRING_LITERAL.getSymbol).orNull
+  /**
+   * Visits a stringLit context that may contain multiple singleStringLit 
children (which can be
+   * either singleStringLitWithoutMarker or parameterMarker). When multiple 
children are present,
+   * they are coalesced into a single token.
+   */
+  override def visitStringLit(ctx: StringLitContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    import scala.jdk.CollectionConverters._
+
+    // Collect tokens from all singleStringLit children.
+    // Each child is either a singleStringLitWithoutMarker or a 
parameterMarker.
+    val tokens = ctx
+      .singleStringLit()
+      .asScala
+      .map { child =>
+        visit(child).asInstanceOf[Token]
+      }
+      .toSeq
+
+    if (tokens.isEmpty) {
+      null
+    } else if (tokens.size == 1) {
+      // Fast path: single token, return unchanged
+      tokens.head
+    } else {
+      // Multiple tokens: create coalesced token
+      createCoalescedStringToken(tokens)
+    }
+  }
 
-  override def visitDoubleQuotedStringLiteralValue(
-      ctx: DoubleQuotedStringLiteralValueContext): Token =
-    Option(ctx).map(_.DOUBLEQUOTED_STRING.getSymbol).orNull
+  /**
+   * Visits a stringLitWithoutMarker context that contains one or more string 
literal terminals.
+   * Multiple literals are automatically coalesced into a single 
CoalescedStringToken.
+   */
+  override def visitStringLitWithoutMarker(ctx: 
StringLitWithoutMarkerContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    // Collect all string literal terminals (could be multiple with 
stringLitWithoutMarker+)
+    val allTerminals = collectStringTerminals(ctx)
 
+    if (allTerminals.isEmpty) {
+      null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Visits singleStringLitWithoutMarker alternatives and returns the token. 
Always returns
+   * exactly one token without coalescing.
+   */
+  override def visitSingleStringLiteralValue(ctx: 
SingleStringLiteralValueContext): Token = {
+    ctx.STRING_LITERAL().getSymbol
+  }
+
+  override def visitSingleDoubleQuotedStringLiteralValue(

Review Comment:
   I think what we should override is `visitSingleStringLit` to ignore the 
parameter marker.



##########
sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala:
##########
@@ -45,18 +46,193 @@ class DataTypeAstBuilder extends 
SqlBaseParserBaseVisitor[AnyRef] {
     withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
   }
 
-  override def visitStringLiteralValue(ctx: StringLiteralValueContext): Token =
-    Option(ctx).map(_.STRING_LITERAL.getSymbol).orNull
+  /**
+   * Visits a stringLit context that may contain multiple singleStringLit 
children (which can be
+   * either singleStringLitWithoutMarker or parameterMarker). When multiple 
children are present,
+   * they are coalesced into a single token.
+   */
+  override def visitStringLit(ctx: StringLitContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    import scala.jdk.CollectionConverters._
+
+    // Collect tokens from all singleStringLit children.
+    // Each child is either a singleStringLitWithoutMarker or a 
parameterMarker.
+    val tokens = ctx
+      .singleStringLit()
+      .asScala
+      .map { child =>
+        visit(child).asInstanceOf[Token]
+      }
+      .toSeq
+
+    if (tokens.isEmpty) {
+      null
+    } else if (tokens.size == 1) {
+      // Fast path: single token, return unchanged
+      tokens.head
+    } else {
+      // Multiple tokens: create coalesced token
+      createCoalescedStringToken(tokens)
+    }
+  }
 
-  override def visitDoubleQuotedStringLiteralValue(
-      ctx: DoubleQuotedStringLiteralValueContext): Token =
-    Option(ctx).map(_.DOUBLEQUOTED_STRING.getSymbol).orNull
+  /**
+   * Visits a stringLitWithoutMarker context that contains one or more string 
literal terminals.
+   * Multiple literals are automatically coalesced into a single 
CoalescedStringToken.
+   */
+  override def visitStringLitWithoutMarker(ctx: 
StringLitWithoutMarkerContext): Token = {
+    if (ctx == null) {
+      return null
+    }
+
+    // Collect all string literal terminals (could be multiple with 
stringLitWithoutMarker+)
+    val allTerminals = collectStringTerminals(ctx)
 
+    if (allTerminals.isEmpty) {
+      null
+    } else if (allTerminals.size == 1) {
+      // Fast path: single literal, return original token unchanged
+      allTerminals.head.getSymbol
+    } else {
+      // Multiple literals: create coalesced token
+      createCoalescedStringToken(allTerminals.map(_.getSymbol).toSeq)
+    }
+  }
+
+  /**
+   * Visits singleStringLitWithoutMarker alternatives and returns the token. 
Always returns
+   * exactly one token without coalescing.
+   */
+  override def visitSingleStringLiteralValue(ctx: 
SingleStringLiteralValueContext): Token = {
+    ctx.STRING_LITERAL().getSymbol
+  }
+
+  override def visitSingleDoubleQuotedStringLiteralValue(

Review Comment:
   ditto



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-53573][SQL] Allow coalescing string literals everywhere [spark]

Reply via email to