This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 1c2d0620390e [SPARK-52990][CORE] Support `StringSubstitutor`
1c2d0620390e is described below

commit 1c2d0620390e97d13088503bc1e88d966a387d15
Author: Dongjoon Hyun <dongj...@apache.org>
AuthorDate: Tue Jul 29 07:59:43 2025 -0700

    [SPARK-52990][CORE] Support `StringSubstitutor`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `StringSubstitutor` in `common/utils` module. In 
addition, new Scalastyle rule is added to ban 
`org.apache.commons.text.StringSubstitutor`.
    
    ### Why are the changes needed?
    
    - Improve String utility. New one is faster.
    
    ```scala
    scala> import scala.jdk.CollectionConverters._
    import scala.jdk.CollectionConverters._
    
    scala> val s = "a" * 500_000_000 + "${a}"
    
    scala> spark.time((new org.apache.commons.text.StringSubstitutor(Map("a" -> 
"b").asJava)).replace(s)).length
    Time taken: 706 ms
    val res0: Int = 500000001
    
    scala> spark.time((new org.apache.spark.StringSubstitutor(Map("a" -> 
"b"))).replace(s)).length
    Time taken: 312 ms
    val res1: Int = 500000001
    ```
    
    - Removes `commons-text` compile dependency from `common` modules.
      - After this PR, `common/unsafe` module has the only  `commons-text` test 
dependency under `common` directory.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Pass the existing CIs because this is a replacement.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #51697 from dongjoon-hyun/SPARK-52990.
    
    Authored-by: Dongjoon Hyun <dongj...@apache.org>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 common/network-shuffle/pom.xml                     |   5 +
 common/utils/pom.xml                               |   4 -
 .../org/apache/spark/ErrorClassesJSONReader.scala  |   7 +-
 .../scala/org/apache/spark/StringSubstitutor.scala | 126 +++++++++++++++++++++
 scalastyle-config.xml                              |   5 +
 sql/api/pom.xml                                    |   4 +
 6 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index adfc55d28c35..60ad97157399 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -42,6 +42,11 @@
       <version>${project.version}</version>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>io.dropwizard.metrics</groupId>
       <artifactId>metrics-core</artifactId>
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index 44771938439a..249add11c9d8 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -51,10 +51,6 @@
       <groupId>com.fasterxml.jackson.module</groupId>
       <artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-text</artifactId>
-    </dependency>
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
diff --git 
a/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala 
b/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
index 85d460f618a7..2ff811298901 100644
--- a/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
+++ b/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
@@ -19,13 +19,10 @@ package org.apache.spark
 
 import java.net.URL
 
-import scala.jdk.CollectionConverters._
-
 import com.fasterxml.jackson.annotation.JsonIgnore
 import com.fasterxml.jackson.core.`type`.TypeReference
 import com.fasterxml.jackson.databind.json.JsonMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import org.apache.commons.text.StringSubstitutor
 
 import org.apache.spark.annotation.DeveloperApi
 
@@ -48,9 +45,7 @@ class ErrorClassesJsonReader(jsonFileURLs: Seq[URL]) {
       case (key, null) => key -> "null"
       case (key, value) => key -> value
     }
-    val sub = new StringSubstitutor(sanitizedParameters.asJava)
-    sub.setEnableUndefinedVariableException(true)
-    sub.setDisableSubstitutionInValues(true)
+    val sub = new StringSubstitutor(sanitizedParameters)
     val errorMessage = try {
       sub.replace(ErrorClassesJsonReader.TEMPLATE_REGEX.replaceAllIn(
         messageTemplate, "\\$\\{$1\\}"))
diff --git 
a/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala 
b/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala
new file mode 100644
index 000000000000..6513c7dc7e8e
--- /dev/null
+++ b/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark
+
+import scala.collection.mutable.{ListBuffer, StringBuilder}
+
+class StringSubstitutor(
+    resolver: Map[String, Any],
+    prefix: String = "${",
+    suffix: String = "}",
+    escape: Char = '$',
+    valueDelimiter: String = ":-",
+    preserveEscapes: Boolean = true,
+    enableSubstitutionInVariables: Boolean = false,
+    enableUndefinedVariableException: Boolean = true) {
+
+  private val prefixLen = prefix.length
+  private val suffixLen = suffix.length
+
+  def replace(source: String): String = {
+    if (source == null) return null
+    val buf = new StringBuilder(source)
+    if (!substitute(buf, 0, buf.length, ListBuffer.empty[String])) {
+      source
+    } else {
+      buf.toString
+    }
+  }
+
+  private def substitute(
+      buf: StringBuilder,
+      offset: Int,
+      length: Int,
+      priorVariables: ListBuffer[String]): Boolean = {
+    var altered = false
+    var currentPos = offset
+    val endPos = offset + length
+
+    while (currentPos < endPos) {
+      // Find prefix
+      val prefixPos = buf.indexOf(prefix, currentPos)
+      if (prefixPos < 0 || prefixPos >= endPos) {
+        return altered
+      }
+
+      if (prefixPos > 0 && buf.charAt(prefixPos - 1) == escape) {
+        if (preserveEscapes) {
+          currentPos = prefixPos + prefixLen
+        } else {
+          buf.deleteCharAt(prefixPos - 1)
+          altered = true
+          currentPos = prefixPos - 1 + prefixLen // adjust position
+        }
+      } else {
+        // Find suffix
+        val suffixPos = buf.indexOf(suffix, prefixPos + prefixLen)
+        if (suffixPos < 0) {
+          currentPos = prefixPos + prefixLen
+        } else {
+          // Extract variable name and default
+          val varFull = buf.substring(prefixPos + prefixLen, suffixPos)
+          val index = varFull.indexOf(valueDelimiter)
+          val (varName, defaultValue) = if (index >= 0) {
+            (varFull.substring(0, index), Some(varFull.substring(index + 
valueDelimiter.length)))
+          } else {
+            (varFull, None)
+          }
+
+          // Check for cycle
+          if (priorVariables.contains(varName)) {
+            throw new IllegalStateException(
+              s"Infinite loop in property interpolation of $varName")
+          }
+
+          resolver.get(varName) match {
+            case Some(value) =>
+              var replacement = value.toString
+              if (enableSubstitutionInVariables) {
+                val newPrior = priorVariables.clone() += varName
+                val tempBuf = new StringBuilder(replacement)
+                if (substitute(tempBuf, 0, tempBuf.length, newPrior)) {
+                  replacement = tempBuf.toString
+                }
+              }
+              buf.replace(prefixPos, suffixPos + suffixLen, replacement)
+              // Adjust positions and recurse on the changed part
+              val changeInLength = replacement.length - (suffixPos + suffixLen 
- prefixPos)
+              val newLength = length + changeInLength
+              substitute(buf, prefixPos + replacement.length, newLength, 
priorVariables)
+              return true // restart scan after change
+
+            case None =>
+              defaultValue match {
+                case Some(value) =>
+                  buf.replace(prefixPos, suffixPos + suffixLen, value)
+                  val changeInLength = value.length - (suffixPos + suffixLen - 
prefixPos)
+                  val newLength = length + changeInLength
+                  substitute(buf, prefixPos + value.length, length, 
priorVariables)
+                  return true
+                case None =>
+                  if (enableUndefinedVariableException) {
+                    throw new IllegalArgumentException(s"Undefined variable: 
$varName")
+                  }
+              }
+          }
+          currentPos = suffixPos + suffixLen
+        }
+      }
+    }
+    altered
+  }
+}
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 663ad4225581..3f33db9746f0 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -319,6 +319,11 @@ This file is divided into 3 sections:
     <customMessage>Use Java String methods instead</customMessage>
   </check>
 
+  <check customId="commonstextstringsubstitutor" level="error" 
class="org.scalastyle.file.RegexChecker" enabled="true">
+    <parameters><parameter 
name="regex">org\.apache\.commons\.text\.StringSubstitutor</parameter></parameters>
+    <customMessage>Use org.apache.spark.StringSubstitutor 
instead</customMessage>
+  </check>
+
   <check customId="uribuilder" level="error" 
class="org.scalastyle.file.RegexChecker" enabled="true">
     <parameters><parameter 
name="regex">UriBuilder\.fromUri</parameter></parameters>
     <customMessage>Use Utils.getUriBuilder instead.</customMessage>
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index 86a8b0adaff9..184d39c4b8ea 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -64,6 +64,10 @@
             <version>${project.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+           <groupId>org.apache.commons</groupId>
+           <artifactId>commons-lang3</artifactId>
+        </dependency>
         <dependency>
             <groupId>org.json4s</groupId>
             <artifactId>json4s-jackson_${scala.binary.version}</artifactId>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to