This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 1c2d0620390e [SPARK-52990][CORE] Support `StringSubstitutor`
1c2d0620390e is described below
commit 1c2d0620390e97d13088503bc1e88d966a387d15
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Tue Jul 29 07:59:43 2025 -0700
[SPARK-52990][CORE] Support `StringSubstitutor`
### What changes were proposed in this pull request?
This PR aims to support `StringSubstitutor` in `common/utils` module. In
addition, new Scalastyle rule is added to ban
`org.apache.commons.text.StringSubstitutor`.
### Why are the changes needed?
- Improve String utility. New one is faster.
```scala
scala> import scala.jdk.CollectionConverters._
import scala.jdk.CollectionConverters._
scala> val s = "a" * 500_000_000 + "${a}"
scala> spark.time((new org.apache.commons.text.StringSubstitutor(Map("a" ->
"b").asJava)).replace(s)).length
Time taken: 706 ms
val res0: Int = 500000001
scala> spark.time((new org.apache.spark.StringSubstitutor(Map("a" ->
"b"))).replace(s)).length
Time taken: 312 ms
val res1: Int = 500000001
```
- Removes `commons-text` compile dependency from `common` modules.
- After this PR, `common/unsafe` module has the only `commons-text` test
dependency under `common` directory.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass the existing CIs because this is a replacement.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #51697 from dongjoon-hyun/SPARK-52990.
Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
common/network-shuffle/pom.xml | 5 +
common/utils/pom.xml | 4 -
.../org/apache/spark/ErrorClassesJSONReader.scala | 7 +-
.../scala/org/apache/spark/StringSubstitutor.scala | 126 +++++++++++++++++++++
scalastyle-config.xml | 5 +
sql/api/pom.xml | 4 +
6 files changed, 141 insertions(+), 10 deletions(-)
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index adfc55d28c35..60ad97157399 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -42,6 +42,11 @@
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ </dependency>
+
<dependency>
<groupId>io.dropwizard.metrics</groupId>
<artifactId>metrics-core</artifactId>
diff --git a/common/utils/pom.xml b/common/utils/pom.xml
index 44771938439a..249add11c9d8 100644
--- a/common/utils/pom.xml
+++ b/common/utils/pom.xml
@@ -51,10 +51,6 @@
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
</dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-text</artifactId>
- </dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
diff --git
a/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
b/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
index 85d460f618a7..2ff811298901 100644
--- a/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
+++ b/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala
@@ -19,13 +19,10 @@ package org.apache.spark
import java.net.URL
-import scala.jdk.CollectionConverters._
-
import com.fasterxml.jackson.annotation.JsonIgnore
import com.fasterxml.jackson.core.`type`.TypeReference
import com.fasterxml.jackson.databind.json.JsonMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import org.apache.commons.text.StringSubstitutor
import org.apache.spark.annotation.DeveloperApi
@@ -48,9 +45,7 @@ class ErrorClassesJsonReader(jsonFileURLs: Seq[URL]) {
case (key, null) => key -> "null"
case (key, value) => key -> value
}
- val sub = new StringSubstitutor(sanitizedParameters.asJava)
- sub.setEnableUndefinedVariableException(true)
- sub.setDisableSubstitutionInValues(true)
+ val sub = new StringSubstitutor(sanitizedParameters)
val errorMessage = try {
sub.replace(ErrorClassesJsonReader.TEMPLATE_REGEX.replaceAllIn(
messageTemplate, "\\$\\{$1\\}"))
diff --git
a/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala
b/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala
new file mode 100644
index 000000000000..6513c7dc7e8e
--- /dev/null
+++ b/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark
+
+import scala.collection.mutable.{ListBuffer, StringBuilder}
+
+class StringSubstitutor(
+ resolver: Map[String, Any],
+ prefix: String = "${",
+ suffix: String = "}",
+ escape: Char = '$',
+ valueDelimiter: String = ":-",
+ preserveEscapes: Boolean = true,
+ enableSubstitutionInVariables: Boolean = false,
+ enableUndefinedVariableException: Boolean = true) {
+
+ private val prefixLen = prefix.length
+ private val suffixLen = suffix.length
+
+ def replace(source: String): String = {
+ if (source == null) return null
+ val buf = new StringBuilder(source)
+ if (!substitute(buf, 0, buf.length, ListBuffer.empty[String])) {
+ source
+ } else {
+ buf.toString
+ }
+ }
+
+ private def substitute(
+ buf: StringBuilder,
+ offset: Int,
+ length: Int,
+ priorVariables: ListBuffer[String]): Boolean = {
+ var altered = false
+ var currentPos = offset
+ val endPos = offset + length
+
+ while (currentPos < endPos) {
+ // Find prefix
+ val prefixPos = buf.indexOf(prefix, currentPos)
+ if (prefixPos < 0 || prefixPos >= endPos) {
+ return altered
+ }
+
+ if (prefixPos > 0 && buf.charAt(prefixPos - 1) == escape) {
+ if (preserveEscapes) {
+ currentPos = prefixPos + prefixLen
+ } else {
+ buf.deleteCharAt(prefixPos - 1)
+ altered = true
+ currentPos = prefixPos - 1 + prefixLen // adjust position
+ }
+ } else {
+ // Find suffix
+ val suffixPos = buf.indexOf(suffix, prefixPos + prefixLen)
+ if (suffixPos < 0) {
+ currentPos = prefixPos + prefixLen
+ } else {
+ // Extract variable name and default
+ val varFull = buf.substring(prefixPos + prefixLen, suffixPos)
+ val index = varFull.indexOf(valueDelimiter)
+ val (varName, defaultValue) = if (index >= 0) {
+ (varFull.substring(0, index), Some(varFull.substring(index +
valueDelimiter.length)))
+ } else {
+ (varFull, None)
+ }
+
+ // Check for cycle
+ if (priorVariables.contains(varName)) {
+ throw new IllegalStateException(
+ s"Infinite loop in property interpolation of $varName")
+ }
+
+ resolver.get(varName) match {
+ case Some(value) =>
+ var replacement = value.toString
+ if (enableSubstitutionInVariables) {
+ val newPrior = priorVariables.clone() += varName
+ val tempBuf = new StringBuilder(replacement)
+ if (substitute(tempBuf, 0, tempBuf.length, newPrior)) {
+ replacement = tempBuf.toString
+ }
+ }
+ buf.replace(prefixPos, suffixPos + suffixLen, replacement)
+ // Adjust positions and recurse on the changed part
+ val changeInLength = replacement.length - (suffixPos + suffixLen
- prefixPos)
+ val newLength = length + changeInLength
+ substitute(buf, prefixPos + replacement.length, newLength,
priorVariables)
+ return true // restart scan after change
+
+ case None =>
+ defaultValue match {
+ case Some(value) =>
+ buf.replace(prefixPos, suffixPos + suffixLen, value)
+ val changeInLength = value.length - (suffixPos + suffixLen -
prefixPos)
+ val newLength = length + changeInLength
+ substitute(buf, prefixPos + value.length, length,
priorVariables)
+ return true
+ case None =>
+ if (enableUndefinedVariableException) {
+ throw new IllegalArgumentException(s"Undefined variable:
$varName")
+ }
+ }
+ }
+ currentPos = suffixPos + suffixLen
+ }
+ }
+ }
+ altered
+ }
+}
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 663ad4225581..3f33db9746f0 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -319,6 +319,11 @@ This file is divided into 3 sections:
<customMessage>Use Java String methods instead</customMessage>
</check>
+ <check customId="commonstextstringsubstitutor" level="error"
class="org.scalastyle.file.RegexChecker" enabled="true">
+ <parameters><parameter
name="regex">org\.apache\.commons\.text\.StringSubstitutor</parameter></parameters>
+ <customMessage>Use org.apache.spark.StringSubstitutor
instead</customMessage>
+ </check>
+
<check customId="uribuilder" level="error"
class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter
name="regex">UriBuilder\.fromUri</parameter></parameters>
<customMessage>Use Utils.getUriBuilder instead.</customMessage>
diff --git a/sql/api/pom.xml b/sql/api/pom.xml
index 86a8b0adaff9..184d39c4b8ea 100644
--- a/sql/api/pom.xml
+++ b/sql/api/pom.xml
@@ -64,6 +64,10 @@
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ </dependency>
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]