This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1c2d0620390e [SPARK-52990][CORE] Support `StringSubstitutor` 1c2d0620390e is described below commit 1c2d0620390e97d13088503bc1e88d966a387d15 Author: Dongjoon Hyun <dongj...@apache.org> AuthorDate: Tue Jul 29 07:59:43 2025 -0700 [SPARK-52990][CORE] Support `StringSubstitutor` ### What changes were proposed in this pull request? This PR aims to support `StringSubstitutor` in `common/utils` module. In addition, new Scalastyle rule is added to ban `org.apache.commons.text.StringSubstitutor`. ### Why are the changes needed? - Improve String utility. New one is faster. ```scala scala> import scala.jdk.CollectionConverters._ import scala.jdk.CollectionConverters._ scala> val s = "a" * 500_000_000 + "${a}" scala> spark.time((new org.apache.commons.text.StringSubstitutor(Map("a" -> "b").asJava)).replace(s)).length Time taken: 706 ms val res0: Int = 500000001 scala> spark.time((new org.apache.spark.StringSubstitutor(Map("a" -> "b"))).replace(s)).length Time taken: 312 ms val res1: Int = 500000001 ``` - Removes `commons-text` compile dependency from `common` modules. - After this PR, `common/unsafe` module has the only `commons-text` test dependency under `common` directory. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the existing CIs because this is a replacement. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51697 from dongjoon-hyun/SPARK-52990. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- common/network-shuffle/pom.xml | 5 + common/utils/pom.xml | 4 - .../org/apache/spark/ErrorClassesJSONReader.scala | 7 +- .../scala/org/apache/spark/StringSubstitutor.scala | 126 +++++++++++++++++++++ scalastyle-config.xml | 5 + sql/api/pom.xml | 4 + 6 files changed, 141 insertions(+), 10 deletions(-) diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index adfc55d28c35..60ad97157399 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -42,6 +42,11 @@ <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + </dependency> + <dependency> <groupId>io.dropwizard.metrics</groupId> <artifactId>metrics-core</artifactId> diff --git a/common/utils/pom.xml b/common/utils/pom.xml index 44771938439a..249add11c9d8 100644 --- a/common/utils/pom.xml +++ b/common/utils/pom.xml @@ -51,10 +51,6 @@ <groupId>com.fasterxml.jackson.module</groupId> <artifactId>jackson-module-scala_${scala.binary.version}</artifactId> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-text</artifactId> - </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> diff --git a/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala b/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala index 85d460f618a7..2ff811298901 100644 --- a/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala +++ b/common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala @@ -19,13 +19,10 @@ package org.apache.spark import java.net.URL -import scala.jdk.CollectionConverters._ - import com.fasterxml.jackson.annotation.JsonIgnore import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.databind.json.JsonMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule -import org.apache.commons.text.StringSubstitutor import org.apache.spark.annotation.DeveloperApi @@ -48,9 +45,7 @@ class ErrorClassesJsonReader(jsonFileURLs: Seq[URL]) { case (key, null) => key -> "null" case (key, value) => key -> value } - val sub = new StringSubstitutor(sanitizedParameters.asJava) - sub.setEnableUndefinedVariableException(true) - sub.setDisableSubstitutionInValues(true) + val sub = new StringSubstitutor(sanitizedParameters) val errorMessage = try { sub.replace(ErrorClassesJsonReader.TEMPLATE_REGEX.replaceAllIn( messageTemplate, "\\$\\{$1\\}")) diff --git a/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala b/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala new file mode 100644 index 000000000000..6513c7dc7e8e --- /dev/null +++ b/common/utils/src/main/scala/org/apache/spark/StringSubstitutor.scala @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark + +import scala.collection.mutable.{ListBuffer, StringBuilder} + +class StringSubstitutor( + resolver: Map[String, Any], + prefix: String = "${", + suffix: String = "}", + escape: Char = '$', + valueDelimiter: String = ":-", + preserveEscapes: Boolean = true, + enableSubstitutionInVariables: Boolean = false, + enableUndefinedVariableException: Boolean = true) { + + private val prefixLen = prefix.length + private val suffixLen = suffix.length + + def replace(source: String): String = { + if (source == null) return null + val buf = new StringBuilder(source) + if (!substitute(buf, 0, buf.length, ListBuffer.empty[String])) { + source + } else { + buf.toString + } + } + + private def substitute( + buf: StringBuilder, + offset: Int, + length: Int, + priorVariables: ListBuffer[String]): Boolean = { + var altered = false + var currentPos = offset + val endPos = offset + length + + while (currentPos < endPos) { + // Find prefix + val prefixPos = buf.indexOf(prefix, currentPos) + if (prefixPos < 0 || prefixPos >= endPos) { + return altered + } + + if (prefixPos > 0 && buf.charAt(prefixPos - 1) == escape) { + if (preserveEscapes) { + currentPos = prefixPos + prefixLen + } else { + buf.deleteCharAt(prefixPos - 1) + altered = true + currentPos = prefixPos - 1 + prefixLen // adjust position + } + } else { + // Find suffix + val suffixPos = buf.indexOf(suffix, prefixPos + prefixLen) + if (suffixPos < 0) { + currentPos = prefixPos + prefixLen + } else { + // Extract variable name and default + val varFull = buf.substring(prefixPos + prefixLen, suffixPos) + val index = varFull.indexOf(valueDelimiter) + val (varName, defaultValue) = if (index >= 0) { + (varFull.substring(0, index), Some(varFull.substring(index + valueDelimiter.length))) + } else { + (varFull, None) + } + + // Check for cycle + if (priorVariables.contains(varName)) { + throw new IllegalStateException( + s"Infinite loop in property interpolation of $varName") + } + + resolver.get(varName) match { + case Some(value) => + var replacement = value.toString + if (enableSubstitutionInVariables) { + val newPrior = priorVariables.clone() += varName + val tempBuf = new StringBuilder(replacement) + if (substitute(tempBuf, 0, tempBuf.length, newPrior)) { + replacement = tempBuf.toString + } + } + buf.replace(prefixPos, suffixPos + suffixLen, replacement) + // Adjust positions and recurse on the changed part + val changeInLength = replacement.length - (suffixPos + suffixLen - prefixPos) + val newLength = length + changeInLength + substitute(buf, prefixPos + replacement.length, newLength, priorVariables) + return true // restart scan after change + + case None => + defaultValue match { + case Some(value) => + buf.replace(prefixPos, suffixPos + suffixLen, value) + val changeInLength = value.length - (suffixPos + suffixLen - prefixPos) + val newLength = length + changeInLength + substitute(buf, prefixPos + value.length, length, priorVariables) + return true + case None => + if (enableUndefinedVariableException) { + throw new IllegalArgumentException(s"Undefined variable: $varName") + } + } + } + currentPos = suffixPos + suffixLen + } + } + } + altered + } +} diff --git a/scalastyle-config.xml b/scalastyle-config.xml index 663ad4225581..3f33db9746f0 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -319,6 +319,11 @@ This file is divided into 3 sections: <customMessage>Use Java String methods instead</customMessage> </check> + <check customId="commonstextstringsubstitutor" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">org\.apache\.commons\.text\.StringSubstitutor</parameter></parameters> + <customMessage>Use org.apache.spark.StringSubstitutor instead</customMessage> + </check> + <check customId="uribuilder" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> <parameters><parameter name="regex">UriBuilder\.fromUri</parameter></parameters> <customMessage>Use Utils.getUriBuilder instead.</customMessage> diff --git a/sql/api/pom.xml b/sql/api/pom.xml index 86a8b0adaff9..184d39c4b8ea 100644 --- a/sql/api/pom.xml +++ b/sql/api/pom.xml @@ -64,6 +64,10 @@ <version>${project.version}</version> <scope>compile</scope> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + </dependency> <dependency> <groupId>org.json4s</groupId> <artifactId>json4s-jackson_${scala.binary.version}</artifactId> --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org