This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new d918477 [SPARK-31573][R] Apply fixed=TRUE as appropriate to regex
usage in R
d918477 is described below
commit d918477a9b96ac9fa8ca5116814d93737985e8c9
Author: Michael Chirico <[email protected]>
AuthorDate: Tue Apr 28 17:24:21 2020 +0900
[SPARK-31573][R] Apply fixed=TRUE as appropriate to regex usage in R
### What changes were proposed in this pull request?
For regex functions in base R (`gsub`, `grep`, `grepl`, `strsplit`,
`gregexpr`), supplying the `fixed=TRUE` option will be more performant.
### Why are the changes needed?
This is a minor fix for performance
### Does this PR introduce any user-facing change?
No (although some internal code was applying fixed-as-regex in some cases
that could technically have been over-broad and caught unintended patterns)
### How was this patch tested?
Not
Closes #28367 from MichaelChirico/r-regex-fixed.
Authored-by: Michael Chirico <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
(cherry picked from commit c011502ee3f60d2467e4c89d4e0863174592fff6)
Signed-off-by: HyukjinKwon <[email protected]>
---
R/pkg/R/DataFrame.R | 2 +-
R/pkg/R/SQLContext.R | 7 ++++---
R/pkg/R/client.R | 10 +++++-----
R/pkg/R/install.R | 4 ++--
R/pkg/R/schema.R | 2 +-
R/pkg/R/sparkR.R | 4 ++--
R/pkg/R/utils.R | 16 ++++++++++------
7 files changed, 25 insertions(+), 20 deletions(-)
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a734804..f67f5fd 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2614,7 +2614,7 @@ setMethod("join",
"left", "leftouter", "left_outer",
"right", "rightouter", "right_outer",
"semi", "left_semi", "leftsemi", "anti", "left_anti",
"leftanti")) {
- joinType <- gsub("_", "", joinType)
+ joinType <- gsub("_", "", joinType, fixed = TRUE)
sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc,
joinType)
} else {
stop(paste("joinType must be one of the following types:",
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index c684291..1ef2641 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -110,10 +110,11 @@ sparkR.conf <- function(key, defaultValue) {
value <- if (missing(defaultValue)) {
tryCatch(callJMethod(conf, "get", key),
error = function(e) {
- if (any(grep("java.util.NoSuchElementException",
as.character(e)))) {
+ estr <- as.character(e)
+ if (any(grepl("java.util.NoSuchElementException", estr, fixed
= TRUE))) {
stop(paste0("Config '", key, "' is not set"))
} else {
- stop(paste0("Unknown error: ", as.character(e)))
+ stop(paste0("Unknown error: ", estr))
}
})
} else {
@@ -205,7 +206,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) {
# SPAKR-SQL does not support '.' in column name, so replace it with '_'
# TODO(davies): remove this once SPARK-2775 is fixed
names <- lapply(names, function(n) {
- nn <- gsub("[.]", "_", n)
+ nn <- gsub(".", "_", n, fixed = TRUE)
if (nn != n) {
warning(paste("Use", nn, "instead of", n, "as column name"))
}
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 2ff68ab..872b214 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -65,8 +65,8 @@ checkJavaVersion <- function() {
javaHome <- Sys.getenv("JAVA_HOME")
javaReqs <- utils::packageDescription(utils::packageName(), fields =
c("SystemRequirements"))
sparkJavaVersions <- strsplit(javaReqs, "[(,)]")[[1]]
- minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">=
")[[1]][[2]])
- maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "<
")[[1]][[2]])
+ minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= ", fixed =
TRUE)[[1]][[2]])
+ maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< ", fixed =
TRUE)[[1]][[2]])
if (javaHome != "") {
javaBin <- file.path(javaHome, "bin", javaBin)
}
@@ -89,13 +89,13 @@ checkJavaVersion <- function() {
})
javaVersionFilter <- Filter(
function(x) {
- grepl(" version", x)
+ grepl(" version", x, fixed = TRUE)
}, javaVersionOut)
- javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]
+ javaVersionStr <- strsplit(javaVersionFilter[[1]], '"', fixed =
TRUE)[[1L]][2]
# javaVersionStr is of the form 1.8.0_92/9.0.x/11.0.x.
# We are using 8, 9, 10, 11 for sparkJavaVersion.
- versions <- strsplit(javaVersionStr, "[.]")[[1L]]
+ versions <- strsplit(javaVersionStr, ".", fixed = TRUE)[[1L]]
if ("1" == versions[1]) {
javaVersionNum <- as.integer(versions[2])
} else {
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index 6d1edf6..8c5355a 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -214,9 +214,9 @@ getPreferredMirror <- function(version, packageName) {
file.path("spark", version, packageName),
".tgz&as_json=1")
textLines <- readLines(jsonUrl, warn = FALSE)
- rowNum <- grep("\"preferred\"", textLines)
+ rowNum <- grep('"preferred"', textLines, fixed = TRUE)
linePreferred <- textLines[rowNum]
- matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred)
+ matchInfo <- regexpr('"[A-Za-z][A-Za-z0-9+-.]*://.+"', linePreferred)
if (matchInfo != -1) {
startPos <- matchInfo + 1
endPos <- matchInfo + attr(matchInfo, "match.length") - 2
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 8d2d9a1..89d5c2c 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -182,7 +182,7 @@ checkType <- function(type) {
# strsplit does not return the final empty string, so check if
# the final char is ","
if (substr(fieldsString, nchar(fieldsString),
nchar(fieldsString)) != ",") {
- fields <- strsplit(fieldsString, ",")[[1]]
+ fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]]
for (field in fields) {
m <- regexec("^(.+):(.+)$", field)
matchedStrings <- regmatches(field, m)
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 2ece83a..9ba36ad 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -435,7 +435,7 @@ sparkR.session <- function(
# Check if version number of SparkSession matches version number of SparkR
package
jvmVersion <- callJMethod(sparkSession, "version")
# Remove -SNAPSHOT from jvm versions
- jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion)
+ jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion, fixed = TRUE)
rPackageVersion <- paste0(packageVersion("SparkR"))
if (jvmVersionStrip != rPackageVersion) {
@@ -606,7 +606,7 @@ getClientModeSparkSubmitOpts <- function(submitOps,
sparkEnvirMap) {
# process only if --option is not already specified
if (!is.null(opsValue) &&
nchar(opsValue) > 1 &&
- !grepl(sparkConfToSubmitOps[[conf]], submitOps)) {
+ !grepl(sparkConfToSubmitOps[[conf]], submitOps, fixed = TRUE)) {
# put "" around value in case it has spaces
paste0(sparkConfToSubmitOps[[conf]], " \"", opsValue, "\" ")
} else {
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 9d7d1a4..c60e4db 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -818,7 +818,8 @@ captureJVMException <- function(e, method) {
}
# StreamingQueryException could wrap an IllegalArgumentException, so look
for that first
- if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ",
stacktrace))) {
+ if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ",
+ stacktrace, fixed = TRUE))) {
msg <- strsplit(stacktrace,
"org.apache.spark.sql.streaming.StreamingQueryException: ",
fixed = TRUE)[[1]]
# Extract "Error in ..." message.
@@ -826,14 +827,14 @@ captureJVMException <- function(e, method) {
# Extract the first message of JVM exception.
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
stop(paste0(rmsg, "streaming query error - ", first), call. = FALSE)
- } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace))) {
+ } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace,
fixed = TRUE))) {
msg <- strsplit(stacktrace, "java.lang.IllegalArgumentException: ", fixed
= TRUE)[[1]]
# Extract "Error in ..." message.
rmsg <- msg[1]
# Extract the first message of JVM exception.
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
stop(paste0(rmsg, "illegal argument - ", first), call. = FALSE)
- } else if (any(grep("org.apache.spark.sql.AnalysisException: ",
stacktrace))) {
+ } else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace,
fixed = TRUE))) {
msg <- strsplit(stacktrace, "org.apache.spark.sql.AnalysisException: ",
fixed = TRUE)[[1]]
# Extract "Error in ..." message.
rmsg <- msg[1]
@@ -841,7 +842,8 @@ captureJVMException <- function(e, method) {
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
stop(paste0(rmsg, "analysis error - ", first), call. = FALSE)
} else
- if
(any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
stacktrace))) {
+ if
(any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
+ stacktrace, fixed = TRUE))) {
msg <- strsplit(stacktrace,
"org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
fixed = TRUE)[[1]]
# Extract "Error in ..." message.
@@ -850,7 +852,8 @@ captureJVMException <- function(e, method) {
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
stop(paste0(rmsg, "no such database - ", first), call. = FALSE)
} else
- if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException:
", stacktrace))) {
+ if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException:
",
+ stacktrace, fixed = TRUE))) {
msg <- strsplit(stacktrace,
"org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ",
fixed = TRUE)[[1]]
# Extract "Error in ..." message.
@@ -858,7 +861,8 @@ captureJVMException <- function(e, method) {
# Extract the first message of JVM exception.
first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
stop(paste0(rmsg, "no such table - ", first), call. = FALSE)
- } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ",
stacktrace))) {
+ } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ",
+ stacktrace, fixed = TRUE))) {
msg <- strsplit(stacktrace,
"org.apache.spark.sql.catalyst.parser.ParseException: ",
fixed = TRUE)[[1]]
# Extract "Error in ..." message.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]