[spark] branch branch-3.0 updated: [SPARK-31573][R] Apply fixed=TRUE as appropriate to regex usage in R

gurwls223 Tue, 28 Apr 2020 01:26:54 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new d918477  [SPARK-31573][R] Apply fixed=TRUE as appropriate to regex 
usage in R
d918477 is described below

commit d918477a9b96ac9fa8ca5116814d93737985e8c9
Author: Michael Chirico <[email protected]>
AuthorDate: Tue Apr 28 17:24:21 2020 +0900

    [SPARK-31573][R] Apply fixed=TRUE as appropriate to regex usage in R
    
    ### What changes were proposed in this pull request?
    
    For regex functions in base R (`gsub`, `grep`, `grepl`, `strsplit`, 
`gregexpr`), supplying the `fixed=TRUE` option will be more performant.
    
    ### Why are the changes needed?
    
    This is a minor fix for performance
    
    ### Does this PR introduce any user-facing change?
    
    No (although some internal code was applying fixed-as-regex in some cases 
that could technically have been over-broad and caught unintended patterns)
    
    ### How was this patch tested?
    
    Not
    
    Closes #28367 from MichaelChirico/r-regex-fixed.
    
    Authored-by: Michael Chirico <[email protected]>
    Signed-off-by: HyukjinKwon <[email protected]>
    (cherry picked from commit c011502ee3f60d2467e4c89d4e0863174592fff6)
    Signed-off-by: HyukjinKwon <[email protected]>
---
 R/pkg/R/DataFrame.R  |  2 +-
 R/pkg/R/SQLContext.R |  7 ++++---
 R/pkg/R/client.R     | 10 +++++-----
 R/pkg/R/install.R    |  4 ++--
 R/pkg/R/schema.R     |  2 +-
 R/pkg/R/sparkR.R     |  4 ++--
 R/pkg/R/utils.R      | 16 ++++++++++------
 7 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a734804..f67f5fd 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2614,7 +2614,7 @@ setMethod("join",
                     "left", "leftouter", "left_outer",
                     "right", "rightouter", "right_outer",
                     "semi", "left_semi", "leftsemi", "anti", "left_anti", 
"leftanti")) {
-                  joinType <- gsub("_", "", joinType)
+                  joinType <- gsub("_", "", joinType, fixed = TRUE)
                   sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, 
joinType)
                 } else {
                   stop(paste("joinType must be one of the following types:",
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index c684291..1ef2641 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -110,10 +110,11 @@ sparkR.conf <- function(key, defaultValue) {
     value <- if (missing(defaultValue)) {
       tryCatch(callJMethod(conf, "get", key),
               error = function(e) {
-                if (any(grep("java.util.NoSuchElementException", 
as.character(e)))) {
+                estr <- as.character(e)
+                if (any(grepl("java.util.NoSuchElementException", estr, fixed 
= TRUE))) {
                   stop(paste0("Config '", key, "' is not set"))
                 } else {
-                  stop(paste0("Unknown error: ", as.character(e)))
+                  stop(paste0("Unknown error: ", estr))
                 }
               })
     } else {
@@ -205,7 +206,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) {
     # SPAKR-SQL does not support '.' in column name, so replace it with '_'
     # TODO(davies): remove this once SPARK-2775 is fixed
     names <- lapply(names, function(n) {
-      nn <- gsub("[.]", "_", n)
+      nn <- gsub(".", "_", n, fixed = TRUE)
       if (nn != n) {
         warning(paste("Use", nn, "instead of", n, "as column name"))
       }
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 2ff68ab..872b214 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -65,8 +65,8 @@ checkJavaVersion <- function() {
   javaHome <- Sys.getenv("JAVA_HOME")
   javaReqs <- utils::packageDescription(utils::packageName(), fields = 
c("SystemRequirements"))
   sparkJavaVersions <- strsplit(javaReqs, "[(,)]")[[1]]
-  minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= 
")[[1]][[2]])
-  maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< 
")[[1]][[2]])
+  minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= ", fixed = 
TRUE)[[1]][[2]])
+  maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< ", fixed = 
TRUE)[[1]][[2]])
   if (javaHome != "") {
     javaBin <- file.path(javaHome, "bin", javaBin)
   }
@@ -89,13 +89,13 @@ checkJavaVersion <- function() {
     })
   javaVersionFilter <- Filter(
       function(x) {
-        grepl(" version", x)
+        grepl(" version", x, fixed = TRUE)
       }, javaVersionOut)
 
-  javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2]
+  javaVersionStr <- strsplit(javaVersionFilter[[1]], '"', fixed = 
TRUE)[[1L]][2]
   # javaVersionStr is of the form 1.8.0_92/9.0.x/11.0.x.
   # We are using 8, 9, 10, 11 for sparkJavaVersion.
-  versions <- strsplit(javaVersionStr, "[.]")[[1L]]
+  versions <- strsplit(javaVersionStr, ".", fixed = TRUE)[[1L]]
   if ("1" == versions[1]) {
     javaVersionNum <- as.integer(versions[2])
   } else {
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index 6d1edf6..8c5355a 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -214,9 +214,9 @@ getPreferredMirror <- function(version, packageName) {
                         file.path("spark", version, packageName),
                         ".tgz&as_json=1")
   textLines <- readLines(jsonUrl, warn = FALSE)
-  rowNum <- grep("\"preferred\"", textLines)
+  rowNum <- grep('"preferred"', textLines, fixed = TRUE)
   linePreferred <- textLines[rowNum]
-  matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred)
+  matchInfo <- regexpr('"[A-Za-z][A-Za-z0-9+-.]*://.+"', linePreferred)
   if (matchInfo != -1) {
     startPos <- matchInfo + 1
     endPos <- matchInfo + attr(matchInfo, "match.length") - 2
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 8d2d9a1..89d5c2c 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -182,7 +182,7 @@ checkType <- function(type) {
                 # strsplit does not return the final empty string, so check if
                 # the final char is ","
                 if (substr(fieldsString, nchar(fieldsString), 
nchar(fieldsString)) != ",") {
-                  fields <- strsplit(fieldsString, ",")[[1]]
+                  fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]]
                   for (field in fields) {
                     m <- regexec("^(.+):(.+)$", field)
                     matchedStrings <- regmatches(field, m)
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 2ece83a..9ba36ad 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -435,7 +435,7 @@ sparkR.session <- function(
   # Check if version number of SparkSession matches version number of SparkR 
package
   jvmVersion <- callJMethod(sparkSession, "version")
   # Remove -SNAPSHOT from jvm versions
-  jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion)
+  jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion, fixed = TRUE)
   rPackageVersion <- paste0(packageVersion("SparkR"))
 
   if (jvmVersionStrip != rPackageVersion) {
@@ -606,7 +606,7 @@ getClientModeSparkSubmitOpts <- function(submitOps, 
sparkEnvirMap) {
     # process only if --option is not already specified
     if (!is.null(opsValue) &&
         nchar(opsValue) > 1 &&
-        !grepl(sparkConfToSubmitOps[[conf]], submitOps)) {
+        !grepl(sparkConfToSubmitOps[[conf]], submitOps, fixed = TRUE)) {
       # put "" around value in case it has spaces
       paste0(sparkConfToSubmitOps[[conf]], " \"", opsValue, "\" ")
     } else {
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 9d7d1a4..c60e4db 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -818,7 +818,8 @@ captureJVMException <- function(e, method) {
   }
 
   # StreamingQueryException could wrap an IllegalArgumentException, so look 
for that first
-  if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ", 
stacktrace))) {
+  if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ",
+               stacktrace, fixed = TRUE))) {
     msg <- strsplit(stacktrace, 
"org.apache.spark.sql.streaming.StreamingQueryException: ",
                     fixed = TRUE)[[1]]
     # Extract "Error in ..." message.
@@ -826,14 +827,14 @@ captureJVMException <- function(e, method) {
     # Extract the first message of JVM exception.
     first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
     stop(paste0(rmsg, "streaming query error - ", first), call. = FALSE)
-  } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace))) {
+  } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace, 
fixed = TRUE))) {
     msg <- strsplit(stacktrace, "java.lang.IllegalArgumentException: ", fixed 
= TRUE)[[1]]
     # Extract "Error in ..." message.
     rmsg <- msg[1]
     # Extract the first message of JVM exception.
     first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
     stop(paste0(rmsg, "illegal argument - ", first), call. = FALSE)
-  } else if (any(grep("org.apache.spark.sql.AnalysisException: ", 
stacktrace))) {
+  } else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace, 
fixed = TRUE))) {
     msg <- strsplit(stacktrace, "org.apache.spark.sql.AnalysisException: ", 
fixed = TRUE)[[1]]
     # Extract "Error in ..." message.
     rmsg <- msg[1]
@@ -841,7 +842,8 @@ captureJVMException <- function(e, method) {
     first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
     stop(paste0(rmsg, "analysis error - ", first), call. = FALSE)
   } else
-    if 
(any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", 
stacktrace))) {
+    if 
(any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
+                 stacktrace, fixed = TRUE))) {
     msg <- strsplit(stacktrace, 
"org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ",
                     fixed = TRUE)[[1]]
     # Extract "Error in ..." message.
@@ -850,7 +852,8 @@ captureJVMException <- function(e, method) {
     first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
     stop(paste0(rmsg, "no such database - ", first), call. = FALSE)
   } else
-    if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: 
", stacktrace))) {
+    if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: 
",
+                 stacktrace, fixed = TRUE))) {
     msg <- strsplit(stacktrace, 
"org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ",
                     fixed = TRUE)[[1]]
     # Extract "Error in ..." message.
@@ -858,7 +861,8 @@ captureJVMException <- function(e, method) {
     # Extract the first message of JVM exception.
     first <- strsplit(msg[2], "\r?\n\tat")[[1]][1]
     stop(paste0(rmsg, "no such table - ", first), call. = FALSE)
-  } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ", 
stacktrace))) {
+  } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ",
+                      stacktrace, fixed = TRUE))) {
     msg <- strsplit(stacktrace, 
"org.apache.spark.sql.catalyst.parser.ParseException: ",
                     fixed = TRUE)[[1]]
     # Extract "Error in ..." message.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch branch-3.0 updated: [SPARK-31573][R] Apply fixed=TRUE as appropriate to regex usage in R

Reply via email to