Repository: spark Updated Branches: refs/heads/master 4a9c9d8f9 -> 55f36641f
[SPARK-25093][SQL] Avoid recompiling regexp for comments multiple times ## What changes were proposed in this pull request? The PR moves the compilation of the regexp for code formatting outside the method which is called for each code block when splitting expressions, in order to avoid recompiling the regexp every time. Credit should be given to Izek Greenfield. ## How was this patch tested? existing UTs Closes #22135 from mgaido91/SPARK-25093. Authored-by: Marco Gaido <marcogaid...@gmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/55f36641 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/55f36641 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/55f36641 Branch: refs/heads/master Commit: 55f36641ff20114b892795f100da7efb79b0cc32 Parents: 4a9c9d8 Author: Marco Gaido <marcogaid...@gmail.com> Authored: Wed Aug 22 14:31:51 2018 +0800 Committer: Wenchen Fan <wenc...@databricks.com> Committed: Wed Aug 22 14:31:51 2018 +0800 ---------------------------------------------------------------------- .../scala/org/apache/spark/deploy/worker/Worker.scala | 4 ++-- core/src/main/scala/org/apache/spark/util/Utils.scala | 11 ++++++----- .../apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala | 6 +++--- .../spark/sql/catalyst/catalog/SessionCatalog.scala | 3 ++- .../sql/catalyst/expressions/codegen/CodeFormatter.scala | 10 +++++----- .../main/scala/org/apache/spark/sql/types/DataType.scala | 3 ++- .../org/apache/spark/streaming/dstream/DStream.scala | 10 +++++----- 7 files changed, 25 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index ee1ca0b..cbd812a 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -758,6 +758,7 @@ private[deploy] class Worker( private[deploy] object Worker extends Logging { val SYSTEM_NAME = "sparkWorker" val ENDPOINT_NAME = "Worker" + private val SSL_NODE_LOCAL_CONFIG_PATTERN = """\-Dspark\.ssl\.useNodeLocalConf\=(.+)""".r def main(argStrings: Array[String]) { Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler( @@ -803,9 +804,8 @@ private[deploy] object Worker extends Logging { } def isUseLocalNodeSSLConfig(cmd: Command): Boolean = { - val pattern = """\-Dspark\.ssl\.useNodeLocalConf\=(.+)""".r val result = cmd.javaOpts.collectFirst { - case pattern(_result) => _result.toBoolean + case SSL_NODE_LOCAL_CONFIG_PATTERN(_result) => _result.toBoolean } result.getOrElse(false) } http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/core/src/main/scala/org/apache/spark/util/Utils.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 7ec707d..e6646bd 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1409,13 +1409,14 @@ private[spark] object Utils extends Logging { } } + // A regular expression to match classes of the internal Spark API's + // that we want to skip when finding the call site of a method. + private val SPARK_CORE_CLASS_REGEX = + """^org\.apache\.spark(\.api\.java)?(\.util)?(\.rdd)?(\.broadcast)?\.[A-Z]""".r + private val SPARK_SQL_CLASS_REGEX = """^org\.apache\.spark\.sql.*""".r + /** Default filtering function for finding call sites using `getCallSite`. */ private def sparkInternalExclusionFunction(className: String): Boolean = { - // A regular expression to match classes of the internal Spark API's - // that we want to skip when finding the call site of a method. - val SPARK_CORE_CLASS_REGEX = - """^org\.apache\.spark(\.api\.java)?(\.util)?(\.rdd)?(\.broadcast)?\.[A-Z]""".r - val SPARK_SQL_CLASS_REGEX = """^org\.apache\.spark\.sql.*""".r val SCALA_CORE_CLASS_PREFIX = "scala" val isSparkClass = SPARK_CORE_CLASS_REGEX.findFirstIn(className).isDefined || SPARK_SQL_CLASS_REGEX.findFirstIn(className).isDefined http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala index 80d03ab..48485e0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala @@ -59,13 +59,13 @@ private[r] class AFTSurvivalRegressionWrapper private ( private[r] object AFTSurvivalRegressionWrapper extends MLReadable[AFTSurvivalRegressionWrapper] { + private val FORMULA_REGEXP = """Surv\(([^,]+), ([^,]+)\) ~ (.+)""".r + private def formulaRewrite(formula: String): (String, String) = { var rewritedFormula: String = null var censorCol: String = null - - val regex = """Surv\(([^,]+), ([^,]+)\) ~ (.+)""".r try { - val regex(label, censor, features) = formula + val FORMULA_REGEXP(label, censor, features) = formula // TODO: Support dot operator. if (features.contains(".")) { throw new UnsupportedOperationException( http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index ee3932c..afb0f00 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -101,6 +101,8 @@ class SessionCatalog( @GuardedBy("this") protected var currentDb: String = formatDatabaseName(DEFAULT_DATABASE) + private val validNameFormat = "([\\w_]+)".r + /** * Checks if the given name conforms the Hive standard ("[a-zA-Z_0-9]+"), * i.e. if this name only contains characters, numbers, and _. @@ -109,7 +111,6 @@ class SessionCatalog( * org.apache.hadoop.hive.metastore.MetaStoreUtils.validateName. */ private def validateName(name: String): Unit = { - val validNameFormat = "([\\w_]+)".r if (!validNameFormat.pattern.matcher(name).matches()) { throw new AnalysisException(s"`$name` is not a valid name for tables/databases. " + "Valid names only contain alphabet characters, numbers and _.") http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala index 7b398f4..ea1bb87 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala @@ -27,6 +27,10 @@ import java.util.regex.Matcher */ object CodeFormatter { val commentHolder = """\/\*(.+?)\*\/""".r + val commentRegexp = + ("""([ |\t]*?\/\*[\s|\S]*?\*\/[ |\t]*?)|""" + // strip /*comment*/ + """([ |\t]*?\/\/[\s\S]*?\n)""").r // strip //comment + val extraNewLinesRegexp = """\n\s*\n""".r // strip extra newlines def format(code: CodeAndComment, maxLines: Int = -1): String = { val formatter = new CodeFormatter @@ -91,11 +95,7 @@ object CodeFormatter { } def stripExtraNewLinesAndComments(input: String): String = { - val commentReg = - ("""([ |\t]*?\/\*[\s|\S]*?\*\/[ |\t]*?)|""" + // strip /*comment*/ - """([ |\t]*?\/\/[\s\S]*?\n)""").r // strip //comment - val codeWithoutComment = commentReg.replaceAllIn(input, "") - codeWithoutComment.replaceAll("""\n\s*\n""", "\n") // strip ExtraNewLines + extraNewLinesRegexp.replaceAllIn(commentRegexp.replaceAllIn(input, ""), "\n") } } http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index 50f2a9d..e53628d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -114,6 +114,8 @@ abstract class DataType extends AbstractDataType { @InterfaceStability.Stable object DataType { + private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r + def fromDDL(ddl: String): DataType = { try { CatalystSqlParser.parseDataType(ddl) @@ -132,7 +134,6 @@ object DataType { /** Given the string representation of a type, return its DataType */ private def nameToType(name: String): DataType = { - val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r name match { case "decimal" => DecimalType.USER_DEFAULT case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt) http://git-wip-us.apache.org/repos/asf/spark/blob/55f36641/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala ---------------------------------------------------------------------- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index e23edfa..4a4d2c5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -940,6 +940,11 @@ abstract class DStream[T: ClassTag] ( object DStream { + private val SPARK_CLASS_REGEX = """^org\.apache\.spark""".r + private val SPARK_STREAMING_TESTCLASS_REGEX = """^org\.apache\.spark\.streaming\.test""".r + private val SPARK_EXAMPLES_CLASS_REGEX = """^org\.apache\.spark\.examples""".r + private val SCALA_CLASS_REGEX = """^scala""".r + // `toPairDStreamFunctions` was in SparkContext before 1.3 and users had to // `import StreamingContext._` to enable it. Now we move it here to make the compiler find // it automatically. However, we still keep the old function in StreamingContext for backward @@ -953,11 +958,6 @@ object DStream { /** Get the creation site of a DStream from the stack trace of when the DStream is created. */ private[streaming] def getCreationSite(): CallSite = { - val SPARK_CLASS_REGEX = """^org\.apache\.spark""".r - val SPARK_STREAMING_TESTCLASS_REGEX = """^org\.apache\.spark\.streaming\.test""".r - val SPARK_EXAMPLES_CLASS_REGEX = """^org\.apache\.spark\.examples""".r - val SCALA_CLASS_REGEX = """^scala""".r - /** Filtering function that excludes non-user classes for a streaming application */ def streamingExclustionFunction(className: String): Boolean = { def doesMatch(r: Regex): Boolean = r.findFirstIn(className).isDefined --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org