OOZIE-2984 Parse spark-defaults.conf values with spaces without needing the quotes (andras.piros via gezapeti)
(cherry picked from commit 1ae68908b26e7a36089c47ca64c3c683dc2f6099) Project: http://git-wip-us.apache.org/repos/asf/oozie/repo Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/0f293eaa Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/0f293eaa Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/0f293eaa Branch: refs/heads/branch-4.3 Commit: 0f293eaa5d5efa9e317db8fc606c59b4ff6dd99f Parents: 636e434 Author: Gezapeti Cseh <[email protected]> Authored: Fri Jul 28 13:33:40 2017 +0200 Committer: satishsaley <[email protected]> Committed: Fri Dec 8 16:34:55 2017 -0800 ---------------------------------------------------------------------- .../site/twiki/DG_SparkActionExtension.twiki | 11 +- release-log.txt | 1 + .../action/hadoop/SparkOptionsSplitter.java | 130 +++++++++++++++---- .../action/hadoop/TestSparkArgsExtractor.java | 37 ++++++ .../action/hadoop/TestSparkOptionsSplitter.java | 36 ++++- 5 files changed, 184 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/docs/src/site/twiki/DG_SparkActionExtension.twiki ---------------------------------------------------------------------- diff --git a/docs/src/site/twiki/DG_SparkActionExtension.twiki b/docs/src/site/twiki/DG_SparkActionExtension.twiki index 94175cd..a815869 100644 --- a/docs/src/site/twiki/DG_SparkActionExtension.twiki +++ b/docs/src/site/twiki/DG_SparkActionExtension.twiki @@ -97,9 +97,16 @@ The =class= element if present, indicates the spark's application main class. The =jar= element indicates a comma separated list of jars or python files. -The =spark-opts= element if present, contains a list of spark options that can be passed to spark driver. Spark configuration +The =spark-opts= element, if present, contains a list of spark options that can be passed to spark driver. Spark configuration options can be passed by specifying '--conf key=value' here, or from =oozie.service.SparkConfigurationService.spark.configurations= -in oozie-site.xml. Values containing whitespaces should be enclosed by double quotes. The =spark-opts= configs have priority. +in oozie-site.xml. Values containing whitespaces can be enclosed by double quotes. The =spark-opts= configs have priority. + +Some examples of the =spark-opts= element: + * '--conf key=value' + * '--conf key1=value1 value2' + * '--conf key1="value1 value2"' + * '--conf key1=value1 key2="value2 value3"' + * '--conf key=value --verbose' The =arg= element if present, contains arguments that can be passed to spark application. http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/release-log.txt ---------------------------------------------------------------------- diff --git a/release-log.txt b/release-log.txt index 0995972..a0946fe 100644 --- a/release-log.txt +++ b/release-log.txt @@ -1,5 +1,6 @@ -- Oozie 4.3.1 release +OOZIE-2984 Parse spark-defaults.conf values with spaces without needing the quotes (andras.piros via gezapeti) OOZIE-2825 Custom Authentication doc page is not well formatted (Jan Hentschel via rkanter) OOZIE-2747 README.txt is out of date (Jan Hentschel via rkanter) OOZIE-2923 Improve Spark options parsing (andras.piros via gezapeti) http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java ---------------------------------------------------------------------- diff --git a/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java b/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java index 30def6f..b614322 100644 --- a/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java +++ b/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java @@ -18,47 +18,133 @@ package org.apache.oozie.action.hadoop; +import org.apache.commons.lang.StringUtils; + import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; class SparkOptionsSplitter { /** - * Converts the options to be Spark-compatible. + * Matches an option key. + * <p/> + * Some examples: + * <ul> + * <li>{@code key1}</li> + * <li>{@code key2=}</li> + * <li>{@code key3a-key3b}</li> + * <li>{@code key4a-KEY4b=}</li> + * </ul> + */ + private static final Pattern OPTION_KEY_PREFIX = Pattern.compile("\\s*--[a-zA-Z0-9.]+[\\-a-zA-Z0-9.]*[=]?"); + + /** + * Matches an option key / value pair, where the whole value part is quoted. + * <p/> + * Some examples: + * <ul> + * <li>{@code spark.executor.extraJavaOptions="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp"}</li> + * <li>{@code spark.executor.extraJavaOptions="-XX:HeapDumpPath=/tmp"}</li> + * </ul> + */ + private static final String VALUE_HAS_QUOTES_AT_ENDS_REGEX = "[a-zA-Z0-9.]+=\".+\""; + + /** + * Matches an option key / value pair, where the value part has quotes in between. + * <p/> + * Some examples: * <ul> - * <li>Parameters are separated by whitespace and can be groupped using double quotes</li> - * <li>Quotes should be removed</li> - * <li>Adjacent whitespace separators are treated as one</li> + * <li>{@code spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp + * -Dlog4j.configuration="/some path/spark-log4j.properties"}</li> + * <li>{@code spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError + * -Dlog4j.configuration="/some path/spark-log4j.properties" -XX:HeapDumpPath=/tmp}</li> + * <li>{@code spark.executor.extraJavaOptions=-Dlog4j.configuration="/some path/spark-log4j.properties" + * -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp}</li> * </ul> + */ + private static final String VALUE_HAS_QUOTES_IN_BETWEEN_REGEX = + "[a-zA-Z0-9.]+=.*(\\w\\s+\"\\w+[\\s+\\w]*\"|\"\\w+[\\s+\\w]*\"\\s+\\w)+.*"; + + /** + * Converts the options to be Spark-compatible. + * <p/> + * Following will be considered when converting a single option: + * <ul> + * <li>parameter keys should begin with {@link #OPTION_KEY_PREFIX}</li> + * <li>parameter values should be separated by whitespace(s) and can be grouped using double quotes</li> + * <li>double quotes should be removed when the entire value part of a parameter key / value pair is double quoted, + * and no partial value is quoted. {@see #VALUE_HAS_QUOTES_AT_ENDS_REGEX} + * and {@see #VALUE_HAS_QUOTES_IN_BETWEEN_REGEX}</li> + * <li>double quotes should be kept when only a value part of a parameter key / value pair is double quoted</li> + * <li>adjacent whitespace separators are treated as one</li> + * </ul> + * <p/> + * Please have a look at the scenarios within {@code TestSparkOptionsSplitter} for details. * * @param sparkOpts the options for Spark - * @return the options parsed into a list + * @return the options parsed into a {@link List} */ static List<String> splitSparkOpts(final String sparkOpts) { - final List<String> result = new ArrayList<String>(); - final StringBuilder currentWord = new StringBuilder(); - - boolean insideQuote = false; - for (int i = 0; i < sparkOpts.length(); i++) { - final char c = sparkOpts.charAt(i); - if (c == '"') { - insideQuote = !insideQuote; - } - else if (Character.isWhitespace(c) && !insideQuote) { - if (currentWord.length() > 0) { - result.add(currentWord.toString()); - currentWord.setLength(0); + final List<String> result = new ArrayList<>(); + final Matcher matcher = OPTION_KEY_PREFIX.matcher(sparkOpts); + + int start = 0, end; + while (matcher.find()) { + end = matcher.start(); + + if (start > 0) { + final String maybeQuotedValue = sparkOpts.substring(start, end).trim(); + if (StringUtils.isNotEmpty(maybeQuotedValue)) { + result.add(unquoteEntirelyQuotedValue(maybeQuotedValue)); } } - else { - currentWord.append(c); + + String sparkOpt = matchSparkOpt(sparkOpts, matcher); + if (sparkOpt.endsWith("=")) { + sparkOpt = sparkOpt.replaceAll("=", ""); } + result.add(sparkOpt); + + start = matcher.end(); } - if (currentWord.length() > 0) { - result.add(currentWord.toString()); + final String maybeEntirelyQuotedValue = sparkOpts.substring(start).trim(); + if (StringUtils.isNotEmpty(maybeEntirelyQuotedValue)) { + result.add(unquoteEntirelyQuotedValue(maybeEntirelyQuotedValue)); } return result; } + + private static String matchSparkOpt(final String sparkOpts, final Matcher matcher) { + return sparkOpts.substring(matcher.start(), matcher.end()).trim(); + } + + /** + * Unquote an option value if and only if it has quotes at both ends, and doesn't have any quotes in between. + * <p/> + * Some examples: + * <ul> + * <li>{@code key=value1 value2}: remains unquoted (isn't quoted at all)</li> + * <li>{@code key=value1 "value2"}: remains quoted (has quotes in between)</li> + * <li>{@code key="value1 value2" "value3 value4"}: remains quoted (has quotes both ends, but has also some in between)</li> + * <li>{@code key="value1 value2 value3 value4"}: gets unquoted (has quotes both ends, and no quotes in between)</li> + * </ul> + * @param maybeEntirelyQuotedValue a {@code String} that is a parameter value but not necessarily quoted + * @return an unquoted version of the input {@String}, when {@code maybeEntirelyQuotedValue} had quotes at both ends, and didn't + * have any quotes in between. Else {@code maybeEntirelyQuotedValue} + */ + private static String unquoteEntirelyQuotedValue(final String maybeEntirelyQuotedValue) { + final boolean hasQuotesAtEnds = maybeEntirelyQuotedValue.matches(VALUE_HAS_QUOTES_AT_ENDS_REGEX); + final boolean hasQuotesInBetween = maybeEntirelyQuotedValue.matches(VALUE_HAS_QUOTES_IN_BETWEEN_REGEX); + final boolean isEntirelyQuoted = hasQuotesAtEnds && !hasQuotesInBetween; + + if (isEntirelyQuoted) { + return maybeEntirelyQuotedValue.replaceAll("\"", ""); + } + + return maybeEntirelyQuotedValue; + } } http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java ---------------------------------------------------------------------- diff --git a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java index 7db26a6..d0541ca 100644 --- a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java +++ b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java @@ -194,4 +194,41 @@ public class TestSparkArgsExtractor { "arg1"), sparkArgs); } + + @Test + public void testQuotedDriverAndExecutorExtraJavaOptionsParsing() throws Exception { + final Configuration actionConf = new Configuration(); + actionConf.set(SparkActionExecutor.SPARK_MASTER, "yarn"); + actionConf.set(SparkActionExecutor.SPARK_MODE, "client"); + actionConf.set(SparkActionExecutor.SPARK_CLASS, "org.apache.oozie.example.SparkFileCopy"); + actionConf.set(SparkActionExecutor.SPARK_JOB_NAME, "Spark Copy File"); + actionConf.set(SparkActionExecutor.SPARK_OPTS, + "--conf spark.executor.extraJavaOptions=\"-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp\"" + + "--conf spark.driver.extraJavaOptions=\"-Xmn2703m -XX:SurvivorRatio=2 -XX:ParallelGCThreads=20\""); + actionConf.set(SparkActionExecutor.SPARK_JAR, "/lib/test.jar"); + + final String[] mainArgs = {"arg0", "arg1"}; + final List<String> sparkArgs = new SparkArgsExtractor(actionConf).extract(mainArgs); + + assertEquals("Spark args mismatch", + Lists.newArrayList("--master", "yarn", + "--deploy-mode", "client", + "--name", "Spark Copy File", + "--class", "org.apache.oozie.example.SparkFileCopy", + "--conf", "spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp " + + "-Dlog4j.configuration=spark-log4j.properties", + "--conf", "spark.driver.extraJavaOptions=-Xmn2703m -XX:SurvivorRatio=2 -XX:ParallelGCThreads=20 " + + "-Dlog4j.configuration=spark-log4j.properties", + "--conf", "spark.executor.extraClassPath=$PWD/*", + "--conf", "spark.driver.extraClassPath=$PWD/*", + "--conf", "spark.yarn.security.tokens.hive.enabled=false", + "--conf", "spark.yarn.security.tokens.hbase.enabled=false", + "--files", "spark-log4j.properties,hive-site.xml", + "--conf", "spark.yarn.jar=null", + "--verbose", + "/lib/test.jar", + "arg0", + "arg1"), + sparkArgs); + } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java ---------------------------------------------------------------------- diff --git a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java index 02786a2..4be9777 100644 --- a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java +++ b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java @@ -34,13 +34,35 @@ public class TestSparkOptionsSplitter { @Parameterized.Parameters public static List<Object[]> params() { return Arrays.asList(new Object[][]{ - {"--option1 value1", Arrays.asList("--option1", "value1")}, - {"--option1 value1", Arrays.asList("--option1", "value1")}, - {" --option1 value1 ", Arrays.asList("--option1", "value1")}, - {"--conf special=value1", Arrays.asList("--conf", "special=value1")}, - {"--conf special=\"value1\"", Arrays.asList("--conf", "special=value1")}, - {"--conf special=\"value1 value2\"", Arrays.asList("--conf", "special=value1 value2")}, - {" --conf special=\"value1 value2\" ", Arrays.asList("--conf", "special=value1 value2")}, + {"--option1 value1", + Arrays.asList("--option1", "value1")}, + {"--option1 value1", + Arrays.asList("--option1", "value1")}, + {" --option1 value1 ", + Arrays.asList("--option1", "value1")}, + {"--conf special=value1", + Arrays.asList("--conf", "special=value1")}, + {"--conf special=\"value1\"", + Arrays.asList("--conf", "special=value1")}, + {"--conf special=\"value1 value2\"", + Arrays.asList("--conf", "special=value1 value2")}, + {" --conf special=\"value1 value2\" ", + Arrays.asList("--conf", "special=value1 value2")}, + {"--conf special=value1 value2 --conf value3", + Arrays.asList("--conf", "special=value1 value2", "--conf", "value3")}, + {"--conf special1=value1 special2=\"value2 value3\" --conf value4", + Arrays.asList("--conf", "special1=value1 special2=\"value2 value3\"", "--conf", "value4")}, + {"--conf special1=value1 value2 special2=value3 value4", + Arrays.asList("--conf", "special1=value1 value2 special2=value3 value4")}, + {"--option1 value1 --option2=value2", + Arrays.asList("--option1", "value1", "--option2", "value2")}, + {"--conf spark.executor.extraJavaOptions=\"-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp\"", + Arrays.asList("--conf", + "spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp")}, + {"--option1 value1 --verbose", + Arrays.asList("--option1", "value1", "--verbose")}, + {"--verbose --option1 value1", + Arrays.asList("--verbose", "--option1", "value1")} }); }
