OOZIE-2984 Parse spark-defaults.conf values with spaces without needing the 
quotes (andras.piros via gezapeti)

(cherry picked from commit 1ae68908b26e7a36089c47ca64c3c683dc2f6099)


Project: http://git-wip-us.apache.org/repos/asf/oozie/repo
Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/0f293eaa
Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/0f293eaa
Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/0f293eaa

Branch: refs/heads/branch-4.3
Commit: 0f293eaa5d5efa9e317db8fc606c59b4ff6dd99f
Parents: 636e434
Author: Gezapeti Cseh <[email protected]>
Authored: Fri Jul 28 13:33:40 2017 +0200
Committer: satishsaley <[email protected]>
Committed: Fri Dec 8 16:34:55 2017 -0800

----------------------------------------------------------------------
 .../site/twiki/DG_SparkActionExtension.twiki    |  11 +-
 release-log.txt                                 |   1 +
 .../action/hadoop/SparkOptionsSplitter.java     | 130 +++++++++++++++----
 .../action/hadoop/TestSparkArgsExtractor.java   |  37 ++++++
 .../action/hadoop/TestSparkOptionsSplitter.java |  36 ++++-
 5 files changed, 184 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/docs/src/site/twiki/DG_SparkActionExtension.twiki
----------------------------------------------------------------------
diff --git a/docs/src/site/twiki/DG_SparkActionExtension.twiki 
b/docs/src/site/twiki/DG_SparkActionExtension.twiki
index 94175cd..a815869 100644
--- a/docs/src/site/twiki/DG_SparkActionExtension.twiki
+++ b/docs/src/site/twiki/DG_SparkActionExtension.twiki
@@ -97,9 +97,16 @@ The =class= element if present, indicates the spark's 
application main class.
 
 The =jar= element indicates a comma separated list of jars or python files.
 
-The =spark-opts= element if present, contains a list of spark options that can 
be passed to spark driver. Spark configuration
+The =spark-opts= element, if present, contains a list of spark options that 
can be passed to spark driver. Spark configuration
 options can be passed by specifying '--conf key=value' here, or from 
=oozie.service.SparkConfigurationService.spark.configurations=
-in oozie-site.xml. Values containing whitespaces should be enclosed by double 
quotes. The =spark-opts= configs have priority.
+in oozie-site.xml. Values containing whitespaces can be enclosed by double 
quotes. The =spark-opts= configs have priority.
+
+Some examples of the =spark-opts= element:
+   * '--conf key=value'
+   * '--conf key1=value1 value2'
+   * '--conf key1="value1 value2"'
+   * '--conf key1=value1 key2="value2 value3"'
+   * '--conf key=value --verbose'
 
 The =arg= element if present, contains arguments that can be passed to spark 
application.
 

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/release-log.txt
----------------------------------------------------------------------
diff --git a/release-log.txt b/release-log.txt
index 0995972..a0946fe 100644
--- a/release-log.txt
+++ b/release-log.txt
@@ -1,5 +1,6 @@
 -- Oozie 4.3.1 release
 
+OOZIE-2984 Parse spark-defaults.conf values with spaces without needing the 
quotes (andras.piros via gezapeti)
 OOZIE-2825 Custom Authentication doc page is not well formatted (Jan Hentschel 
via rkanter)
 OOZIE-2747 README.txt is out of date (Jan Hentschel via rkanter)
 OOZIE-2923 Improve Spark options parsing (andras.piros via gezapeti)

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java
----------------------------------------------------------------------
diff --git 
a/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java
 
b/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java
index 30def6f..b614322 100644
--- 
a/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java
+++ 
b/sharelib/spark/src/main/java/org/apache/oozie/action/hadoop/SparkOptionsSplitter.java
@@ -18,47 +18,133 @@
 
 package org.apache.oozie.action.hadoop;
 
+import org.apache.commons.lang.StringUtils;
+
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 class SparkOptionsSplitter {
 
     /**
-     * Converts the options to be Spark-compatible.
+     * Matches an option key.
+     * <p/>
+     * Some examples:
+     * <ul>
+     *     <li>{@code key1}</li>
+     *     <li>{@code key2=}</li>
+     *     <li>{@code key3a-key3b}</li>
+     *     <li>{@code key4a-KEY4b=}</li>
+     * </ul>
+     */
+    private static final Pattern OPTION_KEY_PREFIX = 
Pattern.compile("\\s*--[a-zA-Z0-9.]+[\\-a-zA-Z0-9.]*[=]?");
+
+    /**
+     * Matches an option key / value pair, where the whole value part is 
quoted.
+     * <p/>
+     * Some examples:
+     * <ul>
+     *     <li>{@code 
spark.executor.extraJavaOptions="-XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp"}</li>
+     *     <li>{@code 
spark.executor.extraJavaOptions="-XX:HeapDumpPath=/tmp"}</li>
+     * </ul>
+     */
+    private static final String VALUE_HAS_QUOTES_AT_ENDS_REGEX = 
"[a-zA-Z0-9.]+=\".+\"";
+
+    /**
+     * Matches an option key / value pair, where the value part has quotes in 
between.
+     * <p/>
+     * Some examples:
      * <ul>
-     * <li>Parameters are separated by whitespace and can be groupped using 
double quotes</li>
-     * <li>Quotes should be removed</li>
-     * <li>Adjacent whitespace separators are treated as one</li>
+     *     <li>{@code 
spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp
+     *     -Dlog4j.configuration="/some path/spark-log4j.properties"}</li>
+     *     <li>{@code 
spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError
+     *     -Dlog4j.configuration="/some path/spark-log4j.properties" 
-XX:HeapDumpPath=/tmp}</li>
+     *     <li>{@code 
spark.executor.extraJavaOptions=-Dlog4j.configuration="/some 
path/spark-log4j.properties"
+     *     -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp}</li>
      * </ul>
+     */
+    private static final String VALUE_HAS_QUOTES_IN_BETWEEN_REGEX =
+            
"[a-zA-Z0-9.]+=.*(\\w\\s+\"\\w+[\\s+\\w]*\"|\"\\w+[\\s+\\w]*\"\\s+\\w)+.*";
+
+    /**
+     * Converts the options to be Spark-compatible.
+     * <p/>
+     * Following will be considered when converting a single option:
+     *  <ul>
+     *      <li>parameter keys should begin with {@link 
#OPTION_KEY_PREFIX}</li>
+     *      <li>parameter values should be separated by whitespace(s) and can 
be grouped using double quotes</li>
+     *      <li>double quotes should be removed when the entire value part of 
a parameter key / value pair is double quoted,
+     *      and no partial value is quoted. {@see 
#VALUE_HAS_QUOTES_AT_ENDS_REGEX}
+     *      and {@see #VALUE_HAS_QUOTES_IN_BETWEEN_REGEX}</li>
+     *      <li>double quotes should be kept when only a value part of a 
parameter key / value pair is double quoted</li>
+     *      <li>adjacent whitespace separators are treated as one</li>
+     *  </ul>
+     * <p/>
+     * Please have a look at the scenarios within {@code 
TestSparkOptionsSplitter} for details.
      *
      * @param sparkOpts the options for Spark
-     * @return the options parsed into a list
+     * @return the options parsed into a {@link List}
      */
     static List<String> splitSparkOpts(final String sparkOpts) {
-        final List<String> result = new ArrayList<String>();
-        final StringBuilder currentWord = new StringBuilder();
-
-        boolean insideQuote = false;
-        for (int i = 0; i < sparkOpts.length(); i++) {
-            final char c = sparkOpts.charAt(i);
-            if (c == '"') {
-                insideQuote = !insideQuote;
-            }
-            else if (Character.isWhitespace(c) && !insideQuote) {
-                if (currentWord.length() > 0) {
-                    result.add(currentWord.toString());
-                    currentWord.setLength(0);
+        final List<String> result = new ArrayList<>();
+        final Matcher matcher = OPTION_KEY_PREFIX.matcher(sparkOpts);
+
+        int start = 0, end;
+        while (matcher.find()) {
+            end = matcher.start();
+
+            if (start > 0) {
+                final String maybeQuotedValue = sparkOpts.substring(start, 
end).trim();
+                if (StringUtils.isNotEmpty(maybeQuotedValue)) {
+                    result.add(unquoteEntirelyQuotedValue(maybeQuotedValue));
                 }
             }
-            else {
-                currentWord.append(c);
+
+            String sparkOpt = matchSparkOpt(sparkOpts, matcher);
+            if (sparkOpt.endsWith("=")) {
+                sparkOpt = sparkOpt.replaceAll("=", "");
             }
+            result.add(sparkOpt);
+
+            start = matcher.end();
         }
 
-        if (currentWord.length() > 0) {
-            result.add(currentWord.toString());
+        final String maybeEntirelyQuotedValue = 
sparkOpts.substring(start).trim();
+        if (StringUtils.isNotEmpty(maybeEntirelyQuotedValue)) {
+            result.add(unquoteEntirelyQuotedValue(maybeEntirelyQuotedValue));
         }
 
         return result;
     }
+
+    private static String matchSparkOpt(final String sparkOpts, final Matcher 
matcher) {
+        return sparkOpts.substring(matcher.start(), matcher.end()).trim();
+    }
+
+    /**
+     * Unquote an option value if and only if it has quotes at both ends, and 
doesn't have any quotes in between.
+     * <p/>
+     * Some examples:
+     * <ul>
+     *     <li>{@code key=value1 value2}: remains unquoted (isn't quoted at 
all)</li>
+     *     <li>{@code key=value1 "value2"}: remains quoted (has quotes in 
between)</li>
+     *     <li>{@code key="value1 value2" "value3 value4"}: remains quoted 
(has quotes both ends, but has also some in between)</li>
+     *     <li>{@code key="value1 value2 value3 value4"}: gets unquoted (has 
quotes both ends, and no quotes in between)</li>
+     * </ul>
+     * @param maybeEntirelyQuotedValue a {@code String} that is a parameter 
value but not necessarily quoted
+     * @return an unquoted version of the input {@String}, when {@code 
maybeEntirelyQuotedValue} had quotes at both ends, and didn't
+     * have any quotes in between. Else {@code maybeEntirelyQuotedValue}
+     */
+    private static String unquoteEntirelyQuotedValue(final String 
maybeEntirelyQuotedValue) {
+        final boolean hasQuotesAtEnds = 
maybeEntirelyQuotedValue.matches(VALUE_HAS_QUOTES_AT_ENDS_REGEX);
+        final boolean hasQuotesInBetween = 
maybeEntirelyQuotedValue.matches(VALUE_HAS_QUOTES_IN_BETWEEN_REGEX);
+        final boolean isEntirelyQuoted = hasQuotesAtEnds && 
!hasQuotesInBetween;
+
+        if (isEntirelyQuoted) {
+            return maybeEntirelyQuotedValue.replaceAll("\"", "");
+        }
+
+        return maybeEntirelyQuotedValue;
+    }
 }

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java
----------------------------------------------------------------------
diff --git 
a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java
 
b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java
index 7db26a6..d0541ca 100644
--- 
a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java
+++ 
b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkArgsExtractor.java
@@ -194,4 +194,41 @@ public class TestSparkArgsExtractor {
                         "arg1"),
                 sparkArgs);
     }
+
+    @Test
+    public void testQuotedDriverAndExecutorExtraJavaOptionsParsing() throws 
Exception {
+        final Configuration actionConf = new Configuration();
+        actionConf.set(SparkActionExecutor.SPARK_MASTER, "yarn");
+        actionConf.set(SparkActionExecutor.SPARK_MODE, "client");
+        actionConf.set(SparkActionExecutor.SPARK_CLASS, 
"org.apache.oozie.example.SparkFileCopy");
+        actionConf.set(SparkActionExecutor.SPARK_JOB_NAME, "Spark Copy File");
+        actionConf.set(SparkActionExecutor.SPARK_OPTS,
+                "--conf 
spark.executor.extraJavaOptions=\"-XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp\"" +
+                "--conf spark.driver.extraJavaOptions=\"-Xmn2703m 
-XX:SurvivorRatio=2 -XX:ParallelGCThreads=20\"");
+        actionConf.set(SparkActionExecutor.SPARK_JAR, "/lib/test.jar");
+
+        final String[] mainArgs = {"arg0", "arg1"};
+        final List<String> sparkArgs = new 
SparkArgsExtractor(actionConf).extract(mainArgs);
+
+        assertEquals("Spark args mismatch",
+                Lists.newArrayList("--master", "yarn",
+                        "--deploy-mode", "client",
+                        "--name", "Spark Copy File",
+                        "--class", "org.apache.oozie.example.SparkFileCopy",
+                        "--conf", 
"spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp " +
+                                "-Dlog4j.configuration=spark-log4j.properties",
+                        "--conf", "spark.driver.extraJavaOptions=-Xmn2703m 
-XX:SurvivorRatio=2 -XX:ParallelGCThreads=20 " +
+                                "-Dlog4j.configuration=spark-log4j.properties",
+                        "--conf", "spark.executor.extraClassPath=$PWD/*",
+                        "--conf", "spark.driver.extraClassPath=$PWD/*",
+                        "--conf", 
"spark.yarn.security.tokens.hive.enabled=false",
+                        "--conf", 
"spark.yarn.security.tokens.hbase.enabled=false",
+                        "--files", "spark-log4j.properties,hive-site.xml",
+                        "--conf", "spark.yarn.jar=null",
+                        "--verbose",
+                        "/lib/test.jar",
+                        "arg0",
+                        "arg1"),
+                sparkArgs);
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f293eaa/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java
----------------------------------------------------------------------
diff --git 
a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java
 
b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java
index 02786a2..4be9777 100644
--- 
a/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java
+++ 
b/sharelib/spark/src/test/java/org/apache/oozie/action/hadoop/TestSparkOptionsSplitter.java
@@ -34,13 +34,35 @@ public class TestSparkOptionsSplitter {
     @Parameterized.Parameters
     public static List<Object[]> params() {
         return Arrays.asList(new Object[][]{
-                {"--option1 value1", Arrays.asList("--option1", "value1")},
-                {"--option1   value1", Arrays.asList("--option1", "value1")},
-                {"   --option1 value1   ", Arrays.asList("--option1", 
"value1")},
-                {"--conf special=value1", Arrays.asList("--conf", 
"special=value1")},
-                {"--conf special=\"value1\"", Arrays.asList("--conf", 
"special=value1")},
-                {"--conf special=\"value1 value2\"", Arrays.asList("--conf", 
"special=value1 value2")},
-                {" --conf special=\"value1 value2\"  ", 
Arrays.asList("--conf", "special=value1 value2")},
+                {"--option1 value1",
+                        Arrays.asList("--option1", "value1")},
+                {"--option1   value1",
+                        Arrays.asList("--option1", "value1")},
+                {"   --option1 value1   ",
+                        Arrays.asList("--option1", "value1")},
+                {"--conf special=value1",
+                        Arrays.asList("--conf", "special=value1")},
+                {"--conf special=\"value1\"",
+                        Arrays.asList("--conf", "special=value1")},
+                {"--conf special=\"value1 value2\"",
+                        Arrays.asList("--conf", "special=value1 value2")},
+                {" --conf special=\"value1 value2\"  ",
+                        Arrays.asList("--conf", "special=value1 value2")},
+                {"--conf special=value1 value2 --conf value3",
+                        Arrays.asList("--conf", "special=value1 value2", 
"--conf", "value3")},
+                {"--conf special1=value1 special2=\"value2 value3\" --conf 
value4",
+                        Arrays.asList("--conf", "special1=value1 
special2=\"value2 value3\"", "--conf", "value4")},
+                {"--conf special1=value1 value2 special2=value3 value4",
+                        Arrays.asList("--conf", "special1=value1 value2 
special2=value3 value4")},
+                {"--option1 value1 --option2=value2",
+                        Arrays.asList("--option1", "value1", "--option2", 
"value2")},
+                {"--conf 
spark.executor.extraJavaOptions=\"-XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp\"",
+                    Arrays.asList("--conf",
+                            
"spark.executor.extraJavaOptions=-XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp")},
+                {"--option1 value1 --verbose",
+                        Arrays.asList("--option1", "value1", "--verbose")},
+                {"--verbose --option1 value1",
+                        Arrays.asList("--verbose", "--option1", "value1")}
         });
     }
 

Reply via email to