Repository: nifi
Updated Branches:
  refs/heads/master 3dc7a160c -> 4bf267c8b


NIFI-3101: This closes #1271. Improve Get/Modify/PutHTMLElement URL

- Added detailed description about how the URL property works with
  GetHTMLElement
- Added Expression support with URL
- Made URL property dynamic with ModifyHTMLElement and PutHTMLElement,
  since it won't be used to alter HTML element and need not to be
  specified. Making it a dynamic property let existing processor configuration 
stays valid


Project: http://git-wip-us.apache.org/repos/asf/nifi/repo
Commit: http://git-wip-us.apache.org/repos/asf/nifi/commit/4bf267c8
Tree: http://git-wip-us.apache.org/repos/asf/nifi/tree/4bf267c8
Diff: http://git-wip-us.apache.org/repos/asf/nifi/diff/4bf267c8

Branch: refs/heads/master
Commit: 4bf267c8bb11939d715c02c3ca818dec372bec26
Parents: 3dc7a16
Author: Koji Kawamura <[email protected]>
Authored: Fri Nov 25 17:58:32 2016 +0900
Committer: joewitt <[email protected]>
Committed: Tue Feb 14 15:28:46 2017 -0500

----------------------------------------------------------------------
 .../org/apache/nifi/AbstractHTMLProcessor.java  | 16 +++-
 .../java/org/apache/nifi/GetHTMLElement.java    |  8 +-
 .../java/org/apache/nifi/ModifyHTMLElement.java | 11 ++-
 .../java/org/apache/nifi/PutHTMLElement.java    | 11 ++-
 .../org/apache/nifi/TestGetHTMLElement.java     | 80 ++++++++++++++++++++
 5 files changed, 121 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nifi/blob/4bf267c8/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java
 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java
index 127f0d8..8ad6f8a 100644
--- 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java
+++ 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/AbstractHTMLProcessor.java
@@ -63,9 +63,12 @@ public abstract class AbstractHTMLProcessor extends 
AbstractProcessor {
 
     public static final PropertyDescriptor URL = new PropertyDescriptor
             .Builder().name("URL")
-            .description("Base URL for the HTML page being parsed.")
+            .description("Base URL for the HTML page being parsed." +
+                    " This URL will be used to resolve an absolute URL" +
+                    " when an attribute value is extracted from a HTML 
element.")
             .required(true)
             .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
+            .expressionLanguageSupported(true)
             .build();
 
     public static final PropertyDescriptor CSS_SELECTOR = new 
PropertyDescriptor
@@ -120,11 +123,20 @@ public abstract class AbstractHTMLProcessor extends 
AbstractProcessor {
         session.read(inputFlowFile, new InputStreamCallback() {
             @Override
             public void process(InputStream inputStream) throws IOException {
+                final String baseUrl = getBaseUrl(inputFlowFile, context);
+                if (baseUrl == null || baseUrl.isEmpty()) {
+                    throw new RuntimeException("Base URL was empty.");
+                }
                 doc.set(Jsoup.parse(inputStream,
                         context.getProperty(HTML_CHARSET).getValue(),
-                        context.getProperty(URL).getValue()));
+                        baseUrl));
             }
         });
         return doc.get();
     }
+
+
+    protected String getBaseUrl(final FlowFile inputFlowFile, final 
ProcessContext context) {
+        return "http://localhost/";;
+    }
 }

http://git-wip-us.apache.org/repos/asf/nifi/blob/4bf267c8/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
index 1d421a0..713fabd 100644
--- 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
+++ 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
@@ -92,7 +92,9 @@ public class GetHTMLElement
             .Builder().name("Attribute Name")
             .description(("When getting the value of a HTML element attribute 
this value is used as the key to determine" +
                     " which attribute on the selected element should be 
retrieved. This value is used when the \"Output Type\"" +
-                    " is set to \"" + ELEMENT_ATTRIBUTE + "\""))
+                    " is set to \"" + ELEMENT_ATTRIBUTE + "\"." +
+                    " If this value is prefixed with 'abs:', then the 
extracted attribute value will be converted into" +
+                    " an absolute URL form using the specified base URL."))
             .required(false)
             .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
             .expressionLanguageSupported(true)
@@ -238,4 +240,8 @@ public class GetHTMLElement
         }
     }
 
+    @Override
+    protected String getBaseUrl(FlowFile inputFlowFile, ProcessContext 
context) {
+        return 
context.getProperty(URL).evaluateAttributeExpressions(inputFlowFile).getValue();
+    }
 }

http://git-wip-us.apache.org/repos/asf/nifi/blob/4bf267c8/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java
 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java
index e84d4ed..7f6e12e 100644
--- 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java
+++ 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/ModifyHTMLElement.java
@@ -98,7 +98,6 @@ public class ModifyHTMLElement extends AbstractHTMLProcessor {
     @Override
     protected void init(final ProcessorInitializationContext context) {
         final List<PropertyDescriptor> descriptors = new ArrayList<>();
-        descriptors.add(URL);
         descriptors.add(CSS_SELECTOR);
         descriptors.add(HTML_CHARSET);
         descriptors.add(OUTPUT_TYPE);
@@ -124,6 +123,16 @@ public class ModifyHTMLElement extends 
AbstractHTMLProcessor {
         return descriptors;
     }
 
+    /**
+     * This processor used to support URL property, but it has been removed
+     * since it's not required when altering HTML elements.
+     * Support URL as dynamic property so that existing data flow can stay in 
valid state without modification.
+     */
+    @Override
+    protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final 
String propertyDescriptorName) {
+        return URL;
+    }
+
     @Override
     public void onTrigger(final ProcessContext context, final ProcessSession 
session) throws ProcessException {
         final FlowFile flowFile = session.get();

http://git-wip-us.apache.org/repos/asf/nifi/blob/4bf267c8/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java
 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java
index 995fc99..bc9b70c 100644
--- 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java
+++ 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/PutHTMLElement.java
@@ -88,7 +88,6 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
     @Override
     protected void init(final ProcessorInitializationContext context) {
         final List<PropertyDescriptor> descriptors = new 
ArrayList<PropertyDescriptor>();
-        descriptors.add(URL);
         descriptors.add(CSS_SELECTOR);
         descriptors.add(HTML_CHARSET);
         descriptors.add(PUT_LOCATION_TYPE);
@@ -113,6 +112,16 @@ public class PutHTMLElement extends AbstractHTMLProcessor {
         return descriptors;
     }
 
+    /**
+     * This processor used to support URL property, but it has been removed
+     * since it's not required when altering HTML elements.
+     * Support URL as dynamic property so that existing data flow can stay in 
valid state without modification.
+     */
+    @Override
+    protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final 
String propertyDescriptorName) {
+        return URL;
+    }
+
     @Override
     public void onTrigger(final ProcessContext context, final ProcessSession 
session) throws ProcessException {
         final FlowFile flowFile = session.get();

http://git-wip-us.apache.org/repos/asf/nifi/blob/4bf267c8/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java
 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java
index 4b215fd..2b2706d 100644
--- 
a/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java
+++ 
b/nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/test/java/org/apache/nifi/TestGetHTMLElement.java
@@ -19,7 +19,9 @@ package org.apache.nifi;
 import java.io.File;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.nifi.util.MockFlowFile;
 import org.apache.nifi.util.TestRunner;
@@ -233,6 +235,84 @@ public class TestGetHTMLElement extends AbstractHTMLTest {
     }
 
     @Test
+    public void testExtractAttributeFromElementRelativeUrl() throws Exception {
+        testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
+        testRunner.setProperty(GetHTMLElement.DESTINATION, 
GetHTMLElement.DESTINATION_CONTENT);
+        testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, 
GetHTMLElement.ELEMENT_ATTRIBUTE);
+        testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "src");
+
+        testRunner.enqueue(new 
File("src/test/resources/Weather.html").toPath());
+        testRunner.run();
+
+        testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
+        testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
+
+        List<MockFlowFile> ffs = 
testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
+        ffs.get(0).assertContentEquals("js/scripts.js");
+    }
+
+    @Test
+    public void testExtractAttributeFromElementAbsoluteUrl() throws Exception {
+        testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
+        testRunner.setProperty(GetHTMLElement.DESTINATION, 
GetHTMLElement.DESTINATION_CONTENT);
+        testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, 
GetHTMLElement.ELEMENT_ATTRIBUTE);
+        testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
+
+        testRunner.enqueue(new 
File("src/test/resources/Weather.html").toPath());
+        testRunner.run();
+
+        testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
+        testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
+
+        List<MockFlowFile> ffs = 
testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
+        ffs.get(0).assertContentEquals("http://localhost/js/scripts.js";);
+    }
+
+    @Test
+    public void testExtractAttributeFromElementAbsoluteUrlWithEL() throws 
Exception {
+        testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
+        testRunner.setProperty(GetHTMLElement.DESTINATION, 
GetHTMLElement.DESTINATION_CONTENT);
+        testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, 
GetHTMLElement.ELEMENT_ATTRIBUTE);
+        testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
+        testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
+
+        final Map<String, String> attributes = new HashMap<>();
+        attributes.put("contentUrl", "https://example.com/a/b/c/Weather.html";);
+        testRunner.enqueue(new 
File("src/test/resources/Weather.html").toPath(), attributes);
+        testRunner.run();
+
+        testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0);
+        testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
+
+        List<MockFlowFile> ffs = 
testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS);
+        
ffs.get(0).assertContentEquals("https://example.com/a/b/c/js/scripts.js";);
+    }
+
+    @Test
+    public void testExtractAttributeFromElementAbsoluteUrlWithEmptyElResult() 
throws Exception {
+        testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script");
+        testRunner.setProperty(GetHTMLElement.DESTINATION, 
GetHTMLElement.DESTINATION_CONTENT);
+        testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, 
GetHTMLElement.ELEMENT_ATTRIBUTE);
+        testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src");
+        // Expression Language returns empty string because flow-file doesn't 
have contentUrl attribute.
+        testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}");
+
+        testRunner.enqueue(new 
File("src/test/resources/Weather.html").toPath());
+        testRunner.run();
+
+        testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0);
+        testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 1);
+        testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0);
+        testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0);
+    }
+
+    @Test
     public void testExtractTextFromElement() throws Exception {
         testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
         testRunner.setProperty(GetHTMLElement.DESTINATION, 
GetHTMLElement.DESTINATION_CONTENT);

Reply via email to