[
https://issues.apache.org/jira/browse/NIFI-1156?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15094250#comment-15094250
]
ASF GitHub Bot commented on NIFI-1156:
--------------------------------------
Github user markap14 commented on a diff in the pull request:
https://github.com/apache/nifi/pull/124#discussion_r49480077
--- Diff:
nifi-nar-bundles/nifi-html-bundle/nifi-html-processors/src/main/java/org/apache/nifi/GetHTMLElement.java
---
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.nifi.components.PropertyDescriptor;
+import org.apache.nifi.flowfile.FlowFile;
+import org.apache.nifi.processor.ProcessContext;
+import org.apache.nifi.processor.ProcessSession;
+import org.apache.nifi.processor.Relationship;
+import org.apache.nifi.processor.ProcessorInitializationContext;
+import org.apache.nifi.annotation.behavior.WritesAttribute;
+import org.apache.nifi.annotation.behavior.WritesAttributes;
+import org.apache.nifi.annotation.documentation.CapabilityDescription;
+import org.apache.nifi.annotation.documentation.SeeAlso;
+import org.apache.nifi.annotation.documentation.Tags;
+import org.apache.nifi.processor.exception.ProcessException;
+import org.apache.nifi.processor.io.StreamCallback;
+import org.apache.nifi.processor.util.StandardValidators;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collections;
+
+@Tags({"get", "html", "dom", "css", "element"})
+@CapabilityDescription("Parses HTML input using CSS selector syntax and
creates a new flowfile containing the extracted" +
+ " element content for each matching CSS selector.")
+@SeeAlso({ModifyHTMLElement.class, PutHTMLElement.class})
+@WritesAttributes({@WritesAttribute(attribute="HTMLElement",
description="Flowfile attribute where the element result" +
+ " parsed from the HTML using the CSS selector syntax are placed if
the destination is a flowfile attribute.")})
+public class GetHTMLElement
+ extends AbstractHTMLProcessor {
+
+ public static final String HTML_ELEMENT_ATTRIBUTE_NAME = "HTMLElement";
+ public static final String DESTINATION_ATTRIBUTE =
"flowfile-attribute";
+ public static final String DESTINATION_CONTENT = "flowfile-content";
+
+ public static final PropertyDescriptor PREPEND_ELEMENT_VALUE = new
PropertyDescriptor
+ .Builder().name("Prepend Element value")
+ .description("Prepends the specified value to the resulting
Element")
+ .required(false)
+ .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
+ .expressionLanguageSupported(true)
+ .build();
+
+ public static final PropertyDescriptor APPEND_ELEMENT_VALUE = new
PropertyDescriptor
+ .Builder().name("Append Element value")
+ .description("Appends the specified value to the resulting
Element")
+ .required(false)
+ .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
+ .expressionLanguageSupported(true)
+ .build();
+
+ public static final PropertyDescriptor ATTRIBUTE_KEY = new
PropertyDescriptor
+ .Builder().name("Attribute Name")
+ .description(("When getting the value of an element attribute
this value is used as the key to determine" +
+ " which attribute on the selected element should be
retrieved."))
+ .required(false)
+ .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
+ .expressionLanguageSupported(true)
+ .build();
+
+
+ public static final PropertyDescriptor OUTPUT_TYPE = new
PropertyDescriptor.Builder()
+ .name("Output Type")
+ .description("Controls the type of value that is retrieved
from the element. " +
+ ELEMENT_HTML + "," + ELEMENT_TEXT + ", " +
ELEMENT_ATTRIBUTE + " or " + ELEMENT_DATA)
+ .required(true)
+ .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
+ .allowableValues(ELEMENT_HTML, ELEMENT_TEXT,
ELEMENT_ATTRIBUTE, ELEMENT_DATA)
+ .defaultValue(ELEMENT_HTML)
+ .build();
+
+ public static final PropertyDescriptor DESTINATION = new
PropertyDescriptor.Builder()
+ .name("Destination")
+ .description("Control if element extracted is written as a
flowfile attribute or " +
+ "as flowfile content.")
+ .required(true)
+ .allowableValues(DESTINATION_ATTRIBUTE, DESTINATION_CONTENT)
+ .defaultValue(DESTINATION_ATTRIBUTE)
+ .build();
+
+ private List<PropertyDescriptor> descriptors;
+
+ private Set<Relationship> relationships;
+
+ @Override
+ protected void init(final ProcessorInitializationContext context) {
+ final List<PropertyDescriptor> descriptors = new ArrayList<>();
+ descriptors.add(URL);
+ descriptors.add(CSS_SELECTOR);
+ descriptors.add(HTML_CHARSET);
+ descriptors.add(OUTPUT_TYPE);
+ descriptors.add(DESTINATION);
+ descriptors.add(PREPEND_ELEMENT_VALUE);
+ descriptors.add(APPEND_ELEMENT_VALUE);
+ descriptors.add(ATTRIBUTE_KEY);
+ this.descriptors = Collections.unmodifiableList(descriptors);
+
+ final Set<Relationship> relationships = new HashSet<>();
+ relationships.add(REL_ORIGINAL);
+ relationships.add(REL_SUCCESS);
+ relationships.add(REL_FAILURE);
+ relationships.add(REL_NOT_FOUND);
+ this.relationships = Collections.unmodifiableSet(relationships);
+ }
+
+ @Override
+ public Set<Relationship> getRelationships() {
+ return this.relationships;
+ }
+
+ @Override
+ public final List<PropertyDescriptor>
getSupportedPropertyDescriptors() {
+ return descriptors;
+ }
+
+ @Override
+ public void onTrigger(final ProcessContext context, final
ProcessSession session) throws ProcessException {
+ final FlowFile flowFile = session.get();
+ if ( flowFile == null ) {
+ return;
+ }
+
+ try {
+
+ final Document doc = parseHTMLDocumentFromFlowfile(flowFile,
context, session);
+ final Elements eles =
doc.select(context.getProperty(CSS_SELECTOR)
+ .evaluateAttributeExpressions().getValue());
+ final String prependValue =
context.getProperty(PREPEND_ELEMENT_VALUE)
+ .evaluateAttributeExpressions(flowFile).getValue();
+ final String appendValue =
context.getProperty(APPEND_ELEMENT_VALUE)
+ .evaluateAttributeExpressions(flowFile).getValue();
+
+ if (eles == null || eles.size() == 0) {
+ //No element found
+ session.transfer(flowFile, REL_NOT_FOUND);
+ } else {
+ for (final Element ele : eles) {
+ final FlowFile ff = session.create();
+
+ switch (context.getProperty(DESTINATION).getValue()) {
+ case DESTINATION_ATTRIBUTE:
+ final FlowFile atFlowfile =
session.putAttribute(ff, HTML_ELEMENT_ATTRIBUTE_NAME,
+ extractElementValue(
+ prependValue,
+
context.getProperty(OUTPUT_TYPE).getValue(),
+ appendValue,
+ ele,
+
context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions()
+ .getValue()));
+
session.getProvenanceReporter().create(atFlowfile);
+ session.transfer(atFlowfile, REL_SUCCESS);
+ break;
+ case DESTINATION_CONTENT:
+ final FlowFile conFlowfile = session.write(ff,
new StreamCallback() {
+ @Override
+ public void process(InputStream
inputStream, OutputStream outputStream) throws IOException {
+ try {
+
outputStream.write(extractElementValue(
+ prependValue,
+
context.getProperty(OUTPUT_TYPE).getValue(),
+ appendValue,
+ ele,
+
context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions()
+
.getValue()).getBytes());
+ } catch (Exception ex) {
+ session.transfer(ff, REL_FAILURE);
+ }
+ }
+ });
+
+
session.getProvenanceReporter().create(conFlowfile);
--- End diff --
Same point as above, should be a FORK, not CREATE, but it'll be taken care
of by the framework.
> HTML Parsing Processors Bundle
> ------------------------------
>
> Key: NIFI-1156
> URL: https://issues.apache.org/jira/browse/NIFI-1156
> Project: Apache NiFi
> Issue Type: New Feature
> Components: Core Framework
> Reporter: Jeremy Dyer
> Priority: Minor
>
> NiFi provides the ability to ingest HTML but lacks the convenience to easily
> interact with that HTML once it has entered the flow. There should be a HTML
> Processing Bundle that provides mechanisms for manipulating and interacting
> with HTML data once it has entered the flow. Jsoup http://jsoup.org/ seems
> like a logical tool to use since it is mature and has a MIT license which
> would allow it to be incorporated into NiFi.
> “GetHTMLElement” should use the CSS selector-syntax
> (http://www.w3schools.com/cssref/css_selectors.asp) built into Jsoup to
> extract 0-N HTML elements from the original HTML input. This processor should
> support a delimited string of selectors allowing the user to build compound
> HTML element output. Each HTML element (or compound element result) extracted
> will create a new Flowfile where the element will be in either the Flowfile
> content or an attribute depending on the user configuration.
> “ModifyHTMLElement” should provide the ability to modify the original input
> HTML and overwrite any existing element values. The HTML element that will be
> modified can be selected by using the CSS selector-syntax
> “PutHTMLElement” should provide the ability to put a new HTML element
> anywhere in the original input HTML using CSS selector-syntax to indicate the
> position that the new HTML element should be placed.
> There seems to be a potential for adding more processors but this seems like
> a good start. Since there is a dependency on Jsoup and a potential for more
> processors to come I think it makes sense to add this logic as its own nar
> bundle but I could be wrong.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)