Hi Stephane,
Here's something I hope can help. In the GetHTMLElement instead of doing
the selector on "table td" try "table tr" with an output type of "Text"
and a destination type of flowfile-content. This should create flow files
for each row with data and extract the numeric text from the td elements in
that data. From there you can use the ExecuteScript processor to trim the
whitespace, convert the text values into numbers and sum them. I was able
to get this to work with the javascript (ECMAScript) below and using the
example html you provided:
var flowFile = session.get();
if (flowFile != null) {
var StreamCallback =
Java.type("org.apache.nifi.processor.io.StreamCallback")
var IOUtils = Java.type("org.apache.commons.io.IOUtils")
var StandardCharsets = Java.type("java.nio.charset.StandardCharsets")
flowFile = session.write(flowFile,
new StreamCallback(function(inputStream, outputStream) {
var text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
var res = text.split(" ");
var count = 0;
for(i in res){
if(parseInt(res[i]) != NaN){
count+=parseInt(res[i]);
}
}
outputStream.write(count.toString().getBytes(StandardCharsets.UTF_8))
}))
flowFile = session.putAttribute(flowFile, "filename", flowFile.getId() +
'_count.txt');
session.transfer(flowFile, REL_SUCCESS)
}
I've attached the template I used to do this which hopefully can help as
well. Please let me know if you have any questions.
Yolanda
On Wed, Aug 31, 2016 at 3:52 AM, <[email protected]>
wrote:
> Hi All,
>
>
>
> I’m trying to extract and doing calculation from HTML table with NIFI.
>
> The purpose of the test if doing an addition of each TD in the same TR and
> output the result in file.
>
> For this sample the result should be 23 and 43.
>
>
>
> My table looks like
>
>
>
> <table>
>
> <tr>
>
> <td>11</td>
>
> <td>12</td>
>
> </tr>
>
> <tr>
>
> <td>21</td>
>
> <td>22</td>
>
> </tr>
>
> </table>
>
> My NIFI workflow is
>
>
>
> InvokeHTTP > Response > GetHTMLElement > Success > PutFile
>
>
>
> The CSS Selector for GetHTMLElement is table td.
>
> I know that GetHTMLElement produce 0-N element but I don’t know how I can
> perform calculation of them.
>
>
>
> All help will be grateful
>
>
>
> Thanks
>
> Regards
>
> Stephane
>
>
>
> · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · · ·
> · ·
> *Stephane Tinseau*
>
> *Thomson Reuters*
> [email protected]
> thomsonreuters.com
>
>
>
> ------------------------------
>
> This e-mail is for the sole use of the intended recipient and contains
> information that may be privileged and/or confidential. If you are not an
> intended recipient, please notify the sender by return e-mail and delete
> this e-mail and any attachments. Certain required legal entity disclosures
> can be accessed on our website.
> <http://site.thomsonreuters.com/site/disclosures/>
>
--
--
[email protected]
@YolandaMDavis
<?xml version="1.0" ?>
<template encoding-version="1.0">
<description>Extract data elements from table rows, summarize the data and save</description>
<groupId>e09e08ae-0156-1000-2e13-64dd7900b7a3</groupId>
<name>ExtractAndSumDataRows</name>
<snippet>
<connections>
<id>dfaabf74-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold>
<backPressureObjectThreshold>10000</backPressureObjectThreshold>
<destination>
<groupId>e09e08ae-0156-1000-0000-000000000000</groupId>
<id>dfa7cea7-0156-1000-0000-000000000000</id>
<type>PROCESSOR</type>
</destination>
<flowFileExpiration>0 sec</flowFileExpiration>
<labelIndex>1</labelIndex>
<name></name>
<selectedRelationships>success</selectedRelationships>
<source>
<groupId>e09e08ae-0156-1000-0000-000000000000</groupId>
<id>dfaab0b2-0156-1000-0000-000000000000</id>
<type>PROCESSOR</type>
</source>
<zIndex>0</zIndex>
</connections>
<connections>
<id>e0cf364a-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold>
<backPressureObjectThreshold>10000</backPressureObjectThreshold>
<destination>
<groupId>e09e08ae-0156-1000-0000-000000000000</groupId>
<id>e0cf07fc-0156-1000-0000-000000000000</id>
<type>PROCESSOR</type>
</destination>
<flowFileExpiration>0 sec</flowFileExpiration>
<labelIndex>1</labelIndex>
<name></name>
<selectedRelationships>success</selectedRelationships>
<source>
<groupId>e09e08ae-0156-1000-0000-000000000000</groupId>
<id>dfa7cea7-0156-1000-0000-000000000000</id>
<type>PROCESSOR</type>
</source>
<zIndex>0</zIndex>
</connections>
<connections>
<id>e0d8d7f8-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold>
<backPressureObjectThreshold>10000</backPressureObjectThreshold>
<destination>
<groupId>e09e08ae-0156-1000-0000-000000000000</groupId>
<id>dfb00bb8-0156-1000-0000-000000000000</id>
<type>PROCESSOR</type>
</destination>
<flowFileExpiration>0 sec</flowFileExpiration>
<labelIndex>1</labelIndex>
<name></name>
<selectedRelationships>success</selectedRelationships>
<source>
<groupId>e09e08ae-0156-1000-0000-000000000000</groupId>
<id>e0cf07fc-0156-1000-0000-000000000000</id>
<type>PROCESSOR</type>
</source>
<zIndex>0</zIndex>
</connections>
<processors>
<id>dfa7cea7-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<position>
<x>385.3220957351148</x>
<y>4.875043923781163</y>
</position>
<config>
<bulletinLevel>WARN</bulletinLevel>
<comments></comments>
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount>
<descriptors>
<entry>
<key>URL</key>
<value>
<name>URL</name>
</value>
</entry>
<entry>
<key>CSS Selector</key>
<value>
<name>CSS Selector</name>
</value>
</entry>
<entry>
<key>HTML Character Encoding</key>
<value>
<name>HTML Character Encoding</name>
</value>
</entry>
<entry>
<key>Output Type</key>
<value>
<name>Output Type</name>
</value>
</entry>
<entry>
<key>Destination</key>
<value>
<name>Destination</name>
</value>
</entry>
<entry>
<key>Prepend Element Value</key>
<value>
<name>Prepend Element Value</name>
</value>
</entry>
<entry>
<key>Append Element Value</key>
<value>
<name>Append Element Value</name>
</value>
</entry>
<entry>
<key>Attribute Name</key>
<value>
<name>Attribute Name</name>
</value>
</entry>
</descriptors>
<lossTolerant>false</lossTolerant>
<penaltyDuration>30 sec</penaltyDuration>
<properties>
<entry>
<key>URL</key>
<value>localhost</value>
</entry>
<entry>
<key>CSS Selector</key>
<value>table tr</value>
</entry>
<entry>
<key>HTML Character Encoding</key>
<value>UTF-8</value>
</entry>
<entry>
<key>Output Type</key>
<value>Text</value>
</entry>
<entry>
<key>Destination</key>
<value>flowfile-content</value>
</entry>
<entry>
<key>Prepend Element Value</key>
</entry>
<entry>
<key>Append Element Value</key>
</entry>
<entry>
<key>Attribute Name</key>
</entry>
</properties>
<runDurationMillis>0</runDurationMillis>
<schedulingPeriod>0 sec</schedulingPeriod>
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy>
<yieldDuration>1 sec</yieldDuration>
</config>
<name>GetHTMLElement</name>
<relationships>
<autoTerminate>true</autoTerminate>
<name>element not found</name>
</relationships>
<relationships>
<autoTerminate>true</autoTerminate>
<name>invalid html</name>
</relationships>
<relationships>
<autoTerminate>true</autoTerminate>
<name>original</name>
</relationships>
<relationships>
<autoTerminate>false</autoTerminate>
<name>success</name>
</relationships>
<style></style>
<type>org.apache.nifi.GetHTMLElement</type>
</processors>
<processors>
<id>dfaab0b2-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<position>
<x>0.0</x>
<y>356.2229093280936</y>
</position>
<config>
<bulletinLevel>WARN</bulletinLevel>
<comments></comments>
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount>
<descriptors>
<entry>
<key>Input Directory</key>
<value>
<name>Input Directory</name>
</value>
</entry>
<entry>
<key>File Filter</key>
<value>
<name>File Filter</name>
</value>
</entry>
<entry>
<key>Path Filter</key>
<value>
<name>Path Filter</name>
</value>
</entry>
<entry>
<key>Batch Size</key>
<value>
<name>Batch Size</name>
</value>
</entry>
<entry>
<key>Keep Source File</key>
<value>
<name>Keep Source File</name>
</value>
</entry>
<entry>
<key>Recurse Subdirectories</key>
<value>
<name>Recurse Subdirectories</name>
</value>
</entry>
<entry>
<key>Polling Interval</key>
<value>
<name>Polling Interval</name>
</value>
</entry>
<entry>
<key>Ignore Hidden Files</key>
<value>
<name>Ignore Hidden Files</name>
</value>
</entry>
<entry>
<key>Minimum File Age</key>
<value>
<name>Minimum File Age</name>
</value>
</entry>
<entry>
<key>Maximum File Age</key>
<value>
<name>Maximum File Age</name>
</value>
</entry>
<entry>
<key>Minimum File Size</key>
<value>
<name>Minimum File Size</name>
</value>
</entry>
<entry>
<key>Maximum File Size</key>
<value>
<name>Maximum File Size</name>
</value>
</entry>
</descriptors>
<lossTolerant>false</lossTolerant>
<penaltyDuration>30 sec</penaltyDuration>
<properties>
<entry>
<key>Input Directory</key>
<value>/Users/ydavis/dev/tools/html</value>
</entry>
<entry>
<key>File Filter</key>
<value>[^\.].*</value>
</entry>
<entry>
<key>Path Filter</key>
</entry>
<entry>
<key>Batch Size</key>
<value>10</value>
</entry>
<entry>
<key>Keep Source File</key>
<value>true</value>
</entry>
<entry>
<key>Recurse Subdirectories</key>
<value>true</value>
</entry>
<entry>
<key>Polling Interval</key>
<value>10 sec</value>
</entry>
<entry>
<key>Ignore Hidden Files</key>
<value>true</value>
</entry>
<entry>
<key>Minimum File Age</key>
<value>0 sec</value>
</entry>
<entry>
<key>Maximum File Age</key>
</entry>
<entry>
<key>Minimum File Size</key>
<value>0 B</value>
</entry>
<entry>
<key>Maximum File Size</key>
</entry>
</properties>
<runDurationMillis>0</runDurationMillis>
<schedulingPeriod>0 sec</schedulingPeriod>
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy>
<yieldDuration>1 sec</yieldDuration>
</config>
<name>GetFile</name>
<relationships>
<autoTerminate>false</autoTerminate>
<name>success</name>
</relationships>
<style></style>
<type>org.apache.nifi.processors.standard.GetFile</type>
</processors>
<processors>
<id>dfb00bb8-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<position>
<x>1093.511409068661</x>
<y>0.0</y>
</position>
<config>
<bulletinLevel>WARN</bulletinLevel>
<comments></comments>
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount>
<descriptors>
<entry>
<key>Directory</key>
<value>
<name>Directory</name>
</value>
</entry>
<entry>
<key>Conflict Resolution Strategy</key>
<value>
<name>Conflict Resolution Strategy</name>
</value>
</entry>
<entry>
<key>Create Missing Directories</key>
<value>
<name>Create Missing Directories</name>
</value>
</entry>
<entry>
<key>Maximum File Count</key>
<value>
<name>Maximum File Count</name>
</value>
</entry>
<entry>
<key>Last Modified Time</key>
<value>
<name>Last Modified Time</name>
</value>
</entry>
<entry>
<key>Permissions</key>
<value>
<name>Permissions</name>
</value>
</entry>
<entry>
<key>Owner</key>
<value>
<name>Owner</name>
</value>
</entry>
<entry>
<key>Group</key>
<value>
<name>Group</name>
</value>
</entry>
</descriptors>
<lossTolerant>false</lossTolerant>
<penaltyDuration>30 sec</penaltyDuration>
<properties>
<entry>
<key>Directory</key>
<value>/Users/ydavis/dev/tools/outhtml</value>
</entry>
<entry>
<key>Conflict Resolution Strategy</key>
<value>fail</value>
</entry>
<entry>
<key>Create Missing Directories</key>
<value>true</value>
</entry>
<entry>
<key>Maximum File Count</key>
</entry>
<entry>
<key>Last Modified Time</key>
</entry>
<entry>
<key>Permissions</key>
</entry>
<entry>
<key>Owner</key>
</entry>
<entry>
<key>Group</key>
</entry>
</properties>
<runDurationMillis>0</runDurationMillis>
<schedulingPeriod>0 sec</schedulingPeriod>
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy>
<yieldDuration>1 sec</yieldDuration>
</config>
<name>PutFile</name>
<relationships>
<autoTerminate>true</autoTerminate>
<name>failure</name>
</relationships>
<relationships>
<autoTerminate>true</autoTerminate>
<name>success</name>
</relationships>
<style></style>
<type>org.apache.nifi.processors.standard.PutFile</type>
</processors>
<processors>
<id>e0cf07fc-0156-1000-0000-000000000000</id>
<parentGroupId>e09e08ae-0156-1000-0000-000000000000</parentGroupId>
<position>
<x>768.3701294054474</x>
<y>410.42819846201985</y>
</position>
<config>
<bulletinLevel>WARN</bulletinLevel>
<comments></comments>
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount>
<descriptors>
<entry>
<key>Script Engine</key>
<value>
<name>Script Engine</name>
</value>
</entry>
<entry>
<key>Script File</key>
<value>
<name>Script File</name>
</value>
</entry>
<entry>
<key>Script Body</key>
<value>
<name>Script Body</name>
</value>
</entry>
<entry>
<key>Module Directory</key>
<value>
<name>Module Directory</name>
</value>
</entry>
</descriptors>
<lossTolerant>false</lossTolerant>
<penaltyDuration>30 sec</penaltyDuration>
<properties>
<entry>
<key>Script Engine</key>
<value>ECMAScript</value>
</entry>
<entry>
<key>Script File</key>
</entry>
<entry>
<key>Script Body</key>
<value>var flowFile = session.get();
if (flowFile != null) {
var StreamCallback = Java.type("org.apache.nifi.processor.io.StreamCallback")
var IOUtils = Java.type("org.apache.commons.io.IOUtils")
var StandardCharsets = Java.type("java.nio.charset.StandardCharsets")
flowFile = session.write(flowFile,
new StreamCallback(function(inputStream, outputStream) {
var text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
var res = text.split(" ");
var count = 0;
for(i in res){
if(parseInt(res[i]) != NaN){
count+=parseInt(res[i]);
}
}
outputStream.write(count.toString().getBytes(StandardCharsets.UTF_8))
}))
flowFile = session.putAttribute(flowFile, "filename", flowFile.getId() + '_count.txt');
session.transfer(flowFile, REL_SUCCESS)
}
</value>
</entry>
<entry>
<key>Module Directory</key>
</entry>
</properties>
<runDurationMillis>0</runDurationMillis>
<schedulingPeriod>0 sec</schedulingPeriod>
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy>
<yieldDuration>1 sec</yieldDuration>
</config>
<name>ExecuteScript</name>
<relationships>
<autoTerminate>true</autoTerminate>
<name>failure</name>
</relationships>
<relationships>
<autoTerminate>false</autoTerminate>
<name>success</name>
</relationships>
<style></style>
<type>org.apache.nifi.processors.script.ExecuteScript</type>
</processors>
</snippet>
<timestamp>08/31/2016 10:01:45 EDT</timestamp>
</template>