[ 
https://issues.apache.org/jira/browse/NIFI-2072?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17151064#comment-17151064
 ] 

Otto Fowler commented on NIFI-2072:
-----------------------------------

OK

I have a PR just about ready for this.  But just to get some feedback first:

After the PR there implicitly two ways the processor works based on the enable 
named groups property.
The old way if it is not enabled.

The new way.
The new way is different in that numeric indices are not added until the second 
set of matches ( if you have that enabled).

The root attribute name is used for the 0 group -or- the whole match line if 
there are no groups specified.

such as : 

{code:java}
@Test
    public void testFindAll() throws Exception {
        final TestRunner testRunner = TestRunners.newTestRunner(new 
ExtractText());
        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
        testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, 
"true");
        final String attributeKey = "regex.result";
        testRunner.setProperty(attributeKey, "(?s)(?<W>\\w+)");
        testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
        final MockFlowFile out = 
testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
        // Ensure the zero capture group is in the resultant attributes
        out.assertAttributeExists(attributeKey);
        out.assertAttributeExists(attributeKey + ".W");
        out.assertAttributeExists(attributeKey + ".W.1");
        out.assertAttributeExists(attributeKey + ".W.2");
        out.assertAttributeExists(attributeKey + ".W.3");
        out.assertAttributeEquals(attributeKey, "This");
        out.assertAttributeEquals(attributeKey + ".W", "This");
        out.assertAttributeEquals(attributeKey + ".W.1", "is");
        out.assertAttributeEquals(attributeKey + ".W.2", "my");
        out.assertAttributeEquals(attributeKey + ".W.3", "text");
    }

    @Test
    public void testFindAllPair() throws Exception {
        final TestRunner testRunner = TestRunners.newTestRunner(new 
ExtractText());
        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
        testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, 
"true");
        final String attributeKey = "regex.result";
        testRunner.setProperty(attributeKey, "(?<LEFT>\\w+)=(?<RIGHT>\\d+)");
        testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
        final MockFlowFile out = 
testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
        // Ensure the zero capture group is in the resultant attributes
        out.assertAttributeExists(attributeKey);
        out.assertAttributeExists(attributeKey + ".LEFT");
        out.assertAttributeExists(attributeKey + ".RIGHT");
        out.assertAttributeExists(attributeKey + ".LEFT.1");
        out.assertAttributeExists(attributeKey + ".RIGHT.1");
        out.assertAttributeExists(attributeKey + ".LEFT.2");
        out.assertAttributeExists(attributeKey + ".RIGHT.2");
        out.assertAttributeNotExists(attributeKey + ".LEFT.3"); // Ensure 
there's no more attributes
        out.assertAttributeNotExists(attributeKey + ".RIGHT.3"); // Ensure 
there's no more attributes
        out.assertAttributeEquals(attributeKey , "a=1");
        out.assertAttributeEquals(attributeKey + ".LEFT", "a");
        out.assertAttributeEquals(attributeKey + ".RIGHT", "1");
        out.assertAttributeEquals(attributeKey + ".LEFT.1", "b");
        out.assertAttributeEquals(attributeKey + ".RIGHT.1", "10");
        out.assertAttributeEquals(attributeKey + ".LEFT.2", "c");
        out.assertAttributeEquals(attributeKey + ".RIGHT.2", "100");
    }
{code}


> Support named captures in ExtractText
> -------------------------------------
>
>                 Key: NIFI-2072
>                 URL: https://issues.apache.org/jira/browse/NIFI-2072
>             Project: Apache NiFi
>          Issue Type: Improvement
>            Reporter: Joey Frazee
>            Assignee: Otto Fowler
>            Priority: Major
>
> ExtractText currently captures and creates attributes using numeric indices 
> (e.g, attribute.name.0, attribute.name.1, etc.) whether or not the capture 
> groups are named, i.e., patterns like (?<name>\w+).
> In addition to being more faithful to the provided regexes, named captures 
> could help simplify data flows because you wouldn't have to add superfluous 
> UpdateAttribute steps which are just renaming the indexed captures to more 
> interpretable names.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to