Github user markap14 commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/2371#discussion_r159697649
  
    --- Diff: 
nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/CountText.java
 ---
    @@ -0,0 +1,318 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nifi.processors.standard;
    +
    +import java.io.BufferedReader;
    +import java.io.InputStreamReader;
    +import java.nio.charset.Charset;
    +import java.nio.charset.StandardCharsets;
    +import java.text.DecimalFormat;
    +import java.util.ArrayList;
    +import java.util.Arrays;
    +import java.util.Collections;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Set;
    +import java.util.concurrent.atomic.AtomicBoolean;
    +import java.util.stream.Collectors;
    +import org.apache.nifi.annotation.behavior.EventDriven;
    +import org.apache.nifi.annotation.behavior.InputRequirement;
    +import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
    +import org.apache.nifi.annotation.behavior.SideEffectFree;
    +import org.apache.nifi.annotation.behavior.SupportsBatching;
    +import org.apache.nifi.annotation.behavior.WritesAttribute;
    +import org.apache.nifi.annotation.behavior.WritesAttributes;
    +import org.apache.nifi.annotation.documentation.CapabilityDescription;
    +import org.apache.nifi.annotation.documentation.SeeAlso;
    +import org.apache.nifi.annotation.documentation.Tags;
    +import org.apache.nifi.annotation.lifecycle.OnScheduled;
    +import org.apache.nifi.components.PropertyDescriptor;
    +import org.apache.nifi.flowfile.FlowFile;
    +import org.apache.nifi.processor.AbstractProcessor;
    +import org.apache.nifi.processor.ProcessContext;
    +import org.apache.nifi.processor.ProcessSession;
    +import org.apache.nifi.processor.Relationship;
    +import org.apache.nifi.processor.exception.ProcessException;
    +import org.apache.nifi.processor.util.StandardValidators;
    +import org.apache.nifi.util.StringUtils;
    +
    +@EventDriven
    +@SideEffectFree
    +@SupportsBatching
    +@Tags({"count", "text", "line", "word", "character"})
    +@InputRequirement(Requirement.INPUT_REQUIRED)
    +@CapabilityDescription("Counts various metrics on incoming text. The 
requested results will be recorded as attributes. "
    +        + "The resulting flowfile will not have its content modified.")
    +@WritesAttributes({
    +        @WritesAttribute(attribute = "text.line.count", description = "The 
number of lines of text present in the FlowFile content"),
    +        @WritesAttribute(attribute = "text.line.nonempty.count", 
description = "The number of lines of text (with at least one non-whitespace 
character) present in the original FlowFile"),
    +        @WritesAttribute(attribute = "text.word.count", description = "The 
number of words present in the original FlowFile"),
    +        @WritesAttribute(attribute = "text.character.count", description = 
"The number of characters (given the specified character encoding) present in 
the original FlowFile"),
    +})
    +@SeeAlso(SplitText.class)
    +public class CountText extends AbstractProcessor {
    +    private static final List<Charset> STANDARD_CHARSETS = Arrays.asList(
    +            StandardCharsets.UTF_8,
    +            StandardCharsets.US_ASCII,
    +            StandardCharsets.ISO_8859_1,
    +            StandardCharsets.UTF_16,
    +            StandardCharsets.UTF_16LE,
    +            StandardCharsets.UTF_16BE);
    +
    +    private static final String SYMBOL_REGEX = "[\\s-\\._]";
    +    private static final String WHITESPACE_ONLY_REGEX = "\\s";
    +
    +    // Attribute keys
    +    public static final String TEXT_LINE_COUNT = "text.line.count";
    +    public static final String TEXT_LINE_NONEMPTY_COUNT = 
"text.line.nonempty.count";
    +    public static final String TEXT_WORD_COUNT = "text.word.count";
    +    public static final String TEXT_CHARACTER_COUNT = 
"text.character.count";
    +
    +
    +    public static final PropertyDescriptor TEXT_LINE_COUNT_PD = new 
PropertyDescriptor.Builder()
    +            .name("text-line-count")
    +            .displayName("Text Line Count")
    --- End diff --
    
    Given that these are boolean properties, I wonder if it makes more sense to 
word as "Count Lines", "Count Non-Empty Lines", "Count Words"?
    Is a minor point, and if you don't agree it's okay to keep as-is but I 
found the wording less intuitive personally. Up to you.


---

Reply via email to