Github user kevdoran commented on a diff in the pull request: https://github.com/apache/nifi/pull/2371#discussion_r160242565 --- Diff: nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/CountText.java --- @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi.processors.standard; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.nifi.annotation.behavior.EventDriven; +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; +import org.apache.nifi.annotation.behavior.SideEffectFree; +import org.apache.nifi.annotation.behavior.SupportsBatching; +import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.SeeAlso; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.annotation.lifecycle.OnScheduled; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.processor.AbstractProcessor; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.util.StandardValidators; +import org.apache.nifi.util.StringUtils; + +@EventDriven +@SideEffectFree +@SupportsBatching +@Tags({"count", "text", "line", "word", "character"}) +@InputRequirement(Requirement.INPUT_REQUIRED) +@CapabilityDescription("Counts various metrics on incoming text. The requested results will be recorded as attributes. " + + "The resulting flowfile will not have its content modified.") +@WritesAttributes({ + @WritesAttribute(attribute = "text.line.count", description = "The number of lines of text present in the FlowFile content"), + @WritesAttribute(attribute = "text.line.nonempty.count", description = "The number of lines of text (with at least one non-whitespace character) present in the original FlowFile"), + @WritesAttribute(attribute = "text.word.count", description = "The number of words present in the original FlowFile"), + @WritesAttribute(attribute = "text.character.count", description = "The number of characters (given the specified character encoding) present in the original FlowFile"), +}) +@SeeAlso(SplitText.class) +public class CountText extends AbstractProcessor { + private static final List<Charset> STANDARD_CHARSETS = Arrays.asList( + StandardCharsets.UTF_8, + StandardCharsets.US_ASCII, + StandardCharsets.ISO_8859_1, + StandardCharsets.UTF_16, + StandardCharsets.UTF_16LE, + StandardCharsets.UTF_16BE); + + private static final Pattern SYMBOL_PATTERN = Pattern.compile("[\\s-\\._]"); + private static final Pattern WHITESPACE_ONLY_PATTERN = Pattern.compile("\\s"); + + // Attribute keys + public static final String TEXT_LINE_COUNT = "text.line.count"; + public static final String TEXT_LINE_NONEMPTY_COUNT = "text.line.nonempty.count"; + public static final String TEXT_WORD_COUNT = "text.word.count"; + public static final String TEXT_CHARACTER_COUNT = "text.character.count"; + + + public static final PropertyDescriptor TEXT_LINE_COUNT_PD = new PropertyDescriptor.Builder() + .name("text-line-count") + .displayName("Count Lines") + .description("If enabled, will count the number of lines present in the incoming text.") + .required(true) + .allowableValues("true", "false") + .defaultValue("true") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .build(); + public static final PropertyDescriptor TEXT_LINE_NONEMPTY_COUNT_PD = new PropertyDescriptor.Builder() + .name("text-line-nonempty-count") + .displayName("Count Non-Empty Lines") + .description("If enabled, will count the number of lines that contain a non-whitespace character present in the incoming text.") + .required(true) + .allowableValues("true", "false") + .defaultValue("false") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .build(); + public static final PropertyDescriptor TEXT_WORD_COUNT_PD = new PropertyDescriptor.Builder() + .name("text-word-count") + .displayName("Count Words") + .description("If enabled, will count the number of words (alphanumeric character groups bounded by whitespace)" + + " present in the incoming text. Common logical delimiters [_-.] do not bound a word unless 'Split Words on Symbols' is true.") + .required(true) + .allowableValues("true", "false") + .defaultValue("false") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .build(); + public static final PropertyDescriptor TEXT_CHARACTER_COUNT_PD = new PropertyDescriptor.Builder() + .name("text-character-count") + .displayName("Count Characters") + .description("If enabled, will count the number of characters (including whitespace and symbols, but not including newlines and carriage returns) present in the incoming text.") + .required(true) + .allowableValues("true", "false") + .defaultValue("false") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .build(); + public static final PropertyDescriptor SPLIT_WORDS_ON_SYMBOLS_PD = new PropertyDescriptor.Builder() + .name("split-words-on-symbols") + .displayName("Split Words on Symbols") + .description("If enabled, the word count will identify strings separated by common logical delimiters [_-.] as independent words (ex. split-words-on-symbols = 4 words).") --- End diff -- This is minor issue, but I noticed when rendered using variable-width font in the tooltip the character group `[_-.]` is difficult to read (see screenshot). It might be better to write this a different way for legibility. ![image](https://user-images.githubusercontent.com/5102332/34689958-548c1bbe-f485-11e7-9fc3-a32f7776c1ef.png)
---