[ https://issues.apache.org/jira/browse/NIFI-1280?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15962800#comment-15962800 ]
ASF GitHub Bot commented on NIFI-1280: -------------------------------------- Github user markap14 commented on a diff in the pull request: https://github.com/apache/nifi/pull/1652#discussion_r110646007 --- Diff: nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/grok/GrokReader.java --- @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nifi.grok; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.annotation.lifecycle.OnEnabled; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.controller.ConfigurationContext; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.logging.ComponentLog; +import org.apache.nifi.processor.util.StandardValidators; +import org.apache.nifi.serialization.RecordReader; +import org.apache.nifi.serialization.RowRecordReaderFactory; +import org.apache.nifi.serialization.SchemaRegistryRecordReader; +import org.apache.nifi.serialization.record.RecordSchema; + +import io.thekraken.grok.api.Grok; +import io.thekraken.grok.api.exception.GrokException; + +@Tags({"grok", "logs", "logfiles", "parse", "unstructured", "text", "record", "reader", "regex", "pattern", "logstash"}) +@CapabilityDescription("Provides a mechanism for reading unstructured text data, such as log files, and structuring the data " + + "so that it can be processed. The service is configured using Grok patterns. " + + "The service reads from a stream of data and splits each message that it finds into a separate Record, each containing the fields that are configured. " + + "If a line in the input does not match the expected message pattern, the line of text is considered to be part of the previous " + + "message, with the exception of stack traces. A stack trace that is found at the end of a log message is considered to be part " + + "of the previous message but is added to the 'STACK_TRACE' field of the Record. If a record has no stack trace, it will have a NULL value " + + "for the STACK_TRACE field. All fields that are parsed are considered to be of type String by default. If there is need to change the type of a field, " + + "this can be accomplished by configuring the Schema Registry to use and adding the appropriate schema.") +public class GrokReader extends SchemaRegistryRecordReader implements RowRecordReaderFactory { + private volatile Grok grok; + private volatile boolean useSchemaRegistry; + + private static final String DEFAULT_PATTERN_NAME = "/default-grok-patterns.txt"; + + static final PropertyDescriptor PATTERN_FILE = new PropertyDescriptor.Builder() + .name("Grok Pattern File") + .description("Path to a file that contains Grok Patterns to use for parsing logs. If not specified, a built-in default Pattern file " + + "will be used. If specified, all patterns in the given pattern file will override the default patterns. See the Controller Service's " + + "Additional Details for a list of pre-defined patterns.") + .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR) + .expressionLanguageSupported(true) + .required(false) + .build(); + + static final PropertyDescriptor GROK_EXPRESSION = new PropertyDescriptor.Builder() + .name("Grok Expression") + .description("Specifies the format of a log line in Grok format. This allows the Record Reader to understand how to parse each log line. " + + "If a line in the log file does not match this pattern, the line will be assumed to belong to the previous log message.") + .addValidator(new GrokExpressionValidator()) + .required(true) + .build(); + + @Override + protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { + final List<PropertyDescriptor> properties = new ArrayList<>(super.getSupportedPropertyDescriptors()); + properties.add(PATTERN_FILE); + properties.add(GROK_EXPRESSION); + return properties; + } + + @OnEnabled + public void preCompile(final ConfigurationContext context) throws GrokException, IOException { + grok = new Grok(); + + try (final InputStream in = getClass().getResourceAsStream(DEFAULT_PATTERN_NAME); + final Reader reader = new InputStreamReader(in)) { + grok.addPatternFromReader(reader); + } + + if (context.getProperty(PATTERN_FILE).isSet()) { + grok.addPatternFromFile(context.getProperty(PATTERN_FILE).getValue()); --- End diff -- Good catch. Overlooked it. > Create QueryFlowFile Processor > ------------------------------ > > Key: NIFI-1280 > URL: https://issues.apache.org/jira/browse/NIFI-1280 > Project: Apache NiFi > Issue Type: Task > Components: Extensions > Reporter: Mark Payne > Assignee: Mark Payne > Fix For: 1.2.0 > > Attachments: QueryFlowFile_Record_Reader-Writer_Examples.xml > > > We should have a Processor that allows users to easily filter out specific > columns from CSV data. For instance, a user would configure two different > properties: "Columns of Interest" (a comma-separated list of column indexes) > and "Filtering Strategy" (Keep Only These Columns, Remove Only These Columns). > We can do this today with ReplaceText, but it is far more difficult than it > would be with this Processor, as the user has to use Regular Expressions, > etc. with ReplaceText. > Eventually a Custom UI could even be built that allows a user to upload a > Sample CSV and choose which columns from there, similar to the way that Excel > works when importing CSV by dragging and selecting the desired columns? That > would certainly be a larger undertaking and would not need to be done for an > initial implementation. -- This message was sent by Atlassian JIRA (v6.3.15#6346)