Github user jagdeepsingh2 commented on a diff in the pull request:

    https://github.com/apache/metron/pull/1245#discussion_r234872641
  
    --- Diff: 
metron-platform/metron-parsers/src/main/java/org/apache/metron/parsers/regex/RegularExpressionsParser.java
 ---
    @@ -0,0 +1,427 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license
    + * agreements. See the NOTICE file distributed with this work for 
additional information regarding
    + * copyright ownership. The ASF licenses this file to you under the Apache 
License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the 
License. You may obtain a
    + * copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software 
distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 
ANY KIND, either express
    + * or implied. See the License for the specific language governing 
permissions and limitations under
    + * the License.
    + */
    +
    +package org.apache.metron.parsers.regex;
    +
    +import java.nio.charset.Charset;
    +import java.text.ParseException;
    +import java.util.ArrayList;
    +import java.util.Arrays;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.LinkedHashMap;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Optional;
    +import java.util.Set;
    +import java.util.TreeSet;
    +import java.util.regex.Matcher;
    +import java.util.regex.Pattern;
    +import java.util.stream.Collectors;
    +import org.apache.commons.lang3.StringUtils;
    +import org.apache.metron.common.Constants;
    +import org.apache.metron.parsers.BasicParser;
    +import org.apache.metron.common.Constants.ParserConfigConstants;
    +import org.json.simple.JSONObject;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
    +
    +//@formatter:off
    +/**
    + * General purpose class to parse unstructured text message into a json 
object. This class parses
    + * the message as per supplied parser config as part of sensor config. 
Sensor parser config example:
    + *
    + * <pre>
    + * <code>
    + * "convertCamelCaseToUnderScore": true,
    + * "recordTypeRegex": 
"(?&lt;process&gt;(?&lt;=\\s)\\b(kernel|syslog)\\b(?=\\[|:))",
    + * "messageHeaderRegex": 
"(?&lt;syslogpriority&gt;(?&lt;=^&lt;)\\d{1,4}(?=&gt;)).*?(?&lt;timestamp>(?&lt;=&gt;)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?&lt;syslogHost&gt;(?&lt;=\\s).*?(?=\\s))",
    + * "fields": [
    + * {
    + * "recordType": "kernel",
    + * "regex": ".*(?&lt;eventInfo&gt;(?&lt;=\\]|\\w\\:).*?(?=$))"
    + * },
    + * {
    + * "recordType": "syslog",
    + * "regex": 
".*(?&lt;processid&gt;(?&lt;=PID\\s=\\s).*?(?=\\sLine)).*(?&lt;filePath&gt;(?&lt;=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?&lt;fileName&gt;.*?(?=\")).*(?&lt;eventInfo&gt;(?&lt;=\").*?(?=$))"
    + * }
    + * ]
    + * </code>
    + * </pre>
    + *
    + * Note: messageHeaderRegex could be specified as lists also e.g.
    + *
    + * <pre>
    + * <code>
    + * "messageHeaderRegex": [
    + * "regular expression 1",
    + * "regular expression 2"
    + * ]
    + * </code>
    + * </pre>
    + *
    + * Where <strong>regular expression 1</strong> are valid regular 
expressions and may have named
    + * groups, which would be extracted into fields. This list will be 
evaluated in order until a
    + * matching regular expression is found.<br>
    + * <br>
    + *
    + * <strong>Configuration fields explanation</strong>
    + *
    + * <pre>
    + * recordTypeRegex : used to specify a regular expression to distinctly 
identify a record type.
    + * messageHeaderRegex :  used to specify a regular expression to extract 
fields from a message part which is common across all the messages.
    + * e.g. rhel logs looks like
    + * <code>
    + * <7>Jun 26 16:18:01 hostName kernel: SELinux: initialized (dev tmpfs, 
type tmpfs), uses transition SIDs
    + * </code>
    + * <br>
    + * </pre>
    + *
    + * Here message structure (<7>Jun 26 16:18:01 hostName kernel) is common 
across all messages.
    + * Hence messageHeaderRegex could be used to extract fields from this part.
    + *
    + * fields : json list of objects containing recordType and regex. regex 
could be a further list e.g.
    + *
    + * <pre>
    + * <code>
    + * "regex":  [ "record type specific regular expression 1",
    + *             "record type specific regular expression 2"]
    + *
    + * </code>
    + * </pre>
    + *
    + * <strong>Limitation</strong> <br>
    + * Currently the named groups in java regular expressions have a 
limitation. Only following
    + * characters could be used to name a named group. A capturing group can 
also be assigned a "name",
    + * a named-capturing group, and then be back-referenced later by the 
"name". Group names are
    + * composed of the following characters. The first character must be a 
letter.
    + *
    + * <pre>
    + * <code>
    + * The uppercase letters 'A' through 'Z' ('\u0041' through '\u005a'),
    + * The lowercase letters 'a' through 'z' ('\u0061' through '\u007a'),
    + * The digits '0' through '9' ('\u0030' through '\u0039'),
    + * </code>
    + * </pre>
    + *
    + * This means that an _ (underscore), cannot be used as part of a named 
group name. E.g. this is an
    + * invalid regular expression 
<code>.*(?&lt;event_info&gt;(?&lt;=\\]|\\w\\:).*?(?=$))</code>
    + *
    + * However, this limitation can be easily overcome by adding a parser 
configuration setting.
    + *
    + * <code>
    + *  "convertCamelCaseToUnderScore": true,
    + * <code>
    + * If above property is added to the sensor parser configuration, in 
parserConfig object, this parser will automatically convert all the camel case 
property names to underscore seperated.
    + * For example, following convertions will automatically happen:
    --- End diff --
    
    Corrected. Thanks


---

Reply via email to