Github user justinleet commented on a diff in the pull request:
https://github.com/apache/metron/pull/1245#discussion_r232362818
--- Diff:
metron-platform/metron-parsers/src/main/java/org/apache/metron/parsers/regex/RegularExpressionsParser.java
---
@@ -0,0 +1,427 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
contributor license
+ * agreements. See the NOTICE file distributed with this work for
additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache
License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
ANY KIND, either express
+ * or implied. See the License for the specific language governing
permissions and limitations under
+ * the License.
+ */
+
+package org.apache.metron.parsers.regex;
+
+import java.nio.charset.Charset;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.metron.common.Constants;
+import org.apache.metron.parsers.BasicParser;
+import org.apache.metron.common.Constants.ParserConfigConstants;
+import org.json.simple.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+//@formatter:off
+/**
+ * General purpose class to parse unstructured text message into a json
object. This class parses
+ * the message as per supplied parser config as part of sensor config.
Sensor parser config example:
+ *
+ * <pre>
+ * <code>
+ * "convertCamelCaseToUnderScore": true,
+ * "recordTypeRegex":
"(?<process>(?<=\\s)\\b(kernel|syslog)\\b(?=\\[|:))",
+ * "messageHeaderRegex":
"(?<syslogpriority>(?<=^<)\\d{1,4}(?=>)).*?(?<timestamp>(?<=>)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?<syslogHost>(?<=\\s).*?(?=\\s))",
+ * "fields": [
+ * {
+ * "recordType": "kernel",
+ * "regex": ".*(?<eventInfo>(?<=\\]|\\w\\:).*?(?=$))"
+ * },
+ * {
+ * "recordType": "syslog",
+ * "regex":
".*(?<processid>(?<=PID\\s=\\s).*?(?=\\sLine)).*(?<filePath>(?<=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?<fileName>.*?(?=\")).*(?<eventInfo>(?<=\").*?(?=$))"
+ * }
+ * ]
+ * </code>
+ * </pre>
+ *
+ * Note: messageHeaderRegex could be specified as lists also e.g.
+ *
+ * <pre>
+ * <code>
+ * "messageHeaderRegex": [
+ * "regular expression 1",
+ * "regular expression 2"
+ * ]
+ * </code>
+ * </pre>
+ *
+ * Where <strong>regular expression 1</strong> are valid regular
expressions and may have named
+ * groups, which would be extracted into fields. This list will be
evaluated in order until a
+ * matching regular expression is found.<br>
+ * <br>
+ *
+ * <strong>Configuration fields explanation</strong>
+ *
+ * <pre>
+ * recordTypeRegex : used to specify a regular expression to distinctly
identify a record type.
+ * messageHeaderRegex : used to specify a regular expression to extract
fields from a message part which is common across all the messages.
+ * e.g. rhel logs looks like
+ * <code>
+ * <7>Jun 26 16:18:01 hostName kernel: SELinux: initialized (dev tmpfs,
type tmpfs), uses transition SIDs
+ * </code>
+ * <br>
+ * </pre>
+ *
+ * Here message structure (<7>Jun 26 16:18:01 hostName kernel) is common
across all messages.
+ * Hence messageHeaderRegex could be used to extract fields from this part.
+ *
+ * fields : json list of objects containing recordType and regex. regex
could be a further list e.g.
+ *
+ * <pre>
+ * <code>
+ * "regex": [ "record type specific regular expression 1",
+ * "record type specific regular expression 2"]
+ *
+ * </code>
+ * </pre>
+ *
+ * <strong>Limitation</strong> <br>
+ * Currently the named groups in java regular expressions have a
limitation. Only following
+ * characters could be used to name a named group. A capturing group can
also be assigned a "name",
+ * a named-capturing group, and then be back-referenced later by the
"name". Group names are
+ * composed of the following characters. The first character must be a
letter.
+ *
+ * <pre>
+ * <code>
+ * The uppercase letters 'A' through 'Z' ('\u0041' through '\u005a'),
+ * The lowercase letters 'a' through 'z' ('\u0061' through '\u007a'),
+ * The digits '0' through '9' ('\u0030' through '\u0039'),
+ * </code>
+ * </pre>
+ *
+ * This means that an _ (underscore), cannot be used as part of a named
group name. E.g. this is an
+ * invalid regular expression
<code>.*(?<event_info>(?<=\\]|\\w\\:).*?(?=$))</code>
+ *
+ * However, this limitation can be easily overcome by adding a parser
configuration setting.
+ *
+ * <code>
+ * "convertCamelCaseToUnderScore": true,
+ * <code>
+ * If above property is added to the sensor parser configuration, in
parserConfig object, this parser will automatically convert all the camel case
property names to underscore seperated.
+ * For example, following convertions will automatically happen:
--- End diff --
convertions -> conversions
---