[
https://issues.apache.org/jira/browse/METRON-1795?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16692684#comment-16692684
]
ASF GitHub Bot commented on METRON-1795:
----------------------------------------
Github user jagdeepsingh2 commented on a diff in the pull request:
https://github.com/apache/metron/pull/1245#discussion_r234871255
--- Diff:
metron-platform/metron-parsers/src/main/java/org/apache/metron/parsers/regex/RegularExpressionsParser.java
---
@@ -0,0 +1,427 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
contributor license
+ * agreements. See the NOTICE file distributed with this work for
additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache
License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
ANY KIND, either express
+ * or implied. See the License for the specific language governing
permissions and limitations under
+ * the License.
+ */
+
+package org.apache.metron.parsers.regex;
+
+import java.nio.charset.Charset;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.metron.common.Constants;
+import org.apache.metron.parsers.BasicParser;
+import org.apache.metron.common.Constants.ParserConfigConstants;
+import org.json.simple.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+//@formatter:off
+/**
+ * General purpose class to parse unstructured text message into a json
object. This class parses
+ * the message as per supplied parser config as part of sensor config.
Sensor parser config example:
+ *
+ * <pre>
+ * <code>
+ * "convertCamelCaseToUnderScore": true,
+ * "recordTypeRegex":
"(?<process>(?<=\\s)\\b(kernel|syslog)\\b(?=\\[|:))",
+ * "messageHeaderRegex":
"(?<syslogpriority>(?<=^<)\\d{1,4}(?=>)).*?(?<timestamp>(?<=>)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?<syslogHost>(?<=\\s).*?(?=\\s))",
+ * "fields": [
+ * {
+ * "recordType": "kernel",
+ * "regex": ".*(?<eventInfo>(?<=\\]|\\w\\:).*?(?=$))"
+ * },
+ * {
+ * "recordType": "syslog",
+ * "regex":
".*(?<processid>(?<=PID\\s=\\s).*?(?=\\sLine)).*(?<filePath>(?<=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?<fileName>.*?(?=\")).*(?<eventInfo>(?<=\").*?(?=$))"
+ * }
+ * ]
+ * </code>
+ * </pre>
+ *
+ * Note: messageHeaderRegex could be specified as lists also e.g.
+ *
+ * <pre>
+ * <code>
+ * "messageHeaderRegex": [
+ * "regular expression 1",
+ * "regular expression 2"
+ * ]
+ * </code>
+ * </pre>
+ *
+ * Where <strong>regular expression 1</strong> are valid regular
expressions and may have named
+ * groups, which would be extracted into fields. This list will be
evaluated in order until a
+ * matching regular expression is found.<br>
+ * <br>
+ *
+ * <strong>Configuration fields explanation</strong>
+ *
+ * <pre>
+ * recordTypeRegex : used to specify a regular expression to distinctly
identify a record type.
+ * messageHeaderRegex : used to specify a regular expression to extract
fields from a message part which is common across all the messages.
+ * e.g. rhel logs looks like
+ * <code>
+ * <7>Jun 26 16:18:01 hostName kernel: SELinux: initialized (dev tmpfs,
type tmpfs), uses transition SIDs
+ * </code>
+ * <br>
+ * </pre>
+ *
+ * Here message structure (<7>Jun 26 16:18:01 hostName kernel) is common
across all messages.
+ * Hence messageHeaderRegex could be used to extract fields from this part.
+ *
+ * fields : json list of objects containing recordType and regex. regex
could be a further list e.g.
+ *
+ * <pre>
+ * <code>
+ * "regex": [ "record type specific regular expression 1",
+ * "record type specific regular expression 2"]
+ *
+ * </code>
+ * </pre>
+ *
+ * <strong>Limitation</strong> <br>
+ * Currently the named groups in java regular expressions have a
limitation. Only following
+ * characters could be used to name a named group. A capturing group can
also be assigned a "name",
+ * a named-capturing group, and then be back-referenced later by the
"name". Group names are
+ * composed of the following characters. The first character must be a
letter.
+ *
+ * <pre>
+ * <code>
+ * The uppercase letters 'A' through 'Z' ('\u0041' through '\u005a'),
+ * The lowercase letters 'a' through 'z' ('\u0061' through '\u007a'),
+ * The digits '0' through '9' ('\u0030' through '\u0039'),
+ * </code>
+ * </pre>
+ *
+ * This means that an _ (underscore), cannot be used as part of a named
group name. E.g. this is an
+ * invalid regular expression
<code>.*(?<event_info>(?<=\\]|\\w\\:).*?(?=$))</code>
+ *
+ * However, this limitation can be easily overcome by adding a parser
configuration setting.
+ *
+ * <code>
+ * "convertCamelCaseToUnderScore": true,
+ * <code>
+ * If above property is added to the sensor parser configuration, in
parserConfig object, this parser will automatically convert all the camel case
property names to underscore seperated.
+ * For example, following convertions will automatically happen:
+ *
+ * <code>
+ * ipSrcAddr -> ip_src_addr
+ * ipDstAddr -> ip_dst_addr
+ * ipSrcPort -> ip_src_port
+ * <code>
+ * etc.
+ */
+//@formatter:on
+public class RegularExpressionsParser extends BasicParser {
+
+ private static Logger LOG =
LoggerFactory.getLogger(RegularExpressionsParser.class);
+
+ private static final Charset UTF_8 = Charset.forName("UTF-8");
+
+ private List<Map<String, Object>> fields;
+ private Map<String, Object> parserConfig;
+ private final Pattern namedGroupPattern =
Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>");
+ Pattern capitalLettersPattern = Pattern.compile("^.*[A-Z]+.*$");
+ private Pattern recordTypePattern;
+ private final Set<String> recordTypePatternNamedGroups = new HashSet<>();
+ private final Map<String, Map<Pattern, Set<String>>>
recordTypePatternMap = new HashMap<>();
+ private final Map<Pattern, Set<String>> syslogPatternsMap = new
HashMap<>();
+
+ /**
+ * Parses an unstructured text message into a json object based upon the
regular expression
+ * configuration supplied.
+ *
+ * @param rawMessage incoming unstructured raw text.
+ *
+ * @return List of json parsed json objects. In this case list will have
a single element only.
+ */
+ @Override
+ public List<JSONObject> parse(byte[] rawMessage) {
+ String originalMessage = null;
+ try {
+ originalMessage = new String(rawMessage, UTF_8).trim();
+ LOG.debug(" raw message. {}", originalMessage);
+ if (originalMessage.isEmpty()) {
+ LOG.warn("Message is empty.");
+ return Arrays.asList(new JSONObject());
+ }
+ } catch (final Exception e) {
+ LOG.error("[Metron] Could not read raw message. {} " +
originalMessage, e);
+ throw new RuntimeException(e.getMessage(), e);
+ }
+
+ try {
+ final JSONObject parsedJson = new JSONObject();
+ if (syslogPatternsMap.size() > 0) {
+ parsedJson.putAll(extractHeaderFields(originalMessage));
+ }
+ parsedJson.putAll(parse(originalMessage));
+ parsedJson.put(Constants.Fields.ORIGINAL.getName(), originalMessage);
+ applyFieldTransformations(parsedJson);
+ return Arrays.asList(parsedJson);
+ } catch (final ParseException e) {
+ LOG.error("Error occured in parsing. original message : " +
originalMessage, e);
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+
+ private void applyFieldTransformations(JSONObject parsedJson) {
+ if (getParserConfig()
+
.get(ParserConfigConstants.CONVERT_CAMELCASE_TO_UNDERSCORE.getName()) != null
+ && (Boolean) getParserConfig()
+
.get(ParserConfigConstants.CONVERT_CAMELCASE_TO_UNDERSCORE.getName())) {
+ convertCamelCaseToUnderScore(parsedJson);
+ }
+
+ }
+
+ // @formatter:off
+ /**
+ * This method is called during the parser initialization. It parses the
parser
+ * configuration and configures the parser accordingly. It then
initializes
+ * instance variables.
+ *
+ * @param parserConfig ParserConfig(Map<String, Object>) supplied to the
sensor.
+ * @see
org.apache.metron.parsers.interfaces.Configurable#configure(java.util.Map)<br>
+ * <br>
+ */
+ // @formatter:on
+ @Override
+ public void configure(Map<String, Object> parserConfig) {
+ setParserConfig(parserConfig);
+ setFields(
+ (List<Map<String, Object>>)
getParserConfig().get(ParserConfigConstants.FIELDS.getName()));
+
+ setRecordTypePattern(
+ (String)
getParserConfig().get(ParserConfigConstants.RECORD_TYPE_REGEX.getName()));
+ recordTypePatternNamedGroups.addAll(getNamedGroups(
+ (String)
getParserConfig().get(ParserConfigConstants.RECORD_TYPE_REGEX.getName())));
+ final List<Map<String, Object>> fields =
+ (List<Map<String, Object>>)
getParserConfig().get(ParserConfigConstants.FIELDS.getName());
+
+ configureRecordTypePatterns(fields);
+
+ configureMessageHeaderPattern();
+
+ validateConfig();
+ }
+
+ private void configureMessageHeaderPattern() {
+ if
(getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName()) != null)
{
+ if
(getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName())
instanceof List) {
+ final List<String> syslogPatternList =
+ (List<String>)
getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName());
+ for (final String syslogPatternStr : syslogPatternList) {
+ syslogPatternsMap.put(Pattern.compile(syslogPatternStr),
+ getNamedGroups(syslogPatternStr));
+ }
+ } else if (getParserConfig()
+ .get(ParserConfigConstants.MESSAGE_HEADER.getName()) instanceof
String) {
+ final String syslogPatternStr =
+ (String)
getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName());
+ if (StringUtils.isNotBlank(syslogPatternStr)) {
+ syslogPatternsMap.put(Pattern.compile(syslogPatternStr),
+ getNamedGroups(syslogPatternStr));
+ }
+ }
+ }
+ }
+
+ private void configureRecordTypePatterns(List<Map<String, Object>>
fields) {
+
+ for (final Map<String, Object> field : fields) {
+ if (field.get(ParserConfigConstants.RECORD_TYPE.getName()) != null
+ && field.get(ParserConfigConstants.REGEX.getName()) != null) {
+ final String recordType =
+ ((String)
field.get(ParserConfigConstants.RECORD_TYPE.getName())).toLowerCase();
+ recordTypePatternMap.put(recordType, new LinkedHashMap<Pattern,
Set<String>>());
+ if (field.get(ParserConfigConstants.REGEX.getName()) instanceof
List) {
+ final List<String> regexList =
+ (List<String>)
field.get(ParserConfigConstants.REGEX.getName());
+ regexList.forEach(s -> {
+ recordTypePatternMap.get(recordType).put(Pattern.compile(s),
getNamedGroups(s));
+ });
+ } else if (field.get(ParserConfigConstants.REGEX.getName())
instanceof String) {
+ recordTypePatternMap.get(recordType).put(
+ Pattern.compile((String)
field.get(ParserConfigConstants.REGEX.getName())),
+ getNamedGroups((String)
field.get(ParserConfigConstants.REGEX.getName())));
+ }
+ }
+ }
+ }
+
+ private void setRecordTypePattern(String recordTypeRegex) {
+ if (recordTypeRegex != null) {
+ recordTypePattern = Pattern.compile(recordTypeRegex);
+ }
+ }
+
+ private JSONObject parse(String originalMessage) throws ParseException {
+ final JSONObject parsedJson = new JSONObject();
+ final Optional<String> recordIdentifier = getField(recordTypePattern,
originalMessage);
+ if (recordIdentifier.isPresent()) {
+ extractNamedGroups(parsedJson, recordIdentifier.get(),
originalMessage);
+ }
+ /*
+ * Extract fields(named groups) from record type regular expression
+ */
+ final Matcher matcher = recordTypePattern.matcher(originalMessage);
+ if (matcher.find()) {
+ for (final String namedGroup : recordTypePatternNamedGroups) {
+ if (matcher.group(namedGroup) != null) {
+ parsedJson.put(namedGroup, matcher.group(namedGroup).trim());
+ }
+ }
+ }
+ return parsedJson;
+ }
+
+ private void extractNamedGroups(Map<String, Object> json, String
recordType,
+ String originalMessage) {
+ final Map<Pattern, Set<String>> patternMap =
recordTypePatternMap.get(recordType.toLowerCase());
+ if (patternMap != null) {
+ for (final Map.Entry<Pattern, Set<String>> entry :
patternMap.entrySet()) {
+ final Pattern pattern = entry.getKey();
+ final Set<String> namedGroups = entry.getValue();
+ if (pattern != null && namedGroups != null && namedGroups.size() >
0) {
+ final Matcher m = pattern.matcher(originalMessage);
+ if (m.matches()) {
+ LOG.debug("RecordType : {} Trying regex : {} for message : {}
", recordType,
+ pattern.toString(), originalMessage);
+ for (final String namedGroup : namedGroups) {
+ if (m.group(namedGroup) != null) {
+ json.put(namedGroup, m.group(namedGroup).trim());
+ }
+ }
+ break;
+ }
+ }
+ }
+ } else {
+ LOG.warn("No pattern found for record type : {}", recordType);
+ }
+ }
+
+ public Optional<String> getField(Pattern pattern, String
originalMessage) {
+ final Matcher matcher = pattern.matcher(originalMessage);
+ while (matcher.find()) {
+ return Optional.of(matcher.group());
+ }
+ return Optional.empty();
+ }
+
+ private Set<String> getNamedGroups(String regex) {
+ final Set<String> namedGroups = new TreeSet<>();
+ final Matcher matcher = namedGroupPattern.matcher(regex);
+ while (matcher.find()) {
+ namedGroups.add(matcher.group(1));
+ }
+ return namedGroups;
+ }
+
+ private Map<String, Object> extractHeaderFields(String originalMessage) {
+ final Map<String, Object> syslogJson = new JSONObject();
+ for (final Map.Entry<Pattern, Set<String>> syslogPatternEntry :
syslogPatternsMap.entrySet()) {
+ final Matcher m =
syslogPatternEntry.getKey().matcher(originalMessage);
+ if (m.find()) {
+ for (final String namedGroup : syslogPatternEntry.getValue()) {
+ if (StringUtils.isNotBlank(m.group(namedGroup))) {
+ syslogJson.put(namedGroup, m.group(namedGroup).trim());
+ }
+ }
+ break;
+ }
+ }
+ return syslogJson;
+ }
+
+ @Override
+ public void init() {
+ LOG.info("RegularExpressions parser initialised.");
+ }
+
+ public void validateConfig() {
+ if (getFields() == null) {
+ LOG.error("Invalid config : fields is missing in parserConfig");
+ throw new IllegalStateException("Invalid config :fields is missing
in parserConfig");
+ }
+ if (recordTypePattern == null) {
+ LOG.error("Invalid config :recordTypeRegex is missing in
parserConfig");
+ throw new IllegalStateException("Invalid config :recordTypeRegex is
missing in parserConfig");
+ }
+ }
+
+ private void convertCamelCaseToUnderScore(Map<String, Object> json) {
--- End diff --
Thanks for pointing it out. I have refactored it to use a library function
from guava.
`CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, entry.getKey())`
> General Purpose Regex Parser
> ----------------------------
>
> Key: METRON-1795
> URL: https://issues.apache.org/jira/browse/METRON-1795
> Project: Metron
> Issue Type: New Feature
> Reporter: Jagdeep Singh
> Priority: Minor
>
> We have implemented a general purpose regex parser for Metron that we are
> interested in contributing back to the community.
>
> While the Metron Grok parser provides some regex based capability today, the
> intention of this general purpose regex parser is to:
> # Allow for more advanced parsing scenarios (specifically, dealing with
> multiple regex lines for devices that contain several log formats within them)
> # Give users and developers of Metron additional options for parsing
> # With the new parser chaining and regex routing feature available in
> Metron, this gives some additional flexibility to logically separate a flow
> by:
> # Regex routing to segregate logs at a device level and handle envelope
> unwrapping
> # This general purpose regex parser to parse an entire device type that
> contains multiple log formats within the single device (for example, RHEL
> logs)
> At the high-level control flow is like this:
> # Identify the record type if incoming raw message.
> # Find and apply the regular expression of corresponding record type to
> extract the fields (using named groups).
> # Apply the message header regex to extract the fields in the header part of
> the message (using named groups).
>
> The parser config uses the following structure:
>
> {code:java}
> "recordTypeRegex": "(?<process>(?<=\\s)\\b(kernel|syslog)\\b(?=\\[|:))"
> "messageHeaderRegex":
> "(?<syslogpriority>(?<=^<)\\d{1,4}(?=>)).*?(?<timestamp>(?<=>)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?<syslogHost>(?<=\\s).*?(?=\\s))",
> "fields": [
> {
> "recordType": "kernel",
> "regex": ".*(?<eventInfo>(?<=\\]|\\w\\:).*?(?=$))"
> },
> {
> "recordType": "syslog",
> "regex":
> ".*(?<processid>(?<=PID\\s=\\s).*?(?=\\sLine)).*(?<filePath>(?<=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?<fileName>.*?(?=\")).*(?<eventInfo>(?<=\").*?(?=$))"
> }
> ]
> {code}
>
> Where:
> * *recordTypeRegex* is used to distinctly identify a record type. It inputs
> a valid regular expression and may also have named groups, which would be
> extracted into fields.
> * *messageHeaderRegex* is used to specify a regular expression to extract
> fields from a message part which is common across all the messages (i.e,
> syslog fields, standard headers)
> * *fields*: json list of objects containing recordType and regex. The
> expression that is evaluated is based on the output of the recordTypeRegex
> * Note: *recordTypeRegex* and *messageHeaderRegex* could be specified as
> lists also (as a JSON array), where the list will be evaluated in order until
> a matching regular expression is found.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)