Github user jagdeepsingh2 commented on a diff in the pull request: https://github.com/apache/metron/pull/1245#discussion_r234871662 --- Diff: metron-platform/metron-parsers/src/main/java/org/apache/metron/parsers/regex/RegularExpressionsParser.java --- @@ -0,0 +1,427 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.metron.parsers.regex; + +import java.nio.charset.Charset; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.metron.common.Constants; +import org.apache.metron.parsers.BasicParser; +import org.apache.metron.common.Constants.ParserConfigConstants; +import org.json.simple.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +//@formatter:off +/** + * General purpose class to parse unstructured text message into a json object. This class parses + * the message as per supplied parser config as part of sensor config. Sensor parser config example: + * + * <pre> + * <code> + * "convertCamelCaseToUnderScore": true, + * "recordTypeRegex": "(?<process>(?<=\\s)\\b(kernel|syslog)\\b(?=\\[|:))", + * "messageHeaderRegex": "(?<syslogpriority>(?<=^<)\\d{1,4}(?=>)).*?(?<timestamp>(?<=>)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?<syslogHost>(?<=\\s).*?(?=\\s))", + * "fields": [ + * { + * "recordType": "kernel", + * "regex": ".*(?<eventInfo>(?<=\\]|\\w\\:).*?(?=$))" + * }, + * { + * "recordType": "syslog", + * "regex": ".*(?<processid>(?<=PID\\s=\\s).*?(?=\\sLine)).*(?<filePath>(?<=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?<fileName>.*?(?=\")).*(?<eventInfo>(?<=\").*?(?=$))" + * } + * ] + * </code> + * </pre> + * + * Note: messageHeaderRegex could be specified as lists also e.g. + * + * <pre> + * <code> + * "messageHeaderRegex": [ + * "regular expression 1", + * "regular expression 2" + * ] + * </code> + * </pre> + * + * Where <strong>regular expression 1</strong> are valid regular expressions and may have named + * groups, which would be extracted into fields. This list will be evaluated in order until a + * matching regular expression is found.<br> + * <br> + * + * <strong>Configuration fields explanation</strong> + * + * <pre> + * recordTypeRegex : used to specify a regular expression to distinctly identify a record type. + * messageHeaderRegex : used to specify a regular expression to extract fields from a message part which is common across all the messages. + * e.g. rhel logs looks like + * <code> + * <7>Jun 26 16:18:01 hostName kernel: SELinux: initialized (dev tmpfs, type tmpfs), uses transition SIDs + * </code> + * <br> + * </pre> + * + * Here message structure (<7>Jun 26 16:18:01 hostName kernel) is common across all messages. + * Hence messageHeaderRegex could be used to extract fields from this part. + * + * fields : json list of objects containing recordType and regex. regex could be a further list e.g. + * + * <pre> + * <code> + * "regex": [ "record type specific regular expression 1", + * "record type specific regular expression 2"] + * + * </code> + * </pre> + * + * <strong>Limitation</strong> <br> + * Currently the named groups in java regular expressions have a limitation. Only following + * characters could be used to name a named group. A capturing group can also be assigned a "name", + * a named-capturing group, and then be back-referenced later by the "name". Group names are + * composed of the following characters. The first character must be a letter. + * + * <pre> + * <code> + * The uppercase letters 'A' through 'Z' ('\u0041' through '\u005a'), + * The lowercase letters 'a' through 'z' ('\u0061' through '\u007a'), + * The digits '0' through '9' ('\u0030' through '\u0039'), + * </code> + * </pre> + * + * This means that an _ (underscore), cannot be used as part of a named group name. E.g. this is an + * invalid regular expression <code>.*(?<event_info>(?<=\\]|\\w\\:).*?(?=$))</code> + * + * However, this limitation can be easily overcome by adding a parser configuration setting. + * + * <code> + * "convertCamelCaseToUnderScore": true, + * <code> + * If above property is added to the sensor parser configuration, in parserConfig object, this parser will automatically convert all the camel case property names to underscore seperated. + * For example, following convertions will automatically happen: + * + * <code> + * ipSrcAddr -> ip_src_addr + * ipDstAddr -> ip_dst_addr + * ipSrcPort -> ip_src_port + * <code> + * etc. + */ +//@formatter:on +public class RegularExpressionsParser extends BasicParser { + + private static Logger LOG = LoggerFactory.getLogger(RegularExpressionsParser.class); + + private static final Charset UTF_8 = Charset.forName("UTF-8"); + + private List<Map<String, Object>> fields; + private Map<String, Object> parserConfig; + private final Pattern namedGroupPattern = Pattern.compile("\\(\\?<([a-zA-Z][a-zA-Z0-9]*)>"); + Pattern capitalLettersPattern = Pattern.compile("^.*[A-Z]+.*$"); + private Pattern recordTypePattern; + private final Set<String> recordTypePatternNamedGroups = new HashSet<>(); + private final Map<String, Map<Pattern, Set<String>>> recordTypePatternMap = new HashMap<>(); + private final Map<Pattern, Set<String>> syslogPatternsMap = new HashMap<>(); + + /** + * Parses an unstructured text message into a json object based upon the regular expression + * configuration supplied. + * + * @param rawMessage incoming unstructured raw text. + * + * @return List of json parsed json objects. In this case list will have a single element only. + */ + @Override + public List<JSONObject> parse(byte[] rawMessage) { + String originalMessage = null; + try { + originalMessage = new String(rawMessage, UTF_8).trim(); + LOG.debug(" raw message. {}", originalMessage); + if (originalMessage.isEmpty()) { + LOG.warn("Message is empty."); + return Arrays.asList(new JSONObject()); + } + } catch (final Exception e) { + LOG.error("[Metron] Could not read raw message. {} " + originalMessage, e); + throw new RuntimeException(e.getMessage(), e); + } + + try { + final JSONObject parsedJson = new JSONObject(); + if (syslogPatternsMap.size() > 0) { + parsedJson.putAll(extractHeaderFields(originalMessage)); + } + parsedJson.putAll(parse(originalMessage)); + parsedJson.put(Constants.Fields.ORIGINAL.getName(), originalMessage); + applyFieldTransformations(parsedJson); + return Arrays.asList(parsedJson); + } catch (final ParseException e) { + LOG.error("Error occured in parsing. original message : " + originalMessage, e); + throw new RuntimeException(e.getMessage(), e); + } + } + + private void applyFieldTransformations(JSONObject parsedJson) { + if (getParserConfig() + .get(ParserConfigConstants.CONVERT_CAMELCASE_TO_UNDERSCORE.getName()) != null + && (Boolean) getParserConfig() + .get(ParserConfigConstants.CONVERT_CAMELCASE_TO_UNDERSCORE.getName())) { + convertCamelCaseToUnderScore(parsedJson); + } + + } + + // @formatter:off + /** + * This method is called during the parser initialization. It parses the parser + * configuration and configures the parser accordingly. It then initializes + * instance variables. + * + * @param parserConfig ParserConfig(Map<String, Object>) supplied to the sensor. + * @see org.apache.metron.parsers.interfaces.Configurable#configure(java.util.Map)<br> + * <br> + */ + // @formatter:on + @Override + public void configure(Map<String, Object> parserConfig) { + setParserConfig(parserConfig); + setFields( + (List<Map<String, Object>>) getParserConfig().get(ParserConfigConstants.FIELDS.getName())); + + setRecordTypePattern( + (String) getParserConfig().get(ParserConfigConstants.RECORD_TYPE_REGEX.getName())); + recordTypePatternNamedGroups.addAll(getNamedGroups( + (String) getParserConfig().get(ParserConfigConstants.RECORD_TYPE_REGEX.getName()))); + final List<Map<String, Object>> fields = + (List<Map<String, Object>>) getParserConfig().get(ParserConfigConstants.FIELDS.getName()); + + configureRecordTypePatterns(fields); + + configureMessageHeaderPattern(); + + validateConfig(); + } + + private void configureMessageHeaderPattern() { + if (getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName()) != null) { + if (getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName()) instanceof List) { + final List<String> syslogPatternList = + (List<String>) getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName()); + for (final String syslogPatternStr : syslogPatternList) { + syslogPatternsMap.put(Pattern.compile(syslogPatternStr), + getNamedGroups(syslogPatternStr)); + } + } else if (getParserConfig() + .get(ParserConfigConstants.MESSAGE_HEADER.getName()) instanceof String) { + final String syslogPatternStr = + (String) getParserConfig().get(ParserConfigConstants.MESSAGE_HEADER.getName()); + if (StringUtils.isNotBlank(syslogPatternStr)) { + syslogPatternsMap.put(Pattern.compile(syslogPatternStr), + getNamedGroups(syslogPatternStr)); + } + } + } + } + + private void configureRecordTypePatterns(List<Map<String, Object>> fields) { + + for (final Map<String, Object> field : fields) { + if (field.get(ParserConfigConstants.RECORD_TYPE.getName()) != null + && field.get(ParserConfigConstants.REGEX.getName()) != null) { + final String recordType = + ((String) field.get(ParserConfigConstants.RECORD_TYPE.getName())).toLowerCase(); + recordTypePatternMap.put(recordType, new LinkedHashMap<Pattern, Set<String>>()); + if (field.get(ParserConfigConstants.REGEX.getName()) instanceof List) { + final List<String> regexList = + (List<String>) field.get(ParserConfigConstants.REGEX.getName()); + regexList.forEach(s -> { + recordTypePatternMap.get(recordType).put(Pattern.compile(s), getNamedGroups(s)); + }); + } else if (field.get(ParserConfigConstants.REGEX.getName()) instanceof String) { + recordTypePatternMap.get(recordType).put( + Pattern.compile((String) field.get(ParserConfigConstants.REGEX.getName())), + getNamedGroups((String) field.get(ParserConfigConstants.REGEX.getName()))); + } + } + } + } + + private void setRecordTypePattern(String recordTypeRegex) { + if (recordTypeRegex != null) { + recordTypePattern = Pattern.compile(recordTypeRegex); + } + } + + private JSONObject parse(String originalMessage) throws ParseException { + final JSONObject parsedJson = new JSONObject(); + final Optional<String> recordIdentifier = getField(recordTypePattern, originalMessage); + if (recordIdentifier.isPresent()) { + extractNamedGroups(parsedJson, recordIdentifier.get(), originalMessage); + } + /* + * Extract fields(named groups) from record type regular expression + */ + final Matcher matcher = recordTypePattern.matcher(originalMessage); + if (matcher.find()) { + for (final String namedGroup : recordTypePatternNamedGroups) { + if (matcher.group(namedGroup) != null) { + parsedJson.put(namedGroup, matcher.group(namedGroup).trim()); + } + } + } + return parsedJson; + } + + private void extractNamedGroups(Map<String, Object> json, String recordType, + String originalMessage) { + final Map<Pattern, Set<String>> patternMap = recordTypePatternMap.get(recordType.toLowerCase()); + if (patternMap != null) { + for (final Map.Entry<Pattern, Set<String>> entry : patternMap.entrySet()) { + final Pattern pattern = entry.getKey(); + final Set<String> namedGroups = entry.getValue(); + if (pattern != null && namedGroups != null && namedGroups.size() > 0) { + final Matcher m = pattern.matcher(originalMessage); + if (m.matches()) { + LOG.debug("RecordType : {} Trying regex : {} for message : {} ", recordType, + pattern.toString(), originalMessage); + for (final String namedGroup : namedGroups) { + if (m.group(namedGroup) != null) { + json.put(namedGroup, m.group(namedGroup).trim()); + } + } + break; + } + } + } + } else { + LOG.warn("No pattern found for record type : {}", recordType); + } + } + + public Optional<String> getField(Pattern pattern, String originalMessage) { + final Matcher matcher = pattern.matcher(originalMessage); + while (matcher.find()) { + return Optional.of(matcher.group()); + } + return Optional.empty(); + } + + private Set<String> getNamedGroups(String regex) { + final Set<String> namedGroups = new TreeSet<>(); + final Matcher matcher = namedGroupPattern.matcher(regex); + while (matcher.find()) { + namedGroups.add(matcher.group(1)); + } + return namedGroups; + } + + private Map<String, Object> extractHeaderFields(String originalMessage) { + final Map<String, Object> syslogJson = new JSONObject(); + for (final Map.Entry<Pattern, Set<String>> syslogPatternEntry : syslogPatternsMap.entrySet()) { + final Matcher m = syslogPatternEntry.getKey().matcher(originalMessage); + if (m.find()) { + for (final String namedGroup : syslogPatternEntry.getValue()) { + if (StringUtils.isNotBlank(m.group(namedGroup))) { + syslogJson.put(namedGroup, m.group(namedGroup).trim()); + } + } + break; + } + } + return syslogJson; + } + + @Override + public void init() { + LOG.info("RegularExpressions parser initialised."); + } + + public void validateConfig() { + if (getFields() == null) { + LOG.error("Invalid config : fields is missing in parserConfig"); + throw new IllegalStateException("Invalid config :fields is missing in parserConfig"); + } + if (recordTypePattern == null) { + LOG.error("Invalid config :recordTypeRegex is missing in parserConfig"); + throw new IllegalStateException("Invalid config :recordTypeRegex is missing in parserConfig"); + } + } + + private void convertCamelCaseToUnderScore(Map<String, Object> json) { --- End diff -- Thanks for pointing it out. I have now replaced it with a library function from guava. `CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, entry.getKey())`
---