[ 
https://issues.apache.org/jira/browse/METRON-1795?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16703796#comment-16703796
 ] 

ASF GitHub Bot commented on METRON-1795:
----------------------------------------

Github user nickwallen commented on a diff in the pull request:

    https://github.com/apache/metron/pull/1245#discussion_r237651771
  
    --- Diff: 
metron-platform/metron-parsers/src/test/resources/config/RegularExpressionsInvalidParserConfig.json
 ---
    @@ -0,0 +1,208 @@
    +{
    +  "convertCamelCaseToUnderScore": true,
    +  "messageHeaderRegex": 
"(?<syslogpriority>(?<=^<)\\d{1,4}(?=>)).*?(?<timestampDeviceOriginal>(?<=>)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?<deviceName>(?<=\\s).*?(?=\\s))",
    +  "recordTypeRegex": 
"(?<dstProcessName>(?<=\\s)\\b(tch-replicant|audispd|syslog|ntpd|sendmail|pure-ftpd|usermod|useradd|anacron|unix_chkpwd|sudo|dovecot|postfix\\/smtpd|postfix\\/smtp|postfix\\/qmgr|klnagent|systemd|(?i)crond(?-i)|clamd|kesl|sshd|run-parts|automount|suexec|freshclam|kernel|vsftpd|ftpd|su)\\b(?=\\[|:))",
    +  "fields": [
    +    {
    +      "recordType": "syslog",
    +      "regex": 
".*(?<dstProcessId>(?<=PID\\s=\\s).*?(?=\\sLine)).*(?<filePath>(?<=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?<fileName>.*?(?=\")).*(?<eventInfo>(?<=\").*?(?=$))"
    +    },
    +    {
    +      "recordType": "pure-ftpd",
    +      "regex": 
".*(?<srcUserId>(?<=\\:\\s\\().*?(?=\\)\\s)).*?(?<messageLevel>(?<=\\s\\[).*?(?=\\]\\s)).*?(?<eventInfo>(?<=\\]\\s).*?(?=$))"
    +    },
    +    {
    +      "recordType": "systemd",
    +      "regex": [
    +        
".*(?<eventInfo>(?<=\\ssystemd\\:\\s).*?(?=\\d+)).*?(?<sessionName>(?<=\\sSession\\s).*?(?=\\sof)).*?(?<srcUserId>(?<=\\suser\\s).*?(?=\\.)).*$",
    +        
".*(?<eventInfo>(?<=\\ssystemd\\:\\s).*?(?=\\sof)).*?(?<srcUserId>(?<=\\sof\\s).*?(?=\\.)).*$"
    +      ]
    +    },
    +    {
    +      "recordType": "kesl",
    +      "regex": ".*(?<eventInfo>(?<=\\:).*?(?=$))"
    +    },
    +    {
    +      "recordType": "dovecot",
    +      "regex": [
    +        
".*(?<subprocess>(?<=\\sdovecot:\\s).*?(?=\\:)).*?(?<eventInfo>(?<=\\:).*?(?=\\:\\suser)).*?(?<srcUserId>(?<=user\\=\\<).*?(?=\\>)).*?(?<rip>(?<=rip\\=).*?(?=,)).*?(?<lip>(?<=lip\\=).*?(?=,)).*?(?<connectionType>(?<=,\\s).*?(?=,)).*?(?<sessionName>(?<=session\\=\\<).*?(?=\\>)).*$",
    +        
".*(?<subprocess>(?<=\\sdovecot:\\s).*?(?=\\:)).*?(?<eventInfo>(?<=\\:).*?(?=\\:\\srip)).*?(?<rip>(?<=rip\\=).*?(?=,)).*?(?<lip>(?<=lip\\=).*?(?=,)).*?(?<connectionType>(?<=,\\s).*?(?=$))",
    +        
".*(?<subprocess>(?<=\\sdovecot:\\s).*?(?=\\:)).*?(?<eventInfo>(?<=\\:).*?(?=$))"
    +      ]
    +    },
    +    {
    +      "recordType": "postfix/smtpd",
    +      "regex": [
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\:).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\]:)).*?(?<eventInfo>(?<=\\:\\s)disconnect(?=\\sfrom)).*?(?<srcHost>(?<=from).*(?=\\[)).*?(?<ipSrcAddr>(?<=\\[).*(?=\\])).*$"
    +      ]
    +    },
    +    {
    +      "recordType": "postfix/smtp",
    +      "regex": [
    +        
".*(?<dstProcessId>(?<=smtp\\[).*?(?=\\]:)).*(?<toEmail>(?<=to=#\\<).*?(?=>,)).*(?<relay>(?<=relay=).*?(?=,)).*(?<delay>(?<=delay=).*?(?=,)).*(?<delays>(?<=delays=).*?(?=,)).*(?<dsn>(?<=dsn=).*?(?=,)).*(?<status>(?<=status=).*?(?=\\()).*?(?<dstHost>(?<=connect
 
to).*?(?=\\[)).*?(?<ipDstAddr>(?<=\\[).*?(?=\\])).*?(?<ipDstPort>(?<=\\]:).*?(?=:\\s)).*?(?<eventInfo>(?<=:\\s).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=smtp\\[).*?(?=\\]:)).*?(?<dstHost>(?<=connect 
to).*?(?=\\[)).*?(?<ipDstAddr>(?<=\\[).*?(?=\\])).*(?<ipDstPort>(?<=:).*?(?=\\s)).*(?<eventInfo>(?<=\\s).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\:).*?(?=$))"
    +      ]
    +    },
    +    {
    +      "recordType": "crond",
    +      "regex": [
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<srcUserId>(?<=\\]:\\s\\().*?(?=\\)\\s)).*?(?<commandLine>(?<=CMD\\s\\().*?(?=\\))).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<srcUserId>(?<=\\]:\\s\\().*?(?=\\)\\s)).*?(?<eventInfo>(?<=\\().*?(?=\\))).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<srcUserId>(?<=\\]:\\s\\().*?(?=\\)\\s)).*?(?<commandLine>(?<=CMD\\s\\().*?(?=\\))).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\:).*?(?=$))"
    +      ]
    +    },
    +    {
    +      "recordType": "clamd",
    +      "regex": [
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<subProcess>(?<=\\:\\s).*?(?=\\:)).*?(?<eventInfo>(?<=\\:).*?(?=$))",
    +        
".*(?<subProcess>(?<=\\:\\s).*?(?=\\:)).*?(?<eventInfo>(?<=\\:).*?(?=$))"
    +      ]
    +    },
    +    {
    +      "recordType": "run-parts",
    +      "regex": ".*(?<eventInfo>(?<=\\sparts).*?(?=$))"
    +    },
    +    {
    +      "recordType": "sshd",
    +      "regex": [
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<event_Info>(?<=\\]:\\s).*?(?=\\sfor)).*?(?<dstUserId>(?<=\\sfor\\s).*?(?=\\sfrom)).*?(?<ipSrcAddr>(?<=\\sfrom\\s).*?(?=\\sport)).*?(?<ipSrcPort>(?<=\\sport\\s).*?(?=\\s)).*?(?<appProtocol>(?<=port\\s\\d{1,5}\\s).*(?=:\\s)).*?(?<encryptionAlgorithm>(?<=:\\s).+?(?=\\s)).*(?<correlationId>(?<=\\s).+?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\]:\\s).*?(?=\\sfor)).*?(?<dstUserId>(?<=\\sfor\\s).*?(?=\\sfrom)).*?(?<ipSrcAddr>(?<=\\sfrom\\s).*?(?=\\sport)).*?(?<ipSrcPort>(?<=\\sport\\s).*?(?=\\s)).*?(?<appProtocol>(?<=port\\s\\d{1,5}\\s).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<ipDstAddr>(?<=Remote:).*?(?=\\-)).*?(?<ipDstPort>(?<=\\-).*?(?=;)).*?(?<appProtocol>(?<=Protocol:).*?(?=;)).*?(?<sshClient>(?<=Client:).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<appProtocol>(?<=\\]:).*?(?=:)).*?(?<ipDstAddr>(?<=Remote:).*?(?=\\-)).*?(?<ipDstPort>(?<=\\-).*?(?=;)).*?(?<encryptionAlgorithm>(?<=Enc:\\s).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<ipDstAddr>(?<=Remote:).*?(?=\\-)).*?(?<ipDstPort>(?<=\\-).*?(?=;)).*?(?<encryptionAlgorithm>(?<=Enc:\\s).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=:).*?(?=for)).*?(?<dstUserId>(?<=for).*?(?=from)).*?(?<ipSrcAddr>(?<=from).*?(?=port)).*?(?<ipSrcPort>(?<=port).*?(?=\\s)).*?(?<appProtocol>(?<=\\s).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\]))]:\\s.*?(?<eventInfo>subsystem.*?(?=by\\suser)).*?(?<srcUserId>(?<=user).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<action>(?<=Received).*?(?=from)).*?(?<ipSrcAddr>(?<=from).*?(?=:)).*?(?<eventInfo>(?<=11:).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\]:\\s)Server\\slistening(?=\\s)).*?(?<ipSrcAddr>(?<=\\son\\s).*?(?=port)).*?(?<ipSrcPort>(?<=port\\s)\\d{1,6}(?=\\.)).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\]:\\s)Invalid 
user(?=\\s)).*?(?<dstUserId>(?<=\\s).*?(?=from)).*?(?<ipSrcAddr>(?<=from\\s).*(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=]:\\s)).*(?<subProcess>(?<=]:\\s).*\\)(?=:)).*(?<eventInfo>(?<=:\\s).*(?=;)).*(?<logname>(?<=logname=).*?(?=\\s)).*(?<dstUserId>(?<=uid=).*?(?=\\s)).*(?<effectiveUserId>(?<=euid=).*?(?=\\s)).*(?<sessionName>(?<=tty=).*?(?=\\s)).*(?<srcUserId>(?<=ruser=).*?(?=\\s)).*(?<ipSrcAddr>(?<=rhost=).*?(?=\\s)).*(?<userId>(?<=user=).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=]:\\s)).*(?<eventInfo>(?<=:\\s).*(?=;)).*(?<logname>(?<=logname=).*?(?=\\s)).*(?<dstUserId>(?<=uid=).*?(?=\\s)).*(?<effectiveUserId>(?<=euid=).*?(?=\\s)).*(?<sessionName>(?<=tty=).*?(?=\\s)).*(?<srcUserId>(?<=ruser=).*?(?=\\s)).*(?<ipSrcAddr>(?<=rhost=).*?(?=\\s)).*(?<userId>(?<=user=).*?(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=\\]:\\s).*?(?=for)).*?(?<dstUserId>(?<=\\sfor).*?(?=\\[)).*?(?<subProcess>(?<=\\[).*?(?=\\])).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=:\\s)Excess 
permission or bad ownership on 
file(?=\\s\\/)).*?(?<filePath>(?<=\\s).*(?=\\/)).*?(?<fileName>(?<=\\/).*(?=$))",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=:).*?(?=;)).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=:).*?(?=\\d)).*$",
    +        
".*(?<dstProcessId>(?<=\\[).*?(?=\\])).*?(?<eventInfo>(?<=:).*?(?=$))"
    --- End diff --
    
    Help me understand why you need 17 different regular expressions to parse 
SSHD records?  Is it just that you see it in 17 different forms?


> General Purpose Regex Parser
> ----------------------------
>
>                 Key: METRON-1795
>                 URL: https://issues.apache.org/jira/browse/METRON-1795
>             Project: Metron
>          Issue Type: New Feature
>            Reporter: Jagdeep Singh
>            Priority: Minor
>
> We have implemented a general purpose regex parser for Metron that we are 
> interested in contributing back to the community.
>  
> While the Metron Grok parser provides some regex based capability today, the 
> intention of this general purpose regex parser is to:
>  # Allow for more advanced parsing scenarios (specifically, dealing with 
> multiple regex lines for devices that contain several log formats within them)
>  # Give users and developers of Metron additional options for parsing
>  # With the new parser chaining and regex routing feature available in 
> Metron, this gives some additional flexibility to logically separate a flow 
> by:
>  # Regex routing to segregate logs at a device level and handle envelope 
> unwrapping
>  # This general purpose regex parser to parse an entire device type that 
> contains multiple log formats within the single device (for example, RHEL 
> logs)
> At the high-level control flow is like this:
>  # Identify the record type if incoming raw message.
>  # Find and apply the regular expression of corresponding record type to 
> extract the fields (using named groups). 
>  # Apply the message header regex to extract the fields in the header part of 
> the message (using named groups).
>  
> The parser config uses the following structure:
>   
> {code:java}
> "recordTypeRegex": "(?<process>(?<=\\s)\\b(kernel|syslog)\\b(?=\\[|:))"  
>  "messageHeaderRegex": 
> "(?<syslogpriority>(?<=^<)\\d{1,4}(?=>)).*?(?<timestamp>(?<=>)[A-Za-z]{3}\\s{1,2}\\d{1,2}\\s\\d{1,2}:\\d{1,2}:\\d{1,2}(?=\\s)).*?(?<syslogHost>(?<=\\s).*?(?=\\s))",
>    "fields": [
>       {
>         "recordType": "kernel",
>         "regex": ".*(?<eventInfo>(?<=\\]|\\w\\:).*?(?=$))"
>       },
>       {
>         "recordType": "syslog",
>         "regex": 
> ".*(?<processid>(?<=PID\\s=\\s).*?(?=\\sLine)).*(?<filePath>(?<=64\\s)\/([A-Za-z0-9_-]+\/)+(?=\\w))(?<fileName>.*?(?=\")).*(?<eventInfo>(?<=\").*?(?=$))"
>       }
> ]
> {code}
>  
> Where:
>  * *recordTypeRegex* is used to distinctly identify a record type. It inputs 
> a valid regular expression and may also have named groups, which would be 
> extracted into fields.
>  * *messageHeaderRegex* is used to specify a regular expression to extract 
> fields from a message part which is common across all the messages (i.e, 
> syslog fields, standard headers)
>  * *fields*: json list of objects containing recordType and regex. The 
> expression that is evaluated is based on the output of the recordTypeRegex
>  * Note: *recordTypeRegex* and *messageHeaderRegex* could be specified as 
> lists also (as a JSON array), where the list will be evaluated in order until 
> a matching regular expression is found.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to