[MediaWiki-commits] [Gerrit] logstash: remove support for most udp2log events - change (operations/puppet)

BryanDavis (Code Review) Fri, 16 Jan 2015 15:30:39 -0800

BryanDavis has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/185482


Change subject: logstash: remove support for most udp2log events
......................................................................

logstash: remove support for most udp2log events

Since I43cd2c3197838fbdd4837932f966e47414d27508 most log2udp traffic
from fluorine to logstash has been disabled. This change removes the
processing rules for the now removed events.

Change-Id: Iaf5e4557e6c6b85bf55b3ee116346b670d38331a
---
D files/logstash/filter-mw-via-udp2log.conf
A files/logstash/filter-udp2log.conf
M manifests/role/logstash.pp
3 files changed, 74 insertions(+), 524 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/82/185482/1

diff --git a/files/logstash/filter-mw-via-udp2log.conf 
b/files/logstash/filter-mw-via-udp2log.conf
deleted file mode 100644
index ee902d8..0000000
--- a/files/logstash/filter-mw-via-udp2log.conf
+++ /dev/null
@@ -1,522 +0,0 @@
-# vim:set sw=2 ts=2 sts=2 et
-# Parse MediaWiki log output as sent by udp2log
-filter {
-
-  if [type] == "udp2log" {
-    # Parse a udp2log relay packet
-    # Capture sequence_id and channel from packet and trim from message body
-    grok {
-      match => [
-        "message",
-        "^%{NUMBER:sequence_id} %{NOTSPACE:channel} %{GREEDYDATA:message}$"
-      ]
-      overwrite => [ "message" ]
-      named_captures_only => true
-    }
-
-    # Explode message body on newlines
-    split {
-      add_tag => [ "split" ]
-    }
-
-    # Change message type to be channel name and discard
-    mutate {
-      replace => [ "type", "%{channel}" ]
-      add_tag => [ "udp2log" ]
-      remove_field => [ "channel" ]
-    }
-
-    # Hang on to the UDP packet sender in case later rules figure out another
-    # host to attribute the message to
-    mutate {
-      add_field => [ "udp_sender", "%{host}" ]
-    }
-
-    # NOTE: `add_tag => [ "es" ]` is not done here, so by default none of the
-    # events created by this initial parse phase will be added to
-    # Elasticsearch. Individual types should be tagged below. This is intended
-    # to keep spammy events out of the serach index.
-  } # end [type] == "udp2log"
-
-
-  if "udp2log" in [tags] {
-    # Common "<DATE> <HOST> <WIKI>: <MESSAGE>" message format
-    # "api" excluded because it's too high-volume
-    if [type] in [
-      "api-feature-usage",
-      "antispoof",
-      "badpass",
-      "bug46577",
-      "Bug54847",
-      "Bug58676",
-      "captcha",
-      "centralauth",
-      "CirrusSearch",
-      "CirrusSearch-failed",
-      "CirrusSearch-slow",
-      "collection",
-      "dbperformance",
-      "dnsblacklist",
-      "exception",
-      "exception-json",
-      "exec",
-      "external",
-      "filebackend-ops",
-      "generated-pp-node-count",
-      "gettingstarted",
-      "MassMessage",
-      "memcached-serious",
-      "mobile",
-      "mwsearch",
-      "poolcounter",
-      "privatewiki-slow-parse",
-      "recursion-guard",
-      "redis",
-      "resourceloader",
-      "runJobs",
-      "slow-parse",
-      "spam",
-      "sql-bagostuff",
-      "squid",
-      "swift-backend",
-      "texvc",
-      "thumbnail",
-      "torblock",
-      "wap",
-      "zero"
-    ] {
-      # Join sequential lines into a single event
-      multiline {
-        pattern => "^2\d{3}-\d\d-\d\d \d\d:\d\d:\d\d "
-        negate => true
-        what => "previous"
-      }
-
-      # Separate logdate, host and wikidb from message
-      grok {
-        match => [
-          "message",
-          "^(?m)(?<logdate>2\d{3}-\d\d-\d\d \d\d:\d\d:\d\d) %{NOTSPACE:host} 
%{NOTSPACE:wikidb}: %{GREEDYDATA:message}$"
-        ]
-        overwrite => [ "host", "message" ]
-        named_captures_only => true
-        add_tag => [ "es" ]
-      }
-
-      if !("_grokparsefailure" in [tags]) {
-        # Use the parsed timestamp as canonical for the event
-        date {
-          match => [ "logdate", "YYYY-MM-dd HH:mm:ss" ]
-          remove_field => [ "logdate" ]
-          add_tag => [ "logdate" ]
-        }
-      }
-    } # end "<DATE> <HOST> <WIKI>:" logs
-
-
-    if [type] == "apache2" {
-      grok {
-        match => [
-          "message",
-          "^(?<logdate>%{MONTH}\s+%{MONTHDAY} %{TIME}) 
%{NOTSPACE:host}:%{SPACE}%{GREEDYDATA:message}$"
-        ]
-        overwrite => [ "host", "message" ]
-        named_captures_only => true
-        add_tag => [ "es" ]
-      }
-      # Discard last message repeated messages
-      if [message] =~ "last message repeated \d+ times$" {
-        drop {}
-      }
-      if !("_grokparsefailure" in [tags]) {
-        # Use the parsed timestamp as canonical for the event
-        date {
-          match => [ "logdate", "MMM dd HH:mm:ss", "MMM  d HH:mm:ss" ]
-          remove_field => [ "logdate" ]
-          add_tag => [ "logdate" ]
-        }
-        # Capture message severity
-        grok {
-          match => [
-            "message",
-            "^\[%{WORD:level}\] (\[%{WORD} %{IP:clientip}\] )?"
-          ]
-          match => [
-            "message",
-            "^PHP (?<level>[\w\s]+): "
-          ]
-        }
-      }
-    } # end [type] == "apache2"
-
-
-    # excluded above because it's too high-volume
-    if [type] == "api" {
-      # Remove `API` from message and capture HTTP verb, user name and
-      # remote_ip
-      grok {
-        match => [
-          "message",
-          "^(?m)API (?<message>%{WORD:verb} %{NOTSPACE:user} 
%{NOTSPACE:remote_ip} %{GREEDYDATA})$"
-        ]
-        overwrite => [ "message" ]
-        named_captures_only => true
-      }
-      # Copy key=value pairs out of the message into a collection named "args"
-      kv {
-        target => "args"
-      }
-    } # end [type] == "api"
-
-
-    if [type] == "api-feature-usage" {
-      grok {
-        match => [
-          "message",
-          "^(?m)%{QS:feature} %{QS:username} %{QS:ip} %{QS:referer} 
%{QS:agent}$"
-        ]
-        named_captures_only => true
-      }
-
-      if !("_grokparsefailure" in [tags]) {
-        # Unquote ('"foo \"bar\""' to 'foo "bar"')
-        mutate {
-          # Strip outer quotes
-          gsub => [
-              "feature",  '^"|"$', "",
-              "username", '^"|"$', "",
-              "ip",       '^"|"$', "",
-              "referer",  '^"|"$', "",
-              "agent",    '^"|"$', ""
-          ]
-        }
-        mutate {
-          # Strip backslash escape characters
-          gsub => [
-              "feature",  '\\(.)', '\1',
-              "username", '\\(.)', '\1',
-              "ip",       '\\(.)', '\1',
-              "referer",  '\\(.)', '\1',
-              "agent",    '\\(.)', '\1'
-          ]
-        }
-
-        mutate {
-          replace => [ "message", "%{feature}" ]
-        }
-
-        urldecode {
-          field => "username"
-        }
-
-        useragent {
-          source => "agent"
-          prefix => "ua_"
-        }
-
-        # Ignore this one for now, too many hits
-        if [feature] == "action=query&!rawcontinue&!continue" {
-          drop {}
-        }
-      }
-    } # end [type] == "api-feature-usage"
-
-
-    if [type] == "dberror" {
-      # Join sequential lines into a single event
-      multiline {
-        pattern => "^\w{3} \w{3} \d"
-        negate => true
-        what => "previous"
-      }
-      # Separate logdate, host and wikidb from message
-      grok {
-        match => [
-          "message",
-          "^(?<logdate>%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} 
%{YEAR})\t%{NOTSPACE:host}\t%{NOTSPACE:wikidb}\t%{GREEDYDATA:message}$"
-        ]
-        overwrite => [ "host", "message" ]
-        named_captures_only => true
-        add_tag => [ "es" ]
-      }
-      if !("_grokparsefailure" in [tags]) {
-        # Use the parsed timestamp as canonical for the event
-        date {
-          match => [ "logdate", "EEE MMM d H:mm:ss z YYYY" ]
-          remove_field => [ "logdate" ]
-          add_tag => [ "logdate" ]
-        }
-      }
-    } # end [type] == "dberror"
-
-
-    if [type] == "exception" {
-      grok {
-        match => [
-          "message",
-          "^(?m)\[(?<exception_id>\w+)\] %{NOTSPACE:url}\s+\w+ from line 
%{NUMBER:line} of %{NOTSPACE:file}: 
%{GREEDYDATA:message}\n%{GREEDYDATA:backtrace}$"
-        ]
-        overwrite => [ "message" ]
-      }
-
-      if !("_grokparsefailure" in [tags]) {
-        # Add a checksum value based on message + file + line
-        # This should help us write tools to count duplicates
-        # Note: The "checksum" filter is deprecated.
-        # See https://logstash.jira.com/browse/LOGSTASH-557
-        mutate {
-          add_field => [ "message_checksum", "%{message}|%{file}|%{line}" ]
-        }
-        anonymize {
-          fields => [ "message_checksum" ]
-          algorithm => "MD5"
-          key => "boringsalt"
-        }
-      }
-    } # end [type] == "exception"
-
-
-    if [type] == "exception-json" {
-      # Parse message as json and put elements in event
-      json {
-        source => "message"
-        add_tag => [ "json" ]
-      }
-      # Rename the `id` field to `exeception_id`
-      mutate {
-        rename => [ "id", "exception_id" ]
-      }
-      # Add a checksum value based on message + file + line
-      # This should help us write tools to count duplicates
-      # Note: The "checksum" filter is deprecated.
-      # See https://logstash.jira.com/browse/LOGSTASH-557
-      mutate {
-        add_field => [ "message_checksum", "%{message}|%{file}|%{line}" ]
-      }
-      anonymize {
-        fields => [ "message_checksum" ]
-        algorithm => "MD5"
-        key => "boringsalt"
-      }
-    } # end [type] == "exception-json"
-
-
-    if [type] == "fatal" {
-      # Parse wmerrors php extension output
-      # Join sequential lines into a single event
-      multiline {
-        pattern => "^\[\d\d-\w"
-        negate => true
-        what => "previous"
-      }
-      # Separate logdate, host and other data from message
-      grok {
-        match => [
-          "message",
-          "^(?m)\[(?<logdate>\d\d-\w{3}-\d{4} \d\d:\d\d:\d\d)\] 
%{GREEDYDATA:message}\nServer: %{NOTSPACE:host}\n(Method: 
%{NOTSPACE:verb}\n)?URL: %{NOTSPACE:url}\n(Cookie: 
%{GREEDYDATA:cookie}\n)?Backtrace:\n%{GREEDYDATA:backtrace}$"
-        ]
-        overwrite => [ "host", "message" ]
-        named_captures_only => true
-        add_tag => [ "es" ]
-      }
-      if !("_grokparsefailure" in [tags]) {
-        # Use the parsed timestamp as canonical for the event
-        date {
-          match => [ "logdate", "dd-MMM-YYYY HH:mm:ss" ]
-          remove_field => [ "logdate" ]
-          add_tag => [ "logdate" ]
-        }
-        # Add a checksum value based on message
-        # This should help us write tools to count duplicates
-        # Note: The "checksum" filter is deprecated.
-        # See https://logstash.jira.com/browse/LOGSTASH-557
-        mutate {
-          add_field => [ "message_checksum", "%{message}" ]
-        }
-        anonymize {
-          fields => [ "message_checksum" ]
-          algorithm => "MD5"
-          key => "boringsalt"
-        }
-        # Split cookie line
-        kv {
-          source => "cookie"
-          target => "cookies"
-          remove_field => [ "cookie" ]
-        }
-      }
-    } # end [type] == "fatal"
-
-
-    if [type] == "iegreview" {
-      # Parse message as json and put elements in event
-      json {
-        source => "message"
-        add_tag => [ "json", "es" ]
-      }
-    } # end [type] == "iegreview"
-
-
-    if [type] == "scap" {
-      # Parse message as json and put elements in event
-      json {
-        source => "message"
-        add_tag => [ "json", "es" ]
-      }
-    } # end [type] == "scap"
-
-
-    if [type] == "scholarships" {
-      # Parse message as json and put elements in event
-      json {
-        source => "message"
-        add_tag => [ "json", "es" ]
-      }
-    } # end [type] == "scholarships"
-
-
-    if [type] == "wap" {
-      # Copy the user-agent from the message
-      grok {
-        match => [
-          "message",
-          "^User-agent: '(?<ua_raw>.*'(?:, |$))"
-        ]
-        named_captures_only => true
-      }
-      if !("_grokparsefailure" in [tags]) {
-        # Analize ua using BrowserScope rules
-        useragent {
-          source => "ua_raw"
-          prefix => "ua_"
-          remove_field => [ "ua_raw" ]
-        }
-      }
-    } # end [type] == "wap"
-
-
-    if [type] == "xff" {
-      # Separate logdate from message
-      grok {
-        match => [
-          "message",
-          "^(?<logdate>%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} 
%{ISO8601_TIMEZONE})\t%{GREEDYDATA:message}$"
-        ]
-        overwrite => [ "message" ]
-        named_captures_only => true
-      }
-      if !("_grokparsefailure" in [tags]) {
-        # Use the parsed timestamp as canonical for the event
-        date {
-          match => [ "logdate", "EEE, dd MMM YYYY HH:mm:ss Z" ]
-          remove_field => [ "logdate" ]
-          add_tag => [ "logdate" ]
-        }
-        # Copy XFF addresses from message
-        grok {
-          match => [
-            "message",
-            "^%{URI:url}\t(?:, )?(?<xff>(?:%{IP}(?:, )?)+)\t"
-          ]
-          named_captures_only => true
-        }
-        # Turn comma separated list of XFF addresses into a real list
-        mutate {
-          split => [ "xff", ", " ]
-        }
-      }
-    } # end [type] == "xff"
-
-
-    if [type] in [ "web", "cli" ] {
-      # wfDebug logs from beta
-      multiline {
-        pattern => "^\w+-[0-9a-f]{8}: "
-        negate => true
-        what => "previous"
-      }
-      grok {
-        match => [
-          "message",
-          
"^(?m)(?<wikidb>\w+)-(?<request_id>[0-9a-f]{8}):%{SPACE}%{GREEDYDATA:message}$"
-        ]
-        overwrite => [ "message" ]
-        named_captures_only => true
-      }
-    } # end [type] in [ "web", "cli" ]
-
-    # Fatalmonitor work-alike support
-    if (
-      [type] == "apache2" and
-        (
-          [message] =~ /PHP/ or
-          [message] =~ /Segmentation fault/
-        )
-    ) {
-      mutate {
-        # Tag messages that are traditionally counted by the fatalmonitor 
script
-        add_tag => [ "fatalmonitor" ]
-        # Create a copy of message field that can be normalized
-        add_field => [ "normalized_message", "%{message}" ]
-      }
-      # Normalize the message
-      # sed -r 's/\[notice\] child pid [0-9]+ exit signal //g'
-      mutate {
-        gsub => [
-          "normalized_message",
-          "\[notice\] child pid \d+ exit signal ",
-          ""
-        ]
-      }
-      # sed 's/, referer.*$//g'
-      mutate {
-        gsub => [
-          "normalized_message",
-          ", referer.*$",
-          ""
-        ]
-      }
-      # Remove failed allocation size noise
-      mutate {
-        gsub => [
-          "normalized_message",
-          "\(tried to allocate \d+ bytes\) ",
-          ""
-        ]
-      }
-      # Remove function anchor tags
-      mutate {
-        gsub => [
-          "normalized_message",
-          " \[<a href='function\..*'>function\..*</a>\]",
-          ""
-        ]
-      }
-      # Add a checksum value based on normalized message
-      mutate {
-        add_field => [ "message_checksum", "%{normalized_message}" ]
-      }
-      anonymize {
-        fields => [ "message_checksum" ]
-        algorithm => "MD5"
-        key => "boringsalt"
-      }
-      # Trim the normalized_message to a maximum of 255 characters
-      # This is done because our Elasticsearch schema doesn't store raw fields
-      # for strings longer than 255 characters and we want something to show
-      # in terms queries even if it's shortened.
-      grok {
-        match => [
-          "normalized_message",
-          "^(?<normalized_message>.{255}).*$"
-        ]
-        overwrite => [ "normalized_message" ]
-        named_captures_only => true
-        add_tag => [ "normalized_message_trimmed" ]
-        tag_on_failure => [ "normalized_message_untrimmed" ]
-      }
-    } # end fatalmonitor work-alike support
-
-  } # end "udp2log" in [tags]
-
-}
diff --git a/files/logstash/filter-udp2log.conf 
b/files/logstash/filter-udp2log.conf
new file mode 100644
index 0000000..b2e4d40
--- /dev/null
+++ b/files/logstash/filter-udp2log.conf
@@ -0,0 +1,72 @@
+# vim:set sw=2 ts=2 sts=2 et
+# Parse log events relayed using the udp2log/log2udp protocol
+filter {
+
+  if [type] == "udp2log" {
+    # Parse a udp2log relay packet
+    # Capture sequence_id and channel from packet and trim from message body
+    grok {
+      match => [
+        "message",
+        "^%{NUMBER:sequence_id} %{NOTSPACE:channel} %{GREEDYDATA:message}$"
+      ]
+      overwrite => [ "message" ]
+      named_captures_only => true
+    }
+
+    # Explode message body on newlines
+    split {
+      add_tag => [ "split" ]
+    }
+
+    # Change message type to be channel name and discard
+    mutate {
+      replace => [ "type", "%{channel}" ]
+      add_tag => [ "udp2log" ]
+      remove_field => [ "channel" ]
+    }
+
+    # Hang on to the UDP packet sender in case later rules figure out another
+    # host to attribute the message to
+    mutate {
+      add_field => [ "udp_sender", "%{host}" ]
+    }
+
+    # NOTE: `add_tag => [ "es" ]` is not done here, so by default none of the
+    # events created by this initial parse phase will be added to
+    # Elasticsearch. Individual types should be tagged below. This is intended
+    # to keep spammy events out of the serach index.
+  } # end [type] == "udp2log"
+
+
+  if "udp2log" in [tags] {
+
+    if [type] == "iegreview" {
+      # Parse message as json and put elements in event
+      json {
+        source => "message"
+        add_tag => [ "json", "es" ]
+      }
+    } # end [type] == "iegreview"
+
+
+    if [type] == "scap" {
+      # Parse message as json and put elements in event
+      json {
+        source => "message"
+        add_tag => [ "json", "es" ]
+      }
+    } # end [type] == "scap"
+
+
+    if [type] == "scholarships" {
+      # Parse message as json and put elements in event
+      json {
+        source => "message"
+        add_tag => [ "json", "es" ]
+      }
+    } # end [type] == "scholarships"
+
+  } # end "udp2log" in [tags]
+
+}
diff --git a/manifests/role/logstash.pp b/manifests/role/logstash.pp
index 73fd1e6..8528ed2 100644
--- a/manifests/role/logstash.pp
+++ b/manifests/role/logstash.pp
@@ -88,8 +88,8 @@
         priority => 50,
     }
 
-    logstash::conf { 'filter_mw_via_udp2log':
-        source   => 'puppet:///files/logstash/filter-mw-via-udp2log.conf',
+    logstash::conf { 'filter-udp2log.conf':
+        source   => 'puppet:///files/logstash/filter-udp2log.conf',
         priority => 50,
     }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/185482
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iaf5e4557e6c6b85bf55b3ee116346b670d38331a
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: BryanDavis <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] logstash: remove support for most udp2log events - change (operations/puppet)

Reply via email to