BryanDavis has uploaded a new change for review.
https://gerrit.wikimedia.org/r/185482
Change subject: logstash: remove support for most udp2log events
......................................................................
logstash: remove support for most udp2log events
Since I43cd2c3197838fbdd4837932f966e47414d27508 most log2udp traffic
from fluorine to logstash has been disabled. This change removes the
processing rules for the now removed events.
Change-Id: Iaf5e4557e6c6b85bf55b3ee116346b670d38331a
---
D files/logstash/filter-mw-via-udp2log.conf
A files/logstash/filter-udp2log.conf
M manifests/role/logstash.pp
3 files changed, 74 insertions(+), 524 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/82/185482/1
diff --git a/files/logstash/filter-mw-via-udp2log.conf
b/files/logstash/filter-mw-via-udp2log.conf
deleted file mode 100644
index ee902d8..0000000
--- a/files/logstash/filter-mw-via-udp2log.conf
+++ /dev/null
@@ -1,522 +0,0 @@
-# vim:set sw=2 ts=2 sts=2 et
-# Parse MediaWiki log output as sent by udp2log
-filter {
-
- if [type] == "udp2log" {
- # Parse a udp2log relay packet
- # Capture sequence_id and channel from packet and trim from message body
- grok {
- match => [
- "message",
- "^%{NUMBER:sequence_id} %{NOTSPACE:channel} %{GREEDYDATA:message}$"
- ]
- overwrite => [ "message" ]
- named_captures_only => true
- }
-
- # Explode message body on newlines
- split {
- add_tag => [ "split" ]
- }
-
- # Change message type to be channel name and discard
- mutate {
- replace => [ "type", "%{channel}" ]
- add_tag => [ "udp2log" ]
- remove_field => [ "channel" ]
- }
-
- # Hang on to the UDP packet sender in case later rules figure out another
- # host to attribute the message to
- mutate {
- add_field => [ "udp_sender", "%{host}" ]
- }
-
- # NOTE: `add_tag => [ "es" ]` is not done here, so by default none of the
- # events created by this initial parse phase will be added to
- # Elasticsearch. Individual types should be tagged below. This is intended
- # to keep spammy events out of the serach index.
- } # end [type] == "udp2log"
-
-
- if "udp2log" in [tags] {
- # Common "<DATE> <HOST> <WIKI>: <MESSAGE>" message format
- # "api" excluded because it's too high-volume
- if [type] in [
- "api-feature-usage",
- "antispoof",
- "badpass",
- "bug46577",
- "Bug54847",
- "Bug58676",
- "captcha",
- "centralauth",
- "CirrusSearch",
- "CirrusSearch-failed",
- "CirrusSearch-slow",
- "collection",
- "dbperformance",
- "dnsblacklist",
- "exception",
- "exception-json",
- "exec",
- "external",
- "filebackend-ops",
- "generated-pp-node-count",
- "gettingstarted",
- "MassMessage",
- "memcached-serious",
- "mobile",
- "mwsearch",
- "poolcounter",
- "privatewiki-slow-parse",
- "recursion-guard",
- "redis",
- "resourceloader",
- "runJobs",
- "slow-parse",
- "spam",
- "sql-bagostuff",
- "squid",
- "swift-backend",
- "texvc",
- "thumbnail",
- "torblock",
- "wap",
- "zero"
- ] {
- # Join sequential lines into a single event
- multiline {
- pattern => "^2\d{3}-\d\d-\d\d \d\d:\d\d:\d\d "
- negate => true
- what => "previous"
- }
-
- # Separate logdate, host and wikidb from message
- grok {
- match => [
- "message",
- "^(?m)(?<logdate>2\d{3}-\d\d-\d\d \d\d:\d\d:\d\d) %{NOTSPACE:host}
%{NOTSPACE:wikidb}: %{GREEDYDATA:message}$"
- ]
- overwrite => [ "host", "message" ]
- named_captures_only => true
- add_tag => [ "es" ]
- }
-
- if !("_grokparsefailure" in [tags]) {
- # Use the parsed timestamp as canonical for the event
- date {
- match => [ "logdate", "YYYY-MM-dd HH:mm:ss" ]
- remove_field => [ "logdate" ]
- add_tag => [ "logdate" ]
- }
- }
- } # end "<DATE> <HOST> <WIKI>:" logs
-
-
- if [type] == "apache2" {
- grok {
- match => [
- "message",
- "^(?<logdate>%{MONTH}\s+%{MONTHDAY} %{TIME})
%{NOTSPACE:host}:%{SPACE}%{GREEDYDATA:message}$"
- ]
- overwrite => [ "host", "message" ]
- named_captures_only => true
- add_tag => [ "es" ]
- }
- # Discard last message repeated messages
- if [message] =~ "last message repeated \d+ times$" {
- drop {}
- }
- if !("_grokparsefailure" in [tags]) {
- # Use the parsed timestamp as canonical for the event
- date {
- match => [ "logdate", "MMM dd HH:mm:ss", "MMM d HH:mm:ss" ]
- remove_field => [ "logdate" ]
- add_tag => [ "logdate" ]
- }
- # Capture message severity
- grok {
- match => [
- "message",
- "^\[%{WORD:level}\] (\[%{WORD} %{IP:clientip}\] )?"
- ]
- match => [
- "message",
- "^PHP (?<level>[\w\s]+): "
- ]
- }
- }
- } # end [type] == "apache2"
-
-
- # excluded above because it's too high-volume
- if [type] == "api" {
- # Remove `API` from message and capture HTTP verb, user name and
- # remote_ip
- grok {
- match => [
- "message",
- "^(?m)API (?<message>%{WORD:verb} %{NOTSPACE:user}
%{NOTSPACE:remote_ip} %{GREEDYDATA})$"
- ]
- overwrite => [ "message" ]
- named_captures_only => true
- }
- # Copy key=value pairs out of the message into a collection named "args"
- kv {
- target => "args"
- }
- } # end [type] == "api"
-
-
- if [type] == "api-feature-usage" {
- grok {
- match => [
- "message",
- "^(?m)%{QS:feature} %{QS:username} %{QS:ip} %{QS:referer}
%{QS:agent}$"
- ]
- named_captures_only => true
- }
-
- if !("_grokparsefailure" in [tags]) {
- # Unquote ('"foo \"bar\""' to 'foo "bar"')
- mutate {
- # Strip outer quotes
- gsub => [
- "feature", '^"|"$', "",
- "username", '^"|"$', "",
- "ip", '^"|"$', "",
- "referer", '^"|"$', "",
- "agent", '^"|"$', ""
- ]
- }
- mutate {
- # Strip backslash escape characters
- gsub => [
- "feature", '\\(.)', '\1',
- "username", '\\(.)', '\1',
- "ip", '\\(.)', '\1',
- "referer", '\\(.)', '\1',
- "agent", '\\(.)', '\1'
- ]
- }
-
- mutate {
- replace => [ "message", "%{feature}" ]
- }
-
- urldecode {
- field => "username"
- }
-
- useragent {
- source => "agent"
- prefix => "ua_"
- }
-
- # Ignore this one for now, too many hits
- if [feature] == "action=query&!rawcontinue&!continue" {
- drop {}
- }
- }
- } # end [type] == "api-feature-usage"
-
-
- if [type] == "dberror" {
- # Join sequential lines into a single event
- multiline {
- pattern => "^\w{3} \w{3} \d"
- negate => true
- what => "previous"
- }
- # Separate logdate, host and wikidb from message
- grok {
- match => [
- "message",
- "^(?<logdate>%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ}
%{YEAR})\t%{NOTSPACE:host}\t%{NOTSPACE:wikidb}\t%{GREEDYDATA:message}$"
- ]
- overwrite => [ "host", "message" ]
- named_captures_only => true
- add_tag => [ "es" ]
- }
- if !("_grokparsefailure" in [tags]) {
- # Use the parsed timestamp as canonical for the event
- date {
- match => [ "logdate", "EEE MMM d H:mm:ss z YYYY" ]
- remove_field => [ "logdate" ]
- add_tag => [ "logdate" ]
- }
- }
- } # end [type] == "dberror"
-
-
- if [type] == "exception" {
- grok {
- match => [
- "message",
- "^(?m)\[(?<exception_id>\w+)\] %{NOTSPACE:url}\s+\w+ from line
%{NUMBER:line} of %{NOTSPACE:file}:
%{GREEDYDATA:message}\n%{GREEDYDATA:backtrace}$"
- ]
- overwrite => [ "message" ]
- }
-
- if !("_grokparsefailure" in [tags]) {
- # Add a checksum value based on message + file + line
- # This should help us write tools to count duplicates
- # Note: The "checksum" filter is deprecated.
- # See https://logstash.jira.com/browse/LOGSTASH-557
- mutate {
- add_field => [ "message_checksum", "%{message}|%{file}|%{line}" ]
- }
- anonymize {
- fields => [ "message_checksum" ]
- algorithm => "MD5"
- key => "boringsalt"
- }
- }
- } # end [type] == "exception"
-
-
- if [type] == "exception-json" {
- # Parse message as json and put elements in event
- json {
- source => "message"
- add_tag => [ "json" ]
- }
- # Rename the `id` field to `exeception_id`
- mutate {
- rename => [ "id", "exception_id" ]
- }
- # Add a checksum value based on message + file + line
- # This should help us write tools to count duplicates
- # Note: The "checksum" filter is deprecated.
- # See https://logstash.jira.com/browse/LOGSTASH-557
- mutate {
- add_field => [ "message_checksum", "%{message}|%{file}|%{line}" ]
- }
- anonymize {
- fields => [ "message_checksum" ]
- algorithm => "MD5"
- key => "boringsalt"
- }
- } # end [type] == "exception-json"
-
-
- if [type] == "fatal" {
- # Parse wmerrors php extension output
- # Join sequential lines into a single event
- multiline {
- pattern => "^\[\d\d-\w"
- negate => true
- what => "previous"
- }
- # Separate logdate, host and other data from message
- grok {
- match => [
- "message",
- "^(?m)\[(?<logdate>\d\d-\w{3}-\d{4} \d\d:\d\d:\d\d)\]
%{GREEDYDATA:message}\nServer: %{NOTSPACE:host}\n(Method:
%{NOTSPACE:verb}\n)?URL: %{NOTSPACE:url}\n(Cookie:
%{GREEDYDATA:cookie}\n)?Backtrace:\n%{GREEDYDATA:backtrace}$"
- ]
- overwrite => [ "host", "message" ]
- named_captures_only => true
- add_tag => [ "es" ]
- }
- if !("_grokparsefailure" in [tags]) {
- # Use the parsed timestamp as canonical for the event
- date {
- match => [ "logdate", "dd-MMM-YYYY HH:mm:ss" ]
- remove_field => [ "logdate" ]
- add_tag => [ "logdate" ]
- }
- # Add a checksum value based on message
- # This should help us write tools to count duplicates
- # Note: The "checksum" filter is deprecated.
- # See https://logstash.jira.com/browse/LOGSTASH-557
- mutate {
- add_field => [ "message_checksum", "%{message}" ]
- }
- anonymize {
- fields => [ "message_checksum" ]
- algorithm => "MD5"
- key => "boringsalt"
- }
- # Split cookie line
- kv {
- source => "cookie"
- target => "cookies"
- remove_field => [ "cookie" ]
- }
- }
- } # end [type] == "fatal"
-
-
- if [type] == "iegreview" {
- # Parse message as json and put elements in event
- json {
- source => "message"
- add_tag => [ "json", "es" ]
- }
- } # end [type] == "iegreview"
-
-
- if [type] == "scap" {
- # Parse message as json and put elements in event
- json {
- source => "message"
- add_tag => [ "json", "es" ]
- }
- } # end [type] == "scap"
-
-
- if [type] == "scholarships" {
- # Parse message as json and put elements in event
- json {
- source => "message"
- add_tag => [ "json", "es" ]
- }
- } # end [type] == "scholarships"
-
-
- if [type] == "wap" {
- # Copy the user-agent from the message
- grok {
- match => [
- "message",
- "^User-agent: '(?<ua_raw>.*'(?:, |$))"
- ]
- named_captures_only => true
- }
- if !("_grokparsefailure" in [tags]) {
- # Analize ua using BrowserScope rules
- useragent {
- source => "ua_raw"
- prefix => "ua_"
- remove_field => [ "ua_raw" ]
- }
- }
- } # end [type] == "wap"
-
-
- if [type] == "xff" {
- # Separate logdate from message
- grok {
- match => [
- "message",
- "^(?<logdate>%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME}
%{ISO8601_TIMEZONE})\t%{GREEDYDATA:message}$"
- ]
- overwrite => [ "message" ]
- named_captures_only => true
- }
- if !("_grokparsefailure" in [tags]) {
- # Use the parsed timestamp as canonical for the event
- date {
- match => [ "logdate", "EEE, dd MMM YYYY HH:mm:ss Z" ]
- remove_field => [ "logdate" ]
- add_tag => [ "logdate" ]
- }
- # Copy XFF addresses from message
- grok {
- match => [
- "message",
- "^%{URI:url}\t(?:, )?(?<xff>(?:%{IP}(?:, )?)+)\t"
- ]
- named_captures_only => true
- }
- # Turn comma separated list of XFF addresses into a real list
- mutate {
- split => [ "xff", ", " ]
- }
- }
- } # end [type] == "xff"
-
-
- if [type] in [ "web", "cli" ] {
- # wfDebug logs from beta
- multiline {
- pattern => "^\w+-[0-9a-f]{8}: "
- negate => true
- what => "previous"
- }
- grok {
- match => [
- "message",
-
"^(?m)(?<wikidb>\w+)-(?<request_id>[0-9a-f]{8}):%{SPACE}%{GREEDYDATA:message}$"
- ]
- overwrite => [ "message" ]
- named_captures_only => true
- }
- } # end [type] in [ "web", "cli" ]
-
- # Fatalmonitor work-alike support
- if (
- [type] == "apache2" and
- (
- [message] =~ /PHP/ or
- [message] =~ /Segmentation fault/
- )
- ) {
- mutate {
- # Tag messages that are traditionally counted by the fatalmonitor
script
- add_tag => [ "fatalmonitor" ]
- # Create a copy of message field that can be normalized
- add_field => [ "normalized_message", "%{message}" ]
- }
- # Normalize the message
- # sed -r 's/\[notice\] child pid [0-9]+ exit signal //g'
- mutate {
- gsub => [
- "normalized_message",
- "\[notice\] child pid \d+ exit signal ",
- ""
- ]
- }
- # sed 's/, referer.*$//g'
- mutate {
- gsub => [
- "normalized_message",
- ", referer.*$",
- ""
- ]
- }
- # Remove failed allocation size noise
- mutate {
- gsub => [
- "normalized_message",
- "\(tried to allocate \d+ bytes\) ",
- ""
- ]
- }
- # Remove function anchor tags
- mutate {
- gsub => [
- "normalized_message",
- " \[<a href='function\..*'>function\..*</a>\]",
- ""
- ]
- }
- # Add a checksum value based on normalized message
- mutate {
- add_field => [ "message_checksum", "%{normalized_message}" ]
- }
- anonymize {
- fields => [ "message_checksum" ]
- algorithm => "MD5"
- key => "boringsalt"
- }
- # Trim the normalized_message to a maximum of 255 characters
- # This is done because our Elasticsearch schema doesn't store raw fields
- # for strings longer than 255 characters and we want something to show
- # in terms queries even if it's shortened.
- grok {
- match => [
- "normalized_message",
- "^(?<normalized_message>.{255}).*$"
- ]
- overwrite => [ "normalized_message" ]
- named_captures_only => true
- add_tag => [ "normalized_message_trimmed" ]
- tag_on_failure => [ "normalized_message_untrimmed" ]
- }
- } # end fatalmonitor work-alike support
-
- } # end "udp2log" in [tags]
-
-}
diff --git a/files/logstash/filter-udp2log.conf
b/files/logstash/filter-udp2log.conf
new file mode 100644
index 0000000..b2e4d40
--- /dev/null
+++ b/files/logstash/filter-udp2log.conf
@@ -0,0 +1,72 @@
+# vim:set sw=2 ts=2 sts=2 et
+# Parse log events relayed using the udp2log/log2udp protocol
+filter {
+
+ if [type] == "udp2log" {
+ # Parse a udp2log relay packet
+ # Capture sequence_id and channel from packet and trim from message body
+ grok {
+ match => [
+ "message",
+ "^%{NUMBER:sequence_id} %{NOTSPACE:channel} %{GREEDYDATA:message}$"
+ ]
+ overwrite => [ "message" ]
+ named_captures_only => true
+ }
+
+ # Explode message body on newlines
+ split {
+ add_tag => [ "split" ]
+ }
+
+ # Change message type to be channel name and discard
+ mutate {
+ replace => [ "type", "%{channel}" ]
+ add_tag => [ "udp2log" ]
+ remove_field => [ "channel" ]
+ }
+
+ # Hang on to the UDP packet sender in case later rules figure out another
+ # host to attribute the message to
+ mutate {
+ add_field => [ "udp_sender", "%{host}" ]
+ }
+
+ # NOTE: `add_tag => [ "es" ]` is not done here, so by default none of the
+ # events created by this initial parse phase will be added to
+ # Elasticsearch. Individual types should be tagged below. This is intended
+ # to keep spammy events out of the serach index.
+ } # end [type] == "udp2log"
+
+
+ if "udp2log" in [tags] {
+
+ if [type] == "iegreview" {
+ # Parse message as json and put elements in event
+ json {
+ source => "message"
+ add_tag => [ "json", "es" ]
+ }
+ } # end [type] == "iegreview"
+
+
+ if [type] == "scap" {
+ # Parse message as json and put elements in event
+ json {
+ source => "message"
+ add_tag => [ "json", "es" ]
+ }
+ } # end [type] == "scap"
+
+
+ if [type] == "scholarships" {
+ # Parse message as json and put elements in event
+ json {
+ source => "message"
+ add_tag => [ "json", "es" ]
+ }
+ } # end [type] == "scholarships"
+
+ } # end "udp2log" in [tags]
+
+}
diff --git a/manifests/role/logstash.pp b/manifests/role/logstash.pp
index 73fd1e6..8528ed2 100644
--- a/manifests/role/logstash.pp
+++ b/manifests/role/logstash.pp
@@ -88,8 +88,8 @@
priority => 50,
}
- logstash::conf { 'filter_mw_via_udp2log':
- source => 'puppet:///files/logstash/filter-mw-via-udp2log.conf',
+ logstash::conf { 'filter-udp2log.conf':
+ source => 'puppet:///files/logstash/filter-udp2log.conf',
priority => 50,
}
--
To view, visit https://gerrit.wikimedia.org/r/185482
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iaf5e4557e6c6b85bf55b3ee116346b670d38331a
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: BryanDavis <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits