Fdans has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/350234 )
Change subject: Flag requests sent by spiders/bots using AutomatedRequest schema
......................................................................
Flag requests sent by spiders/bots using AutomatedRequest schema
Bug: T67508
Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
---
M bin/eventlogging-processor
M eventlogging/event.py
M eventlogging/parse.py
M eventlogging/schema.py
M eventlogging/service.py
M eventlogging/utils.py
M tests/test_parser.py
7 files changed, 55 insertions(+), 6 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/eventlogging
refs/changes/34/350234/1
diff --git a/bin/eventlogging-processor b/bin/eventlogging-processor
index dd0c420..660cb07 100755
--- a/bin/eventlogging-processor
+++ b/bin/eventlogging-processor
@@ -36,10 +36,11 @@
import argparse
import logging
+import json
from eventlogging import (capsule_uuid, create_event_error, LogParser,
get_reader, get_writer, validate, setup_logging,
- uri_force_raw, uri_append_query_items
+ uri_force_raw, uri_append_query_items, AUTOMATED_SCID
)
from jsonschema import ValidationError
@@ -121,12 +122,25 @@
logging.error('Unable to create EventError object: %s' % e.message)
+def generate_automated_event(event):
+ event['schema'] = AUTOMATED_SCID[0]
+ event['revision'] = AUTOMATED_SCID[1]
+ spider_name = json.loads(event['userAgent'])['browser_family']
+ event['event'] = {
+ 'spiderName': spider_name
+ }
+ return event
+
+
for raw_event in get_reader(args.input):
event = None
try:
event = parser.parse(raw_event)
event.pop('clientValidated', None)
event.pop('isTruncated', None)
+ if 'isAutomated' in event:
+ event.pop('isAutomated')
+ event = generate_automated_event(event)
validate(event)
event['uuid'] = capsule_uuid(event)
diff --git a/eventlogging/event.py b/eventlogging/event.py
index 8c59a36..faed4e7 100644
--- a/eventlogging/event.py
+++ b/eventlogging/event.py
@@ -15,8 +15,8 @@
from .compat import json, string_types
from .schema import (
- get_schema, CAPSULE_SCID, ERROR_SCID, url_from_scid, scid_from_uri,
- SCHEMA_RE_PATTERN
+ get_schema, CAPSULE_SCID, ERROR_SCID, url_from_scid,
+ scid_from_uri, SCHEMA_RE_PATTERN
)
from .topic import TopicNotFound
from .utils import datetime_from_timestamp
diff --git a/eventlogging/parse.py b/eventlogging/parse.py
index 982cfa8..894ac91 100644
--- a/eventlogging/parse.py
+++ b/eventlogging/parse.py
@@ -42,7 +42,7 @@
from .compat import json, unquote_plus, uuid5
from .event import Event
-from .utils import parse_ua
+from .utils import parse_ua, flag_if_automated
__all__ = (
'LogParser', 'ncsa_to_unix',
@@ -154,6 +154,7 @@
caster_key_pairs = [pair for pair in zip(self.casters, keys)
if pair[0]]
event = {k: f(match.group(k)) for f, k in caster_key_pairs}
+ event = flag_if_automated(event)
event.update(event.pop('capsule'))
event['uuid'] = capsule_uuid(event)
if ('userAgent' in event) and event['userAgent']:
diff --git a/eventlogging/schema.py b/eventlogging/schema.py
index e2334c2..b77efaa 100644
--- a/eventlogging/schema.py
+++ b/eventlogging/schema.py
@@ -24,7 +24,7 @@
__all__ = (
'cache_schema', 'get_schema', 'validate', 'init_schema_cache',
'is_schema_cached', 'get_latest_schema_revision', 'CAPSULE_SCID',
- 'ERROR_SCID', 'SCHEMA_RE_PATTERN', 'get_schema_cache',
+ 'ERROR_SCID', 'SCHEMA_RE_PATTERN', 'AUTOMATED_SCID', 'get_schema_cache',
'schema_uri_from_scid', 'get_cached_scids', 'get_cached_schema_uris'
)
@@ -69,6 +69,8 @@
# TODO: Make new meta style EventError on meta.
ERROR_SCID = ('EventError', 14035058)
+AUTOMATED_SCID = ('AutomatedRequest', 16638244)
+
# Schemas retrieved via HTTP or files are cached in this dictionary.
schema_cache = {}
diff --git a/eventlogging/service.py b/eventlogging/service.py
index 57d7566..f2d6bc7 100644
--- a/eventlogging/service.py
+++ b/eventlogging/service.py
@@ -468,8 +468,9 @@
Returns the value you should use for the response Content-Type.
This assumes the default response is json.
"""
+ headers = ['application/x-yaml', 'application/yaml']
if ('Accept' in request_headers and
- request_headers['Accept'] in ['application/x-yaml',
'application/yaml']):
+ request_headers['Accept'] in headers):
return 'application/x-yaml; charset=UTF-8'
else:
return 'application/json; charset=UTF-8'
diff --git a/eventlogging/utils.py b/eventlogging/utils.py
index 482c03a..a0d4ab9 100644
--- a/eventlogging/utils.py
+++ b/eventlogging/utils.py
@@ -295,6 +295,26 @@
logging.getLogger("kazoo").setLevel(logging.INFO)
+def flag_if_automated(event):
+ if 'userAgent' not in event:
+ return event
+ pattern = re.compile('(.*(bot|spider|WordPress|AppEngine|AppleDictionarySe'
+ 'rvice|Python-urllib|python-requests|Google-HTTP-Java'
+ '-Client|[Ff]acebook|[Yy]ahoo|RockPeaks|http).*|(goo '
+ 'wikipedia|MediaWikiCrawler-Google|wikiwix-bot|Java|c'
+ 'url|PHP|Faraday|HTTPC|Ruby|\.NET|Python|Apache|Scrap'
+ 'y|PycURL|libwww|Zend|wget|nodemw|WinHttpRaw|Twisted|'
+ 'com\.eusoft|Lagotto|Peggo|Recuweb|check_http|Magnus|'
+ 'MLD|Jakarta|find-link|J\. River|projectplan9|ADmantX'
+ '|httpunit|LWP|iNaturalist|WikiDemo|FSResearchIt|live'
+ 'door|Microsoft Monitoring|MediaWiki|User:|User_talk:'
+ '|github|tools.wmflabs.org|Blackboard Safeassign|Damn'
+ ' Small XSS|\S+@\S+\.[a-zA-Z]{2,3}).*)$')
+ if pattern.match(event['userAgent']):
+ event['isAutomated'] = True
+ return event
+
+
def parse_ua(user_agent):
"""
Returns a json string containing the parsed User Agent data
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 67128d5..077b2c3 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -98,6 +98,17 @@
}
self.assertEqual(parser.parse(raw), parsed)
+ def test_parser_automated_requests(self):
+ parser = eventlogging.LogParser(
+ '%q %{recvFrom}s %{seqId}d %t %o %{userAgent}i')
+ raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic'
+ '%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22articleId%2'
+ '2%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizondo%22%7'
+ 'D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3022.esa'
+ 'ms.wikimedia.org 132073 2013-01-19T23:16:38 - '
+ 'AppEngine-Google; (+http://code.google.com/appengine; appid:
webetrex)')
+ self.assertEqual(parser.parse(raw)['isAutomated'], True)
+
def test_parse_failure(self):
"""Parse failure raises ValueError exception."""
parser = eventlogging.LogParser('%q %{recvFrom}s %t')
--
To view, visit https://gerrit.wikimedia.org/r/350234
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
Gerrit-PatchSet: 1
Gerrit-Project: eventlogging
Gerrit-Branch: master
Gerrit-Owner: Fdans <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits