Ottomata has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/350234 )
Change subject: Mark events as bots if they self-identify
......................................................................
Mark events as bots if they self-identify
Flags requests sent by spiders/bots, adding `is_bot` field to
user agent object in the event capsule.
Bug: T67508
Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
---
M eventlogging/parse.py
M eventlogging/service.py
M eventlogging/utils.py
M tests/test_parser.py
M tests/test_utils.py
5 files changed, 75 insertions(+), 7 deletions(-)
Approvals:
Ottomata: Verified; Looks good to me, approved
diff --git a/eventlogging/parse.py b/eventlogging/parse.py
index 982cfa8..9e8a382 100644
--- a/eventlogging/parse.py
+++ b/eventlogging/parse.py
@@ -157,8 +157,7 @@
event.update(event.pop('capsule'))
event['uuid'] = capsule_uuid(event)
if ('userAgent' in event) and event['userAgent']:
- parsed_ua = parse_ua(event['userAgent'])
- event['userAgent'] = parsed_ua
+ event['userAgent'] = parse_ua(event['userAgent'])
return Event(event)
def __repr__(self):
diff --git a/eventlogging/service.py b/eventlogging/service.py
index b8f9b35..86fac9b 100644
--- a/eventlogging/service.py
+++ b/eventlogging/service.py
@@ -468,8 +468,9 @@
Returns the value you should use for the response Content-Type.
This assumes the default response is json.
"""
+ headers = ['application/x-yaml', 'application/yaml']
if ('Accept' in request_headers and
- request_headers['Accept'] in ['application/x-yaml',
'application/yaml']):
+ request_headers['Accept'] in headers):
return 'application/x-yaml; charset=UTF-8'
else:
return 'application/json; charset=UTF-8'
diff --git a/eventlogging/utils.py b/eventlogging/utils.py
index 482c03a..163747f 100644
--- a/eventlogging/utils.py
+++ b/eventlogging/utils.py
@@ -36,6 +36,22 @@
'uri_delete_query_item', 'uri_append_query_items', 'uri_force_raw',
'parse_etcd_uri', 'datetime_from_uuid1', 'datetime_from_timestamp')
+# Regex extending uaparser's bot/spider detection, comes from
+# Webrequest.java in refinery-source/core
+bot_ua_pattern = re.compile('(.*(bot|spider|WordPress|AppEngine|AppleDictionar'
+ 'yService|Python-urllib|python-requests|Google-HTT'
+ 'P-Java-Client|[Ff]acebook|[Yy]ahoo|RockPeaks|http'
+ ').*|(goo wikipedia|MediaWikiCrawler-Google|wikiwi'
+ 'x-bot|Java|curl|PHP|Faraday|HTTPC|Ruby|\.NET|Pyth'
+ 'on|Apache|Scrapy|PycURL|libwww|Zend|wget|nodemw|W'
+ 'inHttpRaw|Twisted|com\.eusoft|Lagotto|Peggo|Recuw'
+ 'eb|check_http|Magnus|MLD|Jakarta|find-link|J\. Ri'
+ 'ver|projectplan9|ADmantX|httpunit|LWP|iNaturalist'
+ '|WikiDemo|FSResearchIt|livedoor|Microsoft Monitor'
+ 'ing|MediaWiki|User:|User_talk:|github|tools.wmfla'
+ 'bs.org|Blackboard Safeassign|Damn Small XSS|\S+@'
+ '\S+\.[a-zA-Z]{2,3}).*)$')
+
class PeriodicThread(threading.Thread):
"""Represents a threaded job that runs repeatedly at a regular interval."""
@@ -327,6 +343,8 @@
formatted_ua['os_minor'] = parsed_ua['os']['minor']
# default wmf_app_version is '-'
formatted_ua['wmf_app_version'] = '-'
+ # is request a bot/spider?
+ formatted_ua['is_bot'] = is_bot(formatted_ua['device_family'], user_agent)
app_ua = 'WikipediaApp/'
if app_ua in user_agent:
@@ -337,3 +355,16 @@
# escape json so it doesn't cause problems when validating
# to string (per capsule definition)
return json.dumps(formatted_ua)
+
+
+def is_bot(device_family, user_agent):
+ """
+ Tests the raw user agent string against a bot regular expression
+ if uaparser isn't already marking it as a spider
+ """
+ if device_family == 'Spider':
+ return True
+ elif device_family == 'Other':
+ ua_string = user_agent.strip('"')
+ return bool(bot_ua_pattern.match(ua_string))
+ return False
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 67128d5..faf7f3d 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -50,7 +50,8 @@
'browser_major': '10',
'browser_minor': '0',
'browser_family': 'Firefox',
- 'wmf_app_version': '-'
+ 'wmf_app_version': '-',
+ 'is_bot': False
})
parsed = {
'uuid': '799341a01ba957c79b15dc4d2d950864',
@@ -98,6 +99,39 @@
}
self.assertEqual(parser.parse(raw), parsed)
+ def test_parser_bot_requests(self):
+ parser = eventlogging.LogParser(
+ '%q %{recvFrom}s %{seqId}d %t %o %{userAgent}i')
+ # Bot - recognised by uaparser
+ raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic'
+ '%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22articleId%2'
+ '2%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizondo%22%7'
+ 'D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3022.esa'
+ 'ms.wikimedia.org 132073 2013-01-19T23:16:38 - '
+ 'AppEngine-Google; (+http://code.google.com/appengine; appid'
+ ': webetrex)')
+ ua_map = json.loads(parser.parse(raw)['userAgent'])
+ self.assertEqual(ua_map['is_bot'], True)
+ # Bot - not recognised by uaparser
+ raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22G'
+ 'eneric%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22artic'
+ 'leId%22%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizond'
+ 'o%22%7D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp30'
+ '22.esams.wikimedia.org 132073 2013-01-19T23:16:38 - '
+ 'WikiDemo/10.2.0;')
+ ua_map = json.loads(parser.parse(raw)['userAgent'])
+ self.assertEqual(ua_map['is_bot'], True)
+ # Regular browser
+ raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22'
+ 'Generic%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22arti'
+ 'cleId%22%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizon'
+ 'do%22%7D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3'
+ '022.esams.wikimedia.org 132073 2013-01-19T23:16:38 - '
+ 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0)'
+ ' Gecko/20100101 Firefox/10.0')
+ ua_map = json.loads(parser.parse(raw)['userAgent'])
+ self.assertEqual(ua_map['is_bot'], False)
+
def test_parse_failure(self):
"""Parse failure raises ValueError exception."""
parser = eventlogging.LogParser('%q %{recvFrom}s %t')
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 131d23e..45e6a03 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -154,7 +154,8 @@
'browser_major': None,
'browser_minor': None,
'browser_family': 'Other',
- 'wmf_app_version': '5.3.3.1038'
+ 'wmf_app_version': '5.3.3.1038',
+ 'is_bot': False
})
self.assertEqual(json.loads(parsed),
json.loads(eventlogging.utils.parse_ua(ios_ua)))
@@ -169,7 +170,8 @@
'browser_family': 'Android',
'browser_minor': '4',
'browser_major': '4',
- 'os_minor': '4'
+ 'os_minor': '4',
+ 'is_bot': False
})
self.assertEqual(json.loads(parsed),
json.loads(eventlogging.utils.parse_ua(android_ua)))
@@ -184,7 +186,8 @@
'browser_major': None,
'browser_minor': None,
'browser_family': 'Other',
- 'wmf_app_version': '-'
+ 'wmf_app_version': '-',
+ 'is_bot': False
})
self.assertEqual(json.loads(parsed),
json.loads(eventlogging.utils.parse_ua(ua)))
--
To view, visit https://gerrit.wikimedia.org/r/350234
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
Gerrit-PatchSet: 14
Gerrit-Project: eventlogging
Gerrit-Branch: master
Gerrit-Owner: Fdans <[email protected]>
Gerrit-Reviewer: Fdans <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits