Ottomata has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/350234 )

Change subject: Mark events as bots if they self-identify
......................................................................


Mark events as bots if they self-identify

Flags requests sent by spiders/bots, adding `is_bot` field to
user agent object in the event capsule.

Bug: T67508
Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
---
M eventlogging/parse.py
M eventlogging/service.py
M eventlogging/utils.py
M tests/test_parser.py
M tests/test_utils.py
5 files changed, 75 insertions(+), 7 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git a/eventlogging/parse.py b/eventlogging/parse.py
index 982cfa8..9e8a382 100644
--- a/eventlogging/parse.py
+++ b/eventlogging/parse.py
@@ -157,8 +157,7 @@
         event.update(event.pop('capsule'))
         event['uuid'] = capsule_uuid(event)
         if ('userAgent' in event) and event['userAgent']:
-            parsed_ua = parse_ua(event['userAgent'])
-            event['userAgent'] = parsed_ua
+            event['userAgent'] = parse_ua(event['userAgent'])
         return Event(event)
 
     def __repr__(self):
diff --git a/eventlogging/service.py b/eventlogging/service.py
index b8f9b35..86fac9b 100644
--- a/eventlogging/service.py
+++ b/eventlogging/service.py
@@ -468,8 +468,9 @@
     Returns the value you should use for the response Content-Type.
     This assumes the default response is json.
     """
+    headers = ['application/x-yaml', 'application/yaml']
     if ('Accept' in request_headers and
-        request_headers['Accept'] in ['application/x-yaml', 
'application/yaml']):
+            request_headers['Accept'] in headers):
         return 'application/x-yaml; charset=UTF-8'
     else:
         return 'application/json; charset=UTF-8'
diff --git a/eventlogging/utils.py b/eventlogging/utils.py
index 482c03a..163747f 100644
--- a/eventlogging/utils.py
+++ b/eventlogging/utils.py
@@ -36,6 +36,22 @@
            'uri_delete_query_item', 'uri_append_query_items', 'uri_force_raw',
            'parse_etcd_uri', 'datetime_from_uuid1', 'datetime_from_timestamp')
 
+# Regex extending uaparser's bot/spider detection, comes from
+# Webrequest.java in refinery-source/core
+bot_ua_pattern = re.compile('(.*(bot|spider|WordPress|AppEngine|AppleDictionar'
+                            'yService|Python-urllib|python-requests|Google-HTT'
+                            'P-Java-Client|[Ff]acebook|[Yy]ahoo|RockPeaks|http'
+                            ').*|(goo wikipedia|MediaWikiCrawler-Google|wikiwi'
+                            'x-bot|Java|curl|PHP|Faraday|HTTPC|Ruby|\.NET|Pyth'
+                            'on|Apache|Scrapy|PycURL|libwww|Zend|wget|nodemw|W'
+                            'inHttpRaw|Twisted|com\.eusoft|Lagotto|Peggo|Recuw'
+                            'eb|check_http|Magnus|MLD|Jakarta|find-link|J\. Ri'
+                            'ver|projectplan9|ADmantX|httpunit|LWP|iNaturalist'
+                            '|WikiDemo|FSResearchIt|livedoor|Microsoft Monitor'
+                            'ing|MediaWiki|User:|User_talk:|github|tools.wmfla'
+                            'bs.org|Blackboard Safeassign|Damn Small XSS|\S+@'
+                            '\S+\.[a-zA-Z]{2,3}).*)$')
+
 
 class PeriodicThread(threading.Thread):
     """Represents a threaded job that runs repeatedly at a regular interval."""
@@ -327,6 +343,8 @@
     formatted_ua['os_minor'] = parsed_ua['os']['minor']
     # default wmf_app_version is '-'
     formatted_ua['wmf_app_version'] = '-'
+    # is request a bot/spider?
+    formatted_ua['is_bot'] = is_bot(formatted_ua['device_family'], user_agent)
     app_ua = 'WikipediaApp/'
 
     if app_ua in user_agent:
@@ -337,3 +355,16 @@
     # escape json so it doesn't cause problems when validating
     # to string (per capsule definition)
     return json.dumps(formatted_ua)
+
+
+def is_bot(device_family, user_agent):
+    """
+    Tests the raw user agent string against a bot regular expression
+    if uaparser isn't already marking it as a spider
+    """
+    if device_family == 'Spider':
+        return True
+    elif device_family == 'Other':
+        ua_string = user_agent.strip('"')
+        return bool(bot_ua_pattern.match(ua_string))
+    return False
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 67128d5..faf7f3d 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -50,7 +50,8 @@
                 'browser_major': '10',
                 'browser_minor': '0',
                 'browser_family': 'Firefox',
-                'wmf_app_version': '-'
+                'wmf_app_version': '-',
+                'is_bot': False
             })
         parsed = {
             'uuid': '799341a01ba957c79b15dc4d2d950864',
@@ -98,6 +99,39 @@
         }
         self.assertEqual(parser.parse(raw), parsed)
 
+    def test_parser_bot_requests(self):
+        parser = eventlogging.LogParser(
+            '%q %{recvFrom}s %{seqId}d %t %o %{userAgent}i')
+        # Bot - recognised by uaparser
+        raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic'
+               '%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22articleId%2'
+               '2%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizondo%22%7'
+               'D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3022.esa'
+               'ms.wikimedia.org 132073 2013-01-19T23:16:38 - '
+               'AppEngine-Google; (+http://code.google.com/appengine; appid'
+               ': webetrex)')
+        ua_map = json.loads(parser.parse(raw)['userAgent'])
+        self.assertEqual(ua_map['is_bot'], True)
+        # Bot - not recognised by uaparser
+        raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22G'
+               'eneric%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22artic'
+               'leId%22%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizond'
+               'o%22%7D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp30'
+               '22.esams.wikimedia.org 132073 2013-01-19T23:16:38 - '
+               'WikiDemo/10.2.0;')
+        ua_map = json.loads(parser.parse(raw)['userAgent'])
+        self.assertEqual(ua_map['is_bot'], True)
+        # Regular browser
+        raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22'
+               'Generic%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22arti'
+               'cleId%22%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizon'
+               'do%22%7D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3'
+               '022.esams.wikimedia.org 132073 2013-01-19T23:16:38 - '
+               'Mozilla/5.0 (X11; Linux x86_64; rv:10.0)'
+               ' Gecko/20100101 Firefox/10.0')
+        ua_map = json.loads(parser.parse(raw)['userAgent'])
+        self.assertEqual(ua_map['is_bot'], False)
+
     def test_parse_failure(self):
         """Parse failure raises ValueError exception."""
         parser = eventlogging.LogParser('%q %{recvFrom}s %t')
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 131d23e..45e6a03 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -154,7 +154,8 @@
             'browser_major': None,
             'browser_minor': None,
             'browser_family': 'Other',
-            'wmf_app_version': '5.3.3.1038'
+            'wmf_app_version': '5.3.3.1038',
+            'is_bot': False
         })
         self.assertEqual(json.loads(parsed),
                          json.loads(eventlogging.utils.parse_ua(ios_ua)))
@@ -169,7 +170,8 @@
             'browser_family': 'Android',
             'browser_minor': '4',
             'browser_major': '4',
-            'os_minor': '4'
+            'os_minor': '4',
+            'is_bot': False
         })
         self.assertEqual(json.loads(parsed),
                          json.loads(eventlogging.utils.parse_ua(android_ua)))
@@ -184,7 +186,8 @@
             'browser_major': None,
             'browser_minor': None,
             'browser_family': 'Other',
-            'wmf_app_version': '-'
+            'wmf_app_version': '-',
+            'is_bot': False
         })
         self.assertEqual(json.loads(parsed),
                          json.loads(eventlogging.utils.parse_ua(ua)))

-- 
To view, visit https://gerrit.wikimedia.org/r/350234
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
Gerrit-PatchSet: 14
Gerrit-Project: eventlogging
Gerrit-Branch: master
Gerrit-Owner: Fdans <[email protected]>
Gerrit-Reviewer: Fdans <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to