Fdans has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/350234 )

Change subject: Flag requests sent by spiders/bots using AutomatedRequest schema
......................................................................

Flag requests sent by spiders/bots using AutomatedRequest schema

Bug: T67508
Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
---
M bin/eventlogging-processor
M eventlogging/event.py
M eventlogging/parse.py
M eventlogging/schema.py
M eventlogging/service.py
M eventlogging/utils.py
M tests/test_parser.py
7 files changed, 55 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/eventlogging 
refs/changes/34/350234/1

diff --git a/bin/eventlogging-processor b/bin/eventlogging-processor
index dd0c420..660cb07 100755
--- a/bin/eventlogging-processor
+++ b/bin/eventlogging-processor
@@ -36,10 +36,11 @@
 
 import argparse
 import logging
+import json
 
 from eventlogging import (capsule_uuid, create_event_error, LogParser,
                           get_reader, get_writer, validate, setup_logging,
-                          uri_force_raw, uri_append_query_items
+                          uri_force_raw, uri_append_query_items, AUTOMATED_SCID
                           )
 
 from jsonschema import ValidationError
@@ -121,12 +122,25 @@
         logging.error('Unable to create EventError object: %s' % e.message)
 
 
+def generate_automated_event(event):
+    event['schema'] = AUTOMATED_SCID[0]
+    event['revision'] = AUTOMATED_SCID[1]
+    spider_name = json.loads(event['userAgent'])['browser_family']
+    event['event'] = {
+        'spiderName': spider_name
+    }
+    return event
+
+
 for raw_event in get_reader(args.input):
     event = None
     try:
         event = parser.parse(raw_event)
         event.pop('clientValidated', None)
         event.pop('isTruncated', None)
+        if 'isAutomated' in event:
+            event.pop('isAutomated')
+            event = generate_automated_event(event)
         validate(event)
         event['uuid'] = capsule_uuid(event)
 
diff --git a/eventlogging/event.py b/eventlogging/event.py
index 8c59a36..faed4e7 100644
--- a/eventlogging/event.py
+++ b/eventlogging/event.py
@@ -15,8 +15,8 @@
 
 from .compat import json, string_types
 from .schema import (
-    get_schema, CAPSULE_SCID, ERROR_SCID, url_from_scid, scid_from_uri,
-    SCHEMA_RE_PATTERN
+    get_schema, CAPSULE_SCID, ERROR_SCID, url_from_scid,
+    scid_from_uri, SCHEMA_RE_PATTERN
 )
 from .topic import TopicNotFound
 from .utils import datetime_from_timestamp
diff --git a/eventlogging/parse.py b/eventlogging/parse.py
index 982cfa8..894ac91 100644
--- a/eventlogging/parse.py
+++ b/eventlogging/parse.py
@@ -42,7 +42,7 @@
 
 from .compat import json, unquote_plus, uuid5
 from .event import Event
-from .utils import parse_ua
+from .utils import parse_ua, flag_if_automated
 
 __all__ = (
     'LogParser', 'ncsa_to_unix',
@@ -154,6 +154,7 @@
         caster_key_pairs = [pair for pair in zip(self.casters, keys)
                             if pair[0]]
         event = {k: f(match.group(k)) for f, k in caster_key_pairs}
+        event = flag_if_automated(event)
         event.update(event.pop('capsule'))
         event['uuid'] = capsule_uuid(event)
         if ('userAgent' in event) and event['userAgent']:
diff --git a/eventlogging/schema.py b/eventlogging/schema.py
index e2334c2..b77efaa 100644
--- a/eventlogging/schema.py
+++ b/eventlogging/schema.py
@@ -24,7 +24,7 @@
 __all__ = (
     'cache_schema', 'get_schema', 'validate', 'init_schema_cache',
     'is_schema_cached', 'get_latest_schema_revision', 'CAPSULE_SCID',
-    'ERROR_SCID', 'SCHEMA_RE_PATTERN', 'get_schema_cache',
+    'ERROR_SCID', 'SCHEMA_RE_PATTERN', 'AUTOMATED_SCID', 'get_schema_cache',
     'schema_uri_from_scid', 'get_cached_scids', 'get_cached_schema_uris'
 )
 
@@ -69,6 +69,8 @@
 # TODO: Make new meta style EventError on meta.
 ERROR_SCID = ('EventError', 14035058)
 
+AUTOMATED_SCID = ('AutomatedRequest', 16638244)
+
 # Schemas retrieved via HTTP or files are cached in this dictionary.
 schema_cache = {}
 
diff --git a/eventlogging/service.py b/eventlogging/service.py
index 57d7566..f2d6bc7 100644
--- a/eventlogging/service.py
+++ b/eventlogging/service.py
@@ -468,8 +468,9 @@
     Returns the value you should use for the response Content-Type.
     This assumes the default response is json.
     """
+    headers = ['application/x-yaml', 'application/yaml']
     if ('Accept' in request_headers and
-        request_headers['Accept'] in ['application/x-yaml', 
'application/yaml']):
+        request_headers['Accept'] in headers):
         return 'application/x-yaml; charset=UTF-8'
     else:
         return 'application/json; charset=UTF-8'
diff --git a/eventlogging/utils.py b/eventlogging/utils.py
index 482c03a..a0d4ab9 100644
--- a/eventlogging/utils.py
+++ b/eventlogging/utils.py
@@ -295,6 +295,26 @@
         logging.getLogger("kazoo").setLevel(logging.INFO)
 
 
+def flag_if_automated(event):
+    if 'userAgent' not in event:
+        return event
+    pattern = re.compile('(.*(bot|spider|WordPress|AppEngine|AppleDictionarySe'
+                         'rvice|Python-urllib|python-requests|Google-HTTP-Java'
+                         '-Client|[Ff]acebook|[Yy]ahoo|RockPeaks|http).*|(goo '
+                         'wikipedia|MediaWikiCrawler-Google|wikiwix-bot|Java|c'
+                         'url|PHP|Faraday|HTTPC|Ruby|\.NET|Python|Apache|Scrap'
+                         'y|PycURL|libwww|Zend|wget|nodemw|WinHttpRaw|Twisted|'
+                         'com\.eusoft|Lagotto|Peggo|Recuweb|check_http|Magnus|'
+                         'MLD|Jakarta|find-link|J\. River|projectplan9|ADmantX'
+                         '|httpunit|LWP|iNaturalist|WikiDemo|FSResearchIt|live'
+                         'door|Microsoft Monitoring|MediaWiki|User:|User_talk:'
+                         '|github|tools.wmflabs.org|Blackboard Safeassign|Damn'
+                         ' Small XSS|\S+@\S+\.[a-zA-Z]{2,3}).*)$')
+    if pattern.match(event['userAgent']):
+        event['isAutomated'] = True
+    return event
+
+
 def parse_ua(user_agent):
     """
     Returns a json string containing the parsed User Agent data
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 67128d5..077b2c3 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -98,6 +98,17 @@
         }
         self.assertEqual(parser.parse(raw), parsed)
 
+    def test_parser_automated_requests(self):
+        parser = eventlogging.LogParser(
+            '%q %{recvFrom}s %{seqId}d %t %o %{userAgent}i')
+        raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic'
+               '%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22articleId%2'
+               '2%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizondo%22%7'
+               'D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3022.esa'
+               'ms.wikimedia.org 132073 2013-01-19T23:16:38 - '
+               'AppEngine-Google; (+http://code.google.com/appengine; appid: 
webetrex)')
+        self.assertEqual(parser.parse(raw)['isAutomated'], True)
+
     def test_parse_failure(self):
         """Parse failure raises ValueError exception."""
         parser = eventlogging.LogParser('%q %{recvFrom}s %t')

-- 
To view, visit https://gerrit.wikimedia.org/r/350234
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0320ad999d97b29f1b6aacdfa0c135ece34aaeff
Gerrit-PatchSet: 1
Gerrit-Project: eventlogging
Gerrit-Branch: master
Gerrit-Owner: Fdans <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to