Milimetric has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/393613 )
Change subject: Prefer event capsule defined fields over those found in raw log line ...................................................................... Prefer event capsule defined fields over those found in raw log line Currently, if a an event producer sets capsule level fields AND those capsule level fields have format specifiers that are used to parse that field out of the raw log line separate from the incoming event data, the incoming event data will overwrite whatever was parsed from the raw non-event data and casted into that field. This caused a bug with the Popups schema and userAgent fields. The Popups EventLogging extension code would set userAgent sometimes: https://github.com/wikimedia/mediawiki-extensions-EventLogging/blob/28f871d38c0f32e7b641fa9364e58f4b7666633e/includes/EventLogging.php#L71 With the code before this patch, this means that the raw userAgent would be used, rather than the parsed one. After this patch, the submitted event capsule's userAgent will be parsed by the parse_ua caster function. I hate the capsule. Bug: T178440 Change-Id: I37ca9e9bfce7c1c8dae8d8788b8f5c9d20b8a309 --- M eventlogging/parse.py M tests/test_parser.py 2 files changed, 123 insertions(+), 13 deletions(-) Approvals: Milimetric: Verified; Looks good to me, approved diff --git a/eventlogging/parse.py b/eventlogging/parse.py index b40c293..40fa2d2 100644 --- a/eventlogging/parse.py +++ b/eventlogging/parse.py @@ -179,16 +179,69 @@ """Parse a log line into a map of field names / values.""" match = self.re.match(line) if match is None: - raise ValueError(self.re, line) - keys = sorted(match.groupdict(), key=match.start) - # Filter out the caster-key pairs where caster is None - caster_key_pairs = [pair for pair in zip(self.casters, keys) - if pair[0]] - event = {k: f(match.group(k)) for f, k in caster_key_pairs} - event.update(event.pop('capsule')) - event['uuid'] = capsule_uuid(event) + raise ValueError(self.re.pattern, line) - return Event(event) + # Dict of capture group name to matched value, e.g userAgent: "..." + matches = match.groupdict() + + if 'capsule' not in matches.keys(): + raise ValueError( + '\'capsule\' was not matched in line, but it is required', + (self.re.pattern, line) + ) + + # Just the matched capture group names. + keys = sorted(matches, key=match.start) + + # Build a dict of capture group names to caster functions. + # This works because self.casters is a list of functions in the + # same order of keys returned by match.groupdict. + # Also filter out the casters where caster is None, so we omit it. + caster_dict = dict([pair for pair in zip(keys, self.casters) + if pair[1]]) + + # 'capsule' is a required format specifier. + # Parse it out now as the main event. + capsule = caster_dict['capsule'](matches['capsule']) + # capsule at this point MUST be a dict + if not isinstance(capsule, dict): + raise ValueError( + 'capsule was successfully parsed, but not as an object.', + capsule + ) + + # Apply caster functions to event 'capsule' + # level fields that also have casters defined. + for k in (set(capsule.keys()) & set(caster_dict.keys())): + capsule[k] = caster_dict[k](capsule[k]) + + # For other fields that have been parsed out of the raw event log line + # Apply their specifier 'caster' functions. Only use the data from + # the raw line if and only if it is not already present in the + # capsule data. + # This is how we keep user submitted event data in the capsule even if + # there is data for that field in the raw line. E.g. if the user sent + # an event capsule with a 'userAgent' field already in it, we will use + # the user sent userAgent, not the one that might be parsed from the + # raw log line. Note that in this example, the userAgent sent in + # the event capsule data will only be 'cast' (parsed) if the %u + # speficier was used, and the caster to a 'userAgent' field is + # defined. This is true for any field. If it is not in the format + # string, it will not be 'cast', but just used as is. + parsed_fields_from_line = { + k: f(matches[k]) for k, f in caster_dict.items() + # skip the 'capsule' caster, since we already did it, + # and also skip any casters that are already in the + # capsule, since we already did those too. + if k != 'capsule' and k not in capsule.keys() + } + # Add the parsed fields to the event capsule. + capsule.update(parsed_fields_from_line) + + # Add a uuid for this event. + capsule['uuid'] = capsule_uuid(capsule) + + return Event(capsule) def __repr__(self): return '<LogParser(\'%s\')>' % self.format diff --git a/tests/test_parser.py b/tests/test_parser.py index a27d339..9c1ac81 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -37,8 +37,8 @@ raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic' '%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22articleId%2' '2%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizondo%22%7' - 'D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; cp3022.esa' - 'ms.wikimedia.org 132073 2013-01-19T23:16:38 - ' + 'D%2C%22webHost%22%3A%22test.wikipedia.org%22%7D; srv0.example.org ' + '132073 2013-01-19T23:16:38 - ' 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0)' ' Gecko/20100101 Firefox/10.0') ua = { @@ -54,8 +54,8 @@ 'is_mediawiki': False } parsed = { - 'uuid': '5202f558f6aa5894978062d7aa039486', - 'recvFrom': 'cp3022.esams.wikimedia.org', + 'uuid': 'e6b17e1f661155bd8523d14c87d0600b', + 'recvFrom': 'srv0.example.org', 'wiki': 'testwiki', 'webHost': 'test.wikipedia.org', 'seqId': 132073, @@ -77,6 +77,63 @@ self.assertEqual(fromParser[key], parsed[key]) + + def test_parse_capsule_user_agent(self): + """ + Parser test: client-side events with userAgent in submitted capsule. + If a capsule has a field that is also parsed from the raw event line, + the capsule's field should be preferred. + """ + parser = eventlogging.LogParser( + '%q %{recvFrom}s %{seqId}d %D %o %u') + raw = ('?%7B%22wiki%22%3A%22testwiki%22%2C%22schema%22%3A%22Generic' + '%22%2C%22revision%22%3A13%2C%22event%22%3A%7B%22articleId%2' + '2%3A1%2C%22articleTitle%22%3A%22H%C3%A9ctor%20Elizondo%22%7' + 'D%2C%22webHost%22%3A%22test.wikipedia.org%22%2C' + '%22userAgent%22%3A%22Mozilla%2F5.0%5Cu0020%28Windows%5C' + 'u0020NT%5Cu00206.1%3B%5Cu0020WOW64%29%5Cu0020AppleWebKit' + '%2F537.36%5Cu0020%28KHTML%2C%5Cu0020like%5Cu0020Gecko%29' + '%5Cu0020Chrome%2F61.0.3163.100%5Cu0020Safari%2F537.36%22' + '%7D; srv0.example.org 132073 2013-01-19T23:16:38 - ' + 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0)' + ' Gecko/20100101 Firefox/10.0') + # This ua is the parsed userAgent from inside the event capsule, NOT + # the last field in the log line. + ua = { + 'browser_family': 'Chrome', + 'browser_major': '61', + 'browser_minor': '0', + 'device_family': 'Other', + 'is_bot': False, + 'is_mediawiki': False, + 'os_family': 'Windows 7', + 'os_major': None, + 'os_minor': None, + 'wmf_app_version': '-' + } + parsed = { + 'uuid': 'e6b17e1f661155bd8523d14c87d0600b', + 'recvFrom': 'srv0.example.org', + 'wiki': 'testwiki', + 'webHost': 'test.wikipedia.org', + 'seqId': 132073, + 'dt': '2013-01-19T23:16:38', + 'schema': 'Generic', + 'revision': 13, + 'userAgent': ua, + 'event': { + 'articleTitle': 'Héctor Elizondo', + 'articleId': 1 + } + } + fromParser = parser.parse(raw) + for key in parsed: + if key == 'userAgent': + self.assertEqual(parsed[key], + fromParser[key]) + else: + self.assertEqual(fromParser[key], parsed[key]) + def test_parser_bot_requests(self): parser = eventlogging.LogParser( '%q %{recvFrom}s %{seqId}d %D %o %u') -- To view, visit https://gerrit.wikimedia.org/r/393613 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I37ca9e9bfce7c1c8dae8d8788b8f5c9d20b8a309 Gerrit-PatchSet: 7 Gerrit-Project: eventlogging Gerrit-Branch: master Gerrit-Owner: Ottomata <[email protected]> Gerrit-Reviewer: Krinkle <[email protected]> Gerrit-Reviewer: Milimetric <[email protected]> Gerrit-Reviewer: Ottomata <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
