Madhuvishy has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/271728

Change subject: [WIP] Allow sensitive fields like clientIP to be collected only 
be schemas that need it
......................................................................

[WIP] Allow sensitive fields like clientIP to be collected only be schemas that 
need it

Bug: T126366
Change-Id: I1fcc8becfe4a41018b9986e5fb8a978344d5952b
---
M eventlogging/parse.py
1 file changed, 26 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/eventlogging 
refs/changes/28/271728/1

diff --git a/eventlogging/parse.py b/eventlogging/parse.py
index cb61d6e..4b4bc10 100644
--- a/eventlogging/parse.py
+++ b/eventlogging/parse.py
@@ -69,6 +69,13 @@
 # used to anonymize IP addresses.
 KEY_LIFESPAN = datetime.timedelta(days=90)
 
+# Define special sensitive fields that may be included in the schema, and
+# their mapping to keys extracted from raw Eventlogging data. These
+# fields are only stored with the event if they are specified
+# explicitly in the schema. Any information that is already present in those
+# fields will be overridden by eventlogging. So far only clientIp is supported.
+SENSITIVE_FIELDS_MAP = {'__clientIp': 'clientIp'}
+
 
 def capsule_uuid(capsule):
     """Generate a UUID for a capsule object.
@@ -158,6 +165,24 @@
         self.casters.append(caster)
         return matcher
 
+    def drop_sensitive_fields(self, event):
+        """Sensitive fields like ip are only included if the special __ip
+        like field name is present in the Schema. Drop them if the special
+        fields listed in SENSITIVE_FIELDS are absent in the JSON schema.
+        """
+        for field, key in SENSITIVE_FIELDS_MAP.iteritems():
+            # If the special field is present, pop it - the actual value
+            # associated with the key is left untouched. For e.g if __clientIp
+            # is present, pop __clientIp. clientIp is already extracted and
+            # remains in the event
+            if field in event.schema():
+                event.pop(field)
+            # If the field is not present remove the actual key. e.g clientIp
+            # is popped if __clientIp is not present in the schema.
+            else:
+                event.pop(key)
+        return event
+
     def parse(self, line):
         """Parse a log line into a map of field names / values."""
         match = self.re.match(line)
@@ -167,7 +192,7 @@
         event = {k: f(match.group(k)) for f, k in zip(self.casters, keys)}
         event.update(event.pop('capsule'))
         event['uuid'] = capsule_uuid(event)
-        return Event(event)
+        return self.drop_sensitive_fields(Event(event))
 
     def __repr__(self):
         return '<LogParser(\'%s\')>' % self.format

-- 
To view, visit https://gerrit.wikimedia.org/r/271728
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1fcc8becfe4a41018b9986e5fb8a978344d5952b
Gerrit-PatchSet: 1
Gerrit-Project: eventlogging
Gerrit-Branch: master
Gerrit-Owner: Madhuvishy <mviswanat...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to