Adamw has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/53279


Change subject: barebones banner impression log parser
......................................................................

barebones banner impression log parser

Change-Id: Id9df44bda7942d86c9d5492c73dda3f988c96081
---
A banner_logs/config.py.example
A banner_logs/db.py
A banner_logs/impressions.py
A banner_logs/squid.py
4 files changed, 134 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools 
refs/changes/79/53279/1

diff --git a/banner_logs/config.py.example b/banner_logs/config.py.example
new file mode 100644
index 0000000..a7a495b
--- /dev/null
+++ b/banner_logs/config.py.example
@@ -0,0 +1,7 @@
+db_params = {
+    'host': 'localhost',
+    'db': '',
+    'user': '',
+    'passwd': '',
+    'debug': False,
+}
diff --git a/banner_logs/db.py b/banner_logs/db.py
new file mode 100644
index 0000000..11d226a
--- /dev/null
+++ b/banner_logs/db.py
@@ -0,0 +1,33 @@
+'''
+Mysql wrapper
+'''
+import MySQLdb as Dbi
+
+class Connection(object):
+    def __init__(self, host=None, user=None, passwd=None, db=None, 
debug=False):
+        self.db_conn = Dbi.connect(host=host, user=user, passwd=passwd, db=db)
+        self.debug = debug
+
+    def close(self):
+        #self.db_conn.commit()
+        pass
+
+    def execute(self, sql, params={}):
+        cursor = self.db_conn.cursor(cursorclass=Dbi.cursors.DictCursor)
+        if self.debug:
+            print sql
+        cursor.execute(sql, params)
+        for row in cursor.fetchall():
+            yield row
+        cursor.close()
+
+
+db_conn = False
+
+def get_db():
+    '''Convienience'''
+    import config
+    global db_conn
+    if not db_conn:
+        db_conn = Connection(**config.db_params)
+    return db_conn
diff --git a/banner_logs/impressions.py b/banner_logs/impressions.py
new file mode 100755
index 0000000..bb25afa
--- /dev/null
+++ b/banner_logs/impressions.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+import sys
+import re
+from urlparse import urlparse, parse_qs
+from gzip import GzipFile
+
+from squid import squidline
+import db
+import config
+
+def import_log(filename):
+    # begin transaction
+    infile = GzipFile(filename, "r")
+    for line in infile:
+        matches = re.match(squidline, line)
+        if matches:
+            parsed = parse_line(matches.groupdict())
+            record_line(parsed)
+        else:
+            log("Irregular line: %s" % line)
+
+def parse_line(squid):
+    out = {
+        'timestamp': squid['timestamp'],
+        'secure': squid['referrer'].find('https://') == 0,
+    }
+    urlsplit = urlparse(squid['url'])
+    query_params = parse_qs(urlsplit.query)
+    legal_params = [ 'userlang', 'country', 'db', 'project', 'reason', 
'result', 'bucket' ]
+    for key in legal_params:
+        if key in query_params:
+            out[key] = query_params[key]
+
+    return out
+
+def record_line(terms):
+    columns = ", ".join([ "%s = " % k for k in terms.keys() ])
+    params = terms.values()
+
+    sql = """
+INSERT INTO banner_impressions SET %s
+""" % columns
+    db.get_db().execute(sql, params)
+
+def log(msg):
+    print msg
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        for path in sys.argv[1:]:
+            import_log(path)
+    else:
+        log("NEED ARGS")
diff --git a/banner_logs/squid.py b/banner_logs/squid.py
new file mode 100644
index 0000000..a7b9008
--- /dev/null
+++ b/banner_logs/squid.py
@@ -0,0 +1,40 @@
+import re
+
+# Regex based on http://wikitech.wikimedia.org/view/Squid_log_format
+squidline = re.compile(
+    r"""
+        ^
+        (?P<squid>[\S]+) # Name of the squid server
+        \s[-]*
+        (?P<sequence>[0-9]+) # Sequence ID from the squid server
+        \s
+        (?P<timestamp>[0-9-]+T[0-9:.]+) # Timestamp
+        \s
+        (?P<servicetime>[0-9.]+) # Request service time
+        \s
+        (?P<client>[\S]+) # Client IP address
+        \s
+        (?P<squidstatus>[\S]+) # Squid request status and HTTP status code
+        \s
+        (?P<reply>[0-9]+) # Reply size including HTTP headers
+        \s
+        (?P<request>[\S]+) # Request type
+        \s
+        (?P<url>[\S]+) # Request URL
+        \s
+        (?P<squidhierarchy>[\S]+) # Squid hierarchy status, peer IP
+        \s
+        (?P<mime>[\S]+) # MIME content type
+        \s
+        (?P<referrer>[\S]+) # Referer header
+        \s
+        (?P<xff>[\S]+) # X-Forwarded-For header
+        \s
+        (?P<useragent>[\S\s]+) # User-Agent header
+        \s
+        (?P<acceptlanguage>[\S\s]+) # Accept-Language header
+        \s
+        (?P<xcarrier>[\S\s]+) # X-carrier header
+        $
+    """, re.VERBOSE
+)

-- 
To view, visit https://gerrit.wikimedia.org/r/53279
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id9df44bda7942d86c9d5492c73dda3f988c96081
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Adamw <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to