Adamw has uploaded a new change for review.
https://gerrit.wikimedia.org/r/53279
Change subject: barebones banner impression log parser
......................................................................
barebones banner impression log parser
Change-Id: Id9df44bda7942d86c9d5492c73dda3f988c96081
---
A banner_logs/config.py.example
A banner_logs/db.py
A banner_logs/impressions.py
A banner_logs/squid.py
4 files changed, 134 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools
refs/changes/79/53279/1
diff --git a/banner_logs/config.py.example b/banner_logs/config.py.example
new file mode 100644
index 0000000..a7a495b
--- /dev/null
+++ b/banner_logs/config.py.example
@@ -0,0 +1,7 @@
+db_params = {
+ 'host': 'localhost',
+ 'db': '',
+ 'user': '',
+ 'passwd': '',
+ 'debug': False,
+}
diff --git a/banner_logs/db.py b/banner_logs/db.py
new file mode 100644
index 0000000..11d226a
--- /dev/null
+++ b/banner_logs/db.py
@@ -0,0 +1,33 @@
+'''
+Mysql wrapper
+'''
+import MySQLdb as Dbi
+
+class Connection(object):
+ def __init__(self, host=None, user=None, passwd=None, db=None,
debug=False):
+ self.db_conn = Dbi.connect(host=host, user=user, passwd=passwd, db=db)
+ self.debug = debug
+
+ def close(self):
+ #self.db_conn.commit()
+ pass
+
+ def execute(self, sql, params={}):
+ cursor = self.db_conn.cursor(cursorclass=Dbi.cursors.DictCursor)
+ if self.debug:
+ print sql
+ cursor.execute(sql, params)
+ for row in cursor.fetchall():
+ yield row
+ cursor.close()
+
+
+db_conn = False
+
+def get_db():
+ '''Convienience'''
+ import config
+ global db_conn
+ if not db_conn:
+ db_conn = Connection(**config.db_params)
+ return db_conn
diff --git a/banner_logs/impressions.py b/banner_logs/impressions.py
new file mode 100755
index 0000000..bb25afa
--- /dev/null
+++ b/banner_logs/impressions.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+import sys
+import re
+from urlparse import urlparse, parse_qs
+from gzip import GzipFile
+
+from squid import squidline
+import db
+import config
+
+def import_log(filename):
+ # begin transaction
+ infile = GzipFile(filename, "r")
+ for line in infile:
+ matches = re.match(squidline, line)
+ if matches:
+ parsed = parse_line(matches.groupdict())
+ record_line(parsed)
+ else:
+ log("Irregular line: %s" % line)
+
+def parse_line(squid):
+ out = {
+ 'timestamp': squid['timestamp'],
+ 'secure': squid['referrer'].find('https://') == 0,
+ }
+ urlsplit = urlparse(squid['url'])
+ query_params = parse_qs(urlsplit.query)
+ legal_params = [ 'userlang', 'country', 'db', 'project', 'reason',
'result', 'bucket' ]
+ for key in legal_params:
+ if key in query_params:
+ out[key] = query_params[key]
+
+ return out
+
+def record_line(terms):
+ columns = ", ".join([ "%s = " % k for k in terms.keys() ])
+ params = terms.values()
+
+ sql = """
+INSERT INTO banner_impressions SET %s
+""" % columns
+ db.get_db().execute(sql, params)
+
+def log(msg):
+ print msg
+
+if __name__ == '__main__':
+ if len(sys.argv) > 1:
+ for path in sys.argv[1:]:
+ import_log(path)
+ else:
+ log("NEED ARGS")
diff --git a/banner_logs/squid.py b/banner_logs/squid.py
new file mode 100644
index 0000000..a7b9008
--- /dev/null
+++ b/banner_logs/squid.py
@@ -0,0 +1,40 @@
+import re
+
+# Regex based on http://wikitech.wikimedia.org/view/Squid_log_format
+squidline = re.compile(
+ r"""
+ ^
+ (?P<squid>[\S]+) # Name of the squid server
+ \s[-]*
+ (?P<sequence>[0-9]+) # Sequence ID from the squid server
+ \s
+ (?P<timestamp>[0-9-]+T[0-9:.]+) # Timestamp
+ \s
+ (?P<servicetime>[0-9.]+) # Request service time
+ \s
+ (?P<client>[\S]+) # Client IP address
+ \s
+ (?P<squidstatus>[\S]+) # Squid request status and HTTP status code
+ \s
+ (?P<reply>[0-9]+) # Reply size including HTTP headers
+ \s
+ (?P<request>[\S]+) # Request type
+ \s
+ (?P<url>[\S]+) # Request URL
+ \s
+ (?P<squidhierarchy>[\S]+) # Squid hierarchy status, peer IP
+ \s
+ (?P<mime>[\S]+) # MIME content type
+ \s
+ (?P<referrer>[\S]+) # Referer header
+ \s
+ (?P<xff>[\S]+) # X-Forwarded-For header
+ \s
+ (?P<useragent>[\S\s]+) # User-Agent header
+ \s
+ (?P<acceptlanguage>[\S\s]+) # Accept-Language header
+ \s
+ (?P<xcarrier>[\S\s]+) # X-carrier header
+ $
+ """, re.VERBOSE
+)
--
To view, visit https://gerrit.wikimedia.org/r/53279
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Id9df44bda7942d86c9d5492c73dda3f988c96081
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Adamw <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits