Author: Karsten Loesing <karsten.loes...@gmx.net> Date: Wed, 20 Oct 2010 11:33:36 +0200 Subject: Add visitor.py written by Kiyoto Tamura. Commit: 3884f121e6cfeb623acad27f2c5f30f81fa7a299
--- visitor/visitor.py | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 170 insertions(+), 0 deletions(-) create mode 100644 visitor/visitor.py diff --git a/visitor/visitor.py b/visitor/visitor.py new file mode 100644 index 0000000..c87222d --- /dev/null +++ b/visitor/visitor.py @@ -0,0 +1,170 @@ +# author: Kiyoto Tamura <owenes...@gmail.com> +# +# A Python port of Karsten Loesing's VisiTor. +# +# + +import re +import sys +import os +import doctest +from datetime import datetime, date, timedelta +import bisect +from time import strptime # datetime.strptime does not exist for version < 2.5 +from cStringIO import StringIO + +# regexes used in the script +IP_RE = re.compile(r'(\d+\.){3}\d+') +APACHE_DATETIME = re.compile(r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}) -\d{4}\]') +TOR_USERAGENTS = [('torbutton1_2_0rc1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; ' + r'en-US; rv\:1\.8\.1\.14\) ' + r'Gecko/20080404 Firefox/2\.0\.0\.14')), + ('torbutton1_2_0', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; ' + r'[a-z]{2}-[A-Z]{2}; rv\:1\.8\.1\.16\) ' + r'Gecko/20080702 Firefox/2\.0\.0\.16')), + ('torbutton1_2_1', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 5\.1; ' + r'en-US; rv\:1\.9\.0\.7\) ' + r'Gecko/2009021910 Firefox/3\.0\.7')), + ('torbutton1_2_5', re.compile(r'Mozilla/5\.0 \(Windows; U; Windows NT 6\.1; ' + r'[a-z]{2}-[A-Z]{2}; rv:1\.9\.2\.3\) ' + r'Gecko/20100401 Firefox/3\.6\.3')) + ] + + +class ApacheParseError(Exception): pass +class NoExitListAvailableError(Exception): pass + +def get_exitlist(exitlist_filepath): + """ + Returns a dictionary keyed by ip address. The value is a sorted list of timestamps when an + exist address was recorded. + """ + exitlist = {} + for dirpath, _, filenames in os.walk(exitlist_filepath, topdown = False): + for filename in filenames: + fn = os.path.join(dirpath, filename) + try: + f = open(fn) + for line in f: + if line.startswith('ExitAddress'): + _, ip, dt = line.split(' ', 2) + yr, mo, d, h, m, s, _, _, _ = strptime(dt.rstrip('\s\n'), '%Y-%m-%d %H:%M:%S') + if not ip in exitlist: + exitlist[ip] = [] + bisect.insort(exitlist[ip], datetime(yr, mo, d, h, m, s)) # maintain ordered list + except IOError: + print >> sys.stderr, 'could not open %s. Skipping it.'%fn + + return exitlist + +def apache_time2datetime(time_str): + """ + transforms the apache time to a Python datetime object + """ + yr, mo, d, h, m, s, _, _, _ = strptime(time_str, '%d/%b/%Y:%H:%M:%S') + return datetime(yr, mo, d, h, m, s) + +def parse_apache_line(log_line): + """ + Parses one line of Apache access log. It assumes that it isn in the "combined" format. + """ + ip = IP_RE.match(log_line) # the IP address should occur at the beginning + if ip is None: + raise ApacheParseError("Could not match the IP address at the beginning of the line for %s"%log_line) + ip = ip.group(0) + + apache_datetime = APACHE_DATETIME.search(log_line) + if apache_datetime is None: + raise ApacheParseError("Could not match the datetime for the line %s"%log_line) + apache_datetime = apache_time2datetime(apache_datetime.group(1)) + + user_agent = log_line.split('" ')[-1].rstrip('\n') + + return ip, user_agent, apache_datetime # maybe turn it into a dict if it gets confusing + #return {'ip': ip, 'user_agent': user_agent, 'apache_datetime': apache_datetime} + + +def is_tor(apache_ip, apache_time, exitlist): + if not apache_ip in exitlist: return False + timestamps = exitlist[apache_ip] + pos = bisect.bisect_left(timestamps, apache_time) + if pos >= len(timestamps): return False + return timestamps[pos] - apache_time <= timedelta(1) + +def analyze(apache_log_path, exitlist_path, output = sys.stdout): + """ + The main script. It reads the exit list, and goes through the Apache access log line by line, and checks if + if it is a Tor request. TODO: filter out the bots. + """ + exitlist = get_exitlist(exitlist_path) + + tor_stats = {} + tor_ua = TOR_USERAGENTS + + try: + apache_log_file = open(apache_log_path) + except IOError: + raise IOError('Could not open %s. Please check the path to the access log again') + + for apache_line in apache_log_file: + ip, user_agent, apache_datetime = parse_apache_line(apache_line) + apache_date = apache_datetime.date() + if apache_date not in tor_stats: + d = {'date': apache_date, 'tor': 0, 'nottor': 0} + for tor_type, _ in tor_ua: + d[tor_type] = 0 + tor_stats[apache_date] = d + stats = tor_stats[apache_date] + + if is_tor(ip, apache_datetime, exitlist): + stats['tor'] += 1 + for tor_type, tor_re in tor_ua: + if tor_re.search(user_agent): + stats[tor_type] += 1 + break + else: + stats['nottor'] += 1 + + if not tor_stats: + print >> sys.stderr, 'No data to be written. Exiting' + return + + # writing to a buffer + buffer = StringIO() + col_list = ['date', 'tor', 'nottor'] + for tor_type, _ in tor_ua: + col_list.append(tor_type) + buffer.write(','.join(col_list) + '\n') + apache_dates = tor_stats.keys() + apache_dates.sort() + curr_apache_date = apache_dates[0] + last_apache_date = apache_dates[-1] + + while curr_apache_date <= last_apache_date: + stats = tor_stats.get(curr_apache_date) + if stats is None: + stats = {'date': curr_apache_date} + buffer.write(','.join([str(stats.get(col, 'N/A')) for col in col_list]) + '\n') + curr_apache_date += timedelta(1) + + if output != sys.stdout: + try: + ofile = open(output, 'w') + ofile.write(buffer.getvalue()) + + except: + raise IOError("Could not write results to %s. Exiting witout writing"%output) + + else: + print buffer.getvalue() + +if __name__ == '__main__': + argc = len(sys.argv) + if argc < 3 or argc > 4: + print """ +usage: python visitor.py <access_log path> <exit list path> (<output file path>) + """ + else: + access_log_path = sys.argv[1] + exitlist_path = sys.argv[2] + analyze(access_log_path, exitlist_path) -- 1.7.1