I posted the first part of this before as
http://lists.canonical.org/pipermail/kragen-hacks/2002-January/000310.html
--- this renders it useful. I wrote it tonight, starting from that post.
Obviously it could be faster, prettier, and more flexible, and it
contains a routine called "fwiffle", but it works under Python 1.5 and
2.1, and it's useful. My sample data file was an HTTP log.
Here's the CGI script:
#!/usr/bin/python
import sys
sys.path.insert(0, '/home/kragen/devel/oerlap')
import oerlapcgi, string
oerlapcgi.debug = 1
oerlapcgi.oerlapcgi('/tmp/lists-log',
string.split('client user group date timezone method ' +
'URL version result nbytes referer'),
datatitle="HTTP logs",
defaulthide=string.split('user group timezone version'))
This imports oerlapcgi.py, which follows.
# TO DO:
# add output size limits by default, plus ways of increasing them.
# more hierarchical stuff: /~kragen, /~kragen/sw, etc.
# add gzip content-transfer-encoding for clients that support it. It would
# improve response time by about a factor of 20.
# add an obvious way to remove restrictions, i.e. widen the filter.
# add an obvious way to hide columns.
# add a way to hide columns that aren't hidden by default.
# make ... clickable to see the full extent of the hideousness
# make the 'more' indicator clickable.
# add pretty colors
# add pretty fonts
# add ability to sort by some columns
# add totals to all-numeric or mostly-numeric columns
import oerlap, cgi, string, sys, os, urllib
debug = 0
def row(items, celltype='td'):
rv = (['<tr>'] +
map(lambda s, c=celltype: "<%s>%s</%s>" % (c, s, c), items) +
['</tr>\n'])
return string.join(rv, '')
def userval(somestring):
somestring = str(somestring)
if len(somestring) > 40: somestring = somestring[:40] + '...'
return cgi.escape(somestring, 1)
def andlist(alist):
assert len(alist) > 0
if len(alist) == 1:
return alist[0]
elif len(alist) == 2:
return "%s and %s" % tuple(alist)
else:
return "%s, and %s" % (string.join(alist[:-1], ", "), alist[-1])
def title(datatitle, colnames, paramdict):
rv = cgi.escape(datatitle)
brokenout = []
for name in colnames:
if breakoutby(name, paramdict):
brokenout.append(name)
if len(brokenout) == 0:
return rv
else:
return "%s by %s" % (rv, andlist(brokenout))
def describe_filters(colnames, paramdict):
filters = []
for name in colnames:
value = filterby(name, paramdict)
if value:
filters.append((name, value))
if filters == []:
return "All input rows selected."
else:
return "Only input rows where %s are selected." % andlist(
map(lambda (name, value): "%s is %s" % (name,
repr(userval(value))),
filters))
def describe_hides(colnames, paramdict, defaulthides):
hidden = filter(lambda name, p=paramdict, d=defaulthides:
hidden(name, p, d), colnames)
if hidden == []:
return "All columns displayed."
else:
return "The following columns are hidden: %s." % andlist(
map(lambda name, p=paramdict:
'<a href="%s">%s</a>' % (unhideurl(name, p), name), hidden))
def render_nrows(nrows):
if nrows == 0: return "No rows"
elif nrows == 1: return "One row"
else: return "%d rows" % nrows
def breakoutbyurl(colname, paramdict):
paramdict = paramdict.copy()
fieldname = 'bb_%s' % colname
if '1' in paramdict.get(fieldname, []): del paramdict[fieldname]
else: paramdict[fieldname] = ['1']
# I'm kind of dubious about this. It means that you can turn off
# filtering by a particular field by clicking on the header for that
# field, and you can hide a field (if it would be hidden by default)
# by doing that twice. That seems kind of nonobvious and possibly
# surprising.
for param in ['show_' + colname, 'f_' + colname]:
if paramdict.has_key(param): del paramdict[param]
return urlencode(paramdict)
def urlencode(paramdict):
# We can't rely on urllib.urlencode to be sane in Python 1.5 --- it
# doesn't accept a list of tuples, so doesn't handle multiple values.
rv = []
for key, value in paramdict.items():
key = urllib.quote_plus(str(key))
for item in value:
item = urllib.quote_plus(str(item))
rv.append(key + '=' + item)
return cgiurl() + '?' + string.join(rv, '&')
def breakoutby(colname, paramdict):
return '1' in paramdict.get('bb_%s' % colname, [])
def filterurl(fields, fieldnames, values, paramdict):
paramdict = paramdict.copy()
for ii in range(len(fields)):
fieldname = fieldnames[fields[ii]]
paramdict['f_%s' % fieldname] = [values[ii]]
for param in ['bb_' + fieldname, 'show_' + fieldname]:
if paramdict.has_key(param):
del paramdict[param]
return urlencode(paramdict)
def filterby(colname, paramdict):
return paramdict.get('f_%s' % colname, [None])[0]
def unhideurl(colname, paramdict):
paramdict = paramdict.copy()
paramdict['show_' + colname] = [1]
return urlencode(paramdict)
def hidden(colname, paramdict, defaulthides):
# hiding things you're filtering or breaking out by is confusing.
return (colname in defaulthides and
'1' not in paramdict.get("show_%s" % colname, []) and
not filterby(colname, paramdict) and
not breakoutby(colname, paramdict))
def cgiurl():
"Return a URL likely to refer to this script, without parameters."
# logic copied from CGI.pm 2.46, minus some features and bugs
getenv = os.environ.get
port = int(getenv('SERVER_PORT', '80')) # default is for debugging
if (getenv('HTTPS') == 'ON' or port == 443):
protocol = "https"
else:
protocol = "http"
if (protocol == 'http' and port == 80 or
protocol == 'https' and port == 443):
portstr = ""
else:
portstr = ":" + str(port)
hostname = getenv('HTTP_HOST', getenv('SERVER_NAME', 'localhost'))
path = getenv('SCRIPT_NAME', sys.argv[0])
return protocol + "://" + hostname + portstr + path
def fwiffle(filename, fieldtitles, output, paramdict, datatitle="data",
defaulthide=[]):
mytitle = title(datatitle, fieldtitles, paramdict)
output(('Content-Type: text/html\n\n<html><head><title>%s</title>\n' +
'</head><body><h1>%s</h1>\n') % (mytitle, mytitle))
if debug: output('%s' % paramdict)
try:
bocols = []
filters = [] # input row selection criteria
hiddencols = []
for ii in range(len(fieldtitles)):
colname = fieldtitles[ii]
if breakoutby(colname, paramdict):
bocols.append(ii)
if filterby(colname, paramdict) is not None:
filters.append((ii, filterby(colname, paramdict)))
if hidden(colname, paramdict, defaulthide):
hiddencols.append(ii)
datasrc = oerlap.filterdata(oerlap.filelines(open(filename)), filters)
results, freqs, n = oerlap.oerlap(datasrc, bocols)
output('<p>%s %s</p>' % (describe_filters(fieldtitles, paramdict),
describe_hides(fieldtitles, paramdict,
defaulthide)))
output('<p>%s selected from input. %s in this summary.</p>\n' %
(render_nrows(n), render_nrows(len(results))))
output('<table border>\n')
headers = map(lambda ss, paramdict=paramdict, fieldtitles=fieldtitles:
'<a href="%s">%s</a>' % (breakoutbyurl(ss, paramdict),
cgi.escape(ss)),
filter(lambda name, p=paramdict, d=defaulthide:
not hidden(name, p, d), fieldtitles))
output(row(['N'] + headers, 'th'))
for eachkey in oerlap.sort(freqs):
if (len(results)) == 1:
zoom = ''
else:
zoom = ('<a href="%s">(zoom)</a>' %
filterurl(bocols, fieldtitles, eachkey, paramdict))
cells = ['<p align="center">%s<br />%s</p>'
% (str(freqs[eachkey]), zoom)]
for ii in range(len(results[eachkey])):
if ii in hiddencols: continue
frequencies = results[eachkey][ii]
mystr = ['<table width="100%">']
maxn = 3
for eachitem in oerlap.sort(frequencies)[:maxn]:
mystr.append(('<tr><td>%s</td>' +
'<td align="right">%s</td></tr>\n') %
(userval(eachitem), frequencies[eachitem]))
if len(frequencies) > maxn:
mystr.append('<tr><td colspan="2" align="center">' +
'(%d more)</td></tr>\n' %
(len(frequencies) - maxn))
cells.append(string.join(mystr + ['</table>'], ''))
output(row(cells, 'td valign="top"'))
finally:
output('</table></body></html>\n')
def oerlapcgi(filename, fieldtitles, datatitle="data", defaulthide=[]):
"Provides a cgi veneer over fwiffle."
import os
if os.environ.has_key("GATEWAY_INTERFACE"):
try:
import cgitb
sys.excepthook = cgitb.excepthook
except ImportError:
# guess they don't have cgitb (http://web.lfw.org/python/).
# Oh well. Losers. Guess they have lots of time to debug stuff.
pass
fwiffle(filename, fieldtitles, sys.stdout.write, cgi.parse(),
datatitle=datatitle, defaulthide=defaulthide)
That imports oerlap.py, which follows:
# incredibly powerful secret web log analysis tool
import string
def oerlap(datasrc, breakoutby):
"""Analyze data.
Given a data source that yields tuples or None when .next() is called,
and a sequence 'breakoutby' that specifies which fields of the tuples to
break out by, count frequencies.
Result is a dict; keys are tuples of values things are broken out by;
values are lists of dicts mapping keys to frequencies.
"""
results = {}
freqs = {}
nn = 0
while 1:
line = datasrc.next()
if line is None: return results, freqs, nn
nn = nn + 1
key = tuple(map(lambda f, line=line: line[f], breakoutby))
if not results.has_key(key):
results[key] = map(lambda x: {}, range(len(line)))
r = results[key]
freqs[key] = freqs.get(key, 0) + 1
if len(r) < len(line): r.extend([{}] * (len(line) - len(r)))
for dict, value in map(None, r, line):
dict[value] = dict.get(value, 0) + 1
def sort(freqs):
"""Returns keys of a hash results sorted descending by their values.
Useful for the freqs result of oerlap or for the individual items
within its results result.
"""
rv = map(lambda (key, value): (value, key), freqs.items())
rv.sort()
rv.reverse()
return map(lambda item: item[1], rv)
class filterdata:
"Return only data items matching a filter."
def __init__(self, datasource, filter):
self.datasource = datasource
self.filter = filter
def next(self):
while 1:
next = self.datasource.next()
if next is None: return None
for field, value in self.filter:
if next[field] != value: break
else:
return next
class filelines:
"Return lines from a file."
def __init__(self, somefile):
self.file = somefile
def next(self):
line = self.file.readline()
if line == "": return None
return tuple(map(lambda x: intern(x), string.split(line)))
class arrayitems:
"For testing. Return tuples from an array."
def __init__(self, somearray):
self.array = somearray
self.ii = 0
def next(self):
if self.ii == len(self.array): return None
try: return self.array[self.ii]
finally: self.ii = self.ii + 1
testdata = [('a', 1, 32),
('a', 1, 33),
('b', 1, 31),
('c', 2, 30),
('a', 0, 30)]
def test(bb=[]): return oerlap(arrayitems(testdata), bb)
--
<[EMAIL PROTECTED]> Kragen Sitaker <http://www.pobox.com/~kragen/>
The sages do not believe that making no mistakes is a blessing. They believe,
rather, that the great virtue of man lies in his ability to correct his
mistakes and continually make a new man of himself. -- Wang Yang-Ming