http://www.mediawiki.org/wiki/Special:Code/MediaWiki/84667
Revision: 84667
Author: rfaulk
Date: 2011-03-24 06:57:35 +0000 (Thu, 24 Mar 2011)
Log Message:
-----------
Added reporting for arbitrary interval sizes over fundraiser data. Report
drafts queries of minutely snapshots of the data and plots the results.
Modified Paths:
--------------
trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py
trunk/fundraiser-statistics/fundraiser-scripts/query_store.py
Modified: trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py
2011-03-24 04:28:38 UTC (rev 84666)
+++ trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py
2011-03-24 06:57:35 UTC (rev 84667)
@@ -21,6 +21,7 @@
import MySQLdb
import pylab
import HTML
+import math
import query_store as qs
import miner_help as mh
@@ -30,16 +31,67 @@
"""
- Takes a list of timestamps as input and converts it to a set of
hours counting back from 0
+ Takes a list of timestamps as input and converts it to a set of
days,hours, or minutes counting back from 0
- time_lists - a list of timestamp lists
- time_norm - a dictionary of normalized times
+ time_lists - a list of datetime objects
+ time_unit - an integer indicating what unit to
measure time in (0 = day, 1 = hour, 2 = minute)
+
+ RETURN:
+ time_norm - a dictionary of normalized times
"""
- def normalize_timestamps(self, time_lists):
+ def normalize_timestamps(self, time_lists, count_back, time_unit):
- # Convert lists into dictionaries before processing
- # it is assumed that lists are composed of only simple types
+ time_lists, isList = self.timestamps_to_dict(time_lists)
+
+
+ """ Depending on args set the start date """
+ if count_back:
+ start_date_obj =
self.find_latest_date_in_list(time_lists)
+ else:
+ start_date_obj =
self.find_earliest_date_in_list(time_lists)
+
+ start_day = start_date_obj.day
+ start_hr = start_date_obj.hour
+ start_mte = start_date_obj.minute
+
+ # Normalize dates
+ time_norm = mh.AutoVivification()
+ for key in time_lists.keys():
+ for date_obj in time_lists[key]:
+
+ day = date_obj.day
+ hr = date_obj.hour
+ mte = date_obj.minute
+
+ if time_unit == 0:
+ elem = (day - start_day)
+ elif time_unit == 1:
+ elem = (day - start_day) * 24 + (hr -
start_hr)
+ elif time_unit == 2:
+ elem = (day - start_day) * 24 * 60 +
(hr - start_hr) * 60 + (mte - start_mte)
+
+ try:
+ time_norm[key].append(elem)
+ except:
+ time_norm[key] = list()
+ time_norm[key].append(elem)
+
+ # If the original argument was a list put it back in that form
+ if isList:
+ time_norm = time_norm[key]
+
+ return time_norm
+
+ """
+
+ HELPER METHOD for normalize_timestamps
+
+ Convert lists into dictionaries before processing it is assumed
that lists are composed of only simple types
+
+ """
+ def timestamps_to_dict(self, time_lists):
+
isList = 0
if type(time_lists) is list:
isList = 1
@@ -52,47 +104,53 @@
for i in range(len(old_list)):
time_lists[key].append(old_list[i])
+
+ return [time_lists, isList]
+
+ """
+
+ HELPER METHOD for normalize_timestamps
+ Find the latest time stamp in a list
+
+ """
+ def find_latest_date_in_list(self, time_lists):
- # Find the earliest date
- max_i = 0
+ date_max = datetime.datetime(1000,1,1,0,0,0)
for key in time_lists.keys():
- for date_str in time_lists[key]:
- day_int = int(date_str[8:10])
- hr_int = int(date_str[11:13])
- date_int =
int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
- if date_int > max_i:
- max_i = date_int
- max_day = day_int
- max_hr = hr_int
+ for date_obj in time_lists[key]:
+ if date_int > date_min:
+ date_min = date_obj
+
+ return date_max
+
+ """
+
+ HELPER METHOD for normalize_timestamps
+ Find the earliest time stamp in a list
+
+ """
+ def find_earliest_date_in_list(self, time_lists):
- # Normalize dates
- time_norm = mh.AutoVivification()
+ date_min = datetime.datetime(3000,1,1,0,0,0)
+
for key in time_lists.keys():
- for date_str in time_lists[key]:
- day = int(date_str[8:10])
- hr = int(date_str[11:13])
- # date_int =
int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
- elem = (day - max_day) * 24 + (hr - max_hr)
- try:
- time_norm[key].append(elem)
- except:
- time_norm[key] = list()
- time_norm[key].append(elem)
-
- # If the original argument was a list put it back in that form
- if isList:
- time_norm = time_norm[key]
-
- return time_norm
-
+ for date_obj in time_lists[key]:
+ if date_obj < date_min:
+ date_min = date_obj
+
+ return date_min
+
"""
+ Takes datetime objects and converts them to a string format
YYYYMMDDHHmmSS for SQL processing by the hour
- input - python datetime object
- returns - formatted datetime strings
+ now - datetime object
+ hours_back - number of hours between start and end time
+ returns formatted datetime start_time and end_time strings
+
"""
def gen_date_strings_hr(self, now, hours_back):
@@ -138,10 +196,13 @@
return [start_time, end_time]
"""
+ Takes datetime objects and converts them to a string format
YYYYMMDDHHmmSS for SQL processing by day
- input - python datetime object
- returns - formatted datetime strings
+ now - datetime object
+ days_back - number of days between start and end time
+ returns formatted datetime start_time and end_time strings
+
"""
def gen_date_strings_day(self, now, days_back):
@@ -179,17 +240,17 @@
"""
- Formats date string to match the form of
civicrm.civicrm_contribution.recieve_date
+ Given a datetime object produce a timestamp a number of hours in the
past and according to a particular format
format 1 - 20080101000606
format 2 - 2008-01-01 00:06:06
input:
- now - python datetime object
+ now - datetime object
hours_back - the amount of time the
format - the format of the returned timestamp strings
- resolution - the
+ resolution - the resolution detail of the timestamp (e.g. down to the
minute, down to the hour, ...)
returns - formatted datetime strings
@@ -212,21 +273,10 @@
"""
- Formats date string to match the form of
civicrm.civicrm_contribution.recieve_date
+ Convert datetime objects to a timestamp of a given format
+
+ HELPER METHOD for gen_date_strings
- format 1 - 20080101000606
- format 2 - 2008-01-01 00:06:06
-
- input:
-
- time_obj - python datetime object
- hours_back - the amount of time the
- format - the format of the returned timestamp strings
- resolution -
-
-
- returns - formatted datetime strings
-
"""
def timestamp_from_obj(self, time_obj, format, resolution):
@@ -277,19 +327,83 @@
return timestamp
+ """
+
+ Convert timestamp to a datetime object of a given format
+
+ """
+ def timestamp_to_obj(self, timestamp, format):
+
+ if format == 1:
+ time_obj = datetime.datetime(int(timestamp[0:4]),
int(timestamp[4:6]), int(timestamp[6:8]), \
+
int(timestamp[8:10]), int(timestamp[10:12]), int(timestamp[12:14]))
+
+ elif format == 2:
+ time_obj = datetime.datetime(int(timestamp[0:4]),
int(timestamp[5:7]), int(timestamp[8:10]), \
+
int(timestamp[11:13]), int(timestamp[14:16]), int(timestamp[17:19]))
+
+ return time_obj
+
+
+ """
+
+ Inserts missing interval points into the time and metric lists
+
+ Assumptions:
+ _metrics_ and _times_ are lists of the same length
+ there must be a data point at each interval
+ Some data points may be missed
+ where there is no metric data the metric takes on the
value 0.0
+
+ e.g. when _interval_ = 10
+ times = [0 10 30 50], metrics = [1 1 1 1] ==> [0 10 30 40 50],
[1 1 0 1 0 1]
+
+ """
+ def normalize_intervals(self, times, metrics, interval):
+
+ current_time = 0.0
+ index = 0
+ iterations = 0
+ max_elems = math.ceil((times[-1] - times[0]) / interval) #
there should be no more elements in the list than this
+
+ new_times = list()
+ new_metrics = list()
+
+ """ Iterate through the time list """
+ while index < len(times):
+
+ """ TEMPORARY SOLUTION: break out of the loop if more
than the maximum number of elements is reached """
+ if iterations > max_elems:
+ break;
+
+ new_times.append(current_time)
+
+ """ If the current time is not in the current list then
add it and a metric value of 0.0
+ otherwise add the existing elements to the new
lists """
+ if current_time != times[index]:
+ new_metrics.append(0.0)
+
+ else:
+ new_metrics.append(metrics[index])
+ index = index + 1
+
+ current_time = current_time + interval
+
+ iterations = iterations + 1
+
+ return [new_times, new_metrics]
+
"""
CLASS :: ^FundraiserReporting^
Base class for reporting fundraiser analytics. Methods that are intended to
be extended in derived classes include:
-run_query()
+run_query() - format and execute the query to obtain data
+gen_plot() - plots the results of the report
+write_to_html_table() - writes the results to an HTML table
run()
-gen_plot()
-publish_google_sheet()
-write_to_html_table()
-
"""
class FundraiserReporting(TimestampProcesser):
@@ -300,8 +414,8 @@
def init_db(self):
""" Establish connection """
#db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk',
db='faulkner')
- #self.db = MySQLdb.connect(host='127.0.0.1', user='rfaulk',
db='faulkner', port=3307)
- self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet',
user='rfaulk', db='faulkner')
+ self.db = MySQLdb.connect(host='127.0.0.1', user='rfaulk',
db='faulkner', port=3307)
+ #self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet',
user='rfaulk', db='faulkner')
""" Create cursor """
self.cur = self.db.cursor()
@@ -1051,43 +1165,41 @@
"""
-INCOMPLETE -
-CLASS :: ^ConfidenceReporting^
+CLASS :: ^IntervalReporting^
-To be called primarily for reporting
+Performs queries that take timestamps, query, and an interval as arguments.
Data for a single metric
+is generated for each time interval in the time period defined by the start
and end timestamps.
+Types of queries supported:
+
+report_banner_metrics_minutely
+report_LP_metrics_minutely
+
"""
-class ConfidenceReporting(FundraiserReporting):
+class IntervalReporting(FundraiserReporting):
- def __init__(self, query_name, cmpgn_1, cmpgn_2, item_1, item_2,
start_time , end_time, metric):
- self.query_name = query_name
- self.cmpgn_1 = cmpgn_1
- self.cmpgn_2 = cmpgn_2
- self.item_1 = item_1
- self.item_2 = item_2
- self.start_time = start_time
- self.end_time = end_time
- self.metric = metric
-
-
- def run_query(self):
+ def run_query(self, start_time, end_time, interval, query_name,
metric_name, campaign):
self.init_db()
+
query_obj = qs.query_store()
+
+ metrics = mh.AutoVivification()
+ times = mh.AutoVivification()
+ times_norm = mh.AutoVivification()
- metric_list_1 = mh.AutoVivification()
- metric_list_2 = mh.AutoVivification()
- time_list = mh.AutoVivification()
-
# Load the SQL File & Format
- filename = './sql/' + self.query_name + '.sql'
+ filename = './sql/' + query_name + '.sql'
sql_stmnt = mh.read_sql(filename)
- sql_stmnt = query_obj.format_query(self.query_name, sql_stmnt,
[self.start_time, self.end_time, self.cmpgn_1, self.item_1])
+ sql_stmnt = query_obj.format_query(query_name, sql_stmnt,
[start_time, end_time, campaign, interval])
+ #print sql_stmnt
+ # Get Indexes into Query
+ key_index = query_obj.get_banner_index(query_name)
+ metric_index = query_obj.get_metric_index(query_name,
metric_name)
time_index = query_obj.get_time_index(query_name)
- metric_index = query_obj.get_metric_index(query_name,
metric_name)
# Composes the data for each banner
try:
@@ -1097,29 +1209,119 @@
results = self.cur.fetchall()
for row in results:
-
+
key_name = row[key_index]
-
+ time_obj =
self.timestamp_to_obj(row[time_index], 1) # format = 1, 14-digit TS
+
+ """ For each new dictionary index by key name
start a new list if its not already there """
try:
-
metric_lists[key_name].append(row[metric_index])
-
time_lists[key_name].append(row[time_index])
+
metrics[key_name].append(row[metric_index])
+ times[key_name].append(time_obj)
except:
- metric_lists[key_name] = list()
- time_lists[key_name] = list()
-
-
metric_lists[key_name].append(row[metric_index])
-
time_lists[key_name].append(row[time_index])
-
+ metrics[key_name] = list()
+ times[key_name] = list()
+
+
metrics[key_name].append(row[metric_index])
+ times[key_name].append(time_obj)
+
except:
self.db.rollback()
sys.exit("Database Interface Exception:\n" + err_msg)
+ """ Convert Times to Integers that indicate relative times AND
normalize the intervals in case any are missing """
+ for key in times.keys():
+ times_norm[key] = self.normalize_timestamps(times[key],
False, 2)
+ times_norm[key], metrics[key] =
self.normalize_intervals(times_norm[key], metrics[key], interval)
+
self.close_db()
- def gen_plot(self,counts, times, title, xlabel, ylabel, ranges,
subplot_index, fname):
- return
+ return [metrics, times_norm]
+
- def run(self):
- self.run_query()
+ def gen_plot(self, metrics, times, title, xlabel, ylabel, ranges,
subplot_index, fname):
+
+ pylab.subplot(subplot_index)
+ pylab.figure(num=None,figsize=[26,14])
+
+ line_types =
['b-o','g-o','r-o','c-o','m-o','k-o','y-o','b--d','g--d','r--d','c--d','m--d','k--d','y--d','b-.s','g-.s','r-.s','c-.s','m-.s','k-.s','y-.s']
+
+ count = 0
+ for key in metrics.keys():
+ pylab.plot(times[key], metrics[key], line_types[count])
+ count = count + 1
+
+ pylab.grid()
+ pylab.xlim(ranges[0], ranges[1])
+ pylab.legend(metrics.keys(),loc=2)
+
+ pylab.xlabel(xlabel)
+ pylab.ylabel(ylabel)
+
+ pylab.title(title)
+ pylab.savefig(fname, format='png')
+
+
+
+ """
+
+ start_time, end_time -- start and end timestamps
+
+ """
+ def run(self, start_time, end_time, interval, query_name, metric_name,
campaign):
+
+ query_obj = qs.query_store()
+
+ print '\nGenerating ' + query_name +', start and end times are:
' + start_time + ' - ' + end_time +' ... \n'
+
+ # Run Query
+ return_val = self.run_query(start_time, end_time, interval,
query_name, metric_name, campaign)
+ counts = return_val[0]
+ times = return_val[1]
+
+ # Normalize times
+ min_time = min(times)
+ ranges = [min_time, 0]
+
+ xlabel = 'Hours'
+ subplot_index = 111
+ fname = query_name + '.png'
+
+ title = query_obj.get_plot_title(query_name)
+ title = title + ' -- ' + start_time + ' - ' + end_time
+ ylabel = query_obj.get_plot_ylabel(query_name)
+
+ # Convert counts to float (from Decimal) to prevent exception
when bar plotting
+ # Bbox::update_numerix_xy expected numerix array
+ for key in counts.keys():
+ counts_new = list()
+ for i in range(len(counts[key])):
+ counts_new.append(float(counts[key][i]))
+ counts[key] = counts_new
+
+ """ Determine List maximums """
+ times_max = 0
+ metrics_max = 0
+
+ for key in counts.keys():
+ list_max = max(counts[key])
+ if list_max > metrics_max:
+ metrics_max = list_max
+
+ for key in times.keys():
+ list_max = max(times[key])
+ if list_max > times_max:
+ times_max = list_max
+
+ print times_max
+ print metrics_max
+ ranges = list()
+ ranges.append(0)
+ ranges.append(times_max * 1.1)
+ ranges.append(0)
+ ranges.append(metrics_max * 1.1)
+
+
+ # Generate Histogram
self.gen_plot(counts, times, title, xlabel, ylabel, ranges,
subplot_index, fname)
-
\ No newline at end of file
+
+
Modified: trunk/fundraiser-statistics/fundraiser-scripts/query_store.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/query_store.py
2011-03-24 04:28:38 UTC (rev 84666)
+++ trunk/fundraiser-statistics/fundraiser-scripts/query_store.py
2011-03-24 06:57:35 UTC (rev 84667)
@@ -126,7 +126,22 @@
elif query_name == 'report_ecomm_by_contact':
where_str = args[0]
- sql_stmnt = sql_stmnt % ('%', '%', '%', '%',
where_str)
+ sql_stmnt = sql_stmnt % ('%', '%', '%', '%', where_str)
+
+ elif query_name == 'report_banner_metrics_minutely':
+ start_time = args[0]
+ end_time = args[1]
+ campaign = args[2]
+ interval = args[3]
+ sql_stmnt = sql_stmnt % ('%', '%', '%', '%', interval,
interval, start_time, end_time, '%', '%', '%', '%', interval, interval,
start_time, end_time, campaign, \
+ '%',
'%', '%', '%', interval, interval, start_time, end_time, campaign)
+
+ elif query_name == 'report_LP_metrics_minutely':
+ start_time = args[0]
+ end_time = args[1]
+ campaign = args[2]
+ interval = args[3]
+ sql_stmnt = sql_stmnt % ('%', '%', '%', '%', interval,
interval, start_time, end_time, campaign, '%', '%', '%', '%', interval,
interval, start_time, end_time, campaign)
else:
return 'no such table\n'
@@ -151,6 +166,13 @@
else:
return 'no such table'
+ """ Returns the index of the key for the query data """
+ def get_key_index(self, query_name):
+ if query_name == 'report_banner_metrics_minutely':
+ return 1
+ elif query_name == 'report_LP_metrics_minutely':
+ return 1
+
def get_count_index(self, query_name):
if query_name == 'report_lp_views_by_hour':
return 1
@@ -176,6 +198,10 @@
return 0
elif query_name == 'report_lp_views_by_hour':
return 0
+ elif query_name == 'report_banner_metrics_minutely':
+ return 0
+ elif query_name == 'report_LP_metrics_minutely':
+ return 0
else:
return -1
@@ -202,6 +228,8 @@
return 1
elif query_name == 'report_bannerLP_metrics':
return 1
+ elif query_name == 'report_banner_metrics_minutely':
+ return 1
else:
return -1
@@ -216,6 +244,8 @@
return 2
elif query_name == 'report_bannerLP_metrics':
return 1
+ elif query_name == 'report_LP_metrics_minutely':
+ return 1
else:
return -1
@@ -317,9 +347,24 @@
return 15
else:
return -1
+ elif query_name == 'report_LP_metrics_minutely':
+ if metric_name == 'don_per_view':
+ return 7
+ elif metric_name == 'amt50_per_view':
+ return 9
+ else:
+ return -1
+ elif query_name == 'report_banner_metrics_minutely':
+ if metric_name == 'don_per_imp':
+ return 10
+ elif metric_name == 'amt50_per_imp':
+ return 12
+ else:
+ return -1
else:
return 'no such table'
+
def get_plot_title(self, query_name):
if query_name == 'report_banner_impressions_by_hour':
return 'Banner Impressions Over the Past 24 Hours'
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs