http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88804
Revision: 88804
Author: rfaulk
Date: 2011-05-25 18:06:04 +0000 (Wed, 25 May 2011)
Log Message:
-----------
Added state to DataLoader class: type of query, raw query results, data handler
object, columns of the results
Added state to IntervalReportingLoader class: summary data that stores the
interval data combined over the time intervals
Modified IntervalReportingLoader::combine_rows to populate the summary data
Removed arg references to query type since it is now a state variable
Modified Paths:
--------------
trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
Modified: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
===================================================================
--- trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
2011-05-25 18:02:01 UTC (rev 88803)
+++ trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
2011-05-25 18:06:04 UTC (rev 88804)
@@ -26,6 +26,7 @@
import Fundraiser_Tools.classes.QueryData as QD
import Fundraiser_Tools.classes.TimestampProcessor as TP
import Fundraiser_Tools.classes.Helper as Hlp
+import Fundraiser_Tools.classes.FundraiserDataHandler as FDH
"""
@@ -48,9 +49,16 @@
""" Database and Cursor objects """
_db_ = None
_cur_ = None
+
_sql_path_ = '../sql/' # Relative path for SQL files to be processed
+
_query_names_ = dict()
+ _data_handler_ = None # class that will define how to process the query
fields
+ _query_type_ = '' # Stores the query type (dependent on the data
handler definition)
+ _results_ = None
+ _col_names_ = None
+
def init_db(self):
""" Establish connection """
@@ -170,10 +178,10 @@
query_name -
"""
- def get_sql_filename_for_query(self, query_type):
+ def get_sql_filename_for_query(self):
try:
- return self._query_names_[query_type]
+ return self._query_names_[self._query_type_]
except KeyError:
print >> sys.stderr, 'Could not find a query for type: ' +
query_type
sys.exit(2)
@@ -183,15 +191,22 @@
class IntervalReportingLoader(DataLoader):
-
- def __init__(self):
+
+ _summary_data_ = None
+
+ def __init__(self, query_type):
self._query_names_['banner'] = 'report_banner_metrics_minutely'
self._query_names_['LP'] = 'report_LP_metrics_minutely'
self._query_names_['campaign'] = 'report_campaign_metrics_minutely'
self._query_names_['campaign_total'] =
'report_campaign_metrics_minutely_total'
+
+ self._query_type_ = query_type
+
+ """ hardcode the data handler for now """
+ self._data_handler_ = FDH
"""
- <DESCRIPTION>
+ Executes the query which generates interval metrics and sets _results_
and _col_names_
INPUT:
start_time - start timestamp for reporting
@@ -205,15 +220,17 @@
RETURN:
metrics - dict containing metric measure for each time
index for each donation pipeline handle (e.g. banner names)
times - dict containing time index for each donation
pipeline handle (e.g. banner names)
+ _results_ - list containing the rows generated by the
query
"""
- def run_query(self, start_time, end_time, interval, query_type,
metric_name, campaign):
+ def run_query(self, start_time, end_time, interval, metric_name, campaign):
self.init_db()
- query_name = self.get_sql_filename_for_query(query_type)
+ query_name = self.get_sql_filename_for_query()
metrics = Hlp.AutoVivification()
times = Hlp.AutoVivification()
+ self._col_names_ = list()
""" Compose datetime objects to represent the first and last intervals
"""
start_time_obj = TP.timestamp_to_obj(start_time, 1)
@@ -243,11 +260,14 @@
# err_msg = sql_stmnt
self._cur_.execute(sql_stmnt)
- results = self._cur_.fetchall()
+ for i in self._cur_.description:
+ self._col_names_.append(i[0])
+
+ self._results_ = self._cur_.fetchall()
final_time = dict() # stores
the last timestamp seen
interval_obj = datetime.timedelta(minutes=interval) #
timedelta object used to shift times by _interval_ minutes
- for row in results:
+ for row in self._results_:
key_name = row[key_index]
time_obj = TP.timestamp_to_obj(row[time_index], 1) # format =
1, 14-digit TS
@@ -306,10 +326,11 @@
metrics_new.append(float(metrics[key][i]))
metrics[key] = metrics_new
- return [metrics, times, results]
+ return [metrics, times, self._results_]
+
"""
- Post process raw data from query. Combines data rows according to
column type definitions.
+ Post process raw data from query. Combines data rows according to
column type definitions. This must be run *after*
INPUT:
data - a list of rows
@@ -318,18 +339,23 @@
RETURN:
the dictionary of combined rows (note that there must be a key
column)
"""
- def combine_rows(self, data, data_handler, query_type):
+ def combine_rows(self):
- query_name = self.get_sql_filename_for_query(query_type)
+ query_name = self.get_sql_filename_for_query()
- col_types = data_handler.get_col_types(query_type)
+ col_types = self._data_handler_.get_col_types(self._query_type_)
key_index = QD.get_key_index(query_name)
data_dict = dict()
- num_rows = len(data)
+ num_rows = len(self._results_)
+ """ Check that there are columns defined for the query type """
+ if len(col_types) == 0:
+ print >> sys.stderr, 'No metric columns defined for this query
type\n'
+ return 0
+
""" Combine the rows of data according to the column type definition
for the given query """
- for row in data:
+ for row in self._results_:
key = row[key_index]
@@ -343,31 +369,32 @@
col_type = col_types[i]
field = row[i]
- if col_type == data_handler._COLTYPE_RATE_:
+ if col_type == self._data_handler_._COLTYPE_RATE_:
try:
- data_dict[key][i.__str__()] =
data_dict[key][i.__str__()] + float(field)
+ data_dict[key][self._col_names_[i]] =
data_dict[key][self._col_names_[i]] + float(field)
except KeyError as e:
- data_dict[key][i.__str__()] = float(field)
+ data_dict[key][self._col_names_[i]] = float(field)
- elif col_type == data_handler._COLTYPE_AMOUNT_:
+ elif col_type == self._data_handler_._COLTYPE_AMOUNT_:
try:
- data_dict[key][i.__str__()] =
data_dict[key][i.__str__()] + float(field)
+ data_dict[key][self._col_names_[i]] =
data_dict[key][self._col_names_[i]] + float(field)
except KeyError as e:
- data_dict[key][i.__str__()] = float(field)
+ data_dict[key][self._col_names_[i]] = float(field)
""" !! MODIFY --- this could cause issues in the case of missing data
"""
- num_rows = len(data) / len(data_dict.keys())
+ num_rows = len(self._results_) / len(data_dict.keys())
""" POST PROCESSING
Normalize rate columns """
for i in range(len(col_types)):
- if col_types[i] == data_handler._COLTYPE_RATE_:
+ if col_types[i] == self._data_handler_._COLTYPE_RATE_:
for key in data_dict.keys():
- data_dict[key][i.__str__()] = data_dict[key][i.__str__()]
/ num_rows
+ data_dict[key][self._col_names_[i]] =
data_dict[key][self._col_names_[i]] / num_rows
- return data_dict
+ self._summary_data_ = data_dict
+
"""
@@ -375,9 +402,6 @@
"""
class CampaignIntervalReportingLoader(IntervalReportingLoader):
-
- def __init__(self):
- IntervalReportingLoader.__init__(self)
"""
<DESCRIPTION>
@@ -395,18 +419,17 @@
metrics - dict containing metric measure for each time
index for each donation pipeline handle (e.g. banner names)
times - dict containing time index for each donation
pipeline handle (e.g. banner names)
"""
- def run_query(self, start_time, end_time, interval, query_type,
metric_name, campaign):
+ def run_query(self, start_time, end_time, interval, metric_name, campaign):
- query_type_1 = 'campaign'
- query_type_2 = 'campaign_total'
-
""" Execute the standard interval reporting query """
- data = IntervalReportingLoader.run_query(self, start_time, end_time,
interval, query_type_1, metric_name, campaign)
+ ir = IntervalReportingLoader('campaign')
+ data = ir.run_query(start_time, end_time, interval, metric_name,
campaign)
metrics = data[0]
times = data[1]
""" Get the totals for campaign views and donations """
- data = IntervalReportingLoader.run_query(self, start_time, end_time,
interval, query_type_2, metric_name, campaign)
+ ir = IntervalReportingLoader('campaign_total')
+ data = ir.run_query(start_time, end_time, interval, metric_name,
campaign)
metrics_total = data[0]
times_total = data[1]
@@ -432,12 +455,14 @@
"""
class BannerLPReportingLoader(DataLoader):
- def __init__(self):
+ def __init__(self, query_type):
self._query_names_['LP'] = 'report_LP_metrics'
self._query_names_['BAN'] = 'report_banner_metrics'
self._query_names_['BAN-TEST'] = 'report_banner_metrics'
self._query_names_['LP-TEST'] = 'report_LP_metrics'
+ self._query_type_ = query_type
+
"""
<description>
@@ -446,7 +471,7 @@
RETURN:
"""
- def run_query(self,start_time, end_time, campaign, query_name,
metric_name):
+ def run_query(self,start_time, end_time, campaign, metric_name):
self.init_db()
@@ -665,12 +690,14 @@
"""
class CampaignReportingLoader(DataLoader):
- def __init__(self):
+ def __init__(self, query_type):
self._query_names_['totals'] = 'report_campaign_totals'
self._query_names_['times'] = 'report_campaign_times'
- self._query_names_['banners'] = 'report_campaign_banners'
- self._query_names_['lps'] = 'report_campaign_lps'
+ self._query_names_[FDH._TESTTYPE_BANNER_] = 'report_campaign_banners'
+ self._query_names_[FDH._TESTTYPE_LP_] = 'report_campaign_lps'
+ self._query_type_ = query_type
+
"""
!! MODIFY -- use python reflection !! ... maybe
@@ -679,16 +706,16 @@
delegates the procesing to different methods
"""
- def run_query(self, query_type, params):
+ def run_query(self, params):
self.init_db()
data = None
- if query_type == 'totals':
- data = self.query_totals(query_type, params)
- elif query_type == 'banners' or query_type == 'lps':
- data = self.query_artifacts(query_type, params)
+ if self._query_type_ == 'totals':
+ data = self.query_totals(params)
+ elif self._query_type_ == FDH._TESTTYPE_BANNER_ or self._query_type_
== FDH._TESTTYPE_LP_:
+ data = self.query_artifacts(params)
self.close_db()
@@ -701,14 +728,14 @@
Gets metric totals for campaigns
"""
- def query_totals(self, query_type, params):
+ def query_totals(self, params):
""" Resolve parameters """
metric_name = params['metric_name']
start_time = params['start_time']
end_time = params['end_time']
- query_name = self.get_sql_filename_for_query(query_type)
+ query_name = self.get_sql_filename_for_query()
""" Load the SQL File & Format """
filename = self._sql_path_+ query_name + '.sql'
@@ -753,14 +780,14 @@
Gets a list of banners and landing pages running on the campaign in a
time frame
"""
- def query_artifacts(self, query_type, params):
+ def query_artifacts(self, params):
""" Resolve parameters """
utm_campaign = params['utm_campaign']
start_time = params['start_time']
end_time = params['end_time']
- query_name = self.get_sql_filename_for_query(query_type)
+ query_name = self.get_sql_filename_for_query()
""" Load the SQL File & Format """
filename = self._sql_path_+ query_name + '.sql'
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs