Ejegg has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/394433 )
Change subject: WIP refactor dedupe
......................................................................
WIP refactor dedupe
Big changes here. Use our 'ON DUPLICATE KEY UPDATE' trick to avoid
putting duplicate email addresses in the staging table in the first
place.
This means we don't need the extra ID column and can use email as a
primary key, so we don't need a separate index. We do need to join
to civicrm_email more times while generating statistics, since we
don't have all of the contact IDs in the staging table.
Change-Id: I4a18013e68fc4517c257e670b9ff25365d275706
---
M silverpop_export/update_table.sql
1 file changed, 28 insertions(+), 77 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools
refs/changes/33/394433/1
diff --git a/silverpop_export/update_table.sql
b/silverpop_export/update_table.sql
index 5f9c12d..4c83015 100755
--- a/silverpop_export/update_table.sql
+++ b/silverpop_export/update_table.sql
@@ -15,15 +15,13 @@
DROP TABLE IF EXISTS silverpop_export_address;
CREATE TABLE IF NOT EXISTS silverpop_export_staging(
- id int unsigned PRIMARY KEY, -- This is actually civicrm_email.id
-
-- General information about the contact
contact_id int unsigned,
contact_hash varchar(32),
first_name varchar(128),
last_name varchar(128),
preferred_language varchar(12),
- email varchar(255),
+ email varchar(255) PRIMARY KEY,
opted_out tinyint(1),
-- Lifetime contribution statistics
@@ -50,8 +48,6 @@
postal_code varchar(128),
timezone varchar(8),
- INDEX spex_contact_id (contact_id),
- INDEX spex_email (email),
INDEX spex_country (country),
INDEX spex_opted_out (opted_out)
) COLLATE 'utf8_unicode_ci';
@@ -66,12 +62,12 @@
) COLLATE 'utf8_unicode_ci';
-- Populate, or append to, the storage table all contacts that
--- have an email address. ID is civicrm_email.id.
+-- have an email address.
-- (15 minutes)
INSERT INTO silverpop_export_staging
- (id, contact_id, contact_hash, email, first_name, last_name,
preferred_language, opted_out)
+ (contact_id, contact_hash, email, first_name, last_name, preferred_language,
opted_out)
SELECT
- e.id, e.contact_id, c.hash, e.email, c.first_name, c.last_name,
+ e.contact_id, c.hash, e.email, c.first_name, c.last_name,
REPLACE(c.preferred_language, '_', '-'),
(c.is_opt_out OR c.do_not_email OR e.on_hold OR COALESCE(d.do_not_solicit,
0))
FROM civicrm.civicrm_email e
@@ -80,7 +76,11 @@
WHERE
e.email IS NOT NULL AND e.email != ''
AND c.is_deleted = 0
- AND e.is_primary = 1;
+ AND e.is_primary = 1
+ ORDER BY e.id DESC
+ ON DUPLICATE KEY UPDATE
+ preferred_language =
COALESCE(silverpop_export_staging.preferred_language,
REPLACE(c.preferred_language, '_', '-')),
+ opted_out = (opted_out OR c.is_opt_out OR c.do_not_email OR e.on_hold OR
COALESCE(d.do_not_solicit, 0));
-- Find the latest donation for each email address. Ordering by
-- receive_date and total_amount descending should always insert
@@ -99,7 +99,8 @@
ct.total_amount,
ct.receive_date
FROM
- silverpop_export_staging e
+ civicrm.civicrm_email e FORCE INDEX(UI_email)
+ INNER JOIN silverpop_export_staging exs ON e.email=exs.email
INNER JOIN civicrm.civicrm_contribution ct
ON ct.contact_id = e.contact_id
INNER JOIN civicrm.wmf_contribution_extra ex
@@ -131,78 +132,34 @@
ct.total_amount,
ct.receive_date
FROM
- silverpop_export_staging e,
- civicrm.civicrm_contribution ct,
- civicrm.wmf_contribution_extra ex
+ civicrm.civicrm_email e FORCE INDEX(UI_email)
+ INNER JOIN silverpop_export_staging exs ON e.email=exs.email
+ INNER JOIN civicrm.civicrm_contribution ct ON ct.contact_id = e.contact_id
+ INNER JOIN civicrm.wmf_contribution_extra ex ON ex.entity_id = ct.id
WHERE
- e.contact_id = ct.contact_id AND
- ex.entity_id = ct.id AND
ct.receive_date IS NOT NULL AND
ct.total_amount > 0 AND -- Refunds don't count
- ct.contribution_status_id = 1 -- 'Completed'
+ ct.contribution_status_id = 1 AND -- 'Completed'
+ exs.opted_out = 0
ORDER BY
ct.total_amount DESC,
ct.receive_date DESC
ON DUPLICATE KEY UPDATE highest_native_currency =
silverpop_export_highest.highest_native_currency;
--- Deduplicate rows that have the same email address, we will
--- have to merge in more data later, but this is ~1.5M rows we're
--- getting rid of here which is more better than taking them all the way
--- through.
-CREATE TABLE silverpop_export_dedupe_email (
- id INT PRIMARY KEY AUTO_INCREMENT,
- email varchar(255),
- maxid int,
- preferred_language varchar(12),
- opted_out tinyint(1),
-
- INDEX spexde_email (email)
-) COLLATE 'utf8_unicode_ci';
-
-INSERT INTO silverpop_export_dedupe_email (email, maxid, opted_out)
- SELECT email, max(id) maxid, max(opted_out) opted_out
- FROM silverpop_export_staging
- FORCE INDEX (spex_email)
- GROUP BY email
- HAVING count(*) > 1;
-
--- We pull in language from the parent table so that we
--- can preserve it and not propagate nulls
-UPDATE silverpop_export_dedupe_email exde, silverpop_export_staging ex
- SET
- exde.preferred_language = ex.preferred_language
- WHERE
- ex.email = exde.email AND
- ex.preferred_language IS NOT NULL;
-
-DELETE silverpop_export_staging FROM silverpop_export_staging,
silverpop_export_dedupe_email
- WHERE
- silverpop_export_staging.email = silverpop_export_dedupe_email.email AND
- silverpop_export_staging.id != silverpop_export_dedupe_email.maxid;
-
-UPDATE silverpop_export_staging ex, silverpop_export_dedupe_email exde
- SET
- ex.opted_out = exde.opted_out,
- ex.preferred_language = exde.preferred_language
- WHERE
- exde.maxid = ex.id;
-
-- Create an aggregate table from a full contribution table scan
CREATE TABLE silverpop_export_stat (
email varchar(255) PRIMARY KEY,
- exid INT,
has_recurred_donation tinyint(1),
total_usd decimal(20,2),
cnt_total int unsigned,
- first_donation_date datetime,
- INDEX stat_exid (exid)
+ first_donation_date datetime
) COLLATE 'utf8_unicode_ci';
INSERT INTO silverpop_export_stat
- (email, exid, total_usd, cnt_total, has_recurred_donation,
first_donation_date)
+ (email, total_usd, cnt_total, has_recurred_donation, first_donation_date)
SELECT
- e.email, MAX(ex.id), SUM(ct.total_amount), COUNT(*),
+ e.email, SUM(ct.total_amount), COUNT(*),
MAX(IF(SUBSTRING(ct.trxn_id, 1, 9) = 'RECURRING', 1, 0)),
MIN(ct.receive_date)
FROM civicrm.civicrm_email e FORCE INDEX(UI_email)
@@ -210,7 +167,8 @@
JOIN civicrm.civicrm_contribution ct ON e.contact_id=ct.contact_id
WHERE ct.receive_date IS NOT NULL AND
ct.total_amount > 0 AND -- Refunds don't count
- ct.contribution_status_id = 1 -- Only completed status
+ ct.contribution_status_id = 1 AND -- Only completed status
+ ex.opted_out = 0
GROUP BY e.email;
-- Postal addresses by email
@@ -253,12 +211,12 @@
JOIN civicrm.civicrm_country ctry
ON ct.country = ctry.iso_code # filter out invalid c_t countries
WHERE ex.opted_out = 0
- ORDER BY cc.id DESC
+ ORDER BY cc.id DESC
ON DUPLICATE KEY UPDATE email = e.email;
-- Pull in address and latest/greatest/cumulative stats from intermediate
tables
UPDATE silverpop_export_staging ex
- LEFT JOIN silverpop_export_stat exs ON ex.id = exs.exid
+ LEFT JOIN silverpop_export_stat exs ON ex.email = exs.email
LEFT JOIN silverpop_export_latest lt ON ex.email = lt.email
LEFT JOIN silverpop_export_highest hg ON ex.email = hg.email
LEFT JOIN silverpop_export_address addr ON ex.email = addr.email
@@ -335,15 +293,13 @@
DROP TABLE IF EXISTS silverpop_export;
CREATE TABLE IF NOT EXISTS silverpop_export(
- id int unsigned PRIMARY KEY, -- This is actually civicrm_email.id
-
-- General information about the contact
contact_id int unsigned,
contact_hash varchar(32),
first_name varchar(128),
last_name varchar(128),
preferred_language varchar(12),
- email varchar(255),
+ email varchar(255) PRIMARY KEY,
-- Lifetime contribution statistics
has_recurred_donation tinyint(1),
@@ -369,23 +325,18 @@
postal_code varchar(128),
timezone varchar(8),
- INDEX rspex_contact_id (contact_id),
- INDEX rspex_email (email),
- INDEX rspex_city (city),
- INDEX rspex_country (country),
- INDEX rspex_postal (postal_code),
- CONSTRAINT sp_email UNIQUE (email)
+ UNIQUE (contact_id)
) COLLATE 'utf8_unicode_ci';
-- Move the data from the staging table into the persistent one
-- (12 minutes)
INSERT INTO silverpop_export (
- id,contact_id,contact_hash,first_name,last_name,preferred_language,email,
+ contact_id,contact_hash,first_name,last_name,preferred_language,email,
has_recurred_donation,highest_usd_amount,highest_native_amount,
highest_native_currency,highest_donation_date,lifetime_usd_total,donation_count,
latest_currency,latest_currency_symbol,latest_native_amount,latest_usd_amount,
latest_donation, first_donation_date,city,country,state,postal_code,timezone
)
-SELECT
id,contact_id,contact_hash,first_name,last_name,preferred_language,email,
+SELECT contact_id,contact_hash,first_name,last_name,preferred_language,email,
has_recurred_donation,highest_usd_amount,highest_native_amount,
highest_native_currency,highest_donation_date,lifetime_usd_total,donation_count,
latest_currency,latest_currency_symbol,latest_native_amount,latest_usd_amount,
--
To view, visit https://gerrit.wikimedia.org/r/394433
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I4a18013e68fc4517c257e670b9ff25365d275706
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Ejegg <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits