Awight has uploaded a new change for review. https://gerrit.wikimedia.org/r/314476
Change subject: Remove unused dedupe module ...................................................................... Remove unused dedupe module DEPLOYMENT NOTE: It's possible that a cronjob is still running quick_autoreview on production. This was a prototype dedupe system and is not doing anything helpful at the moment. Remove to avoid confusion. Change-Id: I3ce402b49cc8dc8fbd5eaec84f72dae9cc8d4d03 --- D dedupe/.gitignore D dedupe/README D dedupe/__init__.py D dedupe/action.py D dedupe/autoreview.py D dedupe/config.yaml.example D dedupe/contact_cache.py D dedupe/fuzzy_text_matching.py D dedupe/match.py D dedupe/quick_autoreview.py D dedupe/review_job.py D dedupe/review_queue.py D dedupe/tests/test_autoreview.py 13 files changed, 0 insertions(+), 544 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools refs/changes/76/314476/1 diff --git a/dedupe/.gitignore b/dedupe/.gitignore deleted file mode 100644 index 5b6b072..0000000 --- a/dedupe/.gitignore +++ /dev/null @@ -1 +0,0 @@ -config.yaml diff --git a/dedupe/README b/dedupe/README deleted file mode 100644 index f624cd4..0000000 --- a/dedupe/README +++ /dev/null @@ -1 +0,0 @@ -apt-get install python-yaml python-Levenshtein python-mysqldb diff --git a/dedupe/__init__.py b/dedupe/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/dedupe/__init__.py +++ /dev/null diff --git a/dedupe/action.py b/dedupe/action.py deleted file mode 100644 index e5b1576..0000000 --- a/dedupe/action.py +++ /dev/null @@ -1,21 +0,0 @@ -from process.globals import config -from database import db - -class Action(object): - cache = {} - - @staticmethod - def get(name): - if name not in Action.cache: - Action.cache[name] = Action(name) - - return Action.cache[name] - - def __init__(self, name): - self.name = name - - sql = "SELECT id FROM donor_review_action WHERE name = %s" - results = list(db.get_db(config.drupal_schema).execute(sql, (name, ))) - if not results: - raise RuntimeError("Db schema missing action: " + name) - self.id = results[0]['id'] diff --git a/dedupe/autoreview.py b/dedupe/autoreview.py deleted file mode 100644 index 2c9293e..0000000 --- a/dedupe/autoreview.py +++ /dev/null @@ -1,137 +0,0 @@ -from process.globals import config - -from civicrm.tag import Tag -from contact_cache import TaggedGroup -from dedupe.action import Action -from fuzzy_text_matching import FuzzyTextMatching - -class Autoreview(object): - IDENTICAL = 'I' - SIMILAR = 'S' - UNRELATED = 'U' - - REVIEW = Tag.get('Review') - AUTOREVIEWED = Tag.get('Autoreviewed - Unique') - - REC_KEEP = Action.get('Autoreview - Recommend keep') - REC_SPAM = Action.get('Autoreview - Recommend spamblock') - REC_DUP = Action.get('Autoreview - Recommend is duplicate') - REC_NEWER = Action.get('Autoreview - Recommend update contact') - REC_CONFLICT = Action.get('Autoreview - Recommend conflict resolution') - - EMAIL_EDIT_THRESHOLD = 2 - NAME_EDIT_THRESHOLD = 2 - ADDRESS_NUMBERS_EDIT_THRESHOLD = 2 - - actionLookup = { - #'III': AUTOREVIEWED & REC_DUP - 'IIS': REC_DUP, - 'IIU': REC_NEWER, - 'ISI': REC_DUP, - 'ISS': REC_DUP, - 'ISU': REC_CONFLICT, - 'IUI': REC_NEWER, - 'IUS': REC_NEWER, - 'IUU': REC_CONFLICT, - 'SII': REC_DUP, - 'SIS': REC_DUP, - 'SIU': REC_CONFLICT, - 'SSI': REC_DUP, - 'SSS': REC_CONFLICT, - 'SSU': REC_CONFLICT, - 'SUI': REC_CONFLICT, - 'SUS': REC_CONFLICT, - 'SUU': AUTOREVIEWED, # & REC_KEEP - #U**: AUTOREVIEWED & REC_KEEP - } - - def __init__(self): - self.contactCache = TaggedGroup(Autoreview.REVIEW) - - def reviewBatch(self): - self.contactCache.fetch(config.autoreview_job_size) - - # for ALL contacts, - #self.review( - - def review(self, contact): - for other in self.contactCache: - result = {} - result['other'] = other - result['name'] = Autoreview.compareNames(contact['name'], other['name']) - result['email'] = Autoreview.compareEmails(contact['email'], other['email']) - result['address'] = Autoreview.compareAddresses(contact['address'], other['address']) - #TODO action = self.determineAction(result) - - @staticmethod - def compareNames(a, b): - if a == b: - return Autoreview.IDENTICAL - - # TODO: initials - - if FuzzyTextMatching.levenshteinDistance(a, b) <= Autoreview.NAME_EDIT_THRESHOLD: - return Autoreview.SIMILAR - - return Autoreview.UNRELATED - - @staticmethod - def compareEmails(a, b): - if a == b: - return Autoreview.IDENTICAL - - if FuzzyTextMatching.levenshteinDistance(a, b) <= Autoreview.EMAIL_EDIT_THRESHOLD: - return Autoreview.SIMILAR - - return Autoreview.UNRELATED - - @staticmethod - def compareAddresses(a, b): - a['street_numbers'] = FuzzyTextMatching.extractNumbers(a['street_address']) - b['street_numbers'] = FuzzyTextMatching.extractNumbers(b['street_address']) - - identical_hits = 0 - components = [ - 'street_numbers', - 'street_address', - 'city', - 'postal_code', - 'country', - 'state', - ] - for key in components: - if a[key] == b[key]: - identical_hits += 1 - - if identical_hits == len(components): - return Autoreview.IDENTICAL - - # same postal code or closer - if identical_hits >= 4: - return Autoreview.SIMILAR - - if FuzzyTextMatching.levenshteinDistance(a['street_numbers'], b['street_numbers']) <= Autoreview.ADDRESS_NUMBERS_EDIT_THRESHOLD: - return Autoreview.SIMILAR - - #if identical_hits == 0: - return Autoreview.UNRELATED - - def determineAction(self, results): - queue = None - tag = None - - concatKey = results['name'] + results['email'] + results['address'] - if results['name'] == Autoreview.IDENTICAL and results['email'] == Autoreview.IDENTICAL and results['address'] == Autoreview.IDENTICAL: - queue = Autoreview.AUTOREVIEWED - tag = Autoreview.REC_DUP - elif results['name'] == 'U': - queue = Autoreview.AUTOREVIEWED - tag = Autoreview.REC_KEEP - else: - queue = Autoreview.REVIEW - tag = Autoreview.actionLookup[concatKey] - - return { - 'queue': queue, - 'tag': tag, - } diff --git a/dedupe/config.yaml.example b/dedupe/config.yaml.example deleted file mode 100644 index c7dba0e..0000000 --- a/dedupe/config.yaml.example +++ /dev/null @@ -1,12 +0,0 @@ -civicrm_schema: - wmf_civicrm -drupal_schema: - wmf_drupal -db_params: - host: localhost - db: wmf_civicrm - user: USE - passwd: A PASS - debug: 0 -contact_cache_size: - 10000 diff --git a/dedupe/contact_cache.py b/dedupe/contact_cache.py deleted file mode 100644 index c239f57..0000000 --- a/dedupe/contact_cache.py +++ /dev/null @@ -1,122 +0,0 @@ -'''Optimized retrieval and in-memory storage of a small amount of information across many contacts.''' - -from process.logging import Logger as log -from process.globals import config -from database import db - -class ContactCache(object): - def __init__(self, require_email=False, **kw): - self.columns = [] - self.contacts = [] - self.require_email = require_email - - def isEmpty(self): - return not self.contacts.empty() - - def fetch(self): - '''Load a batch of contacts into the cache''' - query = self.buildQuery() - - self.contacts = [] - result = db.get_db().execute(query) - for row in result: - name_components = [] - keys = ['first_name', 'middle_name', 'last_name', 'organization_name'] - - for key in keys: - if key in row and row[key]: - name_components.append(row[key]) - - #TODO: consider some flatter structure: - #self.contacts.append([ - # row['id'], - # " ".join(name_components), - # row['email'], - #]) - self.contacts.append({ - 'id': row['id'], - 'name': " ".join(name_components), - 'email': row['email'], - }) - - def buildQuery(self): - query = db.Query() - query.columns.extend([ - "contact.id", - "contact.first_name", - "contact.middle_name", - "contact.last_name", - "email.email", - "address.street_address", - "address.city", - "address.postal_code", - "state.abbreviation", - "country.iso_code", - ]) - email_clause = "civicrm_email email ON contact.id = email.contact_id" - if self.require_email: - email_clause += " AND email.email IS NOT NULL" - query.tables = [ - "civicrm_contact contact", - email_clause, - "civicrm_address address ON contact.id = address.contact_id", - "civicrm_country country ON address.country_id = country.id", - "civicrm_state_province state ON address.state_province_id = state.id", - ] - query.group_by = [ - "contact.id", - ] - query.order_by = [ - "contact.id", - ] - return query - -class PagedGroup(ContactCache): - pagesize = config.contact_cache_size - - def __init__(self, **kw): - super(PagedGroup, self).__init__(**kw) - self.offset = 0 - - def buildQuery(self): - query = super(PagedGroup, self).buildQuery() - log.info("Limiting batch contact retrieval to {num} records.".format(num=self.pagesize)) - query.limit = self.pagesize - query.offset = self.offset - return query - - def next(self): - #TODO: - #query.offset += self.pagesize - #self.fetch() - raise Exception("unimplemented") - -class TaggedGroup(PagedGroup): - """Select contacts based on included and excluded tags.""" - - def __init__(self, tag, excludetag=None, **kw): - super(TaggedGroup, self).__init__(**kw) - self.tag = tag - self.excludetag = excludetag - - def buildQuery(self): - query = super(TaggedGroup, self).buildQuery() - query.tables.extend([ - "civicrm_entity_tag entity_tag ON entity_tag.entity_id = contact.id AND entity_tag.tag_id = %(tag_id)s AND entity_tag.entity_table = 'civicrm_contact'", - ]) - query.params.update({ - 'tag_id': self.tag.id - }) - - if self.excludetag: - query.tables.extend([ - "civicrm_entity_tag entity_tag_not ON entity_tag_not.entity_id = contact.id AND entity_tag_not.tag_id = %(excludetag_id)s AND entity_tag_not.entity_table = 'civicrm_contact'", - ]) - query.where.extend([ - "entity_tag_not.id IS NULL" - ]) - query.params.update({ - 'excludetag_id': self.excludetag.id - }) - - return query diff --git a/dedupe/fuzzy_text_matching.py b/dedupe/fuzzy_text_matching.py deleted file mode 100644 index 1843540..0000000 --- a/dedupe/fuzzy_text_matching.py +++ /dev/null @@ -1,15 +0,0 @@ -import re - -from Levenshtein import distance - -class FuzzyTextMatching(object): - @staticmethod - def levenshteinDistance(string_a, string_b): - return distance(string_a, string_b) - - @staticmethod - def extractNumbers(address): - return re.sub(r'/[^0-9 ]/', '', address).strip(); - - #overkill: static function stripTrivial($address) { - # See https://www.usps.com/send/official-abbreviations.htm -> Street suffixes, and Secondary units diff --git a/dedupe/match.py b/dedupe/match.py deleted file mode 100644 index b19e410..0000000 --- a/dedupe/match.py +++ /dev/null @@ -1,25 +0,0 @@ -import json - -# TODO: elaborate - -class Match(object): - def __init__(self): - self.address = None - self.email = None - self.name = None - - def json(self): - return json.dumps({ - "address": self.address, - "email": self.email, - "name": self.name, - }) - -class EmailMatch(Match): - def __init__(self, matchDescription): - self.email = matchDescription - - def json(self): - return json.dumps({ - "email": self.email, - }) diff --git a/dedupe/quick_autoreview.py b/dedupe/quick_autoreview.py deleted file mode 100755 index 69f5400..0000000 --- a/dedupe/quick_autoreview.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python - -'''Find low-hanging dupe fruits and mark them for the manual review queue''' - -from process.logging import Logger as log -from process.globals import load_config -load_config("dedupe") -import process.lock as lock - -from autoreview import Autoreview -from civicrm.tag import Tag -from contact_cache import TaggedGroup -from database import db -from match import EmailMatch -from review_job import ReviewJob -from review_queue import ReviewQueue - -class QuickAutoreview(object): - QUICK_REVIEWED = Tag.get("Quick autoreviewed") - - def __init__(self): - self.contactCache = TaggedGroup( - tag=Autoreview.REVIEW, - excludetag=QuickAutoreview.QUICK_REVIEWED, - require_email=True - ) - job = ReviewJob("Quick autoreview") - self.job_id = job.id - - def reviewBatch(self): - '''For each new contact, find the oldest contact with the same email address.''' - - matchDescription = EmailMatch("Exact match").json() - - matched = 0 - self.contactCache.fetch() - for contact in self.contactCache.contacts: - if contact['email']: - query = db.Query() - query.columns = [ - 'MIN(contact_id) AS contact_id', - ] - query.tables = [ - 'civicrm_email', - ] - query.where.extend([ - 'email = %(email)s', - 'contact_id < %(new_id)s', - ]) - query.group_by.extend([ - 'email', - ]) - query.params = { - 'new_id': contact['id'], - 'email': contact['email'], - } - result = db.get_db().execute(query) - - if result: - for row in result: - ReviewQueue.addMatch(self.job_id, row['contact_id'], contact['id'], Autoreview.REC_DUP, matchDescription) - matched += 1 - - ReviewQueue.tag(contact['id'], QuickAutoreview.QUICK_REVIEWED) - - if not self.contactCache.contacts: - log.warn("Searched an empty batch of contacts!") - else: - last_seen = self.contactCache.contacts[-1]['id'] - log.info("End of batch. Last contact scanned was ID {id}".format(id=last_seen)) - log.info("Marked {matched} contacts as potential duplicates.".format(matched=matched)) - - -if __name__ == '__main__': - log.info("Begin quick_autoreview deduper") - lock.begin() - - job = QuickAutoreview() - job.reviewBatch() - ReviewQueue.commit() - - lock.end() - log.info("End quick_autoreview deduper") diff --git a/dedupe/review_job.py b/dedupe/review_job.py deleted file mode 100644 index df9abcb..0000000 --- a/dedupe/review_job.py +++ /dev/null @@ -1,13 +0,0 @@ -from process.logging import Logger as log -from process.globals import config -from database import db - -class ReviewJob(object): - def __init__(self, name): - self.name = name - - sql = "INSERT INTO donor_autoreview_job SET name = %s" - dbc = db.get_db(config.drupal_schema) - dbc.execute(sql, (name, )) - self.id = dbc.last_insert_id() - log.info("This job has ID %d" % self.id) diff --git a/dedupe/review_queue.py b/dedupe/review_queue.py deleted file mode 100644 index 52c9d66..0000000 --- a/dedupe/review_queue.py +++ /dev/null @@ -1,69 +0,0 @@ -from process.logging import Logger as log -from process.globals import config -from database import db - -class ReviewQueue(object): - cached_tagging = True - cached_tags = {} - - @staticmethod - def addMatch(job_id, oldId, newId, action, match): - #log.info("Found a match: {old} -> {new} : {match}".format(old=oldId, new=newId, match=match)) - db.get_db(config.drupal_schema).execute(""" - INSERT INTO donor_review_queue - SET - job_id = %(job_id)s, - old_id = %(old_id)s, - new_id = %(new_id)s, - action_id = %(action_id)s, - match_description = %(match)s - """, { - 'job_id': job_id, - 'old_id': oldId, - 'new_id': newId, - 'action_id': action.id, - 'match': match, - }) - - @staticmethod - def tag(contact_id, tag): - if ReviewQueue.cached_tagging: - if tag not in ReviewQueue.cached_tags: - ReviewQueue.cached_tags[tag] = [] - - ReviewQueue.cached_tags[tag].append(contact_id) - else: - ReviewQueue.tag_single(contact_id, tag) - - @staticmethod - def commit(): - log.info("Committing tags...") - for tag, contacts in ReviewQueue.cached_tags.items(): - log.info("Bulk tagging {num} contacts with tag <{tag}>".format(num=len(contacts), tag=tag.name)) - ReviewQueue.tag_many(contacts, tag) - - @staticmethod - def tag_many(contacts, tag): - sets = [ "('civicrm_contact', {contact_id}, {tag_id})".format(contact_id=contact_id, tag_id=tag.id) - for contact_id in contacts ] - values = ", ".join(sets) - - db.get_db(config.civicrm_schema).execute(""" - INSERT IGNORE INTO civicrm_entity_tag - (entity_table, entity_id, tag_id) - VALUES - %s - """ % values) - - @staticmethod - def tag_single(contact_id, tag): - db.get_db(config.civicrm_schema).execute(""" - INSERT IGNORE INTO civicrm_entity_tag - SET - entity_table = 'civicrm_contact', - entity_id = %(contact_id)s, - tag_id = %(tag_id)s - """, { - 'contact_id': contact_id, - 'tag_id': tag.id, - }) diff --git a/dedupe/tests/test_autoreview.py b/dedupe/tests/test_autoreview.py deleted file mode 100644 index 24d393c..0000000 --- a/dedupe/tests/test_autoreview.py +++ /dev/null @@ -1,45 +0,0 @@ -import unittest -from dedupe.autoreview import Autoreview - -class TestAutoreview(unittest.TestCase): - def setUp(self): - self.reviewer = Autoreview() - - def test_compareNames(self): - self.assertEqual(Autoreview.IDENTICAL, Autoreview.compareNames('Bar Foo', 'Bar Foo')) - self.assertEqual(Autoreview.SIMILAR, Autoreview.compareNames('Bra Foo', 'Bar Foo')) - self.assertEqual(Autoreview.UNRELATED, Autoreview.compareNames('Arctostaphylos', 'Bar Foo')) - - def test_compareEmails(self): - self.assertEqual(Autoreview.IDENTICAL, Autoreview.compareEmails('foo@bar', 'foo@bar')) - self.assertEqual(Autoreview.SIMILAR, Autoreview.compareEmails('foo2@bar', 'foo@bar')) - self.assertEqual(Autoreview.UNRELATED, Autoreview.compareEmails('elem@ant', 'foo@bar')) - - def test_compareAddresses(self): - oldAddress = { - 'street_address': '1701 Flightless Bird', - 'postal_code': '112233 BFF', - 'city': 'Dent', - 'country': 'UK', - 'state': 'Eastside', - } - nearAddress = { - 'street_address': '1710 F. Bd.', - 'postal_code': '112233 BFF', - 'city': 'Dent', - 'country': 'UK', - 'state': 'Eastside', - } - otherAddress = { - 'street_address': '1 Uptown', - 'postal_code': '323232', - 'city': 'Dent', - 'country': 'UK', - 'state': 'Eastside', - } - self.assertEqual(Autoreview.IDENTICAL, Autoreview.compareAddresses(oldAddress, oldAddress)) - self.assertEqual(Autoreview.SIMILAR, Autoreview.compareAddresses(nearAddress, oldAddress)) - self.assertEqual(Autoreview.UNRELATED, Autoreview.compareAddresses(otherAddress, oldAddress)) - -if __name__ == '__main__': - unittest.main() -- To view, visit https://gerrit.wikimedia.org/r/314476 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3ce402b49cc8dc8fbd5eaec84f72dae9cc8d4d03 Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/fundraising/tools Gerrit-Branch: master Gerrit-Owner: Awight <awi...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits