Awight has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/314476

Change subject: Remove unused dedupe module
......................................................................

Remove unused dedupe module

DEPLOYMENT NOTE: It's possible that a cronjob is still running quick_autoreview
on production.

This was a prototype dedupe system and is not doing anything helpful at the
moment.  Remove to avoid confusion.

Change-Id: I3ce402b49cc8dc8fbd5eaec84f72dae9cc8d4d03
---
D dedupe/.gitignore
D dedupe/README
D dedupe/__init__.py
D dedupe/action.py
D dedupe/autoreview.py
D dedupe/config.yaml.example
D dedupe/contact_cache.py
D dedupe/fuzzy_text_matching.py
D dedupe/match.py
D dedupe/quick_autoreview.py
D dedupe/review_job.py
D dedupe/review_queue.py
D dedupe/tests/test_autoreview.py
13 files changed, 0 insertions(+), 544 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools 
refs/changes/76/314476/1

diff --git a/dedupe/.gitignore b/dedupe/.gitignore
deleted file mode 100644
index 5b6b072..0000000
--- a/dedupe/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-config.yaml
diff --git a/dedupe/README b/dedupe/README
deleted file mode 100644
index f624cd4..0000000
--- a/dedupe/README
+++ /dev/null
@@ -1 +0,0 @@
-apt-get install python-yaml python-Levenshtein python-mysqldb
diff --git a/dedupe/__init__.py b/dedupe/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/dedupe/__init__.py
+++ /dev/null
diff --git a/dedupe/action.py b/dedupe/action.py
deleted file mode 100644
index e5b1576..0000000
--- a/dedupe/action.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from process.globals import config
-from database import db
-
-class Action(object):
-    cache = {}
-
-    @staticmethod
-    def get(name):
-        if name not in Action.cache:
-            Action.cache[name] = Action(name)
-
-        return Action.cache[name]
-
-    def __init__(self, name):
-        self.name = name
-
-        sql = "SELECT id FROM donor_review_action WHERE name = %s"
-        results = list(db.get_db(config.drupal_schema).execute(sql, (name, )))
-        if not results:
-            raise RuntimeError("Db schema missing action: " + name)
-        self.id = results[0]['id']
diff --git a/dedupe/autoreview.py b/dedupe/autoreview.py
deleted file mode 100644
index 2c9293e..0000000
--- a/dedupe/autoreview.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from process.globals import config
-
-from civicrm.tag import Tag
-from contact_cache import TaggedGroup
-from dedupe.action import Action
-from fuzzy_text_matching import FuzzyTextMatching
-
-class Autoreview(object):
-    IDENTICAL = 'I'
-    SIMILAR = 'S'
-    UNRELATED = 'U'
-
-    REVIEW = Tag.get('Review')
-    AUTOREVIEWED = Tag.get('Autoreviewed - Unique')
-
-    REC_KEEP = Action.get('Autoreview - Recommend keep')
-    REC_SPAM = Action.get('Autoreview - Recommend spamblock')
-    REC_DUP = Action.get('Autoreview - Recommend is duplicate')
-    REC_NEWER = Action.get('Autoreview - Recommend update contact')
-    REC_CONFLICT = Action.get('Autoreview - Recommend conflict resolution')
-
-    EMAIL_EDIT_THRESHOLD = 2
-    NAME_EDIT_THRESHOLD = 2
-    ADDRESS_NUMBERS_EDIT_THRESHOLD = 2
-
-    actionLookup = {
-        #'III': AUTOREVIEWED & REC_DUP
-        'IIS': REC_DUP,
-        'IIU': REC_NEWER,
-        'ISI': REC_DUP,
-        'ISS': REC_DUP,
-        'ISU': REC_CONFLICT,
-        'IUI': REC_NEWER,
-        'IUS': REC_NEWER,
-        'IUU': REC_CONFLICT,
-        'SII': REC_DUP,
-        'SIS': REC_DUP,
-        'SIU': REC_CONFLICT,
-        'SSI': REC_DUP,
-        'SSS': REC_CONFLICT,
-        'SSU': REC_CONFLICT,
-        'SUI': REC_CONFLICT,
-        'SUS': REC_CONFLICT,
-        'SUU': AUTOREVIEWED, # & REC_KEEP
-        #U**: AUTOREVIEWED & REC_KEEP
-    }
-
-    def __init__(self):
-        self.contactCache = TaggedGroup(Autoreview.REVIEW)
-
-    def reviewBatch(self):
-        self.contactCache.fetch(config.autoreview_job_size)
-
-        # for ALL contacts,
-        #self.review(
-
-    def review(self, contact):
-        for other in self.contactCache:
-            result = {}
-            result['other'] = other
-            result['name'] = Autoreview.compareNames(contact['name'], 
other['name'])
-            result['email'] = Autoreview.compareEmails(contact['email'], 
other['email'])
-            result['address'] = 
Autoreview.compareAddresses(contact['address'], other['address'])
-            #TODO action = self.determineAction(result)
-
-    @staticmethod
-    def compareNames(a, b):
-        if a == b:
-            return Autoreview.IDENTICAL
-
-        # TODO: initials
-
-        if FuzzyTextMatching.levenshteinDistance(a, b) <= 
Autoreview.NAME_EDIT_THRESHOLD:
-            return Autoreview.SIMILAR
-
-        return Autoreview.UNRELATED
-
-    @staticmethod
-    def compareEmails(a, b):
-        if a == b:
-            return Autoreview.IDENTICAL
-
-        if FuzzyTextMatching.levenshteinDistance(a, b) <= 
Autoreview.EMAIL_EDIT_THRESHOLD:
-            return Autoreview.SIMILAR
-
-        return Autoreview.UNRELATED
-
-    @staticmethod
-    def compareAddresses(a, b):
-        a['street_numbers'] = 
FuzzyTextMatching.extractNumbers(a['street_address'])
-        b['street_numbers'] = 
FuzzyTextMatching.extractNumbers(b['street_address'])
-
-        identical_hits = 0
-        components = [
-            'street_numbers',
-            'street_address',
-            'city',
-            'postal_code',
-            'country',
-            'state',
-        ]
-        for key in components:
-            if a[key] == b[key]:
-                identical_hits += 1
-
-        if identical_hits == len(components):
-            return Autoreview.IDENTICAL
-
-        # same postal code or closer
-        if identical_hits >= 4:
-            return Autoreview.SIMILAR
-
-        if FuzzyTextMatching.levenshteinDistance(a['street_numbers'], 
b['street_numbers']) <= Autoreview.ADDRESS_NUMBERS_EDIT_THRESHOLD:
-            return Autoreview.SIMILAR
-
-        #if identical_hits == 0:
-        return Autoreview.UNRELATED
-
-    def determineAction(self, results):
-        queue = None
-        tag = None
-
-        concatKey = results['name'] + results['email'] + results['address']
-        if results['name'] == Autoreview.IDENTICAL and results['email'] == 
Autoreview.IDENTICAL and results['address'] == Autoreview.IDENTICAL:
-            queue = Autoreview.AUTOREVIEWED
-            tag = Autoreview.REC_DUP
-        elif results['name'] == 'U':
-            queue = Autoreview.AUTOREVIEWED
-            tag = Autoreview.REC_KEEP
-        else:
-            queue = Autoreview.REVIEW
-            tag = Autoreview.actionLookup[concatKey]
-
-        return {
-            'queue': queue,
-            'tag': tag,
-        }
diff --git a/dedupe/config.yaml.example b/dedupe/config.yaml.example
deleted file mode 100644
index c7dba0e..0000000
--- a/dedupe/config.yaml.example
+++ /dev/null
@@ -1,12 +0,0 @@
-civicrm_schema:
-    wmf_civicrm
-drupal_schema:
-    wmf_drupal
-db_params:
-    host: localhost
-    db: wmf_civicrm
-    user: USE
-    passwd: A PASS
-    debug: 0
-contact_cache_size:
-    10000
diff --git a/dedupe/contact_cache.py b/dedupe/contact_cache.py
deleted file mode 100644
index c239f57..0000000
--- a/dedupe/contact_cache.py
+++ /dev/null
@@ -1,122 +0,0 @@
-'''Optimized retrieval and in-memory storage of a small amount of information 
across many contacts.'''
-
-from process.logging import Logger as log
-from process.globals import config
-from database import db
-
-class ContactCache(object):
-    def __init__(self, require_email=False, **kw):
-        self.columns = []
-        self.contacts = []
-        self.require_email = require_email
-
-    def isEmpty(self):
-        return not self.contacts.empty()
-
-    def fetch(self):
-        '''Load a batch of contacts into the cache'''
-        query = self.buildQuery()
-
-        self.contacts = []
-        result = db.get_db().execute(query)
-        for row in result:
-            name_components = []
-            keys = ['first_name', 'middle_name', 'last_name', 
'organization_name']
-
-            for key in keys:
-                if key in row and row[key]:
-                    name_components.append(row[key])
-
-            #TODO: consider some flatter structure:
-            #self.contacts.append([
-            #  row['id'],
-            #  " ".join(name_components),
-            #  row['email'],
-            #])
-            self.contacts.append({
-                'id': row['id'],
-                'name': " ".join(name_components),
-                'email': row['email'],
-            })
-
-    def buildQuery(self):
-        query = db.Query()
-        query.columns.extend([
-            "contact.id",
-            "contact.first_name",
-            "contact.middle_name",
-            "contact.last_name",
-            "email.email",
-            "address.street_address",
-            "address.city",
-            "address.postal_code",
-            "state.abbreviation",
-            "country.iso_code",
-        ])
-        email_clause = "civicrm_email email ON contact.id = email.contact_id"
-        if self.require_email:
-            email_clause += " AND email.email IS NOT NULL"
-        query.tables = [
-            "civicrm_contact contact",
-            email_clause,
-            "civicrm_address address ON contact.id = address.contact_id",
-            "civicrm_country country ON address.country_id = country.id",
-            "civicrm_state_province state ON address.state_province_id = 
state.id",
-        ]
-        query.group_by = [
-            "contact.id",
-        ]
-        query.order_by = [
-            "contact.id",
-        ]
-        return query
-
-class PagedGroup(ContactCache):
-    pagesize = config.contact_cache_size
-
-    def __init__(self, **kw):
-        super(PagedGroup, self).__init__(**kw)
-        self.offset = 0
-
-    def buildQuery(self):
-        query = super(PagedGroup, self).buildQuery()
-        log.info("Limiting batch contact retrieval to {num} 
records.".format(num=self.pagesize))
-        query.limit = self.pagesize
-        query.offset = self.offset
-        return query
-
-    def next(self):
-        #TODO:
-        #query.offset += self.pagesize
-        #self.fetch()
-        raise Exception("unimplemented")
-
-class TaggedGroup(PagedGroup):
-    """Select contacts based on included and excluded tags."""
-
-    def __init__(self, tag, excludetag=None, **kw):
-        super(TaggedGroup, self).__init__(**kw)
-        self.tag = tag
-        self.excludetag = excludetag
-
-    def buildQuery(self):
-        query = super(TaggedGroup, self).buildQuery()
-        query.tables.extend([
-            "civicrm_entity_tag entity_tag ON entity_tag.entity_id = 
contact.id AND entity_tag.tag_id = %(tag_id)s AND entity_tag.entity_table = 
'civicrm_contact'",
-        ])
-        query.params.update({
-            'tag_id': self.tag.id
-        })
-
-        if self.excludetag:
-            query.tables.extend([
-                "civicrm_entity_tag entity_tag_not ON entity_tag_not.entity_id 
= contact.id AND entity_tag_not.tag_id = %(excludetag_id)s AND 
entity_tag_not.entity_table = 'civicrm_contact'",
-            ])
-            query.where.extend([
-                "entity_tag_not.id IS NULL"
-            ])
-            query.params.update({
-                'excludetag_id': self.excludetag.id
-            })
-
-        return query
diff --git a/dedupe/fuzzy_text_matching.py b/dedupe/fuzzy_text_matching.py
deleted file mode 100644
index 1843540..0000000
--- a/dedupe/fuzzy_text_matching.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import re
-
-from Levenshtein import distance
-
-class FuzzyTextMatching(object):
-    @staticmethod
-    def levenshteinDistance(string_a, string_b):
-        return distance(string_a, string_b)
-
-    @staticmethod
-    def extractNumbers(address):
-        return re.sub(r'/[^0-9 ]/', '', address).strip();
-
-    #overkill: static function stripTrivial($address) {
-    # See https://www.usps.com/send/official-abbreviations.htm -> Street 
suffixes, and Secondary units
diff --git a/dedupe/match.py b/dedupe/match.py
deleted file mode 100644
index b19e410..0000000
--- a/dedupe/match.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import json
-
-# TODO: elaborate
-
-class Match(object):
-    def __init__(self):
-        self.address = None
-        self.email = None
-        self.name = None
-
-    def json(self):
-        return json.dumps({
-            "address": self.address,
-            "email": self.email,
-            "name": self.name,
-        })
-
-class EmailMatch(Match):
-    def __init__(self, matchDescription):
-        self.email = matchDescription
-        
-    def json(self):
-        return json.dumps({
-            "email": self.email,
-        })
diff --git a/dedupe/quick_autoreview.py b/dedupe/quick_autoreview.py
deleted file mode 100755
index 69f5400..0000000
--- a/dedupe/quick_autoreview.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-
-'''Find low-hanging dupe fruits and mark them for the manual review queue'''
-
-from process.logging import Logger as log
-from process.globals import load_config
-load_config("dedupe")
-import process.lock as lock
-
-from autoreview import Autoreview
-from civicrm.tag import Tag
-from contact_cache import TaggedGroup
-from database import db
-from match import EmailMatch
-from review_job import ReviewJob
-from review_queue import ReviewQueue
-
-class QuickAutoreview(object):
-    QUICK_REVIEWED = Tag.get("Quick autoreviewed")
-
-    def __init__(self):
-        self.contactCache = TaggedGroup(
-            tag=Autoreview.REVIEW,
-            excludetag=QuickAutoreview.QUICK_REVIEWED,
-            require_email=True
-        )
-        job = ReviewJob("Quick autoreview")
-        self.job_id = job.id
-
-    def reviewBatch(self):
-        '''For each new contact, find the oldest contact with the same email 
address.'''
-
-        matchDescription = EmailMatch("Exact match").json()
-
-        matched = 0
-        self.contactCache.fetch()
-        for contact in self.contactCache.contacts:
-            if contact['email']:
-                query = db.Query()
-                query.columns = [
-                    'MIN(contact_id) AS contact_id',
-                ]
-                query.tables = [
-                    'civicrm_email',
-                ]
-                query.where.extend([
-                    'email = %(email)s',
-                    'contact_id < %(new_id)s',
-                ])
-                query.group_by.extend([
-                    'email',
-                ])
-                query.params = {
-                    'new_id': contact['id'],
-                    'email': contact['email'],
-                }
-                result = db.get_db().execute(query)
-
-                if result:
-                    for row in result:
-                        ReviewQueue.addMatch(self.job_id, row['contact_id'], 
contact['id'], Autoreview.REC_DUP, matchDescription)
-                        matched += 1
-
-            ReviewQueue.tag(contact['id'], QuickAutoreview.QUICK_REVIEWED)
-
-        if not self.contactCache.contacts:
-            log.warn("Searched an empty batch of contacts!")
-        else:
-            last_seen = self.contactCache.contacts[-1]['id']
-            log.info("End of batch.  Last contact scanned was ID 
{id}".format(id=last_seen))
-            log.info("Marked {matched} contacts as potential 
duplicates.".format(matched=matched))
-
-
-if __name__ == '__main__':
-    log.info("Begin quick_autoreview deduper")
-    lock.begin()
-
-    job = QuickAutoreview()
-    job.reviewBatch()
-    ReviewQueue.commit()
-
-    lock.end()
-    log.info("End quick_autoreview deduper")
diff --git a/dedupe/review_job.py b/dedupe/review_job.py
deleted file mode 100644
index df9abcb..0000000
--- a/dedupe/review_job.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from process.logging import Logger as log
-from process.globals import config
-from database import db
-
-class ReviewJob(object):
-    def __init__(self, name):
-        self.name = name
-
-        sql = "INSERT INTO donor_autoreview_job SET name = %s"
-        dbc = db.get_db(config.drupal_schema)
-        dbc.execute(sql, (name, ))
-        self.id = dbc.last_insert_id()
-        log.info("This job has ID %d" % self.id)
diff --git a/dedupe/review_queue.py b/dedupe/review_queue.py
deleted file mode 100644
index 52c9d66..0000000
--- a/dedupe/review_queue.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from process.logging import Logger as log
-from process.globals import config
-from database import db
-
-class ReviewQueue(object):
-    cached_tagging = True
-    cached_tags = {}
-
-    @staticmethod
-    def addMatch(job_id, oldId, newId, action, match):
-        #log.info("Found a match: {old} -> {new} : {match}".format(old=oldId, 
new=newId, match=match))
-        db.get_db(config.drupal_schema).execute("""
-            INSERT INTO donor_review_queue
-                SET
-                    job_id = %(job_id)s,
-                    old_id = %(old_id)s,
-                    new_id = %(new_id)s,
-                    action_id = %(action_id)s,
-                    match_description = %(match)s
-            """, {
-                'job_id': job_id,
-                'old_id': oldId,
-                'new_id': newId,
-                'action_id': action.id,
-                'match': match,
-            })
-
-    @staticmethod
-    def tag(contact_id, tag):
-        if ReviewQueue.cached_tagging:
-            if tag not in ReviewQueue.cached_tags:
-                ReviewQueue.cached_tags[tag] = []
-
-            ReviewQueue.cached_tags[tag].append(contact_id)
-        else:
-            ReviewQueue.tag_single(contact_id, tag)
-
-    @staticmethod
-    def commit():
-        log.info("Committing tags...")
-        for tag, contacts in ReviewQueue.cached_tags.items():
-            log.info("Bulk tagging {num} contacts with tag 
<{tag}>".format(num=len(contacts), tag=tag.name))
-            ReviewQueue.tag_many(contacts, tag)
-
-    @staticmethod
-    def tag_many(contacts, tag):
-        sets = [ "('civicrm_contact', {contact_id}, 
{tag_id})".format(contact_id=contact_id, tag_id=tag.id)
-            for contact_id in contacts ]
-        values = ", ".join(sets)
-
-        db.get_db(config.civicrm_schema).execute("""
-            INSERT IGNORE INTO civicrm_entity_tag
-                (entity_table, entity_id, tag_id)
-            VALUES
-                %s
-        """ % values)
-
-    @staticmethod
-    def tag_single(contact_id, tag):
-        db.get_db(config.civicrm_schema).execute("""
-            INSERT IGNORE INTO civicrm_entity_tag
-                SET
-                    entity_table = 'civicrm_contact',
-                    entity_id = %(contact_id)s,
-                    tag_id = %(tag_id)s
-            """, {
-                'contact_id': contact_id,
-                'tag_id': tag.id,
-            })
diff --git a/dedupe/tests/test_autoreview.py b/dedupe/tests/test_autoreview.py
deleted file mode 100644
index 24d393c..0000000
--- a/dedupe/tests/test_autoreview.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import unittest
-from dedupe.autoreview import Autoreview
-
-class TestAutoreview(unittest.TestCase):
-    def setUp(self):
-        self.reviewer = Autoreview()
-
-    def test_compareNames(self):
-        self.assertEqual(Autoreview.IDENTICAL, Autoreview.compareNames('Bar 
Foo', 'Bar Foo'))
-        self.assertEqual(Autoreview.SIMILAR, Autoreview.compareNames('Bra 
Foo', 'Bar Foo'))
-        self.assertEqual(Autoreview.UNRELATED, 
Autoreview.compareNames('Arctostaphylos', 'Bar Foo'))
-
-    def test_compareEmails(self):
-        self.assertEqual(Autoreview.IDENTICAL, 
Autoreview.compareEmails('foo@bar', 'foo@bar'))
-        self.assertEqual(Autoreview.SIMILAR, 
Autoreview.compareEmails('foo2@bar', 'foo@bar'))
-        self.assertEqual(Autoreview.UNRELATED, 
Autoreview.compareEmails('elem@ant', 'foo@bar'))
-
-    def test_compareAddresses(self):
-        oldAddress = {
-            'street_address': '1701 Flightless Bird',
-            'postal_code': '112233 BFF',
-            'city': 'Dent',
-            'country': 'UK',
-            'state': 'Eastside',
-        }
-        nearAddress = {
-            'street_address': '1710 F. Bd.',
-            'postal_code': '112233 BFF',
-            'city': 'Dent',
-            'country': 'UK',
-            'state': 'Eastside',
-        }
-        otherAddress = {
-            'street_address': '1 Uptown',
-            'postal_code': '323232',
-            'city': 'Dent',
-            'country': 'UK',
-            'state': 'Eastside',
-        }
-        self.assertEqual(Autoreview.IDENTICAL, 
Autoreview.compareAddresses(oldAddress, oldAddress))
-        self.assertEqual(Autoreview.SIMILAR, 
Autoreview.compareAddresses(nearAddress, oldAddress))
-        self.assertEqual(Autoreview.UNRELATED, 
Autoreview.compareAddresses(otherAddress, oldAddress))
-
-if __name__ == '__main__':
-    unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/314476
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3ce402b49cc8dc8fbd5eaec84f72dae9cc8d4d03
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Awight <awi...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to