Jeroen T. Vermeulen has proposed merging 
lp:~jtv/launchpad/bug-994650-scrub-in-batches into lp:launchpad with 
lp:~jtv/launchpad/bug-994650-scrub-faster as a prerequisite.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #994650 in Launchpad itself: "POFileTranslator falls out of date"
  https://bugs.launchpad.net/launchpad/+bug/994650

For more details, see:
https://code.launchpad.net/~jtv/launchpad/bug-994650-scrub-in-batches/+merge/105189

This is one of two further optimizations for POFileTranslator scrubbing as 
mentioned in the preceding MP: 
https://code.launchpad.net/~jtv/launchpad/bug-994650-scrub-faster

Here you see two parts of the scrubbing process separated further: finding 
which POFiles in a batch need their POFileTranslator entries fixed, and 
actually doing so.  The benefit is in a new, intervening step: bulk-load all 
objects needed for doing this work.  Disappointingly, about half of the 
relevant object graph is needed for log output.  But we probably should have it 
anyway, because tweaking such processes without helpful logs can be highly 
demotivating.

As an added bonus, it turns out we don't really need to load full 
POFileTranslator records, let alone cache them across these steps.  That's one 
big memory load off my mind.  I deliberately didn't make the scrubber check and 
correct dates on existing POFileTranslator records; a bit of imprecision is 
fine.

As a next step, which should further reduce memory load as well as DB querying, 
we can cache sets of POTMsgSet ids per template across items in a batch.  (But 
not outside batches, since they may change between transactions).

The tests don't go into the new components.  They already cover the aggregate 
behaviour of fix_pofile() and the main loop; there'd be a lot of duplication 
and I'd hate to lose integration test coverage as a side effect of moving 
things into more fine-grained unit tests.  Also I'm trying to correct for a 
past of focusing too much on fine-grained tests.  But, dear reviewer, if you 
see anything that you do want tested then please say so.


Jeroen
-- 
https://code.launchpad.net/~jtv/launchpad/bug-994650-scrub-in-batches/+merge/105189
Your team Launchpad code reviewers is requested to review the proposed merge of 
lp:~jtv/launchpad/bug-994650-scrub-in-batches into lp:launchpad.
=== modified file 'lib/lp/translations/scripts/scrub_pofiletranslator.py'
--- lib/lp/translations/scripts/scrub_pofiletranslator.py	2012-05-09 11:13:31 +0000
+++ lib/lp/translations/scripts/scrub_pofiletranslator.py	2012-05-09 11:13:34 +0000
@@ -8,14 +8,25 @@
     'ScrubPOFileTranslator',
     ]
 
+from collections import namedtuple
+
 from storm.expr import (
     Coalesce,
     Desc,
     )
 import transaction
 
+from lp.registry.model.distribution import Distribution
+from lp.registry.model.distroseries import DistroSeries
+from lp.registry.model.product import Product
+from lp.registry.model.productseries import ProductSeries
+from lp.services.database.bulk import (
+    load,
+    load_related,
+    )
 from lp.services.database.lpstorm import IStore
 from lp.services.looptuner import TunableLoop
+from lp.services.worlddata.model.language import Language
 from lp.translations.model.pofile import POFile
 from lp.translations.model.pofiletranslator import POFileTranslator
 from lp.translations.model.potemplate import POTemplate
@@ -40,9 +51,12 @@
     return query.order_by(POTemplate.name, POFile.languageID)
 
 
-def get_pofile_details(pofile_ids):
+def summarize_pofiles(pofile_ids):
     """Retrieve relevant parts of `POFile`s with given ids.
 
+    This gets just enough information to determine whether any of the
+    `POFile`s need their `POFileTranslator` records fixed.
+
     :param pofile_ids: Iterable of `POFile` ids.
     :return: Dict mapping each id in `pofile_ids` to a duple of
         `POTemplate` id and `Language` id for the associated `POFile`.
@@ -113,15 +127,14 @@
 
 
 def get_pofiletranslators(pofile_id):
-    """Get `POFileTranslator` entries for a `POFile`.
+    """Get `Person` ids from `POFileTranslator` entries for a `POFile`.
 
-    Returns a dict mapping each contributor's person id to their
-    `POFileTranslator` record.
+    Returns a `set` of `Person` ids.
     """
     store = IStore(POFileTranslator)
-    pofts = store.find(
-        POFileTranslator, POFileTranslator.pofileID == pofile_id)
-    return dict((poft.personID, poft) for poft in pofts)
+    return set(store.find(
+        POFileTranslator.personID,
+        POFileTranslator.pofileID == pofile_id))
 
 
 def remove_pofiletranslators(logger, pofile, person_ids):
@@ -139,14 +152,14 @@
 
 def remove_unwarranted_pofiletranslators(logger, pofile, pofts, contribs):
     """Delete `POFileTranslator` records that shouldn't be there."""
-    excess = set(pofts) - set(contribs)
+    excess = pofts - set(contribs)
     if len(excess) > 0:
         remove_pofiletranslators(logger, pofile, excess)
 
 
 def create_missing_pofiletranslators(logger, pofile, pofts, contribs):
     """Create `POFileTranslator` records that were missing."""
-    shortage = set(contribs) - set(pofts)
+    shortage = set(contribs) - pofts
     if len(shortage) == 0:
         return
     logger.debug(
@@ -159,9 +172,8 @@
             date_last_touched=contribs[missing_contributor]))
 
 
-def fix_pofile(logger, pofile_id, potmsgset_ids, pofiletranslators):
+def fix_pofile(logger, pofile, potmsgset_ids, pofiletranslators):
     """This `POFile` needs fixing.  Load its data & fix it."""
-    pofile = IStore(POFile).get(POFile, pofile_id)
     contribs = get_contributions(pofile, potmsgset_ids)
     remove_unwarranted_pofiletranslators(
         logger, pofile, pofiletranslators, contribs)
@@ -169,17 +181,73 @@
         logger, pofile, pofiletranslators, contribs)
 
 
-def scrub_pofile(logger, pofile_id, template_id, language_id):
-    """Scrub `POFileTranslator` entries for one `POFile`.
+def needs_fixing(template_id, language_id, potmsgset_ids, pofiletranslators):
+    """Does the `POFile` with given details need `POFileTranslator` changes?
 
-    Removes inappropriate entries and adds missing ones.
+    :param template_id: id of the `POTemplate` for the `POFile`.
+    :param language_id: id of the `Language` the `POFile` translates to.
+    :param potmsgset_ids: ids of the `POTMsgSet` items participating in the
+        template.
+    :param pofiletranslators: `POFileTranslator` objects for the `POFile`.
+    :return: Bool: does the existing set of `POFileTranslator` need fixing?
     """
-    pofiletranslators = get_pofiletranslators(pofile_id)
-    potmsgset_ids = get_potmsgset_ids(template_id)
     contributors = summarize_contributors(
         template_id, language_id, potmsgset_ids)
-    if set(pofiletranslators) != set(contributors):
-        fix_pofile(logger, pofile_id, potmsgset_ids, pofiletranslators)
+    return pofiletranslators != set(contributors)
+
+
+# A tuple describing a POFile that needs its POFileTranslators fixed.
+WorkItem = namedtuple("WorkItem", [
+    'pofile_id',
+    'potmsgset_ids',
+    'pofiletranslators',
+    ])
+
+
+def gather_work_items(pofile_ids):
+    """Produce `WorkItem`s for those `POFile`s that need fixing.
+
+    :param pofile_ids: An iterable of `POFile` ids to check.
+    :param pofile_summaries: Dict as returned by `summarize_pofiles`.
+    :return: A sequence of `WorkItem`s for those `POFile`s that need fixing.
+    """
+    pofile_summaries = summarize_pofiles(pofile_ids)
+    work_items = []
+    for pofile_id in pofile_ids:
+        template_id, language_id = pofile_summaries[pofile_id]
+        potmsgset_ids = get_potmsgset_ids(template_id)
+        pofts = get_pofiletranslators(pofile_id)
+        if needs_fixing(template_id, language_id, potmsgset_ids, pofts):
+            work_items.append(WorkItem(pofile_id, potmsgset_ids, pofts))
+
+    return work_items
+
+
+def preload_work_items(work_items):
+    """Bulk load data that will be needed to process `work_items`.
+
+    :param work_items: A sequence of `WorkItem` records.
+    :return: A dict mapping `POFile` ids from `work_items` to their
+        respective `POFile` objects.
+    """
+    pofiles = load(POFile, [work_item.pofile_id for work_item in work_items])
+    load_related(Language, pofiles, ['languageID'])
+    templates = load_related(POTemplate, pofiles, ['potemplateID'])
+    distroseries = load_related(DistroSeries, templates, ['distroseriesID'])
+    load_related(Distribution, distroseries, ['distributionID'])
+    productseries = load_related(
+        ProductSeries, templates, ['productseriesID'])
+    load_related(Product, productseries, ['productID'])
+    return dict((pofile.id, pofile) for pofile in pofiles)
+
+
+def process_work_items(logger, work_items, pofiles):
+    """Fix the `POFileTranslator` records covered by `work_items`."""
+    for work_item in work_items:
+        pofile = pofiles[work_item.pofile_id]
+        fix_pofile(
+            logger, pofile, work_item.potmsgset_ids,
+            work_item.pofiletranslators)
 
 
 class ScrubPOFileTranslator(TunableLoop):
@@ -199,13 +267,11 @@
         batch = self.pofile_ids[start_offset:self.next_offset]
         if len(batch) == 0:
             self.next_offset = None
-            return
-
-        pofile_details = get_pofile_details(batch)
-        for pofile_id in batch:
-            template_id, language_id = pofile_details[pofile_id]
-            scrub_pofile(self.log, pofile_id, template_id, language_id)
-        transaction.commit()
+        else:
+            work_items = gather_work_items(batch)
+            pofiles = preload_work_items(work_items)
+            process_work_items(self.log, work_items, pofiles)
+            transaction.commit()
 
     def isDone(self):
         """See `ITunableLoop`."""

=== modified file 'lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py'
--- lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py	2012-05-09 11:13:31 +0000
+++ lib/lp/translations/scripts/tests/test_scrub_pofiletranslator.py	2012-05-09 11:13:34 +0000
@@ -20,14 +20,14 @@
 from lp.testing.layers import ZopelessDatabaseLayer
 from lp.translations.model.pofiletranslator import POFileTranslator
 from lp.translations.scripts.scrub_pofiletranslator import (
+    fix_pofile,
     get_contributions,
-    get_pofile_details,
     get_pofile_ids,
     get_pofiletranslators,
     get_potmsgset_ids,
-    scrub_pofile,
     ScrubPOFileTranslator,
     summarize_contributors,
+    summarize_pofiles,
     )
 
 
@@ -128,11 +128,11 @@
             self.assertEqual(
                 1, size_distance(ordering, pofiles[0].id, pofiles[1].id))
 
-    def test_get_pofile_details_maps_id_to_template_and_language_ids(self):
+    def test_summarize_pofiles_maps_id_to_template_and_language_ids(self):
         pofile = self.factory.makePOFile()
         self.assertEqual(
             {pofile.id: (pofile.potemplate.id, pofile.language.id)},
-            get_pofile_details([pofile.id]))
+            summarize_pofiles([pofile.id]))
 
     def test_get_potmsgset_ids_returns_potmsgset_ids(self):
         pofile = self.factory.makePOFile()
@@ -232,41 +232,37 @@
         potmsgset_ids = get_potmsgset_ids(pofile.potemplate.id)
         self.assertEqual({}, get_contributions(pofile, potmsgset_ids))
 
-    def test_get_pofiletranslators_gets_pofiletranslators_for_pofile(self):
+    def test_get_pofiletranslators_gets_translators_for_pofile(self):
         pofile = self.factory.makePOFile()
         tm = self.make_message_with_pofiletranslator(pofile)
-        pofts = get_pofiletranslators(pofile.id)
-        self.assertContentEqual([tm.submitter.id], pofts.keys())
-        poft = pofts[tm.submitter.id]
-        self.assertEqual(pofile, poft.pofile)
+        self.assertContentEqual(
+            [tm.submitter.id], get_pofiletranslators(pofile.id))
 
-    def test_scrub_pofile_leaves_good_pofiletranslator_in_place(self):
+    def test_fix_pofile_leaves_good_pofiletranslator_in_place(self):
         pofile = self.factory.makePOFile()
         tm = self.make_message_with_pofiletranslator(pofile)
         old_poft = self.query_pofiletranslator(pofile, tm.submitter).one()
 
-        scrub_pofile(
-            fake_logger, pofile.id, pofile.potemplate.id, pofile.language.id)
+        fix_pofile(
+            fake_logger, pofile, [tm.potmsgset.id], set([tm.submitter.id]))
 
         new_poft = self.query_pofiletranslator(pofile, tm.submitter).one()
         self.assertEqual(old_poft, new_poft)
 
-    def test_scrub_pofile_deletes_unwarranted_entries(self):
+    def test_fix_pofile_deletes_unwarranted_entries(self):
         # Deleting POFileTranslator records is not something the app
         # server ever does, so it requires special privileges.
         self.becomeDbUser('postgres')
         poft = self.make_pofiletranslator_without_message()
         (pofile, person) = (poft.pofile, poft.person)
-        scrub_pofile(
-            fake_logger, pofile.id, pofile.potemplate.id, pofile.language.id)
+        fix_pofile(fake_logger, pofile, [], set([person.id]))
         self.assertIsNone(self.query_pofiletranslator(pofile, person).one())
 
-    def test_scrub_pofile_adds_missing_entries(self):
+    def test_fix_pofile_adds_missing_entries(self):
         pofile = self.factory.makePOFile()
         tm = self.make_message_without_pofiletranslator(pofile)
 
-        scrub_pofile(
-            fake_logger, pofile.id, pofile.potemplate.id, pofile.language.id)
+        fix_pofile(fake_logger, pofile, [tm.potmsgset.id], set())
 
         new_poft = self.query_pofiletranslator(pofile, tm.submitter).one()
         self.assertEqual(tm.submitter, new_poft.person)
@@ -284,9 +280,9 @@
         # Try to break the loop if it failed to commit its changes.
         transaction.abort()
 
-        # The unwarranted POFileTranslator record has been deleted.
+        # An unwarranted POFileTranslator record has been deleted.
         self.assertIsNotNone(
             self.query_pofiletranslator(pofile, tm.submitter).one())
-        # The missing POFileTranslator has been created.
+        # A missing POFileTranslator has been created.
         self.assertIsNone(
             self.query_pofiletranslator(pofile, noncontributor).one())

_______________________________________________
Mailing list: https://launchpad.net/~launchpad-reviewers
Post to     : [email protected]
Unsubscribe : https://launchpad.net/~launchpad-reviewers
More help   : https://help.launchpad.net/ListHelp

Reply via email to