This has been a bugaboo of mine for a long time.  Merging sorted
sequences was the fundamental data processing framework of the COBOL
era, and again of the MapReduce era.  But I always found myself writing
clumsy, special-case-filled merge code for it.

Python has a very nice generic lazy iterator protocol that abstracts
over, among other things, lines of files, and a very nice "generator"
feature that allows you to conveniently construct branching pipelines of
iterators built on top of each other.  This, plus some cleverness on the
part of Emile van Sebille, finally gives me a generic merge that I'm
reasonably happy with.

As with everything posted to kragen-hacks without a notice to the
contrary, I dedicate this code to the public domain, abandoning my
copyright interest in it.

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Generic code for merging sorted sequences.

This is useful in a number of circumstances.

The underlying approach is based on the following elegant code by
Emile van Sebille:

    a = [1, 1, 2, 4, 6, 8, 9]
    b = [1, 3, 4, 5, 6, 7, 8]

    def popwhile(l,t):
       r = []
       while l[:1] == t:
         r.append(l.pop(0))
       return r

    while bool(a or b):
       m = min(a[:1],b[:1]) or max(a[:1],b[:1])
       print popwhile(a,m), popwhile(b,m)

"""

def mergeall(aa, bb, key=lambda x: x):
    "Generate in sorted order all items in sorted iterables aa and bb."
    for aas, bbs in merge(aa, bb, key=key):
        for item in aas:
            yield item
        for item in bbs:
            yield item

def intersect(aa, bb, key=lambda x: x):
    "Generate multiset intersection of sorted iterables aa and bb."
    for aas, bbs in merge(aa, bb, key=key):
        for item in min(aas, bbs):
            yield item

def union(aa, bb, key=lambda x: x):
    "Generate multiset union of sorted iterables aa and bb."
    for aas, bbs in merge(aa, bb, key=key):
        for item in max(aas, bbs):
            yield item

def set_difference(aa, bb, key=lambda x: x):
    "Generate multiset set difference of sorted iterables aa and bb."
    for aas, bbs in merge(aa, bb, key=key):
        if len(aas) > len(bbs):
            for item in aas[:len(aas)-len(bbs)]:
                yield item

def merge(aa, bb, key=lambda x: x):
    """General-purpose lazy merge of sorted iterables aa and bb.

    Note that this is not fully lazy: it’s eager with respect to equal
    items in either sequence.

    Generates tuples of (items_from_a, items_from_b), one per unique
    key (as determined by the key function) found in either iterable.

    """
    for ax, bx in fully_lazy_merge(aa, bb, key=key):
        yield (list(ax), list(bx))

def fully_lazy_merge(aa, bb, key=lambda x: x):
    """Fully lazy version of merge, yielding tuples of generators, not lists.

    Note that this can be dangerous!  It’s crucially important for
    correctness that each of these generators be fully consumed before
    resuming fully_lazy_merge.

    """
    ai, bi = Iterator(aa), Iterator(bb)
    while ai.peek() or bi.peek():
        an, bn = map(key, ai.peek()), map(key, bi.peek())
        m = min(an, bn) or max(an, bn)  # min returns [] if either is []
        yield (ai.popwhile(m, key), bi.popwhile(m, key))

class Iterator:
    "Implements a more suitable Iterator interface on top of Python’s."
    def __init__(self, ii):
        self._ii = iter(ii)
        self._hasNext = True
        self._goose()

    def peek(self):
        "Return a list containing the next item, or an empty list if none."
        return [self._next] if self._hasNext else []

    def _goose(self):
        try:
            self._next = self._ii.next()
        except StopIteration:
            self._hasNext = False
            del self._next
            
    def popwhile(self, needlekey, key):
        """Lazily iterate over the next N items matching needlekey."""
        while map(key, self.peek()) == needlekey:
            yield self._next
            self._goose()

def _test():
    a = [1, 1, 2, 4, 6, 6, 8, 9]
    b = [1, 3, 4, 5, 6, 6, 6, 7, 8]
    _ok(list(merge(a, b)),
       [ ([1, 1], [1]),
         ([2], []),
         ([], [3]),
         ([4], [4]),
         ([], [5]),
         ([6, 6], [6, 6, 6]),
         ([], [7]),
         ([8], [8]),
         ([9], []),
         ])
    _ok(list(mergeall(a, b)),
        [1, 1, 1, 2, 3, 4, 4, 5, 6, 6, 6, 6, 6, 7, 8, 8, 9])
    _ok(list(intersect(a, b)), [1, 4, 6, 6, 8])
    _ok(list(union(a, b)), [1, 1, 2, 3, 4, 5, 6, 6, 6, 7, 8, 9])
    _ok(list(set_difference(a, b)), [1, 2, 9])

    # Example of a more complicated use.  XXX this suggests that there
    # should be different key functions for the two sequences, since
    # in this sort of use, you might want to merge sequences whose
    # items are of different types!
    def update_balances(balances, transaction_log):
        for balance, transactions in fully_lazy_merge(balances,
                                                      transaction_log,
                                                      lambda (kk, vv): kk):
            ((kk, vv),) = balance      # enforces unique account names
            for _, dv in transactions:
                vv += dv
            yield kk, vv

    _ok(list(update_balances([('a', 37), ('b', 137), ('c', 237)],
                             [('b', -10), ('c', -20), ('c', 3)])),
        [('a', 37), ('b', 127), ('c', 220)])

def _ok(a, b): assert a == b, (a, b)

_test()
-- 
To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks

Reply via email to