six ways to do repr() with limited resource usage

Kragen Javier Sitaker Mon, 05 Feb 2007 12:17:18 -0800

#!/usr/bin/python
# Various approaches to a "safe" repr that never generates too much text.
# Demonstrates various approaches to laziness in a Python setting.
# In order to be more comprehensible, doesn't cover every Python type,
# leaving out new-style classes in particular; and it doesn't even try
# to be object-oriented.


# I want to emphasize that this code is not part of any production
# code, and several of the implementations use a number of techniques
# that make it unnecessarily hard to understand.  There may be places
# where these techniques are called for, but they should be used very
# sparingly!

# (If you're programming in Haskell, you're probably using these
# techniques without even knowing it.  They cause less trouble in
# Haskell due to being implicit in the language.)

# I am writing this warning because in my early days as a programmer,
# I was enthusiastic about learning advanced new algorithms and ways
# of structuring programs, and I would often use them in contexts
# where they weren't really helpful.  In this case, the entire point
# of this module is to demonstrate different approaches to structuring
# a problem and understand their strengths and weaknesses; most
# programs have some more practical purpose that includes
# maintainability, which conflicts with complexity and unfamiliar
# program structures.

# If you need something like this module in Python in real life, you
# might try the TextRepr class inside pydoc.  It works reasonably well
# for most things, even though it's not too hard to get it to produce
# uselessly voluminous output:
# >>> yy = []
# >>> for ii in range(4): yy = [yy] * 20
# ... 
# >>> xx = pydoc.TextRepr().repr(yy)
# Or take several hours to run; just change the 4 to a larger number,
# such as 7.

import types

class UnhandledType(Exception): pass

### Dumb simple version: generate whole thing, then truncate it.  22 lines.
# Doesn't "work" in the sense of "never generating too much text", and
# it's trivial to make it painfully slow; for example:
# >>> x = 'x' * 10485760
# >>> for ii in range(100): x = [x]
# ... 
# >>> safe_repr.safe_repr_1(x, 200)
# 
"[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[['xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# It takes a few seconds to generate that result, as does repr(x)[:200].
# All the other implementations (except for safe_repr_2, which doesn't handle
# strings) generate it instantly.

def _safe_repr_1(val):
    if isinstance(val, types.ListType):
        return '[%s]' % ', '.join(map(_safe_repr_1, val))
    elif isinstance(val, types.TupleType):
        return '(%s)' % ', '.join(map(_safe_repr_1, val))
    elif isinstance(val, types.IntType):
        return '%d' % val
    elif isinstance(val, types.DictType):
        return '{%s}' % ', '.join(['%s: %s' % (_safe_repr_1(k), _safe_repr_1(v))
                                   for k, v in val.items()])
    elif isinstance(val, types.InstanceType):
        # note that this is very different behavior from the standard
        # Python repr!
        return '%s %s' % (val.__class__.__name__, _safe_repr_1(val.__dict__))
    elif isinstance(val, types.StringType):
        return repr(val)  # I'm not going to bother writing out how that works
    else: 
        raise UnhandledType(val)

def safe_repr_1(val, length):
    "A dumb version that generates it all, then truncates.  For comparison."
    return _safe_repr_1(val)[:length]

### Badly programmed version: never append more than you
# need to.  24 lines, incomplete.

def safe_repr_2(val, length):
    """A partial version that never appends more than it needs to.  
    Too painful to complete."""
    rv = []
    if isinstance(val, types.ListType):
        if length > 0:
            rv.append('[')
            length -= 1
        for ii in range(len(val)):
            item = val[ii]
            if length <= 0: break
            rv.append(safe_repr_2(item, length))
            length -= len(rv[-1])
            if ii < len(val) - 1:
                rv.append(', '[:length])
                length -= len(rv[-1])
        if length > 0:
            rv.append(']')
            length -= 1
    elif isinstance(val, types.IntType):
        rv.append(('%d' % val)[:length])
    else:
        raise UnhandledType(val)
    return ''.join(rv)

### Refactored version: never append more than you need to, and use an
# exception to quit when you've generated enough.  54 lines.

def _safe_repr_3_items(xform, items, buf):
    for ii in range(len(items)):
        xform(items[ii], buf)
        if ii < len(items) - 1: buf.append(', ')

def _safe_repr_3_dict_item((k, v), buf):
    _safe_repr_3(k, buf)
    buf.append(': ')
    _safe_repr_3(v, buf)

def _safe_repr_3(val, buf):
    if isinstance(val, types.ListType):
        buf.append('[')
        _safe_repr_3_items(_safe_repr_3, val, buf)
        buf.append(']')
    elif isinstance(val, types.TupleType):
        buf.append('(')
        _safe_repr_3_items(_safe_repr_3, val, buf)
        buf.append(')')
    elif isinstance(val, types.IntType):
        buf.append('%d' % val)
    elif isinstance(val, types.DictType):
        buf.append('{')
        _safe_repr_3_items(_safe_repr_3_dict_item, val.items(), buf)
        buf.append('}')
    elif isinstance(val, types.InstanceType):
        buf.append(val.__class__.__name__ + ' ')
        _safe_repr_3(val.__dict__, buf)
    elif isinstance(val, types.StringType):
        buf.append(repr(val))
    else:
        raise UnhandledType(val)

class BufferFull(Exception): pass
class Buf:
    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.buf = []
        self.curlen = 0
    def append(self, astr):
        astr = astr[:self.maxlen - self.curlen]
        self.buf.append(astr)
        self.curlen += len(astr)
        if self.curlen == self.maxlen:
            raise BufferFull()
    def contents(self):
        return ''.join(self.buf)

def safe_repr_3(val, length):
    "A version that uses an exception to break out of recursion."
    buf = Buf(length)
    try: _safe_repr_3(val, buf)
    except BufferFull: pass
    return buf.contents()

### Recursive Generator Version, in which we simply stop iterating
# when we have enough.  49 lines.

# This one is able to handle deeply nested data structures, as long as
# you don't ask it to traverse the whole thing; on my Python, that
# means you're safe as long as you don't want more than 499
# characters.

def _safe_repr_4_items(xform, items):
    for ii in range(len(items)):
        for item in xform(items[ii]): yield item
        if ii < len(items) - 1: yield (', ')

def _safe_repr_4_dict_item((k, v)):
    for item in _safe_repr_4(k): yield item
    yield ': '
    for item in _safe_repr_4(v): yield item

def _safe_repr_4(val):
    if isinstance(val, types.ListType):
        yield '['
        for item in _safe_repr_4_items(_safe_repr_4, val): yield item
        yield ']'
    elif isinstance(val, types.TupleType):
        yield '('
        for item in _safe_repr_4_items(_safe_repr_4, val): yield item
        yield ')'
    elif isinstance(val, types.IntType):
        yield '%d' % val
    elif isinstance(val, types.DictType):
        yield '{'
        for item in _safe_repr_4_items(_safe_repr_4_dict_item, val.items()):
            yield item
        yield '}'
    elif isinstance(val, types.InstanceType):
        yield val.__class__.__name__ + ' '
        for item in _safe_repr_4(val.__dict__): yield item
    elif isinstance(val, types.StringType):
        # This time I will bother to write it out.
        yield "'"
        for each_char in val:
            if each_char == '\\': yield '\\\\'
            elif each_char == "'": yield "\\'"
            else: yield each_char
        yield "'"
    else:
        raise UnhandledType(val)

def safe_repr_4(val, length):
    "A version using recursive generators."
    rv = []
    for chunk in _safe_repr_4(val):
        chunk = chunk[:length]
        length -= len(chunk)
        rv.append(chunk)
        if length == 0: break
    return ''.join(rv)

### Stream version without generators.  64 lines, plus 11 lines of streams.
# Here we see how we could do the same thing more painfully in a
# language with closures but without generators.  Surprisingly, this
# is only about 20% more code, if you discount the first four
# functions whose sole reason for being is to manipulate streams.
# However, it did have several bugs in it at first, so I think it's
# clearly harder to understand.

# At present, it dies with a stack overflow on deeply nested data
# structures, even when it shouldn't have to be traversing all the
# levels in order to produce the requested number of characters.  In
# that sense, it's "more eager" and "less lazy" than the generator
# version.  It's possible to make it lazier to solve that problem, but
# I haven't figured out how yet.

def stream_append(a, b):
    afirst, arest = a
    if afirst is None: return b
    return (afirst, lambda: stream_append(arest(), b))
empty_stream = (None, None)
def one_item_stream(x): return (x, lambda: empty_stream)
def stream_append_n(streams):
    streams.reverse()
    rv = empty_stream
    for stream in streams: rv = stream_append(stream, rv)
    return rv

def _safe_repr_5_map(xform, val, ii):
    if ii == len(val): return empty_stream
    else:
        return (xform(val[ii]), lambda: _safe_repr_5_map(xform, val, ii+1))

def _safe_repr_5_items(xform, val, ii):
    if ii == len(val): return empty_stream
    elif ii == len(val) - 1: return xform(val[ii])
    else:
        # we explicitly cons up the second stream here to maintain laziness
        # in case there are a lot of items
        return stream_append(xform(val[ii]),
            (", ", lambda: _safe_repr_5_items(xform, val, ii+1)))

def _safe_repr_5_dict_item((k, v)):
    return stream_append_n([_safe_repr_5(k), one_item_stream(": "),
                            _safe_repr_5(v)])

def _safe_repr_5(val):
    "A stream is a (first_chunk, stream) pair, or (None, dontcare) for empty."
    if isinstance(val, types.ListType):
        return stream_append_n([one_item_stream('['),
                                _safe_repr_5_items(_safe_repr_5, val, 0),
                                one_item_stream(']')])
    elif isinstance(val, types.TupleType):
        return stream_append_n([one_item_stream('('),
                                _safe_repr_5_items(_safe_repr_5, val, 0),
                                one_item_stream(')')])
    elif isinstance(val, types.IntType):
        return one_item_stream('%d' % val)
    elif isinstance(val, types.DictType):
        return stream_append_n([
                one_item_stream('{'),
                # note that we may lose some laziness here building a big list
                _safe_repr_5_items(_safe_repr_5_dict_item, val.items(), 0),
                one_item_stream('}')])
    elif isinstance(val, types.InstanceType):
        return stream_append(one_item_stream(val.__class__.__name__ + ' '),
                             _safe_repr_5(val.__dict__))
    elif isinstance(val, types.StringType):
        def chartrans(each_char):
            if each_char == '\\': return '\\\\'
            elif each_char == "'": return "\\'"
            else: return each_char
        return stream_append_n([one_item_stream("'"),
            _safe_repr_5_map(chartrans, val, 0),
            one_item_stream("'")])                              
        return one_item_stream(repr(val))
    else:
        raise UnhandledType(val)

def safe_repr_5(val, length):
    "A version using lazy streams."
    rv = []
    stream = _safe_repr_5(val)
    while 1:
        chunk, tail = stream
        if chunk is None: break
        chunk = chunk[:length]
        length -= len(chunk)
        rv.append(chunk)
        if length == 0: break
        stream = tail()
    return ''.join(rv)

### Explicit stack version, eschewing recursion and even functions.  61 lines.
# Unsurprisingly, this is nearly the longest version of all, although
# it had surprisingly few bugs.
# In a sense, it's "safer" than the other implementations because it 
# can handle data structures that are nested thousands of levels deep
# without trouble.

def safe_repr_6(val, length):
    "Explicit stack version, eschewing recursion and functions entirely."
    stack = [('val', val)]
    rv = []
    while stack:
        tag, val = stack.pop()
        to_add = None
        if tag == 'val':
            if isinstance(val, types.ListType):
                to_add = '['
                stack.append(('literal', ']'))
                stack.append(('list_contents', (val, 0)))
            elif isinstance(val, types.TupleType):
                to_add = '('
                stack.append(('literal', ')'))
                stack.append(('list_contents', (val, 0)))
            elif isinstance(val, types.IntType):
                to_add = '%d' % val
            elif isinstance(val, types.DictType):
                to_add = '{'
                stack.append(('literal', '}'))
                stack.append(('dict_contents', (val.items(), 0)))
            elif isinstance(val, types.InstanceType):
                to_add = val.__class__.__name__ + ' '
                stack.append(('val', val.__dict__))
            elif isinstance(val, types.StringType):
                to_add = "'"
                stack.append(('literal', "'"))
                stack.append(('string', (val, 0)))
            else: 
                raise UnhandledType(val)
        elif tag == 'literal':
            to_add = val
        elif tag == 'list_contents' or tag == 'dict_contents':
            val, ii = val
            if len(val) > 1 + ii:
                stack.append((tag, (val, ii + 1)))
                stack.append(('literal', ', '))
            if len(val) > ii:
                item = val[ii]
                if tag == 'list_contents':
                    stack.append(('val', item))
                else:  # tag == 'dict_contents'
                    k, v = item
                    stack.append(('val', v))
                    stack.append(('literal', ': '))
                    stack.append(('val', k))
        elif tag == 'string':
            val, ii = val
            if len(val) > ii:
                stack.append((tag, (val, ii + 1)))
                each_char = val[ii]
                if each_char == '\\': to_add = '\\\\'
                elif each_char == "'": to_add = "\\'"
                else: to_add = each_char
        if to_add is not None:
            to_add = to_add[:length]
            length -= len(to_add)
            rv.append(to_add)
            if length == 0: break
    return ''.join(rv)

### Unit tests, to verify that these functions all work.

def test_fun_on(fun, val):
    for length in range(40):
        rstr = fun(val, length)
        assert len(rstr) <= length, (rstr, val, length, len(rstr))
        assert rstr == safe_repr_1(val, length), (rstr, 
                                                  safe_repr_1(val, length))

def test_fun(fun):
    test_fun_on(fun, range(11))
    y = []
    for ii in range(10): y = [y]
    test_fun_on(fun, y)

def test_fun_harder(fun):
    class Foo: pass
    x = Foo()
    x.blarg = 37
    test_fun_on(fun, (x, {}, [(131, 'foo')]))

def test_all():
    test_fun(safe_repr_1)
    test_fun_harder(safe_repr_1)
    test_fun(safe_repr_2)

    test_fun(safe_repr_3)
    test_fun_harder(safe_repr_3)
    test_fun(safe_repr_4)
    test_fun_harder(safe_repr_4)
    test_fun(safe_repr_5)
    test_fun_harder(safe_repr_5)
    test_fun(safe_repr_6)
    test_fun_harder(safe_repr_6)

test_all()

six ways to do repr() with limited resource usage

Reply via email to