#!/usr/bin/python # Various approaches to a "safe" repr that never generates too much text. # Demonstrates various approaches to laziness in a Python setting. # In order to be more comprehensible, doesn't cover every Python type, # leaving out new-style classes in particular; and it doesn't even try # to be object-oriented.
# I want to emphasize that this code is not part of any production # code, and several of the implementations use a number of techniques # that make it unnecessarily hard to understand. There may be places # where these techniques are called for, but they should be used very # sparingly! # (If you're programming in Haskell, you're probably using these # techniques without even knowing it. They cause less trouble in # Haskell due to being implicit in the language.) # I am writing this warning because in my early days as a programmer, # I was enthusiastic about learning advanced new algorithms and ways # of structuring programs, and I would often use them in contexts # where they weren't really helpful. In this case, the entire point # of this module is to demonstrate different approaches to structuring # a problem and understand their strengths and weaknesses; most # programs have some more practical purpose that includes # maintainability, which conflicts with complexity and unfamiliar # program structures. # If you need something like this module in Python in real life, you # might try the TextRepr class inside pydoc. It works reasonably well # for most things, even though it's not too hard to get it to produce # uselessly voluminous output: # >>> yy = [] # >>> for ii in range(4): yy = [yy] * 20 # ... # >>> xx = pydoc.TextRepr().repr(yy) # Or take several hours to run; just change the 4 to a larger number, # such as 7. import types class UnhandledType(Exception): pass ### Dumb simple version: generate whole thing, then truncate it. 22 lines. # Doesn't "work" in the sense of "never generating too much text", and # it's trivial to make it painfully slow; for example: # >>> x = 'x' * 10485760 # >>> for ii in range(100): x = [x] # ... # >>> safe_repr.safe_repr_1(x, 200) # "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[['xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # It takes a few seconds to generate that result, as does repr(x)[:200]. # All the other implementations (except for safe_repr_2, which doesn't handle # strings) generate it instantly. def _safe_repr_1(val): if isinstance(val, types.ListType): return '[%s]' % ', '.join(map(_safe_repr_1, val)) elif isinstance(val, types.TupleType): return '(%s)' % ', '.join(map(_safe_repr_1, val)) elif isinstance(val, types.IntType): return '%d' % val elif isinstance(val, types.DictType): return '{%s}' % ', '.join(['%s: %s' % (_safe_repr_1(k), _safe_repr_1(v)) for k, v in val.items()]) elif isinstance(val, types.InstanceType): # note that this is very different behavior from the standard # Python repr! return '%s %s' % (val.__class__.__name__, _safe_repr_1(val.__dict__)) elif isinstance(val, types.StringType): return repr(val) # I'm not going to bother writing out how that works else: raise UnhandledType(val) def safe_repr_1(val, length): "A dumb version that generates it all, then truncates. For comparison." return _safe_repr_1(val)[:length] ### Badly programmed version: never append more than you # need to. 24 lines, incomplete. def safe_repr_2(val, length): """A partial version that never appends more than it needs to. Too painful to complete.""" rv = [] if isinstance(val, types.ListType): if length > 0: rv.append('[') length -= 1 for ii in range(len(val)): item = val[ii] if length <= 0: break rv.append(safe_repr_2(item, length)) length -= len(rv[-1]) if ii < len(val) - 1: rv.append(', '[:length]) length -= len(rv[-1]) if length > 0: rv.append(']') length -= 1 elif isinstance(val, types.IntType): rv.append(('%d' % val)[:length]) else: raise UnhandledType(val) return ''.join(rv) ### Refactored version: never append more than you need to, and use an # exception to quit when you've generated enough. 54 lines. def _safe_repr_3_items(xform, items, buf): for ii in range(len(items)): xform(items[ii], buf) if ii < len(items) - 1: buf.append(', ') def _safe_repr_3_dict_item((k, v), buf): _safe_repr_3(k, buf) buf.append(': ') _safe_repr_3(v, buf) def _safe_repr_3(val, buf): if isinstance(val, types.ListType): buf.append('[') _safe_repr_3_items(_safe_repr_3, val, buf) buf.append(']') elif isinstance(val, types.TupleType): buf.append('(') _safe_repr_3_items(_safe_repr_3, val, buf) buf.append(')') elif isinstance(val, types.IntType): buf.append('%d' % val) elif isinstance(val, types.DictType): buf.append('{') _safe_repr_3_items(_safe_repr_3_dict_item, val.items(), buf) buf.append('}') elif isinstance(val, types.InstanceType): buf.append(val.__class__.__name__ + ' ') _safe_repr_3(val.__dict__, buf) elif isinstance(val, types.StringType): buf.append(repr(val)) else: raise UnhandledType(val) class BufferFull(Exception): pass class Buf: def __init__(self, maxlen): self.maxlen = maxlen self.buf = [] self.curlen = 0 def append(self, astr): astr = astr[:self.maxlen - self.curlen] self.buf.append(astr) self.curlen += len(astr) if self.curlen == self.maxlen: raise BufferFull() def contents(self): return ''.join(self.buf) def safe_repr_3(val, length): "A version that uses an exception to break out of recursion." buf = Buf(length) try: _safe_repr_3(val, buf) except BufferFull: pass return buf.contents() ### Recursive Generator Version, in which we simply stop iterating # when we have enough. 49 lines. # This one is able to handle deeply nested data structures, as long as # you don't ask it to traverse the whole thing; on my Python, that # means you're safe as long as you don't want more than 499 # characters. def _safe_repr_4_items(xform, items): for ii in range(len(items)): for item in xform(items[ii]): yield item if ii < len(items) - 1: yield (', ') def _safe_repr_4_dict_item((k, v)): for item in _safe_repr_4(k): yield item yield ': ' for item in _safe_repr_4(v): yield item def _safe_repr_4(val): if isinstance(val, types.ListType): yield '[' for item in _safe_repr_4_items(_safe_repr_4, val): yield item yield ']' elif isinstance(val, types.TupleType): yield '(' for item in _safe_repr_4_items(_safe_repr_4, val): yield item yield ')' elif isinstance(val, types.IntType): yield '%d' % val elif isinstance(val, types.DictType): yield '{' for item in _safe_repr_4_items(_safe_repr_4_dict_item, val.items()): yield item yield '}' elif isinstance(val, types.InstanceType): yield val.__class__.__name__ + ' ' for item in _safe_repr_4(val.__dict__): yield item elif isinstance(val, types.StringType): # This time I will bother to write it out. yield "'" for each_char in val: if each_char == '\\': yield '\\\\' elif each_char == "'": yield "\\'" else: yield each_char yield "'" else: raise UnhandledType(val) def safe_repr_4(val, length): "A version using recursive generators." rv = [] for chunk in _safe_repr_4(val): chunk = chunk[:length] length -= len(chunk) rv.append(chunk) if length == 0: break return ''.join(rv) ### Stream version without generators. 64 lines, plus 11 lines of streams. # Here we see how we could do the same thing more painfully in a # language with closures but without generators. Surprisingly, this # is only about 20% more code, if you discount the first four # functions whose sole reason for being is to manipulate streams. # However, it did have several bugs in it at first, so I think it's # clearly harder to understand. # At present, it dies with a stack overflow on deeply nested data # structures, even when it shouldn't have to be traversing all the # levels in order to produce the requested number of characters. In # that sense, it's "more eager" and "less lazy" than the generator # version. It's possible to make it lazier to solve that problem, but # I haven't figured out how yet. def stream_append(a, b): afirst, arest = a if afirst is None: return b return (afirst, lambda: stream_append(arest(), b)) empty_stream = (None, None) def one_item_stream(x): return (x, lambda: empty_stream) def stream_append_n(streams): streams.reverse() rv = empty_stream for stream in streams: rv = stream_append(stream, rv) return rv def _safe_repr_5_map(xform, val, ii): if ii == len(val): return empty_stream else: return (xform(val[ii]), lambda: _safe_repr_5_map(xform, val, ii+1)) def _safe_repr_5_items(xform, val, ii): if ii == len(val): return empty_stream elif ii == len(val) - 1: return xform(val[ii]) else: # we explicitly cons up the second stream here to maintain laziness # in case there are a lot of items return stream_append(xform(val[ii]), (", ", lambda: _safe_repr_5_items(xform, val, ii+1))) def _safe_repr_5_dict_item((k, v)): return stream_append_n([_safe_repr_5(k), one_item_stream(": "), _safe_repr_5(v)]) def _safe_repr_5(val): "A stream is a (first_chunk, stream) pair, or (None, dontcare) for empty." if isinstance(val, types.ListType): return stream_append_n([one_item_stream('['), _safe_repr_5_items(_safe_repr_5, val, 0), one_item_stream(']')]) elif isinstance(val, types.TupleType): return stream_append_n([one_item_stream('('), _safe_repr_5_items(_safe_repr_5, val, 0), one_item_stream(')')]) elif isinstance(val, types.IntType): return one_item_stream('%d' % val) elif isinstance(val, types.DictType): return stream_append_n([ one_item_stream('{'), # note that we may lose some laziness here building a big list _safe_repr_5_items(_safe_repr_5_dict_item, val.items(), 0), one_item_stream('}')]) elif isinstance(val, types.InstanceType): return stream_append(one_item_stream(val.__class__.__name__ + ' '), _safe_repr_5(val.__dict__)) elif isinstance(val, types.StringType): def chartrans(each_char): if each_char == '\\': return '\\\\' elif each_char == "'": return "\\'" else: return each_char return stream_append_n([one_item_stream("'"), _safe_repr_5_map(chartrans, val, 0), one_item_stream("'")]) return one_item_stream(repr(val)) else: raise UnhandledType(val) def safe_repr_5(val, length): "A version using lazy streams." rv = [] stream = _safe_repr_5(val) while 1: chunk, tail = stream if chunk is None: break chunk = chunk[:length] length -= len(chunk) rv.append(chunk) if length == 0: break stream = tail() return ''.join(rv) ### Explicit stack version, eschewing recursion and even functions. 61 lines. # Unsurprisingly, this is nearly the longest version of all, although # it had surprisingly few bugs. # In a sense, it's "safer" than the other implementations because it # can handle data structures that are nested thousands of levels deep # without trouble. def safe_repr_6(val, length): "Explicit stack version, eschewing recursion and functions entirely." stack = [('val', val)] rv = [] while stack: tag, val = stack.pop() to_add = None if tag == 'val': if isinstance(val, types.ListType): to_add = '[' stack.append(('literal', ']')) stack.append(('list_contents', (val, 0))) elif isinstance(val, types.TupleType): to_add = '(' stack.append(('literal', ')')) stack.append(('list_contents', (val, 0))) elif isinstance(val, types.IntType): to_add = '%d' % val elif isinstance(val, types.DictType): to_add = '{' stack.append(('literal', '}')) stack.append(('dict_contents', (val.items(), 0))) elif isinstance(val, types.InstanceType): to_add = val.__class__.__name__ + ' ' stack.append(('val', val.__dict__)) elif isinstance(val, types.StringType): to_add = "'" stack.append(('literal', "'")) stack.append(('string', (val, 0))) else: raise UnhandledType(val) elif tag == 'literal': to_add = val elif tag == 'list_contents' or tag == 'dict_contents': val, ii = val if len(val) > 1 + ii: stack.append((tag, (val, ii + 1))) stack.append(('literal', ', ')) if len(val) > ii: item = val[ii] if tag == 'list_contents': stack.append(('val', item)) else: # tag == 'dict_contents' k, v = item stack.append(('val', v)) stack.append(('literal', ': ')) stack.append(('val', k)) elif tag == 'string': val, ii = val if len(val) > ii: stack.append((tag, (val, ii + 1))) each_char = val[ii] if each_char == '\\': to_add = '\\\\' elif each_char == "'": to_add = "\\'" else: to_add = each_char if to_add is not None: to_add = to_add[:length] length -= len(to_add) rv.append(to_add) if length == 0: break return ''.join(rv) ### Unit tests, to verify that these functions all work. def test_fun_on(fun, val): for length in range(40): rstr = fun(val, length) assert len(rstr) <= length, (rstr, val, length, len(rstr)) assert rstr == safe_repr_1(val, length), (rstr, safe_repr_1(val, length)) def test_fun(fun): test_fun_on(fun, range(11)) y = [] for ii in range(10): y = [y] test_fun_on(fun, y) def test_fun_harder(fun): class Foo: pass x = Foo() x.blarg = 37 test_fun_on(fun, (x, {}, [(131, 'foo')])) def test_all(): test_fun(safe_repr_1) test_fun_harder(safe_repr_1) test_fun(safe_repr_2) test_fun(safe_repr_3) test_fun_harder(safe_repr_3) test_fun(safe_repr_4) test_fun_harder(safe_repr_4) test_fun(safe_repr_5) test_fun_harder(safe_repr_5) test_fun(safe_repr_6) test_fun_harder(safe_repr_6) test_all()