Mark Shannon, 23.12.2011 12:21:
Martin v. Löwis wrote:
- it would be useful to have a specialized representation for
all-keys-are-strings. In that case, me_hash could be dropped
from the representation. You would get savings compared to
the status quo even in the non-shared case.
It might tricky switching key tables and I dont think it would save much
memory as keys that are widely shared take up very little memory anyway,
and not many other dicts are long-lived.

Why do you say that? In a plain 3.3 interpreter, I counted 595 dict
objects (see script below). Of these, 563 (so nearly of them) had
only strings as keys. Among those, I found 286 different key sets,
where 231 key sets occurred only once (i.e. wouldn't be shared).

Together, the string dictionaries had 13282 keys, and you could save
as many pointers (actually more, because there will be more key slots
than keys).

The question is how much memory needs to be saved to be worth adding the
complexity, 10kb: No, 100Mb: yes.
So data from "real" benchmarks would be useful.

Consider taking a parsed MiniDOM tree as a benchmark. It contains so many instances of just a couple of different classes that it just has to make a huge difference if each of those instances is even just a bit smaller. It should also make a clear difference for plain Python ElementTree.

I attached a benchmark script that measures the parsing speed as well as the total memory usage of the in-memory tree. You can get data files from the following places, just download them and pass their file names on the command line:

http://gnosis.cx/download/hamlet.xml

http://www.ibiblio.org/xml/examples/religion/ot/ot.xml

Here are some results from my own machine for comparison:

http://blog.behnel.de/index.php?p=197

Stefan
# $Id: benchmark.py 3248 2007-09-02 15:01:26Z fredrik $
# simple elementtree benchmark program

from xml.etree import ElementTree
try:
    from xml.etree import cElementTree
except ImportError:
    cElementTree = None
try:
    from lxml import etree
except ImportError:
    etree = None
try:
    from elementtree import XMLTreeBuilder # xmllib
except ImportError:
    XMLTreeBuilder = None
try:
    from elementtree import SimpleXMLTreeBuilder # xmllib
except ImportError:
    SimpleXMLTreeBuilder = None
try:
    from elementtree import SgmlopXMLTreeBuilder # sgmlop
except ImportError:
    SgmlopXMLTreeBuilder = None
try:
    from xml.dom import minidom # pyexpat+minidom
except ImportError:
    minidom = None

try:
    import resource
except ImportError:
    resource = None

import os, sys
import traceback
from time import time

FORK=True

def fork(func):
    if not hasattr(os, 'fork'):
        return func
    def wrap(*args, **kwargs):
        if not FORK:
            return func(*args, **kwargs)
        cid = os.fork()
        if cid:
            os.waitpid(cid, 0)
        else:
            try:
                func(*args, **kwargs)
            except Exception:
                traceback.print_exc()
            finally:
                os._exit(0)
    return wrap

def measure_mem(old=0):
    if resource is None:
        return
    used = resource.getrusage(resource.RUSAGE_SELF)
    print('Memory usage: %s%s' % (used.ru_maxrss, (' (+%s)' % (used.ru_maxrss - old)) if old > 0 else ''))
    return used.ru_maxrss

@fork
def benchmark(file, builder_module):
    oldmem = measure_mem()
    with open(file, "rb") as source:
        t = time()
        try:
            builder = builder_module.XMLParser
        except AttributeError:
            builder = builder_module.TreeBuilder
        parser = builder()
        while 1:
            data = source.read(32768)
            if not data:
                break
            parser.feed(data)
        tree = parser.close()
        t = time() - t
    print("%s.%s.feed(): %d nodes read in %.3f seconds" % (
        builder_module.__name__, builder.__name__,
        len(list(tree.getiterator())), t
        ))
    measure_mem(oldmem)
    del tree

@fork
def benchmark_parse(file, driver):
    oldmem = measure_mem()
    t = time()
    tree = driver.parse(file)
    t = time() - t
    print(driver.__name__ + ".parse done in %.3f seconds" % t)
    measure_mem(oldmem)
    del tree

@fork
def benchmark_minidom(file):
    oldmem = measure_mem()
    t = time()
    dom = minidom.parse(file)
    t = time() - t
    print("minidom tree read in %.3f seconds" % t)
    measure_mem(oldmem)
    del dom

class configure_parser(object):
    def __init__(self, etree, name, **config):
        self.__name__ = name
        self.etree = etree
        self.parser = etree.XMLParser(**config)
    def parse(self, input):
        return self.etree.parse(input, self.parser)

def run_benchmark(file):
    benchmark_parse(file, ElementTree)
    if cElementTree is not None:
        benchmark_parse(file, cElementTree)
        benchmark(file, cElementTree)
    if etree is not None:
        benchmark_parse(file, etree)
        benchmark_parse(file, configure_parser(
            etree, 'drop_whitespace',
            remove_blank_text=True, remove_comments=True))
        benchmark(file, etree)
    else:
        print("=== lxml.etree not available")

    if sys.platform != "cli":
        if XMLTreeBuilder:
            benchmark(file, XMLTreeBuilder)
        if SimpleXMLTreeBuilder:
            benchmark(file, SimpleXMLTreeBuilder) # use xmllib
        try:
            if SgmlopXMLTreeBuilder:
                benchmark(file, SgmlopXMLTreeBuilder) # use sgmlop
        except RuntimeError:
            print("=== SgmlopXMLTreeBuilder not available (%s)" % sys.exc_info()[1])

    if minidom:
        benchmark_minidom(file)
    else:
        print("=== minidom not available")

def parse_opts():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option('-r', '--repeat', dest='repeat', default="1",
                      help="number of times to repeat the benchmarks (default: 1)")
    parser.add_option('-n', '--no-fork', dest='fork', action='store_false', default=True,
                      help="disable forking for each test run")
    return parser.parse_args()

if __name__ == '__main__':
    options, args = parse_opts()
    if not args:
        args = ['hamlet.xml']
    repeat = int(options.repeat)
    FORK = options.fork

    for filename in args:
        # gobble gobble
        for i in range(3):
            text = None
            with open(filename, 'rb') as f:
                text = f.read()
        for i in range(repeat):
            run_benchmark(filename)
_______________________________________________
Python-Dev mailing list
Python-Dev@python.org
http://mail.python.org/mailman/listinfo/python-dev
Unsubscribe: 
http://mail.python.org/mailman/options/python-dev/archive%40mail-archive.com

Reply via email to