Mark Shannon, 23.12.2011 12:21:
Martin v. Löwis wrote:
- it would be useful to have a specialized representation for
all-keys-are-strings. In that case, me_hash could be dropped
from the representation. You would get savings compared to
the status quo even in the non-shared case.
It might tricky switching key tables and I dont think it would save much
memory as keys that are widely shared take up very little memory anyway,
and not many other dicts are long-lived.
Why do you say that? In a plain 3.3 interpreter, I counted 595 dict
objects (see script below). Of these, 563 (so nearly of them) had
only strings as keys. Among those, I found 286 different key sets,
where 231 key sets occurred only once (i.e. wouldn't be shared).
Together, the string dictionaries had 13282 keys, and you could save
as many pointers (actually more, because there will be more key slots
than keys).
The question is how much memory needs to be saved to be worth adding the
complexity, 10kb: No, 100Mb: yes.
So data from "real" benchmarks would be useful.
Consider taking a parsed MiniDOM tree as a benchmark. It contains so many
instances of just a couple of different classes that it just has to make a
huge difference if each of those instances is even just a bit smaller. It
should also make a clear difference for plain Python ElementTree.
I attached a benchmark script that measures the parsing speed as well as
the total memory usage of the in-memory tree. You can get data files from
the following places, just download them and pass their file names on the
command line:
http://gnosis.cx/download/hamlet.xml
http://www.ibiblio.org/xml/examples/religion/ot/ot.xml
Here are some results from my own machine for comparison:
http://blog.behnel.de/index.php?p=197
Stefan
# $Id: benchmark.py 3248 2007-09-02 15:01:26Z fredrik $
# simple elementtree benchmark program
from xml.etree import ElementTree
try:
from xml.etree import cElementTree
except ImportError:
cElementTree = None
try:
from lxml import etree
except ImportError:
etree = None
try:
from elementtree import XMLTreeBuilder # xmllib
except ImportError:
XMLTreeBuilder = None
try:
from elementtree import SimpleXMLTreeBuilder # xmllib
except ImportError:
SimpleXMLTreeBuilder = None
try:
from elementtree import SgmlopXMLTreeBuilder # sgmlop
except ImportError:
SgmlopXMLTreeBuilder = None
try:
from xml.dom import minidom # pyexpat+minidom
except ImportError:
minidom = None
try:
import resource
except ImportError:
resource = None
import os, sys
import traceback
from time import time
FORK=True
def fork(func):
if not hasattr(os, 'fork'):
return func
def wrap(*args, **kwargs):
if not FORK:
return func(*args, **kwargs)
cid = os.fork()
if cid:
os.waitpid(cid, 0)
else:
try:
func(*args, **kwargs)
except Exception:
traceback.print_exc()
finally:
os._exit(0)
return wrap
def measure_mem(old=0):
if resource is None:
return
used = resource.getrusage(resource.RUSAGE_SELF)
print('Memory usage: %s%s' % (used.ru_maxrss, (' (+%s)' % (used.ru_maxrss - old)) if old > 0 else ''))
return used.ru_maxrss
@fork
def benchmark(file, builder_module):
oldmem = measure_mem()
with open(file, "rb") as source:
t = time()
try:
builder = builder_module.XMLParser
except AttributeError:
builder = builder_module.TreeBuilder
parser = builder()
while 1:
data = source.read(32768)
if not data:
break
parser.feed(data)
tree = parser.close()
t = time() - t
print("%s.%s.feed(): %d nodes read in %.3f seconds" % (
builder_module.__name__, builder.__name__,
len(list(tree.getiterator())), t
))
measure_mem(oldmem)
del tree
@fork
def benchmark_parse(file, driver):
oldmem = measure_mem()
t = time()
tree = driver.parse(file)
t = time() - t
print(driver.__name__ + ".parse done in %.3f seconds" % t)
measure_mem(oldmem)
del tree
@fork
def benchmark_minidom(file):
oldmem = measure_mem()
t = time()
dom = minidom.parse(file)
t = time() - t
print("minidom tree read in %.3f seconds" % t)
measure_mem(oldmem)
del dom
class configure_parser(object):
def __init__(self, etree, name, **config):
self.__name__ = name
self.etree = etree
self.parser = etree.XMLParser(**config)
def parse(self, input):
return self.etree.parse(input, self.parser)
def run_benchmark(file):
benchmark_parse(file, ElementTree)
if cElementTree is not None:
benchmark_parse(file, cElementTree)
benchmark(file, cElementTree)
if etree is not None:
benchmark_parse(file, etree)
benchmark_parse(file, configure_parser(
etree, 'drop_whitespace',
remove_blank_text=True, remove_comments=True))
benchmark(file, etree)
else:
print("=== lxml.etree not available")
if sys.platform != "cli":
if XMLTreeBuilder:
benchmark(file, XMLTreeBuilder)
if SimpleXMLTreeBuilder:
benchmark(file, SimpleXMLTreeBuilder) # use xmllib
try:
if SgmlopXMLTreeBuilder:
benchmark(file, SgmlopXMLTreeBuilder) # use sgmlop
except RuntimeError:
print("=== SgmlopXMLTreeBuilder not available (%s)" % sys.exc_info()[1])
if minidom:
benchmark_minidom(file)
else:
print("=== minidom not available")
def parse_opts():
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-r', '--repeat', dest='repeat', default="1",
help="number of times to repeat the benchmarks (default: 1)")
parser.add_option('-n', '--no-fork', dest='fork', action='store_false', default=True,
help="disable forking for each test run")
return parser.parse_args()
if __name__ == '__main__':
options, args = parse_opts()
if not args:
args = ['hamlet.xml']
repeat = int(options.repeat)
FORK = options.fork
for filename in args:
# gobble gobble
for i in range(3):
text = None
with open(filename, 'rb') as f:
text = f.read()
for i in range(repeat):
run_benchmark(filename)
_______________________________________________
Python-Dev mailing list
Python-Dev@python.org
http://mail.python.org/mailman/listinfo/python-dev
Unsubscribe:
http://mail.python.org/mailman/options/python-dev/archive%40mail-archive.com