Here's a simple test which does the parsing and the "prettifying" - the process where BeautifulSoup rewrites the HTML in an attempt to make it well-formed.
The benchmark processes 2 different urls, loads them into BeautifulSoup and then reads out the "pretty" (or better-formed) html. I can't use urllib because it's apparently not been implemented (see http://www.codeplex.com/WorkItem/View.aspx?ProjectName=IronPython&WorkItemId=1368) so what I've done is two scripts.
makeFiles.py script, which should be run in CPython, reads the urls and writes them to files.
test.py is the actual benchmark, and the other one is the actual benchmark. The code for both is at the end of this message
These are the results I'm getting:
CPython 2.4
------------------
ran test_getHtml in 0.00 seconds
ran test_load in 0.28 seconds
ran test_prettify in 0.05 seconds
ran benchmark in 0.33 seconds
IronPython 1.0 RC1
----------------------------
ran test_getHtml in 0.04 seconds
ran test_load in 2.49 seconds
ran test_prettify in 0.24 seconds
ran benchmark in 2.77 seconds
So you can see that IronPython is significantly slower than CPython on BeautifulSoup parsing.
#---- makeFiles.py
import urllib
def test_getHtml(url):
f = urllib.urlopen(url)
html = f.read()
f.close()
return html
def saveFile(fName, data):
f = open(fName, "w")
f.write(data)
f.close()
return
urls = ["http://news.bbc.co.uk/2/hi/middle_east/5213602.stm", " http://www.cnn.com/2006/US/07/25/highway.shootings.ap/index.html"]
files = ["c:\\bbc.html", "c:\\cnn.html"]
i = 0
for url in urls:
fName = files[i]
i += 1
data = ""
saveFile(fName, data)
#test.py
#-----------------------------------------------------------------------------------------------------------------
#| Code Start
#-----------------------------------------------------------------------------------------------------------------
import sys
sys.path.append("C:\\Python24\\Lib")
from BeautifulSoup import BeautifulSoup
import time
def test_getFile(fileName):
f = open(fileName, "r")
html = f.read ()
f.close()
return html
def test_load(html):
s = BeautifulSoup(html)
return s
def test_prettify(s):
t = s.prettify()
return t
files = ["c:\\bbc.html", "c:\\cnn.html"]
testCount = 2
benchmarkStart = time.clock()
time_getHtml = 0
time_load = 0
time_prettify = 0
for i in range(testCount):
for file in files:
fName = files[i]
testStart = time.clock()
html = test_getFile(fName)
testEnd = time.clock()
time_getHtml += testEnd - testStart
testStart = time.clock()
s = test_load(html)
testEnd = time.clock()
time_load += testEnd - testStart
testStart = time.clock()
t = test_prettify(s)
testEnd = time.clock()
time_prettify += testEnd - testStart
benchmarkEnd = time.clock()
print 'ran test_getHtml in \t%.2f seconds' % (time_getHtml)
print 'ran test_load in \t%.2f seconds' % (time_load)
print 'ran test_prettify in \t%.2f seconds' % (time_prettify)
print 'ran benchmark in \t%.2f seconds' % (benchmarkEnd - benchmarkStart)
#-----------------------------------------------------------------------------------------------------------------
#| Code End
#-----------------------------------------------------------------------------------------------------------------
_______________________________________________ users mailing list [email protected] http://lists.ironpython.com/listinfo.cgi/users-ironpython.com
