Author: Carl Friedrich Bolz <[email protected]>
Branch: extradoc
Changeset: r3655:ebbbaf7507d2
Date: 2011-06-12 22:28 +0200
http://bitbucket.org/pypy/extradoc/changeset/ebbbaf7507d2/
Log: merge
diff --git a/talk/iwtc11/benchmarks/benchmark.sh
b/talk/iwtc11/benchmarks/benchmark.sh
--- a/talk/iwtc11/benchmarks/benchmark.sh
+++ b/talk/iwtc11/benchmarks/benchmark.sh
@@ -15,6 +15,7 @@
$* convolution/conv3x3.cc -lstdc++; /usr/bin/time -f %e ./a.out 1000000 3
> /dev/null
$* convolution/conv3x3.cc -lstdc++; /usr/bin/time -f %e ./a.out 1000 1000
> /dev/null
$* convolution/dilate3x3.cc -lstdc++; /usr/bin/time -f %e ./a.out 1000
1000 > /dev/null
+ $* image/sobel.cc -lstdc++; /usr/bin/time -f %e ./a.out 1002 1002 >
/dev/null
rm a.out
else
$* sqrt/time_sqrt.py float
@@ -24,4 +25,7 @@
$* convolution/time_conv.py 100
$* convolution/time_conv.py 1000
$* convolution/time_conv2d.py
+ $* image/noborder.py NoBorderImagePadded
+ $* image/noborder.py NoBorderImage
+ $* image/time_sobel.py NoBorderImagePadded
fi
diff --git a/talk/iwtc11/benchmarks/image/io.py
b/talk/iwtc11/benchmarks/image/io.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/io.py
@@ -0,0 +1,39 @@
+import os, re, array
+
+def mplayer(Image, fn='tv://', options=''):
+ f = os.popen('mplayer -really-quiet -noframedrop ' + options + ' '
+ '-vo yuv4mpeg:file=/dev/stdout 2>/dev/null </dev/null ' + fn)
+ hdr = f.readline()
+ m = re.search('W(\d+) H(\d+)', hdr)
+ w, h = int(m.group(1)), int(m.group(2))
+ while True:
+ hdr = f.readline()
+ if hdr != 'FRAME\n':
+ break
+ yield Image(w, h, typecode='B', fromfile=f)
+ f.read(w*h/2) # Color data
+
+class MplayerViewer(object):
+ def __init__(self):
+ self.width = self.height = None
+ def view(self, img):
+ assert img.typecode == 'B'
+ if not self.width:
+ self.mplayer = os.popen('mplayer -really-quiet -noframedrop - ' +
+ '2> /dev/null ', 'w')
+ self.mplayer.write('YUV4MPEG2 W%d H%d F100:1 Ip A1:1\n' %
+ (img.width, img.height))
+ self.width = img.width
+ self.height = img.height
+ self.color_data = array.array('B', [127]) * (img.width *
img.height / 2)
+ assert self.width == img.width
+ assert self.height == img.height
+ self.mplayer.write('FRAME\n')
+ img.tofile(self.mplayer)
+ self.color_data.tofile(self.mplayer)
+
+default_viewer = MplayerViewer()
+
+def view(img):
+ default_viewer.view(img)
+
diff --git a/talk/iwtc11/benchmarks/image/noborder.py
b/talk/iwtc11/benchmarks/image/noborder.py
--- a/talk/iwtc11/benchmarks/image/noborder.py
+++ b/talk/iwtc11/benchmarks/image/noborder.py
@@ -3,13 +3,20 @@
class NoBorderImage(object):
"An image class for people who dont care about border effects"
- def __init__(self, w, h):
+ def __init__(self, w, h, typecode='d', fromfile=None):
self.width = w
self.height = h
- self.data = array('d', [0]) * (w*h)
+ if fromfile is not None:
+ self.data = array(typecode)
+ self.data.fromfile(fromfile, w*h)
+ else:
+ self.data = array(typecode, [0]) * (w*h)
+ self.typecode = typecode
def _idx(self, p):
if isinstance(p, Pixel):
+ assert p.image.__class__ is self.__class__
+ assert p.image.width == self.width
idx = p.idx
else:
idx = p[1] * self.width + p[0]
@@ -22,65 +29,133 @@
self.data[self._idx(p)] = val
def pixels(self):
- for i in xrange(self.width * self.height):
- yield Pixel(i, self.width)
+ for i in self.pixelrange():
+ yield Pixel(i, self)
def pixeliter(self):
- return PixelIter(self.width, self.height)
+ return PixelIter(self)
+
+ def pixelrange(self):
+ return xrange(self.width * self.height)
+
+ def setup(self, data):
+ for y in xrange(self.height):
+ for x in xrange(self.width):
+ self[x, y] = data[y][x]
+ return self
+
+ def clone(self, **kwargs):
+ return self.__class__(self.width, self.height, **kwargs)
+
+ def tofile(self, f):
+ self.data.tofile(f)
+
+class NoBorderImagePadded(NoBorderImage):
+ def __init__(self, w, h, typecode='d', fromfile=None):
+ self.width = w
+ self.height = h
+ self.typecode = typecode
+ if fromfile is None:
+ self.data = array(typecode, [0]) * (w*(h+2)+2)
+ else:
+ self.data = array(typecode, [0]) * (w + 1)
+ self.data.fromfile(fromfile, w*h)
+ self.data += array(typecode, [0]) * (w + 1)
+
+ def _idx(self, p):
+ if isinstance(p, Pixel):
+ assert p.image.__class__ is self.__class__
+ assert p.image.width == self.width
+ idx = p.idx
+ else:
+ idx = (p[1]+1) * self.width + p[0] + 1
+ return min(max(idx, 0), len(self.data)-1)
+
+ def pixelrange(self):
+ return xrange(self.width + 1, (self.width+1) * self.height + 1)
+
+ def tofile(self, f):
+ self.data[(self.width+1):(-self.width-1)].tofile(f)
+
class Pixel(object):
- def __init__(self, idx, w):
+ def __init__(self, idx, image):
self.idx = idx
- self.width = w
+ self.image = image
def __add__(self, other):
- return Pixel(self.idx + other[1]*self.width + other[0], self.width)
+ return Pixel(self.idx + other[1]*self.image.width + other[0],
self.image)
class PixelIter(object):
- def __init__(self, w, h):
- self.width = w
- self.n = w*h
- self.idx = 0
+ def __init__(self, image):
+ self.image = image
+ self.pixelrange = iter(image.pixelrange())
def __iter__(self):
return self
def next(self):
- idx = self.idx
- self.idx += 1
- if idx >=self.n:
- raise StopIteration
- return Pixel(idx, self.width)
+ return Pixel(self.pixelrange.next(), self.image)
def conv3x3(img, k):
assert k.width == k.height == 3
- res = NoBorderImage(img.width, img.height)
+ res = img.clone()
for p in img.pixels():
- res[p] = k[2,2]*img[p + (-1, -1)] + k[1,2]*img[p + (0, -1)] +
k[0,2]*img[p + (1, -1)] + \
- k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] +
k[0,1]*img[p + (1, 0)] + \
- k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] +
k[0,0]*img[p + (1, 1)]
+ res[p] = k[2,2]*img[p + (-1,-1)] + k[1,2]*img[p + (0,-1)] +
k[0,2]*img[p + (1,-1)] + \
+ k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] +
k[0,1]*img[p + (1, 0)] + \
+ k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] +
k[0,0]*img[p + (1, 1)]
return res
def conv3x3iter(img, k):
assert k.width == k.height == 3
- res = NoBorderImage(img.width, img.height)
+ res = img.clone()
for p in img.pixeliter():
- res[p] = k[2,2]*img[p + (-1, -1)] + k[1,2]*img[p + (0, -1)] +
k[0,2]*img[p + (1, -1)] + \
- k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] +
k[0,1]*img[p + (1, 0)] + \
- k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] +
k[0,0]*img[p + (1, 1)]
+ res[p] = k[2,2]*img[p + (-1,-1)] + k[1,2]*img[p + (0,-1)] +
k[0,2]*img[p + (1,-1)] + \
+ k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] +
k[0,1]*img[p + (1, 0)] + \
+ k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] +
k[0,0]*img[p + (1, 1)]
+ return res
+
+def conv3x3range(img, k):
+ assert k.width == k.height == 3
+ res = img.clone()
+ for i in img.pixelrange():
+ p = Pixel(i, img)
+ res[p] = k[2,2]*img[p + (-1,-1)] + k[1,2]*img[p + (0,-1)] +
k[0,2]*img[p + (1,-1)] + \
+ k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] +
k[0,1]*img[p + (1, 0)] + \
+ k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] +
k[0,0]*img[p + (1, 1)]
return res
if __name__ == '__main__':
+ import time, sys
+ sys.setcheckinterval(2**30)
try:
import pypyjit
pypyjit.set_param(trace_limit=200000)
except ImportError:
pass
+ Image = eval(sys.argv[1])
+ n = 1000
- import time
+ # Warmup
+ conv3x3(Image(n, n), Image(3,3))
+ conv3x3iter(Image(n, n), Image(3,3))
+ conv3x3range(Image(n, n), Image(3,3))
+
a = time.time()
for i in range(10):
- conv3x3iter(NoBorderImage(100, 100), NoBorderImage(3,3))
+ conv3x3(Image(n, n), Image(3,3))
b = time.time()
- print 'NoBorderImage:', b - a
+ print '%s:' % Image.__name__, b - a
+ a = time.time()
+ for i in range(10):
+ conv3x3iter(Image(n, n), Image(3,3))
+ b = time.time()
+ print '%s(iter):' % Image.__name__, b - a
+
+ a = time.time()
+ for i in range(10):
+ conv3x3range(Image(n, n), Image(3,3))
+ b = time.time()
+ print '%s(range):' % Image.__name__, b - a
+
diff --git a/talk/iwtc11/benchmarks/image/sobel.cc
b/talk/iwtc11/benchmarks/image/sobel.cc
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/sobel.cc
@@ -0,0 +1,51 @@
+// A safe array example.
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+class Array2D {
+ double *data;
+public:
+ int width, height;
+ Array2D(int w, int h) {
+ width = w;
+ height = h;
+ data = (double *) malloc(w*h*sizeof(double));
+ }
+ double &operator()(int x, int y) {
+ if (x >= 0 && x < width && y >= 0 && y < height) {
+ return data[y*width + x];
+ }
+ printf("IndexError\n");
+ exit(1);
+ }
+};
+
+void sobel_magnitude(Array2D &a, Array2D &b) {
+ int x, y;
+ for (y=1; y<a.height-1; y++) {
+ for (x=1; x<a.width-1; x++) {
+ double dx = -1.0*a(x-1, y-1) + 1.0*a(x+1, y-1) +
+ -2.0*a(x-1, y) + 2.0*a(x+1, y) +
+ -1.0*a(x-1, y+1) + 1.0*a(x+1, y+1);
+
+ double dy = -1.0*a(x-1, y-1) - 2.0*a(x, y-1) - 1.0*a(x+1, y-1) +
+ 1.0*a(x-1, y+1) + 2.0*a(x, y+1) + 1.0*a(x+1, y+1);
+ b(x, y) = sqrt(dx*dx + dy*dy) / 4.0;
+
+ }
+ }
+}
+
+int main(int ac, char **av) {
+ int w = atoi(av[1]), h = atoi(av[2]);
+ int i;
+
+ for (i=0; i<10; i++) {
+ Array2D a(w, h), b(w, h);
+ sobel_magnitude(a, b);
+ printf("%f\n", b(1,1));
+ }
+ fprintf(stderr, "sobel_magnitude: ", h);
+ return 0;
+}
diff --git a/talk/iwtc11/benchmarks/image/sobel.py
b/talk/iwtc11/benchmarks/image/sobel.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/sobel.py
@@ -0,0 +1,75 @@
+from noborder import NoBorderImagePadded
+from math import sqrt
+
+def sobeldx(img):
+ res = img.clone(typecode='d')
+ for p in img.pixeliter():
+ res[p] = (-1.0 * img[p + (-1,-1)] + 1.0 * img[p + (1,-1)] + \
+ -2.0 * img[p + (-1, 0)] + 2.0 * img[p + (1, 0)] + \
+ -1.0 * img[p + (-1, 1)] + 1.0 * img[p + (1, 1)]) / 4.0
+ return res
+
+def sobeldy(img):
+ res = img.clone(typecode='d')
+ for p in img.pixeliter():
+ res[p] = (-1.0*img[p + (-1,-1)] -2.0*img[p + (0,-1)] -1.0*img[p +
(1,-1)] + \
+ 1.0*img[p + (-1, 1)] +2.0*img[p + (0, 1)] +2.0*img[p + (1,
1)]) / 4.0
+ return res
+
+def sobel_magnitude(img):
+ res = img.clone(typecode='d')
+ for p in img.pixeliter():
+ dx = -1.0 * img[p + (-1,-1)] + 1.0 * img[p + (1,-1)] + \
+ -2.0 * img[p + (-1, 0)] + 2.0 * img[p + (1, 0)] + \
+ -1.0 * img[p + (-1, 1)] + 1.0 * img[p + (1, 1)]
+ dy = -1.0*img[p + (-1,-1)] -2.0*img[p + (0,-1)] -1.0*img[p + (1,-1)] +
\
+ 1.0*img[p + (-1, 1)] +2.0*img[p + (0, 1)] +1.0*img[p + (1, 1)]
+ res[p] = sqrt(dx*dx + dy*dy) / 4.0
+ return res
+
+def uint8(img):
+ res = img.clone(typecode='B')
+ for p in img.pixeliter():
+ res[p] = min(max(int(img[p]), 0), 255)
+ return res
+
+def sobel_magnitude_uint8(img):
+ res = img.clone(typecode='B')
+ for p in img.pixeliter():
+ dx = -1.0 * img[p + (-1,-1)] + 1.0 * img[p + (1,-1)] + \
+ -2.0 * img[p + (-1, 0)] + 2.0 * img[p + (1, 0)] + \
+ -1.0 * img[p + (-1, 1)] + 1.0 * img[p + (1, 1)]
+ dy = -1.0*img[p + (-1,-1)] -2.0*img[p + (0,-1)] -1.0*img[p + (1,-1)] +
\
+ 1.0*img[p + (-1, 1)] +2.0*img[p + (0, 1)] +1.0*img[p + (1, 1)]
+ res[p] = min(int(sqrt(dx*dx + dy*dy) / 4.0), 255)
+ return res
+
+
+if __name__ == '__main__':
+ from io import mplayer, view
+ import sys
+ from time import time
+
+ if len(sys.argv) > 1:
+ fn = sys.argv[1]
+ else:
+ fn = 'test.avi -vf scale=640:480 -benchmark'
+
+ sys.setcheckinterval(2**30)
+ try:
+ import pypyjit
+ pypyjit.set_param(trace_limit=200000)
+ except ImportError:
+ pass
+
+ start = start0 = time()
+ for fcnt, img in enumerate(mplayer(NoBorderImagePadded, fn)):
+ #view(img)
+ #sobeldx(img)
+ #view(uint8(sobel_magnitude(img)))
+ view(sobel_magnitude_uint8(img))
+ #sobel_magnitude_uint8(img)
+ print 1.0 / (time() - start), 'fps, ', (fcnt-2) / (time() - start0),
'average fps'
+ start = time()
+ if fcnt==2:
+ start0 = time()
diff --git a/talk/iwtc11/benchmarks/image/test.avi
b/talk/iwtc11/benchmarks/image/test.avi
new file mode 100644
index
0000000000000000000000000000000000000000..e72f9f1b0e99f77baa54aa3f9ef4399b0b82ec45
GIT binary patch
[cut]
diff --git a/talk/iwtc11/benchmarks/image/test_image.py
b/talk/iwtc11/benchmarks/image/test_image.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/test_image.py
@@ -0,0 +1,22 @@
+from noborder import *
+
+def test_noborder():
+ for Image in (NoBorderImagePadded, NoBorderImage):
+ a = Image(5, 5).setup([[11, 12, 13, 14, 15],
+ [21, 22, 23, 24, 25],
+ [31, 32, 33, 34, 35],
+ [41, 42, 43, 44, 45],
+ [51, 52, 53, 54, 55]])
+ k = Image(3, 3).setup([[1, 2, 3],
+ [1, 1, 2],
+ [2, 1, 1]])
+ def tst(conv, a, k):
+ b = conv(a, k)
+ assert b[1,1]== 326 and b[2,1]==340 and b[3,1]==354
+ assert b[1,2]== 466 and b[2,2]==480 and b[3,2]==494
+ assert b[1,3]== 606 and b[2,3]==620 and b[3,3]==634
+
+ for c in (conv3x3, conv3x3iter, conv3x3range):
+ yield tst, c, a, k
+
+
diff --git a/talk/iwtc11/benchmarks/image/time_sobel.py
b/talk/iwtc11/benchmarks/image/time_sobel.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/time_sobel.py
@@ -0,0 +1,29 @@
+from noborder import NoBorderImagePadded, NoBorderImage
+from sobel import sobel_magnitude, sobel_magnitude_uint8
+from time import time
+import sys
+
+sys.setcheckinterval(2**30)
+try:
+ import pypyjit
+ pypyjit.set_param(trace_limit=200000)
+except ImportError:
+ pass
+
+Image = eval(sys.argv[1])
+n = 1000
+
+sobel_magnitude(Image(n, n))
+sobel_magnitude_uint8(Image(n, n, typecode='B'))
+
+a = time()
+for i in range(10):
+ sobel_magnitude(Image(n, n))
+b = time()
+print 'sobel(%s):' % Image.__name__, b - a
+
+a = time()
+for i in range(10):
+ sobel_magnitude_uint8(Image(n, n, typecode='B'))
+b = time()
+print 'sobel_uint8(%s):' % Image.__name__, b - a
diff --git a/talk/iwtc11/benchmarks/image/view.py
b/talk/iwtc11/benchmarks/image/view.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/view.py
@@ -0,0 +1,6 @@
+from noborder import NoBorderImage
+from io import mplayer, view
+
+for img in mplayer(NoBorderImage, 'test.avi'):
+ view(img)
+
diff --git a/talk/iwtc11/benchmarks/result.txt
b/talk/iwtc11/benchmarks/result.txt
--- a/talk/iwtc11/benchmarks/result.txt
+++ b/talk/iwtc11/benchmarks/result.txt
@@ -1,48 +1,129 @@
+
pypy
-sqrt(float): 1.20120882988
- sqrt(int): 2.41813898087
-sqrt(Fix16): 6.11410784721
-conv3: 2.14187502861
-conv5: 2.33459997177
+sqrt(float): 1.20290899277
+ sqrt(int): 2.41840982437
+sqrt(Fix16): 6.10620713234
+conv3(1e8): 2.5192759037
+conv5(1e8): 2.89429306984
+conv3(1e6): 0.828789949417
+conv5(1e6): 1.01669406891
+conv3(1e5): 0.777491092682
+conv5(1e5): 0.971807956696
+conv3x3(3): 0.653658866882
+conv3x3(1000): 0.748742103577
+dilate3x3(1000): 4.8826611042
+NoBorderImagePadded: 2.31043601036
+NoBorderImagePadded(iter): 0.572638988495
+NoBorderImagePadded(range): 0.494098186493
+NoBorderImage: 2.90333104134
+NoBorderImage(iter): 2.06943392754
+NoBorderImage(range): 1.99161696434
+sobel(NoBorderImagePadded): 0.668392896652
pypy --jit enable_opts=intbounds:rewrite:virtualize:heap:unroll
-sqrt(float): 1.2082631588
- sqrt(int): 2.42825579643
-sqrt(Fix16): 6.13569307327
-conv3: 2.14451694489
-conv5: 2.36811304092
+sqrt(float): 1.19338798523
+ sqrt(int): 2.42711806297
+sqrt(Fix16): 6.12403416634
+conv3(1e8): 2.06937193871
+conv5(1e8): 2.26879811287
+conv3(1e6): 0.837247848511
+conv5(1e6): 1.02573990822
+conv3(1e5): 0.779927015305
+conv5(1e5): 0.975258827209
+conv3x3(3): 0.663229942322
+conv3x3(1000): 0.763913154602
+dilate3x3(1000): 4.80735611916
+NoBorderImagePadded: 2.33380198479
+NoBorderImagePadded(iter): 0.504709005356
+NoBorderImagePadded(range): 0.503198862076
+NoBorderImage: 2.93766593933
+NoBorderImage(iter): 2.04195189476
+NoBorderImage(range): 2.02779984474
+sobel(NoBorderImagePadded): 0.670017004013
pypy --jit enable_opts=intbounds:rewrite:virtualize:heap
-sqrt(float): 1.70357894897
- sqrt(int): 3.12929701805
-sqrt(Fix16): 10.3343019485
-conv3: 3.14458608627
-conv5: 3.42248892784
+sqrt(float): 1.69957995415
+ sqrt(int): 3.13235807419
+sqrt(Fix16): 10.325592041
+conv3(1e8): 2.997631073
+conv5(1e8): 3.13820099831
+conv3(1e6): 1.7843170166
+conv5(1e6): 1.94643998146
+conv3(1e5): 1.75876712799
+conv5(1e5): 1.96709895134
+conv3x3(3): 1.09958791733
+conv3x3(1000): 1.02993702888
+dilate3x3(1000): 5.22873902321
+NoBorderImagePadded: 2.45174002647
+NoBorderImagePadded(iter): 1.60747289658
+NoBorderImagePadded(range): 1.55282211304
+NoBorderImage: 2.91020989418
+NoBorderImage(iter): 1.97922706604
+NoBorderImage(range): 2.14161992073
+sobel(NoBorderImagePadded): 1.47591900826
gcc
-sqrt(float): 1.42
+sqrt(float): 1.43
sqrt(int): 1.93
sqrt(Fix16): 2.04
-conv3: 1.94
-conv5: 2.36
+conv3(1e8): 2.03
+conv5(1e8): 2.39
+conv3(1e6): 1.66
+conv5(1e6): 2.03
+conv3(1e5): 1.60
+conv5(1e5): 2.02
+conv3x3(3): 1.81
+conv3x3(1000): 1.79
+dilate3x3(1000): 3.26
+sobel_magnitude: 1.37
gcc -O2
-sqrt(float): 1.14
+sqrt(float): 1.15
sqrt(int): 1.86
-sqrt(Fix16): 1.90
-conv3: 1.18
-conv5: 1.34
+sqrt(Fix16): 1.89
+conv3(1e8): 1.22
+conv5(1e8): 1.37
+conv3(1e6): 1.00
+conv5(1e6): 1.04
+conv3(1e5): 0.81
+conv5(1e5): 0.97
+conv3x3(3): 0.25
+conv3x3(1000): 0.23
+dilate3x3(1000): 0.27
+sobel_magnitude: 0.25
gcc -O3 -march=native
-sqrt(float): 1.14
+sqrt(float): 1.15
sqrt(int): 1.82
sqrt(Fix16): 1.89
-conv3: 1.10
-conv5: 1.16
+conv3(1e8): 1.12
+conv5(1e8): 1.16
+conv3(1e6): 0.96
+conv5(1e6): 0.97
+conv3(1e5): 0.66
+conv5(1e5): 0.75
+conv3x3(3): 0.23
+conv3x3(1000): 0.21
+dilate3x3(1000): 0.26
+sobel_magnitude: 0.25
python2.7
-sqrt(float): 35.3788838387
- sqrt(int): 19.5545659065
-sqrt(Fix16): 978.297157049
-conv3: 72.7751071453
-conv5: 103.557267904
+sqrt(float): 34.9008591175
+ sqrt(int): 19.6919620037
+sqrt(Fix16): 966.111785889
+conv3(1e8): 69.0758299828
+conv5(1e8): 101.503945827
+conv3(1e6): 62.212736845
+conv5(1e6): 93.5375850201
+conv3(1e5): 61.4343979359
+conv5(1e5): 93.6144771576
+conv3x3(3): 198.12590003
+conv3x3(1000): 193.030704975
+dilate3x3(1000): 192.323596954
+NoBorderImagePadded: 512.473811865
+NoBorderImagePadded(iter): 503.393321991
+NoBorderImagePadded(range): 493.907886028
+NoBorderImage: 501.37309289
+NoBorderImage(iter): 495.473101139
+NoBorderImage(range): 493.572232008
+sobel(NoBorderImagePadded): 433.678281069
diff --git a/talk/iwtc11/paper.tex b/talk/iwtc11/paper.tex
--- a/talk/iwtc11/paper.tex
+++ b/talk/iwtc11/paper.tex
@@ -524,22 +524,71 @@
\end{lstlisting}
\subsection{Allocation Removals}
-Using escape analysis we can XXX
+By using escape analysis it is possible to identify objects that are
+allocated within the loop but never escapes it. That is the object are
+short lived and no references to them exists outside the loop. This
+is performed by processing the operation from top to bottom and
+optimistically removing every \lstinline{new} operation. Later on if
+it is discovered that a reference to the object escapes the loop, the
+\lstinline{new} operation is inserted at this point. All operations
+(\lstinline{get} and \lstinline{set}) on the removed objects are also
+removed and the optimizer needs to keep track of the value of all
+attributes of the object.
-Let $\tilde J$ be all variables in $J$ not representing virtuals (in the
-same order). Extend it with all non-virtual fields, $H_i$, of the
-removed virtuals,
+Consider again the original unoptimized trace of
+Figure~\label{fig:peeled-trace}. Line 10 contains the first
+allocation. It is removed and $p_5$ is marked as virtual. This means
+that it refers to an virtual object that was not yet
+(and might never be) allocated. Line 12 sets the \lstinline{intval}
+attribute of $p_5$. This operation is also removed and the optimizer
+registers that the attribute \lstinline{intval} of $p_5$ is $i_4$.
+
+When the optimizer reaches line 13 it needs to construct the
+arguments for the \lstinline{jump} operation, which contains the virtual
+reference $p_5$. This can be achieved by exploding $p_5$ into it's
+attributes. In this case there is only one attribute and it's value is
+$i_4$, which means the $p_5$ is replaced with $i_4$ in the jump
+arguments.
+
+In the general case, each virtual in the jump arguments is exploded into a
+vector of variables containing the values of all it's attributes. If some
+of the attributes are themselves virtuals they are recursively exploded
+to make the vector contain only non virtual variables. Some care has
+to be taken to always place the attributes in the same order when
+performing this explosion. Notation becomes somewhat simpler if also every non
+virtual variable of the jump arguments is exploded into a vector. This will
+be a vector containing the original variable only. To summarize, for
+every variable, $J_k$, of the original jump arguments, $J$, let
\begin{equation}
- \hat J = \left(\tilde J_1, \tilde J_2, \cdots, \tilde J_{|\tilde J|},
- H_1, H_2, \cdots, H_{|H}\right)
+ \tilde J^{\left(k\right)} = \left\{
+ \begin{array}{ll}
+ \left(J_k\right) & \text{if $J_k$ is not virtual} \\
+ H^{\left(k\right)} & \text{if $J_k$ is virtual}
+ \end{array}
+ \right.
+ ,
\end{equation}
-and let
+where $H^{\left(k\right)}$ is a vector containing all non virtual
+attributes of $J_k$. The arguments of the optimized \lstinline{jump}
+operation are constructed as the concatenation all the $\tilde
J^{\left(k\right)}$ vectors,
+\begin{equation}
+ \hat J = \left(
+ \begin{array}{cccc}
+ \tilde J^{\left(1\right)} & \tilde J^{\left(2\right)} & \cdots &
+ \tilde J^{\left(|J|\right)} \\
+ \end{array}
+ \right)
+ .
+\end{equation}
+and the arguments of the \lstinline{jump} operation of the second
+operation, $K$, are replaced by inlining $\hat J$,
\begin{equation}
\hat K = \left(m\left(\hat J_1\right), m\left(\hat J_1\right),
\cdots, m\left(\hat J_{|\hat J|}\right)\right)
.
\end{equation}
-
+In the optimized trace $I$ is replaced by $\hat I$ and $K$ by $\hat
+K$. The trace from Figure~\ref{fig:unopt-trace} will be optimized into
\begin{lstlisting}[mathescape,numbers =
right,basicstyle=\setstretch{1.05}\ttfamily\scriptsize]
$l_0$($p_{0}$, $p_{1}$):
@@ -551,20 +600,94 @@
# inside BoxedInteger.add__int
$i_{3}$ = get($p_{0}$, intval)
$i_{4}$ = int_add($i_{2}$, $i_{3}$)
-jump($l_1$, $p_{0}$, $i_3$, $i_4$)
+ # inside BoxedInteger.__init__
+jump($l_1$, $p_{0}$, $i_{4}$)
-$l_1$($p_{0}$, $p_{5}$, $i_3$, $i_4$):
+$l_1$($p_{0}$, $i_{4}$):
# inside f: y = y.add(step)
# inside BoxedInteger.add
+ guard_class($p_{0}$, BoxedInteger)
# inside BoxedInteger.add__int
- $i_{8}$ = int_add($i_{4}$, $i_{3}$)
-jump($l_1$, $p_{0}$, $i_3$, $i_8$)
+ $i_{7}$ = get($p_{0}$, intval)
+ $i_{8}$ = int_add($i_{4}$, $i_{7}$)
+ # inside BoxedInteger.__init__
+jump($l_1$, $p_{0}$, $i_8$)
\end{lstlisting}
-And we're down to a single integer addition!
+Note that virtuals are only exploded into their attributes when
+constructing the arguments of the jump of the first iteration. This
+explosion can't be repeated when constructing the arguments of the
+jump of the second iteration as it has to mach the first. This means
+the objects that was passed as pointers (non virtuals) from the first
+iteration to the second also has to be passed as pointers from the
+second iteration to the third. If one of these objects are virtual
+at the end of the second iteration they need to be allocated right
+before the jump. With the simple objects considered in this paper,
+that is not a problem. However in more complicated interpreters such
+an allocation might, in combination with other optimizations, lead
+to additional variables from the first iteration being imported into
+the second. This extends both $\hat J$ and $\hat K$, which means that
+some care has to be taken, when implementing this, to allow $\hat J$ to
+grow while inlining it into $\hat K$.
\section{Benchmarks}
+The loop peeling optimization was implemented in the PyPy
+framework. That means that the jit compilers generated for all
+interpreters implemented within PyPy now can take advantage of
+it. Benchmarks have been executed for a few different interpreters and
+we see improvements in several cases. The ideal loop for this optimization
+would be short numerical calculations with no failing guards and no
+external calls.
+
+\subsection{Python}
+The python interpreter of the PyPy framework is a complete python
+version 2.7 compatible interpreter. A set of numerical
+calculations where implemented in both python and in C and their
+runtimes compared. The benchmarks are
+\begin{itemize}
+\item {\bf sqrt}: approximates the square root of $y$ as $x_\infty$
+ with $x_0=y/2$ and $x_k = \left( x_{k-1} + y/x_{k-1} \right) /
+ 2$. There are three different versions of this benchmark where $x_k$
+ is represented with different type of objects: int's, float's and
+ Fix16's. The later, Fix16, is a custom class that implements
+ fixpoint arithmetic with 16 bits precision. In python there is only
+ a single implementation of the benchmark that gets specialized
+ depending on the class of it's input argument, $y$, while in C,
+ there is three different implementations.
+\item {\bf conv3}: one dimensional convolution with a kernel of fixed
+ size $3$.
+\item {\bf conv5}: one dimensional convolution with a kernel of fixed
+ size $5$.
+\item {\bf conv3x3}: two dimensional convolution with kernel of fixed
+ size $3 \times 3$ using a custom class to represent two dimensional
+ arrays.
+\item {\bf dilate3x3}: two dimensional dilation with kernel of fixed
+ size $3 \times 3$. This is similar to convolution but instead of
+ summing over the elements, the maximum is taken. That places a
+ external call to a max function within the loop that prevents some
+ of the optimizations.
+\item {\bf sobel}: an low level video processing algorithm used to
+ locate edges in an image. It calculated the gradient magnitude
+ using sobel derivatives. The algorithm is in python implemented
+ on top of a custom image class that is specially designed for the
+ problem. It ensures that there will be no failing guards, and makes
+ a lot of the two dimension index calculations loop invariant. The
+ intention there is twofold. It shows that the performance impact of
+ having wrapper classes giving objects some application specific
+ properties is negligible. This is due to the inlining performed
+ during the tracing and the allocation removal of the index objects
+ introduced. It also shows that it is possible to do some low level
+ hand optimizations of the python code and hide those optimization
+ under a nice interface without loosing performance.
+\end{itemize}
+
+\subsection{Numpy}
+XXX: Fijal?
+
+\subsection{Prolog}
+XXX: Carl?
+
%\appendix
%\section{Appendix Title}
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit