Author: Carl Friedrich Bolz <[email protected]>
Branch: extradoc
Changeset: r3655:ebbbaf7507d2
Date: 2011-06-12 22:28 +0200
http://bitbucket.org/pypy/extradoc/changeset/ebbbaf7507d2/

Log:    merge

diff --git a/talk/iwtc11/benchmarks/benchmark.sh 
b/talk/iwtc11/benchmarks/benchmark.sh
--- a/talk/iwtc11/benchmarks/benchmark.sh
+++ b/talk/iwtc11/benchmarks/benchmark.sh
@@ -15,6 +15,7 @@
     $* convolution/conv3x3.cc -lstdc++; /usr/bin/time -f %e ./a.out 1000000 3 
> /dev/null
     $* convolution/conv3x3.cc -lstdc++; /usr/bin/time -f %e ./a.out 1000 1000 
> /dev/null
     $* convolution/dilate3x3.cc -lstdc++; /usr/bin/time -f %e ./a.out 1000 
1000 > /dev/null
+    $* image/sobel.cc -lstdc++; /usr/bin/time -f %e ./a.out 1002 1002 > 
/dev/null
     rm a.out
 else
     $* sqrt/time_sqrt.py float
@@ -24,4 +25,7 @@
     $* convolution/time_conv.py 100
     $* convolution/time_conv.py 1000
     $* convolution/time_conv2d.py
+    $* image/noborder.py NoBorderImagePadded
+    $* image/noborder.py NoBorderImage
+    $* image/time_sobel.py NoBorderImagePadded
 fi
diff --git a/talk/iwtc11/benchmarks/image/io.py 
b/talk/iwtc11/benchmarks/image/io.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/io.py
@@ -0,0 +1,39 @@
+import os, re, array
+
+def mplayer(Image, fn='tv://', options=''):
+    f = os.popen('mplayer -really-quiet -noframedrop ' + options + ' ' 
+                 '-vo yuv4mpeg:file=/dev/stdout 2>/dev/null </dev/null ' + fn)
+    hdr = f.readline()
+    m = re.search('W(\d+) H(\d+)', hdr)
+    w, h = int(m.group(1)), int(m.group(2))
+    while True:
+        hdr = f.readline()
+        if hdr != 'FRAME\n':
+            break
+        yield Image(w, h, typecode='B', fromfile=f)
+        f.read(w*h/2) # Color data
+
+class MplayerViewer(object):
+    def __init__(self):
+        self.width = self.height = None
+    def view(self, img):
+        assert img.typecode == 'B'
+        if not self.width:
+            self.mplayer = os.popen('mplayer -really-quiet -noframedrop - ' +
+                                    '2> /dev/null ', 'w')
+            self.mplayer.write('YUV4MPEG2 W%d H%d F100:1 Ip A1:1\n' %
+                               (img.width, img.height))
+            self.width = img.width
+            self.height = img.height
+            self.color_data = array.array('B', [127]) * (img.width * 
img.height / 2)
+        assert self.width == img.width
+        assert self.height == img.height
+        self.mplayer.write('FRAME\n')
+        img.tofile(self.mplayer)
+        self.color_data.tofile(self.mplayer)
+
+default_viewer = MplayerViewer()
+
+def view(img):
+    default_viewer.view(img)
+    
diff --git a/talk/iwtc11/benchmarks/image/noborder.py 
b/talk/iwtc11/benchmarks/image/noborder.py
--- a/talk/iwtc11/benchmarks/image/noborder.py
+++ b/talk/iwtc11/benchmarks/image/noborder.py
@@ -3,13 +3,20 @@
 class NoBorderImage(object):
     "An image class for people who dont care about border effects"
     
-    def __init__(self, w, h):
+    def __init__(self, w, h, typecode='d', fromfile=None):
         self.width = w
         self.height = h
-        self.data = array('d', [0]) * (w*h)
+        if fromfile is not None:
+            self.data = array(typecode)
+            self.data.fromfile(fromfile, w*h)
+        else:
+            self.data = array(typecode, [0]) * (w*h)
+        self.typecode = typecode
 
     def _idx(self, p):
         if isinstance(p, Pixel):
+            assert p.image.__class__ is self.__class__
+            assert p.image.width == self.width
             idx = p.idx
         else:
             idx = p[1] * self.width + p[0]
@@ -22,65 +29,133 @@
         self.data[self._idx(p)] = val
 
     def pixels(self):
-        for i in xrange(self.width * self.height):
-            yield Pixel(i, self.width)
+        for i in self.pixelrange():
+            yield Pixel(i, self)
 
     def pixeliter(self):
-        return PixelIter(self.width, self.height)
+        return PixelIter(self)
+
+    def pixelrange(self):
+        return xrange(self.width * self.height)
+
+    def setup(self, data):
+        for y in xrange(self.height):
+            for x in xrange(self.width):
+                self[x, y] = data[y][x]
+        return self
+
+    def clone(self, **kwargs):
+        return self.__class__(self.width, self.height, **kwargs)
+
+    def tofile(self, f):
+        self.data.tofile(f)
+
+class NoBorderImagePadded(NoBorderImage):
+    def __init__(self, w, h, typecode='d', fromfile=None):
+        self.width = w
+        self.height = h
+        self.typecode = typecode
+        if fromfile is None:
+            self.data = array(typecode, [0]) * (w*(h+2)+2)
+        else:
+            self.data = array(typecode, [0]) * (w + 1)
+            self.data.fromfile(fromfile, w*h)
+            self.data += array(typecode, [0]) * (w + 1)
+
+    def _idx(self, p):
+        if isinstance(p, Pixel):
+            assert p.image.__class__ is self.__class__
+            assert p.image.width == self.width
+            idx = p.idx
+        else:
+            idx = (p[1]+1) * self.width + p[0] + 1
+        return min(max(idx, 0), len(self.data)-1)
+
+    def pixelrange(self):
+        return xrange(self.width + 1, (self.width+1) * self.height + 1)
+
+    def tofile(self, f):
+        self.data[(self.width+1):(-self.width-1)].tofile(f)
+
 
 class Pixel(object):
-    def __init__(self, idx, w):
+    def __init__(self, idx, image):
         self.idx = idx
-        self.width = w
+        self.image = image
 
     def __add__(self, other):
-        return Pixel(self.idx + other[1]*self.width + other[0], self.width)
+        return Pixel(self.idx + other[1]*self.image.width + other[0], 
self.image)
 
 class PixelIter(object):
-    def __init__(self, w, h):
-        self.width = w
-        self.n = w*h
-        self.idx = 0
+    def __init__(self, image):
+        self.image = image
+        self.pixelrange = iter(image.pixelrange())
         
     def __iter__(self):
         return self
 
     def next(self):
-        idx = self.idx
-        self.idx += 1
-        if idx >=self.n:
-            raise StopIteration
-        return Pixel(idx, self.width)
+        return Pixel(self.pixelrange.next(), self.image)
 
 def conv3x3(img, k):
     assert k.width == k.height == 3
-    res = NoBorderImage(img.width, img.height)
+    res = img.clone()
     for p in img.pixels():
-        res[p] = k[2,2]*img[p + (-1, -1)] + k[1,2]*img[p + (0, -1)] + 
k[0,2]*img[p + (1, -1)] + \
-                 k[2,1]*img[p + (-1,  0)] + k[1,1]*img[p + (0,  0)] + 
k[0,1]*img[p + (1,  0)] + \
-                 k[2,0]*img[p + (-1,  1)] + k[1,0]*img[p + (0,  1)] + 
k[0,0]*img[p + (1,  1)]
+        res[p] = k[2,2]*img[p + (-1,-1)] + k[1,2]*img[p + (0,-1)] + 
k[0,2]*img[p + (1,-1)] + \
+                 k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] + 
k[0,1]*img[p + (1, 0)] + \
+                 k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] + 
k[0,0]*img[p + (1, 1)]
     return res
 
 def conv3x3iter(img, k):
     assert k.width == k.height == 3
-    res = NoBorderImage(img.width, img.height)
+    res = img.clone()
     for p in img.pixeliter():
-        res[p] = k[2,2]*img[p + (-1, -1)] + k[1,2]*img[p + (0, -1)] + 
k[0,2]*img[p + (1, -1)] + \
-                 k[2,1]*img[p + (-1,  0)] + k[1,1]*img[p + (0,  0)] + 
k[0,1]*img[p + (1,  0)] + \
-                 k[2,0]*img[p + (-1,  1)] + k[1,0]*img[p + (0,  1)] + 
k[0,0]*img[p + (1,  1)]
+        res[p] = k[2,2]*img[p + (-1,-1)] + k[1,2]*img[p + (0,-1)] + 
k[0,2]*img[p + (1,-1)] + \
+                 k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] + 
k[0,1]*img[p + (1, 0)] + \
+                 k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] + 
k[0,0]*img[p + (1, 1)]
+    return res
+
+def conv3x3range(img, k):
+    assert k.width == k.height == 3
+    res = img.clone()
+    for i in img.pixelrange():
+        p = Pixel(i, img)
+        res[p] = k[2,2]*img[p + (-1,-1)] + k[1,2]*img[p + (0,-1)] + 
k[0,2]*img[p + (1,-1)] + \
+                 k[2,1]*img[p + (-1, 0)] + k[1,1]*img[p + (0, 0)] + 
k[0,1]*img[p + (1, 0)] + \
+                 k[2,0]*img[p + (-1, 1)] + k[1,0]*img[p + (0, 1)] + 
k[0,0]*img[p + (1, 1)]
     return res
 
 if __name__ == '__main__':
+    import time, sys
+    sys.setcheckinterval(2**30)
     try:
         import pypyjit
         pypyjit.set_param(trace_limit=200000)
     except ImportError:
         pass
+    Image = eval(sys.argv[1])
+    n = 1000
 
-    import time
+    # Warmup
+    conv3x3(Image(n, n), Image(3,3))
+    conv3x3iter(Image(n, n), Image(3,3))
+    conv3x3range(Image(n, n), Image(3,3))
+
     a = time.time()
     for i in range(10):
-        conv3x3iter(NoBorderImage(100, 100), NoBorderImage(3,3))
+        conv3x3(Image(n, n), Image(3,3))
     b = time.time()
-    print 'NoBorderImage:', b - a
+    print '%s:' % Image.__name__, b - a
 
+    a = time.time()
+    for i in range(10):
+        conv3x3iter(Image(n, n), Image(3,3))
+    b = time.time()
+    print '%s(iter):' % Image.__name__, b - a
+
+    a = time.time()
+    for i in range(10):
+        conv3x3range(Image(n, n), Image(3,3))
+    b = time.time()
+    print '%s(range):' % Image.__name__, b - a
+
diff --git a/talk/iwtc11/benchmarks/image/sobel.cc 
b/talk/iwtc11/benchmarks/image/sobel.cc
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/sobel.cc
@@ -0,0 +1,51 @@
+// A safe array example.
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+class Array2D {
+  double *data;
+public:
+  int width, height;
+  Array2D(int w, int h) {
+    width = w;
+    height = h;
+    data = (double *) malloc(w*h*sizeof(double));
+  }
+  double &operator()(int x, int y) {
+    if (x >= 0 && x < width && y >= 0 && y < height) {
+       return data[y*width + x];
+    }
+    printf("IndexError\n");
+    exit(1);
+  }
+};
+
+void sobel_magnitude(Array2D &a, Array2D &b) {
+  int x, y;
+  for (y=1; y<a.height-1; y++) {
+    for (x=1; x<a.width-1; x++) {
+      double dx = -1.0*a(x-1, y-1) + 1.0*a(x+1, y-1) +
+                 -2.0*a(x-1, y)   + 2.0*a(x+1, y)   + 
+                 -1.0*a(x-1, y+1) + 1.0*a(x+1, y+1);
+
+      double dy = -1.0*a(x-1, y-1) - 2.0*a(x, y-1) - 1.0*a(x+1, y-1) +
+                  1.0*a(x-1, y+1) + 2.0*a(x, y+1) + 1.0*a(x+1, y+1);
+      b(x, y) = sqrt(dx*dx + dy*dy) / 4.0;
+
+    }
+  }
+}
+
+int main(int ac, char **av) {
+  int w = atoi(av[1]), h = atoi(av[2]);
+  int i;
+
+  for (i=0; i<10; i++) {
+    Array2D a(w, h), b(w, h);
+    sobel_magnitude(a, b);
+    printf("%f\n", b(1,1));
+  }
+  fprintf(stderr, "sobel_magnitude:  ", h);
+  return 0;
+}
diff --git a/talk/iwtc11/benchmarks/image/sobel.py 
b/talk/iwtc11/benchmarks/image/sobel.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/sobel.py
@@ -0,0 +1,75 @@
+from noborder import NoBorderImagePadded
+from math import sqrt
+
+def sobeldx(img):
+    res = img.clone(typecode='d')
+    for p in img.pixeliter():
+        res[p] = (-1.0 * img[p + (-1,-1)] + 1.0 * img[p + (1,-1)] + \
+                  -2.0 * img[p + (-1, 0)] + 2.0 * img[p + (1, 0)] + \
+                  -1.0 * img[p + (-1, 1)] + 1.0 * img[p + (1, 1)]) / 4.0
+    return res
+
+def sobeldy(img):
+    res = img.clone(typecode='d')
+    for p in img.pixeliter():
+        res[p] = (-1.0*img[p + (-1,-1)] -2.0*img[p + (0,-1)] -1.0*img[p + 
(1,-1)] + \
+                   1.0*img[p + (-1, 1)] +2.0*img[p + (0, 1)] +2.0*img[p + (1, 
1)]) / 4.0
+    return res
+
+def sobel_magnitude(img):
+    res = img.clone(typecode='d')
+    for p in img.pixeliter():
+        dx = -1.0 * img[p + (-1,-1)] + 1.0 * img[p + (1,-1)] + \
+             -2.0 * img[p + (-1, 0)] + 2.0 * img[p + (1, 0)] + \
+             -1.0 * img[p + (-1, 1)] + 1.0 * img[p + (1, 1)]
+        dy = -1.0*img[p + (-1,-1)] -2.0*img[p + (0,-1)] -1.0*img[p + (1,-1)] + 
\
+              1.0*img[p + (-1, 1)] +2.0*img[p + (0, 1)] +1.0*img[p + (1, 1)]
+        res[p] = sqrt(dx*dx + dy*dy) / 4.0
+    return res
+
+def uint8(img):
+    res = img.clone(typecode='B')
+    for p in img.pixeliter():
+        res[p] = min(max(int(img[p]), 0), 255)
+    return res
+
+def sobel_magnitude_uint8(img):
+    res = img.clone(typecode='B')
+    for p in img.pixeliter():
+        dx = -1.0 * img[p + (-1,-1)] + 1.0 * img[p + (1,-1)] + \
+             -2.0 * img[p + (-1, 0)] + 2.0 * img[p + (1, 0)] + \
+             -1.0 * img[p + (-1, 1)] + 1.0 * img[p + (1, 1)]
+        dy = -1.0*img[p + (-1,-1)] -2.0*img[p + (0,-1)] -1.0*img[p + (1,-1)] + 
\
+              1.0*img[p + (-1, 1)] +2.0*img[p + (0, 1)] +1.0*img[p + (1, 1)]
+        res[p] = min(int(sqrt(dx*dx + dy*dy) / 4.0), 255)
+    return res
+
+
+if __name__ == '__main__':
+    from io import mplayer, view
+    import sys
+    from time import time
+
+    if len(sys.argv) > 1:
+        fn = sys.argv[1]
+    else:
+        fn = 'test.avi -vf scale=640:480 -benchmark'
+
+    sys.setcheckinterval(2**30)
+    try:
+        import pypyjit
+        pypyjit.set_param(trace_limit=200000)
+    except ImportError:
+        pass
+
+    start = start0 = time()
+    for fcnt, img in enumerate(mplayer(NoBorderImagePadded, fn)):
+        #view(img)
+        #sobeldx(img)
+        #view(uint8(sobel_magnitude(img)))
+        view(sobel_magnitude_uint8(img))
+        #sobel_magnitude_uint8(img)
+        print 1.0 / (time() - start), 'fps, ', (fcnt-2) / (time() - start0), 
'average fps'
+        start = time()
+        if fcnt==2:
+            start0 = time()
diff --git a/talk/iwtc11/benchmarks/image/test.avi 
b/talk/iwtc11/benchmarks/image/test.avi
new file mode 100644
index 
0000000000000000000000000000000000000000..e72f9f1b0e99f77baa54aa3f9ef4399b0b82ec45
GIT binary patch

[cut]

diff --git a/talk/iwtc11/benchmarks/image/test_image.py 
b/talk/iwtc11/benchmarks/image/test_image.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/test_image.py
@@ -0,0 +1,22 @@
+from noborder import *
+
+def test_noborder():
+    for Image in (NoBorderImagePadded, NoBorderImage):
+        a = Image(5, 5).setup([[11, 12, 13, 14, 15],
+                               [21, 22, 23, 24, 25],
+                               [31, 32, 33, 34, 35],
+                               [41, 42, 43, 44, 45],
+                               [51, 52, 53, 54, 55]])
+        k = Image(3, 3).setup([[1, 2, 3],
+                               [1, 1, 2],
+                               [2, 1, 1]])
+        def tst(conv, a, k):
+            b = conv(a, k)
+            assert b[1,1]== 326 and b[2,1]==340 and b[3,1]==354
+            assert b[1,2]== 466 and b[2,2]==480 and b[3,2]==494
+            assert b[1,3]== 606 and b[2,3]==620 and b[3,3]==634
+
+        for c in (conv3x3, conv3x3iter, conv3x3range):
+            yield tst, c, a, k
+
+    
diff --git a/talk/iwtc11/benchmarks/image/time_sobel.py 
b/talk/iwtc11/benchmarks/image/time_sobel.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/time_sobel.py
@@ -0,0 +1,29 @@
+from noborder import NoBorderImagePadded, NoBorderImage
+from sobel import sobel_magnitude, sobel_magnitude_uint8
+from time import time
+import sys
+
+sys.setcheckinterval(2**30)
+try:
+    import pypyjit
+    pypyjit.set_param(trace_limit=200000)
+except ImportError:
+    pass
+
+Image = eval(sys.argv[1])
+n = 1000
+
+sobel_magnitude(Image(n, n))
+sobel_magnitude_uint8(Image(n, n, typecode='B'))
+    
+a = time()
+for i in range(10):
+    sobel_magnitude(Image(n, n))
+b = time()
+print 'sobel(%s):' % Image.__name__, b - a
+
+a = time()
+for i in range(10):
+    sobel_magnitude_uint8(Image(n, n, typecode='B'))
+b = time()
+print 'sobel_uint8(%s):' % Image.__name__, b - a
diff --git a/talk/iwtc11/benchmarks/image/view.py 
b/talk/iwtc11/benchmarks/image/view.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/image/view.py
@@ -0,0 +1,6 @@
+from noborder import NoBorderImage
+from io import mplayer, view
+
+for img in mplayer(NoBorderImage, 'test.avi'):
+    view(img)
+    
diff --git a/talk/iwtc11/benchmarks/result.txt 
b/talk/iwtc11/benchmarks/result.txt
--- a/talk/iwtc11/benchmarks/result.txt
+++ b/talk/iwtc11/benchmarks/result.txt
@@ -1,48 +1,129 @@
+
 pypy
-sqrt(float):   1.20120882988
-  sqrt(int):   2.41813898087
-sqrt(Fix16):   6.11410784721
-conv3:         2.14187502861
-conv5:         2.33459997177
+sqrt(float):   1.20290899277
+  sqrt(int):   2.41840982437
+sqrt(Fix16):   6.10620713234
+conv3(1e8):    2.5192759037
+conv5(1e8):    2.89429306984
+conv3(1e6):    0.828789949417
+conv5(1e6):    1.01669406891
+conv3(1e5):    0.777491092682
+conv5(1e5):    0.971807956696
+conv3x3(3):    0.653658866882
+conv3x3(1000): 0.748742103577
+dilate3x3(1000): 4.8826611042
+NoBorderImagePadded: 2.31043601036
+NoBorderImagePadded(iter): 0.572638988495
+NoBorderImagePadded(range): 0.494098186493
+NoBorderImage: 2.90333104134
+NoBorderImage(iter): 2.06943392754
+NoBorderImage(range): 1.99161696434
+sobel(NoBorderImagePadded): 0.668392896652
 
 pypy --jit enable_opts=intbounds:rewrite:virtualize:heap:unroll
-sqrt(float):   1.2082631588
-  sqrt(int):   2.42825579643
-sqrt(Fix16):   6.13569307327
-conv3:         2.14451694489
-conv5:         2.36811304092
+sqrt(float):   1.19338798523
+  sqrt(int):   2.42711806297
+sqrt(Fix16):   6.12403416634
+conv3(1e8):    2.06937193871
+conv5(1e8):    2.26879811287
+conv3(1e6):    0.837247848511
+conv5(1e6):    1.02573990822
+conv3(1e5):    0.779927015305
+conv5(1e5):    0.975258827209
+conv3x3(3):    0.663229942322
+conv3x3(1000): 0.763913154602
+dilate3x3(1000): 4.80735611916
+NoBorderImagePadded: 2.33380198479
+NoBorderImagePadded(iter): 0.504709005356
+NoBorderImagePadded(range): 0.503198862076
+NoBorderImage: 2.93766593933
+NoBorderImage(iter): 2.04195189476
+NoBorderImage(range): 2.02779984474
+sobel(NoBorderImagePadded): 0.670017004013
 
 pypy --jit enable_opts=intbounds:rewrite:virtualize:heap
-sqrt(float):   1.70357894897
-  sqrt(int):   3.12929701805
-sqrt(Fix16):   10.3343019485
-conv3:         3.14458608627
-conv5:         3.42248892784
+sqrt(float):   1.69957995415
+  sqrt(int):   3.13235807419
+sqrt(Fix16):   10.325592041
+conv3(1e8):    2.997631073
+conv5(1e8):    3.13820099831
+conv3(1e6):    1.7843170166
+conv5(1e6):    1.94643998146
+conv3(1e5):    1.75876712799
+conv5(1e5):    1.96709895134
+conv3x3(3):    1.09958791733
+conv3x3(1000): 1.02993702888
+dilate3x3(1000): 5.22873902321
+NoBorderImagePadded: 2.45174002647
+NoBorderImagePadded(iter): 1.60747289658
+NoBorderImagePadded(range): 1.55282211304
+NoBorderImage: 2.91020989418
+NoBorderImage(iter): 1.97922706604
+NoBorderImage(range): 2.14161992073
+sobel(NoBorderImagePadded): 1.47591900826
 
 gcc
-sqrt(float):   1.42
+sqrt(float):   1.43
 sqrt(int):     1.93
 sqrt(Fix16):   2.04
-conv3:         1.94
-conv5:         2.36
+conv3(1e8):     2.03
+conv5(1e8):     2.39
+conv3(1e6):     1.66
+conv5(1e6):     2.03
+conv3(1e5):     1.60
+conv5(1e5):     2.02
+conv3x3(3):  1.81
+conv3x3(1000):  1.79
+dilate3x3(1000):  3.26
+sobel_magnitude:  1.37
 
 gcc -O2
-sqrt(float):   1.14
+sqrt(float):   1.15
 sqrt(int):     1.86
-sqrt(Fix16):   1.90
-conv3:         1.18
-conv5:         1.34
+sqrt(Fix16):   1.89
+conv3(1e8):     1.22
+conv5(1e8):     1.37
+conv3(1e6):     1.00
+conv5(1e6):     1.04
+conv3(1e5):     0.81
+conv5(1e5):     0.97
+conv3x3(3):  0.25
+conv3x3(1000):  0.23
+dilate3x3(1000):  0.27
+sobel_magnitude:  0.25
 
 gcc -O3 -march=native
-sqrt(float):   1.14
+sqrt(float):   1.15
 sqrt(int):     1.82
 sqrt(Fix16):   1.89
-conv3:         1.10
-conv5:         1.16
+conv3(1e8):     1.12
+conv5(1e8):     1.16
+conv3(1e6):     0.96
+conv5(1e6):     0.97
+conv3(1e5):     0.66
+conv5(1e5):     0.75
+conv3x3(3):  0.23
+conv3x3(1000):  0.21
+dilate3x3(1000):  0.26
+sobel_magnitude:  0.25
 
 python2.7
-sqrt(float):   35.3788838387
-  sqrt(int):   19.5545659065
-sqrt(Fix16):   978.297157049
-conv3:         72.7751071453
-conv5:         103.557267904
+sqrt(float):   34.9008591175
+  sqrt(int):   19.6919620037
+sqrt(Fix16):   966.111785889
+conv3(1e8):    69.0758299828
+conv5(1e8):    101.503945827
+conv3(1e6):    62.212736845
+conv5(1e6):    93.5375850201
+conv3(1e5):    61.4343979359
+conv5(1e5):    93.6144771576
+conv3x3(3):    198.12590003
+conv3x3(1000): 193.030704975
+dilate3x3(1000): 192.323596954
+NoBorderImagePadded: 512.473811865
+NoBorderImagePadded(iter): 503.393321991
+NoBorderImagePadded(range): 493.907886028
+NoBorderImage: 501.37309289
+NoBorderImage(iter): 495.473101139
+NoBorderImage(range): 493.572232008
+sobel(NoBorderImagePadded): 433.678281069
diff --git a/talk/iwtc11/paper.tex b/talk/iwtc11/paper.tex
--- a/talk/iwtc11/paper.tex
+++ b/talk/iwtc11/paper.tex
@@ -524,22 +524,71 @@
 \end{lstlisting}
 
 \subsection{Allocation Removals}
-Using escape analysis we can XXX
+By using escape analysis it is possible to identify objects that are
+allocated within the loop but never escapes it. That is the object are
+short lived and no references to them exists outside the loop. This
+is performed by processing the operation from top to bottom and
+optimistically removing every \lstinline{new} operation. Later on if
+it is discovered that a reference to the object escapes the loop, the
+\lstinline{new} operation is inserted at this point. All operations
+(\lstinline{get} and \lstinline{set}) on the removed objects are also
+removed and the optimizer needs to keep track of the value of all
+attributes of the object.
 
-Let $\tilde J$ be all variables in $J$ not representing virtuals (in the
-same order). Extend it with all non-virtual fields, $H_i$, of the
-removed virtuals,
+Consider again the original unoptimized trace of
+Figure~\label{fig:peeled-trace}. Line 10 contains the first
+allocation. It is removed and $p_5$ is marked as virtual. This means
+that it refers to an virtual object that was not yet
+(and might never be) allocated. Line 12 sets the \lstinline{intval}
+attribute of $p_5$. This operation is also removed and the optimizer
+registers that the attribute \lstinline{intval} of $p_5$ is $i_4$.
+
+When the optimizer reaches line 13 it needs to construct the
+arguments for the \lstinline{jump} operation, which contains the virtual
+reference $p_5$. This can be achieved by exploding $p_5$ into it's
+attributes. In this case there is only one attribute and it's value is
+$i_4$, which means the $p_5$ is replaced with $i_4$ in the jump
+arguments. 
+
+In the general case, each virtual in the jump arguments is exploded into a
+vector of variables containing the values of all it's attributes. If some
+of the attributes are themselves virtuals they are recursively exploded
+to make the vector contain only non virtual variables. Some care has
+to be taken to always place the attributes in the same order when
+performing this explosion. Notation becomes somewhat simpler if also every non
+virtual variable of the jump arguments is exploded into a vector. This will
+be a vector containing the original variable only. To summarize, for
+every variable, $J_k$, of the original jump arguments, $J$, let
 \begin{equation}
-  \hat J = \left(\tilde J_1, \tilde J_2, \cdots, \tilde J_{|\tilde J|}, 
-                 H_1, H_2, \cdots, H_{|H}\right)
+  \tilde J^{\left(k\right)} = \left\{
+      \begin{array}{ll}
+        \left(J_k\right)  & \text{if $J_k$ is not virtual} \\
+        H^{\left(k\right)} & \text{if $J_k$ is virtual}
+      \end{array}
+  \right.
+  ,
 \end{equation}
-and let
+where $H^{\left(k\right)}$ is a vector containing all non virtual
+attributes of $J_k$. The arguments of the optimized \lstinline{jump}
+operation are constructed as the concatenation all the $\tilde 
J^{\left(k\right)}$ vectors,
+\begin{equation}
+  \hat J = \left( 
+    \begin{array}{cccc}
+      \tilde J^{\left(1\right)} & \tilde J^{\left(2\right)} & \cdots &
+      \tilde J^{\left(|J|\right)} \\
+    \end{array}
+  \right)      
+  .
+\end{equation}
+and the arguments of the \lstinline{jump} operation of the second
+operation, $K$, are replaced by inlining $\hat J$, 
 \begin{equation}
   \hat K = \left(m\left(\hat J_1\right), m\left(\hat J_1\right), 
                  \cdots, m\left(\hat J_{|\hat J|}\right)\right)
   .
 \end{equation}
-
+In the optimized trace $I$ is replaced by $\hat I$ and $K$ by $\hat
+K$. The trace from Figure~\ref{fig:unopt-trace} will be optimized into
 
 \begin{lstlisting}[mathescape,numbers = 
right,basicstyle=\setstretch{1.05}\ttfamily\scriptsize]
 $l_0$($p_{0}$, $p_{1}$):
@@ -551,20 +600,94 @@
         # inside BoxedInteger.add__int
         $i_{3}$ = get($p_{0}$, intval)
         $i_{4}$ = int_add($i_{2}$, $i_{3}$)
-jump($l_1$, $p_{0}$, $i_3$, $i_4$)
+            # inside BoxedInteger.__init__
+jump($l_1$, $p_{0}$, $i_{4}$)
 
-$l_1$($p_{0}$, $p_{5}$, $i_3$, $i_4$):
+$l_1$($p_{0}$, $i_{4}$):
 # inside f: y = y.add(step)
     # inside BoxedInteger.add
+    guard_class($p_{0}$, BoxedInteger)
         # inside BoxedInteger.add__int
-        $i_{8}$ = int_add($i_{4}$, $i_{3}$)
-jump($l_1$, $p_{0}$, $i_3$, $i_8$)
+        $i_{7}$ = get($p_{0}$, intval)
+        $i_{8}$ = int_add($i_{4}$, $i_{7}$)
+            # inside BoxedInteger.__init__
+jump($l_1$, $p_{0}$, $i_8$)
 \end{lstlisting}
 
-And we're down to a single integer addition!
+Note that virtuals are only exploded into their attributes when
+constructing the arguments of the jump of the first iteration. This
+explosion can't be repeated when constructing the arguments of the
+jump of the second iteration as it has to mach the first. This means
+the objects that was passed as pointers (non virtuals) from the first
+iteration to the second also has to be passed as pointers from the
+second iteration to the third. If one of these objects are virtual
+at the end of the second iteration they need to be allocated right
+before the jump. With the simple objects considered in this paper,
+that is not a problem. However in more complicated interpreters such
+an allocation might, in combination with other optimizations, lead
+to additional variables from the first iteration being imported into
+the second. This extends both $\hat J$ and $\hat K$, which means that
+some care has to be taken, when implementing this, to allow $\hat J$ to
+grow while inlining it into $\hat K$.
 
 \section{Benchmarks}
 
+The loop peeling optimization was implemented in the PyPy
+framework. That means that the jit compilers generated for all
+interpreters implemented within PyPy now can take advantage of
+it. Benchmarks have been executed for a few different interpreters and
+we see improvements in several cases. The ideal loop for this optimization
+would be short numerical calculations with no failing guards and no
+external calls.
+
+\subsection{Python}
+The python interpreter of the PyPy framework is a complete python
+version 2.7 compatible interpreter. A set of numerical
+calculations where implemented in both python and in C and their
+runtimes compared. The benchmarks are
+\begin{itemize}
+\item {\bf sqrt}: approximates the square root of $y$ as $x_\infty$
+  with $x_0=y/2$ and $x_k = \left( x_{k-1} + y/x_{k-1} \right) /
+  2$. There are three different versions of this benchmark where $x_k$
+  is represented with different type of objects: int's, float's and
+  Fix16's. The later, Fix16, is a custom class that implements
+  fixpoint arithmetic with 16 bits precision. In python there is only
+  a single implementation of the benchmark that gets specialized
+  depending on the class of it's input argument, $y$, while in C,
+  there is three different implementations.
+\item {\bf conv3}: one dimensional convolution with a kernel of fixed
+  size $3$.
+\item {\bf conv5}: one dimensional convolution with a kernel of fixed
+  size $5$.
+\item {\bf conv3x3}: two dimensional convolution with kernel of fixed
+  size $3 \times 3$ using a custom class to represent two dimensional
+  arrays.
+\item {\bf dilate3x3}: two dimensional dilation with kernel of fixed
+  size $3 \times 3$. This is similar to convolution but instead of
+  summing over the elements, the maximum is taken. That places a
+  external call to a max function within the loop that prevents some
+  of the optimizations.
+\item {\bf sobel}: an low level video processing algorithm used to
+  locate edges in an image. It calculated the gradient magnitude
+  using sobel derivatives. The algorithm is in python implemented
+  on top of a custom image class that is specially designed for the
+  problem. It ensures that there will be no failing guards, and makes
+  a lot of the two dimension index calculations loop invariant. The
+  intention there is twofold. It shows that the performance impact of
+  having wrapper classes giving objects some application specific
+  properties is negligible. This is due to the inlining performed
+  during the tracing and the allocation removal of the index objects
+  introduced. It also shows that it is possible to do some low level
+  hand optimizations of the python code and hide those optimization
+  under a nice interface without loosing performance.
+\end{itemize}
+
+\subsection{Numpy}
+XXX: Fijal?
+
+\subsection{Prolog}
+XXX: Carl?
+
 %\appendix
 %\section{Appendix Title}
 
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to