To see it happening please create two files matrix.lisp and matrix2.lisp
(they only differ by part of a declaration being commented out and a
different name for the global function--main and main2 respectively):
matrix.lisp:
(declaim (optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0)))
(declaim (start-block main))
(defun matmul (a b c n m k)
(declare (optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0))
(type (simple-array (unsigned-byte 32) (*)) a b c)
(fixnum n m k))
(let ((sum 0)
(i1 (- m))
(k2 0))
(declare (type (unsigned-byte 32) sum) (type fixnum i1 k2))
(dotimes (i n c)
(declare (fixnum i))
(setf i1 (+ i1 m)) ;; i1=i*m
(dotimes (j k)
(declare (fixnum j))
(setf sum 0)
(setf k2 (- k))
(dotimes (l m)
(declare (fixnum l))
(setf k2 (+ k2 k)) ;; k2= l*k
(setf sum (the (unsigned-byte 32) (+ (the (unsigned-byte 32) sum)
(the (unsigned-byte 32) (* (aref a (+
i1 l))
(aref b (+
k2 j))))))))
(setf (aref c (+ i1 j)) sum)))))
(defun main ()
(let* ((m1 (make-matrix 30 30))
(m2 (make-matrix 30 30))
(m3 (make-matrix 30 30))
(mm (make-array '(30 30) :element-type '(unsigned-byte 32) :displaced-to m3)))
(dotimes (i 10000) (matmul m1 m2 m3 30 30 30))
(format t "~D ~D ~D ~D~%"
(aref mm 0 0) (aref mm 2 3) (aref mm 3 2) (aref mm 4 4))))
(defun make-matrix (rows cols)
(declare (type (unsigned-byte 32) rows cols))
(let* ((space (* rows cols))
(matrix (make-array space
:element-type '(unsigned-byte 32))))
(declare (type (simple-array (unsigned-byte 32) (*)) matrix)
(fixnum space))
(loop :for i :of-type fixnum :from 0 :below space
:do (setf (aref matrix i) (1+ i)))
matrix))
(declaim (end-block))
matrix2.lisp:
(declaim (optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0)))
(declaim (start-block main2))
(defun matmul (a b c n m k)
(declare ;;(optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0))
(type (simple-array (unsigned-byte 32) (*)) a b c)
(fixnum n m k))
(let ((sum 0)
(i1 (- m))
(k2 0))
(declare (type (unsigned-byte 32) sum) (type fixnum i1 k2))
(dotimes (i n c)
(declare (fixnum i))
(setf i1 (+ i1 m)) ;; i1=i*m
(dotimes (j k)
(declare (fixnum j))
(setf sum 0)
(setf k2 (- k))
(dotimes (l m)
(declare (fixnum l))
(setf k2 (+ k2 k)) ;; k2= l*k
(setf sum (the (unsigned-byte 32) (+ (the (unsigned-byte 32) sum)
(the (unsigned-byte 32) (* (aref a (+
i1 l))
(aref b (+
k2 j))))))))
(setf (aref c (+ i1 j)) sum)))))
(defun main2 ()
(let* ((m1 (make-matrix 30 30))
(m2 (make-matrix 30 30))
(m3 (make-matrix 30 30))
(mm (make-array '(30 30) :element-type '(unsigned-byte 32) :displaced-to m3)))
(dotimes (i 10000) (matmul m1 m2 m3 30 30 30))
(format t "~D ~D ~D ~D~%"
(aref mm 0 0) (aref mm 2 3) (aref mm 3 2) (aref mm 4 4))))
(defun make-matrix (rows cols)
(declare (type (unsigned-byte 32) rows cols))
(let* ((space (* rows cols))
(matrix (make-array space
:element-type '(unsigned-byte 32))))
(declare (type (simple-array (unsigned-byte 32) (*)) matrix)
(fixnum space))
(loop :for i :of-type fixnum :from 0 :below space
:do (setf (aref matrix i) (1+ i)))
matrix))
(declaim (end-block))
Please perform a diff to confirm that these are the only differences:
$ diff matrix.lisp matrix2.lisp
3c3
< (declaim (start-block main))
---
> (declaim (start-block main2))
6c6
< (declare (optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0))
---
> (declare ;;(optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0))
28c28
< (defun main ()
---
> (defun main2 ()
Now try this speed test:
(compile-file "matrix.lisp") (compile-file "matrix2.lisp") (load "matrix.x86f") (time
(main)) (load "matrix2.x86f") (time (main2))
On my system (CMUCL current on x86) matrix2 is consistently slower. It
is not obvious why this should so because of the global proclamation.
I have compared the disassembly side by side and it is indeed
significantly different mid way through. As this is block compilation
there is just one large chunk of assembly code for each of main and
main2.
If the declaim start- and end-blocks are commented out both versions run
at the same speed (and the disassembly of matmul is visually identical).
There appears there an optimisation bug in block compilation.
Regards,
Adam