Optimize integer division by a constant in several cases.

author Lutz Euler <lutz.euler@freenet.de>

Mon, 25 Jul 2011 00:47:43 +0000 (02:47 +0200)

committer Nikodemus Siivola <nikodemus@sb-studio.net>

Fri, 5 Aug 2011 13:10:13 +0000 (16:10 +0300)
author Lutz Euler <lutz.euler@freenet.de>
Mon, 25 Jul 2011 00:47:43 +0000 (02:47 +0200)
committer Nikodemus Siivola <nikodemus@sb-studio.net>
Fri, 5 Aug 2011 13:10:13 +0000 (16:10 +0300)
diff --git a/NEWS b/NEWS

index 61e48f0..82ae01e 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,8 @@ changes relative to sbcl-1.0.50:
      and probe counts on Linux.
    * enhancement: building 32-bit SBCL on Linux/x86-64 now works without a
      chroot. (Use "SBCL_ARCH=x86 sh make.sh" to build.)
+  * optimization: unsigned integer divisions by a constant are implemented
+    using multiplication (affects CEILING, FLOOR, TRUNCATE, MOD, and REM.)
    * bug fix: correct RIP offset calculation in SSE comparison and shuffle
      instructions. (lp#814688)
    * bug fix: COERCE to unfinalized extended sequence classes now works.
diff --git a/src/compiler/srctran.lisp b/src/compiler/srctran.lisp

index ce1ac5a..1379d9f 100644 (file)
--- a/src/compiler/srctran.lisp
+++ b/src/compiler/srctran.lisp
@@ -3264,6 +3264,112 @@
        `(if (minusp x)
             (- (logand (- x) ,mask))
             (logand x ,mask)))))
+
+;;; Return an expression to calculate the integer quotient of X and
+;;; constant Y, using multiplication, shift and add/sub instead of
+;;; division. Both arguments must be unsigned, fit in a machine word and
+;;; Y must neither be zero nor a power of two. The quotient is rounded
+;;; towards zero.
+;;; The algorithm is taken from the paper "Division by Invariant
+;;; Integers using Multiplication", 1994 by Torbjörn Granlund and Peter
+;;; L. Montgomery, Figures 4.2 and 6.2, modified to exclude the case of
+;;; division by powers of two.
+;;; The following two examples show an average case and the worst case
+;;; with respect to the complexity of the generated expression, under
+;;; a word size of 64 bits:
+;;;
+;;; (UNSIGNED-DIV-TRANSFORMER 10) ->
+;;; (ASH (%MULTIPLY (ASH X 0) 14757395258967641293) -3)
+;;;
+;;; (UNSIGNED-DIV-TRANSFORMER 7) ->
+;;; (LET* ((NUM X)
+;;;        (T1 (%MULTIPLY NUM 2635249153387078803)))
+;;;   (ASH (LDB (BYTE 64 0)
+;;;             (+ T1 (ASH (LDB (BYTE 64 0)
+;;;                             (- NUM T1))
+;;;                        -1)))
+;;;        -2))
+;;;
+(defun gen-unsigned-div-by-constant-expr (y)
+  (declare (type (integer 3 #.most-positive-word) y))
+  (aver (not (zerop (logand y (1- y)))))
+  (labels ((ld (x)
+             ;; the floor of the binary logarithm of (positive) X
+             (integer-length (1- x)))
+           (choose-multiplier (y precision)
+             (do* ((l (ld y))
+                   (shift l (1- shift))
+                   (expt-2-n+l (expt 2 (+ sb!vm:n-word-bits l)))
+                   (m-low (truncate expt-2-n+l y) (ash m-low -1))
+                   (m-high (truncate (+ expt-2-n+l
+                                        (ash expt-2-n+l (- precision)))
+                                     y)
+                           (ash m-high -1)))
+                  ((not (and (< (ash m-low -1) (ash m-high -1))
+                             (> shift 0)))
+                   (values m-high shift)))))
+    (let ((n (expt 2 sb!vm:n-word-bits))
+          (shift1 0))
+      (multiple-value-bind (m shift2)
+          (choose-multiplier y sb!vm:n-word-bits)
+        (when (and (>= m n) (evenp y))
+          (setq shift1 (ld (logand y (- y))))
+          (multiple-value-setq (m shift2)
+            (choose-multiplier (/ y (ash 1 shift1))
+                               (- sb!vm:n-word-bits shift1))))
+        (if (>= m n)
+            (flet ((word-mod (x)
+                     `(ldb (byte #.sb!vm:n-word-bits 0) ,x)))
+              `(let* ((num x)
+                      (t1 (%multiply num ,(- m n))))
+                 (ash ,(word-mod `(+ t1 (ash ,(word-mod `(- num t1))
+                                             -1)))
+                      ,(- 1 shift2))))
+            `(ash (%multiply (ash x ,(- shift1)) ,m)
+                  ,(- shift2)))))))
+
+;;; If the divisor is constant and both args are positive and fit in a
+;;; machine word, replace the division by a multiplication and possibly
+;;; some shifts and an addition. Calculate the remainder by a second
+;;; multiplication and a subtraction. Dead code elimination will
+;;; suppress the latter part if only the quotient is needed. If the type
+;;; of the dividend allows to derive that the quotient will always have
+;;; the same value, emit much simpler code to handle that. (This case
+;;; may be rare but it's easy to detect and the compiler doesn't find
+;;; this optimization on its own.)
+(deftransform truncate ((x y) ((unsigned-byte #.sb!vm:n-word-bits)
+                               (constant-arg
+                                (unsigned-byte #.sb!vm:n-word-bits)))
+                        *
+                        :policy (and (> speed compilation-speed)
+                                     (> speed space)))
+  "convert integer division to multiplication"
+  (let ((y (lvar-value y)))
+    ;; Division by zero, one or powers of two is handled elsewhere.
+    (when (zerop (logand y (1- y)))
+      (give-up-ir1-transform))
+    ;; The compiler can't derive the result types to maximal tightness
+    ;; from the transformed expression, so we calculate them here and
+    ;; add the corresponding specifiers explicitly through TRULY-THE.
+    ;; This duplicates parts of the TRUNCATE DERIVE-TYPE optimizer but
+    ;; using that here would be too cumbersome.
+    (let* ((x-type (lvar-type x))
+           (x-low (or (and (numeric-type-p x-type)
+                           (numeric-type-low x-type))
+                      0))
+           (x-high (or (and (numeric-type-p x-type)
+                            (numeric-type-high x-type))
+                       (1- (expt 2 #.sb!vm:n-word-bits))))
+           (quot-low (truncate x-low y))
+           (quot-high (truncate x-high y)))
+      (if (= quot-low quot-high)
+          `(values ,quot-low
+                   (- x ,(* quot-low y)))
+          `(let* ((quot ,(gen-unsigned-div-by-constant-expr y))
+                  (rem (ldb (byte #.sb!vm:n-word-bits 0)
+                            (- x (* quot ,y)))))
+             (values (truly-the (integer ,quot-low ,quot-high) quot)
+                     (truly-the (integer 0 ,(1- y)) rem)))))))
  \f
  ;;;; arithmetic and logical identity operation elimination
  
diff --git a/tests/arith.pure.lisp b/tests/arith.pure.lisp

index dde318b..d570df2 100644 (file)
--- a/tests/arith.pure.lisp
+++ b/tests/arith.pure.lisp
@@ -404,3 +404,67 @@
                                  (if (eql fast-result slow-result)
                                      (print (list :ok `(,op ,@args) :=> fast-result))
                                      (error "oops: ~S, ~S" args call-args)))))))))))
+
+;;; (TRUNCATE <unsigned-word> <constant unsigned-word>) is optimized
+;;; to use multiplication instead of division. This propagates to FLOOR,
+;;; MOD and REM. Test that the transform is indeed triggered and test
+;;; several cases for correct results.
+(with-test (:name (:integer-division-using-multiplication :used)
+                  :skipped-on '(not (or :x86-64 :x86)))
+  (dolist (fun '(truncate floor ceiling mod rem))
+    (let* ((foo (compile nil `(lambda (x)
+                                (declare (optimize (speed 3)
+                                                   (space 0)
+                                                   (compilation-speed 0))
+                                         (type (unsigned-byte
+                                                ,sb-vm:n-word-bits) x))
+                                (,fun x 9))))
+           (disassembly (with-output-to-string (s)
+                          (disassemble foo :stream s))))
+      ;; KLUDGE copied from test :float-division-using-exact-reciprocal
+      ;; in compiler.pure.lisp.
+      (assert (and (not (search "DIV" disassembly))
+                   (search "MUL" disassembly))))))
+
+(with-test (:name (:integer-division-using-multiplication :correctness))
+  (let ((*random-state* (make-random-state t)))
+    (dolist (dividend-type `((unsigned-byte ,sb-vm:n-word-bits)
+                             (and fixnum unsigned-byte)
+                             (integer 10000 10100)))
+      (dolist (divisor `(;; Some special cases from the paper
+                         7 10 14 641 274177
+                         ;; Range extremes
+                         3
+                         ,most-positive-fixnum
+                         ,(1- (expt 2 sb-vm:n-word-bits))
+                         ;; Some random values
+                         ,@(loop for i from 8 to sb-vm:n-word-bits
+                                 for r = (random (expt 2 i))
+                                 ;; We don't want 0, 1 and powers of 2.
+                                 when (not (zerop (logand r (1- r))))
+                                 collect r)))
+        (dolist (fun '(truncate ceiling floor mod rem))
+          (let ((foo (compile nil `(lambda (x)
+                                     (declare (optimize (speed 3)
+                                                        (space 0)
+                                                        (compilation-speed 0))
+                                              (type ,dividend-type x))
+                                     (,fun x ,divisor)))))
+            (dolist (dividend `(0 1 ,most-positive-fixnum
+                                ,(1- divisor) ,divisor
+                                ,(1- (* divisor 2)) ,(* divisor 2)
+                                ,@(loop repeat 4
+                                        collect (+ 10000 (random 101)))
+                                ,@(loop for i from 4 to sb-vm:n-word-bits
+                                        for r = (random (expt 2 i))
+                                        collect r)))
+              (when (typep dividend dividend-type)
+                (multiple-value-bind (q1 r1)
+                    (funcall foo dividend)
+                  (multiple-value-bind (q2 r2)
+                      (funcall fun dividend divisor)
+                    (unless (and (= q1 q2)
+                                 (eql r1 r2))
+                      (error "bad results for ~s with dividend type ~s"
+                             (list fun dividend divisor)
+                             dividend-type))))))))))))
author	Lutz Euler <lutz.euler@freenet.de>
	Mon, 25 Jul 2011 00:47:43 +0000 (02:47 +0200)
committer	Nikodemus Siivola <nikodemus@sb-studio.net>
	Fri, 5 Aug 2011 13:10:13 +0000 (16:10 +0300)
NEWS		patch \| blob \| history
src/compiler/srctran.lisp		patch \| blob \| history
tests/arith.pure.lisp		patch \| blob \| history