0.8.3.13:

author Christophe Rhodes <csr21@cam.ac.uk>

Fri, 29 Aug 2003 17:59:08 +0000 (17:59 +0000)

committer Christophe Rhodes <csr21@cam.ac.uk>

Fri, 29 Aug 2003 17:59:08 +0000 (17:59 +0000)
author Christophe Rhodes <csr21@cam.ac.uk>
Fri, 29 Aug 2003 17:59:08 +0000 (17:59 +0000)
committer Christophe Rhodes <csr21@cam.ac.uk>
Fri, 29 Aug 2003 17:59:08 +0000 (17:59 +0000)
diff --git a/NEWS b/NEWS

index cc907ac..17f7410 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -2017,6 +2017,8 @@ changes in sbcl-0.8.4 relative to sbcl-0.8.3:
    * optimization: compiler-internal data structure use has been
      reviewed, and changes have been made that should improve the
      performance of the compiler by about 20%.
+  * microoptimization: the compiler is better able to make use of the
+    x86 LEA instruction for multiplication by constants.
    * bug fix: in some situations compiler did not report usage of
      generic arithmetic in (SPEED 3) policy.
  
diff --git a/package-data-list.lisp-expr b/package-data-list.lisp-expr

index 71dbdd9..8eb7a7b 100644 (file)
--- a/package-data-list.lisp-expr
+++ b/package-data-list.lisp-expr
@@ -1124,6 +1124,7 @@ is a good idea, but see SB-SYS re. blurring of boundaries."
               "KEY-INFO" "KEY-INFO-NAME"
               "KEY-INFO-P" "KEY-INFO-TYPE"
               "LAYOUT-DEPTHOID" "LAYOUT-INVALID-ERROR"
+            #!+x86 "%LEA"
              "LEXENV" "LEXENV-DESIGNATOR"
               "LINE-LENGTH"
               "ANSI-STREAM"
diff --git a/src/compiler/sparc/arith.lisp b/src/compiler/sparc/arith.lisp

index 307977c..aece39a 100644 (file)
--- a/src/compiler/sparc/arith.lisp
+++ b/src/compiler/sparc/arith.lisp
@@ -591,6 +591,21 @@
      (inst sra temp y n-fixnum-tag-bits)
      (inst smul r x temp)))
  
+(define-vop (fast-v8-*-c/fixnum=>fixnum fast-safe-arith-op)
+  (:args (x :target r :scs (any-reg zero)))
+  (:info y)
+  (:arg-types tagged-num
+             (:constant (and (signed-byte 13) (not (integer 0 0)))))
+  (:results (r :scs (any-reg)))
+  (:result-types tagged-num)
+  (:note "inline fixnum arithmetic")
+  (:translate *)
+  (:guard (or (member :sparc-v8 *backend-subfeatures*)
+             (and (member :sparc-v9 *backend-subfeatures*)
+                  (not (member :sparc-64 *backend-subfeatures*)))))
+  (:generator 1
+    (inst smul r x y)))
+
  (define-vop (fast-v8-*/signed=>signed fast-signed-binop)
    (:translate *)
    (:guard (or (member :sparc-v8 *backend-subfeatures*)
@@ -599,6 +614,14 @@
    (:generator 3
      (inst smul r x y)))
  
+(define-vop (fast-v8-*-c/signed=>signed fast-signed-binop-c)
+  (:translate *)
+  (:guard (or (member :sparc-v8 *backend-subfeatures*)
+             (and (member :sparc-v9 *backend-subfeatures*)
+                  (not (member :sparc-64 *backend-subfeatures*)))))
+  (:generator 2
+    (inst smul r x y)))
+         
  (define-vop (fast-v8-*/unsigned=>unsigned fast-unsigned-binop)
    (:translate *)
    (:guard (or (member :sparc-v8 *backend-subfeatures*)
@@ -607,6 +630,14 @@
    (:generator 3
      (inst umul r x y)))
  
+(define-vop (fast-v8-*-c/unsigned=>unsigned fast-unsigned-binop-c)
+  (:translate *)
+  (:guard (or (member :sparc-v8 *backend-subfeatures*)
+             (and (member :sparc-v9 *backend-subfeatures*)
+                  (not (member :sparc-64 *backend-subfeatures*)))))
+  (:generator 2
+    (inst umul r x y)))
+
  ;; The smul and umul instructions are deprecated on the Sparc V9.  Use
  ;; mulx instead.
  (define-vop (fast-v9-*/fixnum=>fixnum fast-fixnum-binop)
@@ -1212,11 +1243,76 @@
    (defun ash-right-unsigned (num shuft)
      (ash-right-unsigned num shift)))
  
+(in-package "SB!C")
+
+;;; If both arguments and the result are (UNSIGNED-BYTE 32), try to
+;;; come up with a ``better'' multiplication using multiplier
+;;; recoding. There are two different ways the multiplier can be
+;;; recoded. The more obvious is to shift X by the correct amount for
+;;; each bit set in Y and to sum the results. But if there is a string
+;;; of bits that are all set, you can add X shifted by one more then
+;;; the bit position of the first set bit and subtract X shifted by
+;;; the bit position of the last set bit. We can't use this second
+;;; method when the high order bit is bit 31 because shifting by 32
+;;; doesn't work too well.
+(deftransform * ((x y)
+                ((unsigned-byte 32) (constant-arg (unsigned-byte 32)))
+                (unsigned-byte 32))
+  "recode as shifts and adds"
+  (let ((y (continuation-value y))
+       (adds 0)
+       (shifts 0)
+       (result nil)
+       (first-one nil))
+    (labels ((tub32 (x) `(truly-the (unsigned-byte 32) ,x))
+            (add (next-factor)
+              (setf result
+                    (tub32
+                     (if result
+                         (progn (incf adds) `(+ ,result ,(tub32 next-factor)))
+                         next-factor)))))
+      (declare (inline add))
+      (dotimes (bitpos 32)
+       (if first-one
+           (when (not (logbitp bitpos y))
+             (add (if (= (1+ first-one) bitpos)
+                      ;; There is only a single bit in the string.
+                      (progn (incf shifts) `(ash x ,first-one))
+                      ;; There are at least two.
+                      (progn
+                        (incf adds)
+                        (incf shifts 2)
+                        `(- ,(tub32 `(ash x ,bitpos))
+                            ,(tub32 `(ash x ,first-one))))))
+             (setf first-one nil))
+           (when (logbitp bitpos y)
+             (setf first-one bitpos))))
+      (when first-one
+       (cond ((= first-one 31))
+             ((= first-one 30) (incf shifts) (add '(ash x 30)))
+             (t
+              (incf shifts 2)
+              (incf adds)
+              (add `(- ,(tub32 '(ash x 31)) ,(tub32 `(ash x ,first-one))))))
+       (incf shifts)
+       (add '(ash x 31))))
+
+    (cond
+      ;; we assume, perhaps foolishly, that good SPARCs don't have an
+      ;; issue with multiplications.  (Remember that there's a
+      ;; different transform for converting x*2^k to a shift).
+      ((member :sparc-64 *backend-subfeatures*) (give-up-ir1-transform))
+      ((or (member :sparc-v9 *backend-subfeatures*)
+          (member :sparc-v8 *backend-subfeatures*))
+       ;; breakeven point as measured by Raymond Toy
+       (when (> (+ adds shifts) 9)
+        (give-up-ir1-transform))))
+    
+    (or result 0)))
+
  ;; If we can prove that we have a right shift, just do the right shift
  ;; instead of calling the inline ASH which has to check for the
  ;; direction of the shift at run-time.
-(in-package "SB!C")
-
  (deftransform ash ((num shift) (integer integer))
    (let ((num-type (continuation-type num))
         (shift-type (continuation-type shift)))
diff --git a/src/compiler/srctran.lisp b/src/compiler/srctran.lisp

index 478e578..150cb3a 100644 (file)
--- a/src/compiler/srctran.lisp
+++ b/src/compiler/srctran.lisp
@@ -2573,54 +2573,6 @@
         `(- (ash x ,len))
         `(ash x ,len))))
  
-;;; If both arguments and the result are (UNSIGNED-BYTE 32), try to
-;;; come up with a ``better'' multiplication using multiplier
-;;; recoding. There are two different ways the multiplier can be
-;;; recoded. The more obvious is to shift X by the correct amount for
-;;; each bit set in Y and to sum the results. But if there is a string
-;;; of bits that are all set, you can add X shifted by one more then
-;;; the bit position of the first set bit and subtract X shifted by
-;;; the bit position of the last set bit. We can't use this second
-;;; method when the high order bit is bit 31 because shifting by 32
-;;; doesn't work too well.
-(deftransform * ((x y)
-                ((unsigned-byte 32) (unsigned-byte 32))
-                (unsigned-byte 32))
-  "recode as shift and add"
-  (unless (constant-continuation-p y)
-    (give-up-ir1-transform))
-  (let ((y (continuation-value y))
-       (result nil)
-       (first-one nil))
-    (labels ((tub32 (x) `(truly-the (unsigned-byte 32) ,x))
-            (add (next-factor)
-              (setf result
-                    (tub32
-                     (if result
-                         `(+ ,result ,(tub32 next-factor))
-                         next-factor)))))
-      (declare (inline add))
-      (dotimes (bitpos 32)
-       (if first-one
-           (when (not (logbitp bitpos y))
-             (add (if (= (1+ first-one) bitpos)
-                      ;; There is only a single bit in the string.
-                      `(ash x ,first-one)
-                      ;; There are at least two.
-                      `(- ,(tub32 `(ash x ,bitpos))
-                          ,(tub32 `(ash x ,first-one)))))
-             (setf first-one nil))
-           (when (logbitp bitpos y)
-             (setf first-one bitpos))))
-      (when first-one
-       (cond ((= first-one 31))
-             ((= first-one 30)
-              (add '(ash x 30)))
-             (t
-              (add `(- ,(tub32 '(ash x 31)) ,(tub32 `(ash x ,first-one))))))
-       (add '(ash x 31))))
-    (or result 0)))
-
  ;;; If arg is a constant power of two, turn FLOOR into a shift and
  ;;; mask. If CEILING, add in (1- (ABS Y)), do FLOOR and correct a
  ;;; remainder.
diff --git a/src/compiler/x86/arith.lisp b/src/compiler/x86/arith.lisp

index bb52d48..934da04 100644 (file)
--- a/src/compiler/x86/arith.lisp
+++ b/src/compiler/x86/arith.lisp
@@ -1373,3 +1373,178 @@
      (inst mov tmp y)
      (inst shr tmp 18)
      (inst xor y tmp)))
+
+(in-package "SB!C")
+
+(defknown %lea ((or (signed-byte 32) (unsigned-byte 32))
+               (or (signed-byte 32) (unsigned-byte 32))
+               (member 1 2 4 8) (signed-byte 32))
+  (or (signed-byte 32) (unsigned-byte 32))
+  (foldable flushable))
+
+(defoptimizer (%lea derive-type) ((base index scale disp))
+  (when (and (constant-continuation-p scale)
+            (constant-continuation-p disp))
+    (let ((scale (continuation-value scale))
+         (disp (continuation-value disp))
+         (base-type (continuation-type base))
+         (index-type (continuation-type index)))
+      (when (and (numeric-type-p base-type)
+                (numeric-type-p index-type))
+       (let ((base-lo (numeric-type-low base-type))
+             (base-hi (numeric-type-high base-type))
+             (index-lo (numeric-type-low index-type))
+             (index-hi (numeric-type-high index-type)))
+         (make-numeric-type :class 'integer
+                            :complexp :real
+                            :low (when (and base-lo index-lo)
+                                   (+ base-lo (* index-lo scale) disp))
+                            :high (when (and base-hi index-hi)
+                                    (+ base-hi (* index-hi scale) disp))))))))
+
+(defun %lea (base index scale disp)
+  (+ base (* index scale) disp))
+
+(in-package "SB!VM")
+
+(define-vop (%lea/unsigned=>unsigned)
+  (:translate %lea)
+  (:policy :fast-safe)
+  (:args (base :scs (unsigned-reg))
+        (index :scs (unsigned-reg)))
+  (:info scale disp)
+  (:arg-types unsigned-num unsigned-num
+             (:constant (member 1 2 4 8))
+             (:constant (signed-byte 32)))
+  (:results (r :scs (unsigned-reg)))
+  (:result-types unsigned-num)
+  (:generator 5
+    (inst lea r (make-ea :dword :base base :index index
+                        :scale scale :disp disp))))
+
+(define-vop (%lea/signed=>signed)
+  (:translate %lea)
+  (:policy :fast-safe)
+  (:args (base :scs (signed-reg))
+        (index :scs (signed-reg)))
+  (:info scale disp)
+  (:arg-types signed-num signed-num
+             (:constant (member 1 2 4 8))
+             (:constant (signed-byte 32)))
+  (:results (r :scs (signed-reg)))
+  (:result-types signed-num)
+  (:generator 4
+    (inst lea r (make-ea :dword :base base :index index
+                        :scale scale :disp disp))))
+
+(define-vop (%lea/fixnum=>fixnum)
+  (:translate %lea)
+  (:policy :fast-safe)
+  (:args (base :scs (any-reg))
+        (index :scs (any-reg)))
+  (:info scale disp)
+  (:arg-types tagged-num tagged-num
+             (:constant (member 1 2 4 8))
+             (:constant (signed-byte 32)))
+  (:results (r :scs (any-reg)))
+  (:result-types tagged-num)
+  (:generator 3
+    (inst lea r (make-ea :dword :base base :index index
+                        :scale scale :disp disp))))
+
+(in-package "SB!C")
+
+;;; This is essentially a straight implementation of the algorithm in
+;;; "Strength Reduction of Multiplications by Integer Constants",
+;;; Youfeng Wu, ACM SIGPLAN Notices, Vol. 30, No.2, February 1995.
+(defun basic-decompose-multiplication (arg num n-bits condensed)
+  (case (aref condensed 0)
+    (0
+     (let ((tmp (min 3 (aref condensed 1))))
+       (decf (aref condensed 1) tmp)
+       `(truly-the (unsigned-byte 32)
+        (%lea ,arg
+              ,(decompose-multiplication
+                arg (ash (1- num) (- tmp)) (1- n-bits) (subseq condensed 1))
+              ,(ash 1 tmp) 0))))
+    ((1 2 3)
+     (let ((r0 (aref condensed 0)))
+       (incf (aref condensed 1) r0)
+       `(truly-the (unsigned-byte 32)
+        (%lea ,(decompose-multiplication
+                arg (- num (ash 1 r0)) (1- n-bits) (subseq condensed 1))
+              ,arg
+              ,(ash 1 r0) 0))))
+    (t (let ((r0 (aref condensed 0)))
+        (setf (aref condensed 0) 0)
+        `(truly-the (unsigned-byte 32)
+          (ash ,(decompose-multiplication
+                 arg (ash num (- r0)) n-bits condensed)
+               ,r0))))))
+
+(defun decompose-multiplication (arg num n-bits condensed)
+  (cond
+    ((= n-bits 0) 0)
+    ((= num 1) arg)
+    ((= n-bits 1)
+     `(truly-the (unsigned-byte 32) (ash ,arg ,(1- (integer-length num)))))
+    ((let ((max 0) (end 0))
+       (loop for i from 2 to (length condensed)
+            for j = (reduce #'+ (subseq condensed 0 i))
+            when (and (> (- (* 2 i) 3 j) max)
+                      (< (+ (ash 1 (1+ j))
+                            (ash (ldb (byte (- 32 (1+ j)) (1+ j)) num)
+                                 (1+ j)))
+                         (ash 1 32)))
+              do (setq max (- (* 2 i) 3 j)
+                       end i))
+       (when (> max 0)
+        (let ((j (reduce #'+ (subseq condensed 0 end))))
+          (let ((n2 (+ (ash 1 (1+ j))
+                       (ash (ldb (byte (- 32 (1+ j)) (1+ j)) num) (1+ j))))
+                (n1 (1+ (ldb (byte (1+ j) 0) (lognot num)))))
+          `(truly-the (unsigned-byte 32)
+            (- ,(optimize-multiply arg n2) ,(optimize-multiply arg n1))))))))
+    ((dolist (i '(9 5 3))
+       (when (integerp (/ num i))
+        (when (< (logcount (/ num i)) (logcount num))
+          (let ((x (gensym)))
+            (return `(let ((,x ,(optimize-multiply arg (/ num i))))
+                      (truly-the (unsigned-byte 32)
+                       (%lea ,x ,x (1- ,i) 0)))))))))
+    (t (basic-decompose-multiplication arg num n-bits condensed))))
+          
+(defun optimize-multiply (arg x)
+  (let* ((n-bits (logcount x))
+        (condensed (make-array n-bits)))
+    (let ((count 0) (bit 0))
+      (dotimes (i 32)
+       (cond ((logbitp i x)
+              (setf (aref condensed bit) count)
+              (setf count 1)
+              (incf bit))
+             (t (incf count)))))
+    (decompose-multiplication arg x n-bits condensed)))
+
+(deftransform * ((x y)
+                ((unsigned-byte 32) (constant-arg (unsigned-byte 32)))
+                (unsigned-byte 32))
+  "recode as leas, shifts and adds"
+  (let ((y (continuation-value y)))
+    (cond
+      ((= y (ash 1 (integer-length y)))
+       ;; there's a generic transform for y = 2^k
+       (give-up-ir1-transform))
+      ((member y '(3 5 9))
+       ;; we can do these multiplications directly using LEA
+       `(%lea x x ,(1- y) 0))
+      ((member :pentium4 *backend-subfeatures*)
+       ;; the pentium4's multiply unit is reportedly very good
+       (give-up-ir1-transform))
+      ;; FIXME: should make this more fine-grained.  If nothing else,
+      ;; there should probably be a cutoff of about 9 instructions on
+      ;; pentium-class machines.
+      (t (optimize-multiply 'x y)))))
+
+;;; FIXME: we should also be able to write an optimizer or two to
+;;; convert (+ (* x 2) 17), (- (* x 9) 5) to a %LEA.
diff --git a/tests/arith.pure.lisp b/tests/arith.pure.lisp

index c2f0ac0..7493d4e 100644 (file)
--- a/tests/arith.pure.lisp
+++ b/tests/arith.pure.lisp
@@ -81,5 +81,24 @@
                (assert (<= exact-q q))
                (assert (< q (1+ exact-q))))))
  
-;; CEILING had a corner case, spotted by Paul Dietz
+;;; CEILING had a corner case, spotted by Paul Dietz
  (assert (= (ceiling most-negative-fixnum (1+ most-positive-fixnum)) -1))
+
+;;; give any optimizers of constant multiplication a light testing.
+;;; 100 may seem low, but (a) it caught CSR's initial errors, and (b)
+;;; before checking in, CSR tested with 10000.  So one hundred
+;;; checkins later, we'll have doubled the coverage.
+(dotimes (i 100)
+  (let* ((x (random most-positive-fixnum))
+        (x2 (* x 2))
+        (x3 (* x 3)))
+    (let ((fn (handler-bind ((sb-ext:compiler-note #'error))
+               (compile nil
+                        `(lambda (y)
+                           (declare (optimize speed) (type (integer 0 3) y))
+                           (* y ,x))))))
+      (unless (and (= (funcall fn 0) 0)
+                  (= (funcall fn 1) x)
+                  (= (funcall fn 2) x2)
+                  (= (funcall fn 3) x3))
+       (error "bad results for ~D" x)))))
diff --git a/version.lisp-expr b/version.lisp-expr

index 8655a3a..542a21d 100644 (file)
--- a/version.lisp-expr
+++ b/version.lisp-expr
@@ -17,4 +17,4 @@
  ;;; checkins which aren't released. (And occasionally for internal
  ;;; versions, especially for internal versions off the main CVS
  ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"0.8.3.12"
+"0.8.3.13"
author	Christophe Rhodes <csr21@cam.ac.uk>
	Fri, 29 Aug 2003 17:59:08 +0000 (17:59 +0000)
committer	Christophe Rhodes <csr21@cam.ac.uk>
	Fri, 29 Aug 2003 17:59:08 +0000 (17:59 +0000)
NEWS		patch \| blob \| history
package-data-list.lisp-expr		patch \| blob \| history
src/compiler/sparc/arith.lisp		patch \| blob \| history
src/compiler/srctran.lisp		patch \| blob \| history
src/compiler/x86/arith.lisp		patch \| blob \| history
tests/arith.pure.lisp		patch \| blob \| history
version.lisp-expr		patch \| blob \| history