From: Christophe Rhodes <csr21@cam.ac.uk>
Date: Fri, 29 Aug 2003 17:59:08 +0000 (+0000)
Subject: 0.8.3.13:
X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=3bc5fbfb7f1528cb2c2e49b2d15fcaa6c62f5b49;p=sbcl.git

0.8.3.13:
	Implement better constant multiply routines
	... have a cutoff on Sparc, as measured by Raymond Toy
	... use LEA more on x86, as per cited paper
	... don't do anything at all (yet) on other architectures.  This
		needs to be fixed before 0.8.4, at least for PPC and
		Alpha
---

diff --git a/NEWS b/NEWS
index cc907ac..17f7410 100644
--- a/NEWS
+++ b/NEWS
@@ -2017,6 +2017,8 @@ changes in sbcl-0.8.4 relative to sbcl-0.8.3:
   * optimization: compiler-internal data structure use has been
     reviewed, and changes have been made that should improve the
     performance of the compiler by about 20%.
+  * microoptimization: the compiler is better able to make use of the
+    x86 LEA instruction for multiplication by constants.
   * bug fix: in some situations compiler did not report usage of
     generic arithmetic in (SPEED 3) policy.
 
diff --git a/package-data-list.lisp-expr b/package-data-list.lisp-expr
index 71dbdd9..8eb7a7b 100644
--- a/package-data-list.lisp-expr
+++ b/package-data-list.lisp-expr
@@ -1124,6 +1124,7 @@ is a good idea, but see SB-SYS re. blurring of boundaries."
              "KEY-INFO" "KEY-INFO-NAME"
              "KEY-INFO-P" "KEY-INFO-TYPE"
              "LAYOUT-DEPTHOID" "LAYOUT-INVALID-ERROR"
+	     #!+x86 "%LEA"
 	     "LEXENV" "LEXENV-DESIGNATOR"
              "LINE-LENGTH"
              "ANSI-STREAM"
diff --git a/src/compiler/sparc/arith.lisp b/src/compiler/sparc/arith.lisp
index 307977c..aece39a 100644
--- a/src/compiler/sparc/arith.lisp
+++ b/src/compiler/sparc/arith.lisp
@@ -591,6 +591,21 @@
     (inst sra temp y n-fixnum-tag-bits)
     (inst smul r x temp)))
 
+(define-vop (fast-v8-*-c/fixnum=>fixnum fast-safe-arith-op)
+  (:args (x :target r :scs (any-reg zero)))
+  (:info y)
+  (:arg-types tagged-num
+	      (:constant (and (signed-byte 13) (not (integer 0 0)))))
+  (:results (r :scs (any-reg)))
+  (:result-types tagged-num)
+  (:note "inline fixnum arithmetic")
+  (:translate *)
+  (:guard (or (member :sparc-v8 *backend-subfeatures*)
+	      (and (member :sparc-v9 *backend-subfeatures*)
+		   (not (member :sparc-64 *backend-subfeatures*)))))
+  (:generator 1
+    (inst smul r x y)))
+
 (define-vop (fast-v8-*/signed=>signed fast-signed-binop)
   (:translate *)
   (:guard (or (member :sparc-v8 *backend-subfeatures*)
@@ -599,6 +614,14 @@
   (:generator 3
     (inst smul r x y)))
 
+(define-vop (fast-v8-*-c/signed=>signed fast-signed-binop-c)
+  (:translate *)
+  (:guard (or (member :sparc-v8 *backend-subfeatures*)
+	      (and (member :sparc-v9 *backend-subfeatures*)
+		   (not (member :sparc-64 *backend-subfeatures*)))))
+  (:generator 2
+    (inst smul r x y)))
+	  
 (define-vop (fast-v8-*/unsigned=>unsigned fast-unsigned-binop)
   (:translate *)
   (:guard (or (member :sparc-v8 *backend-subfeatures*)
@@ -607,6 +630,14 @@
   (:generator 3
     (inst umul r x y)))
 
+(define-vop (fast-v8-*-c/unsigned=>unsigned fast-unsigned-binop-c)
+  (:translate *)
+  (:guard (or (member :sparc-v8 *backend-subfeatures*)
+	      (and (member :sparc-v9 *backend-subfeatures*)
+		   (not (member :sparc-64 *backend-subfeatures*)))))
+  (:generator 2
+    (inst umul r x y)))
+
 ;; The smul and umul instructions are deprecated on the Sparc V9.  Use
 ;; mulx instead.
 (define-vop (fast-v9-*/fixnum=>fixnum fast-fixnum-binop)
@@ -1212,11 +1243,76 @@
   (defun ash-right-unsigned (num shuft)
     (ash-right-unsigned num shift)))
 
+(in-package "SB!C")
+
+;;; If both arguments and the result are (UNSIGNED-BYTE 32), try to
+;;; come up with a ``better'' multiplication using multiplier
+;;; recoding. There are two different ways the multiplier can be
+;;; recoded. The more obvious is to shift X by the correct amount for
+;;; each bit set in Y and to sum the results. But if there is a string
+;;; of bits that are all set, you can add X shifted by one more then
+;;; the bit position of the first set bit and subtract X shifted by
+;;; the bit position of the last set bit. We can't use this second
+;;; method when the high order bit is bit 31 because shifting by 32
+;;; doesn't work too well.
+(deftransform * ((x y)
+		 ((unsigned-byte 32) (constant-arg (unsigned-byte 32)))
+		 (unsigned-byte 32))
+  "recode as shifts and adds"
+  (let ((y (continuation-value y))
+	(adds 0)
+	(shifts 0)
+	(result nil)
+	(first-one nil))
+    (labels ((tub32 (x) `(truly-the (unsigned-byte 32) ,x))
+	     (add (next-factor)
+	       (setf result
+		     (tub32
+		      (if result
+			  (progn (incf adds) `(+ ,result ,(tub32 next-factor)))
+			  next-factor)))))
+      (declare (inline add))
+      (dotimes (bitpos 32)
+	(if first-one
+	    (when (not (logbitp bitpos y))
+	      (add (if (= (1+ first-one) bitpos)
+		       ;; There is only a single bit in the string.
+		       (progn (incf shifts) `(ash x ,first-one))
+		       ;; There are at least two.
+		       (progn
+			 (incf adds)
+			 (incf shifts 2)
+			 `(- ,(tub32 `(ash x ,bitpos))
+			     ,(tub32 `(ash x ,first-one))))))
+	      (setf first-one nil))
+	    (when (logbitp bitpos y)
+	      (setf first-one bitpos))))
+      (when first-one
+	(cond ((= first-one 31))
+	      ((= first-one 30) (incf shifts) (add '(ash x 30)))
+	      (t
+	       (incf shifts 2)
+	       (incf adds)
+	       (add `(- ,(tub32 '(ash x 31)) ,(tub32 `(ash x ,first-one))))))
+	(incf shifts)
+	(add '(ash x 31))))
+
+    (cond
+      ;; we assume, perhaps foolishly, that good SPARCs don't have an
+      ;; issue with multiplications.  (Remember that there's a
+      ;; different transform for converting x*2^k to a shift).
+      ((member :sparc-64 *backend-subfeatures*) (give-up-ir1-transform))
+      ((or (member :sparc-v9 *backend-subfeatures*)
+	   (member :sparc-v8 *backend-subfeatures*))
+       ;; breakeven point as measured by Raymond Toy
+       (when (> (+ adds shifts) 9)
+	 (give-up-ir1-transform))))
+    
+    (or result 0)))
+
 ;; If we can prove that we have a right shift, just do the right shift
 ;; instead of calling the inline ASH which has to check for the
 ;; direction of the shift at run-time.
-(in-package "SB!C")
-
 (deftransform ash ((num shift) (integer integer))
   (let ((num-type (continuation-type num))
 	(shift-type (continuation-type shift)))
diff --git a/src/compiler/srctran.lisp b/src/compiler/srctran.lisp
index 478e578..150cb3a 100644
--- a/src/compiler/srctran.lisp
+++ b/src/compiler/srctran.lisp
@@ -2573,54 +2573,6 @@
 	`(- (ash x ,len))
 	`(ash x ,len))))
 
-;;; If both arguments and the result are (UNSIGNED-BYTE 32), try to
-;;; come up with a ``better'' multiplication using multiplier
-;;; recoding. There are two different ways the multiplier can be
-;;; recoded. The more obvious is to shift X by the correct amount for
-;;; each bit set in Y and to sum the results. But if there is a string
-;;; of bits that are all set, you can add X shifted by one more then
-;;; the bit position of the first set bit and subtract X shifted by
-;;; the bit position of the last set bit. We can't use this second
-;;; method when the high order bit is bit 31 because shifting by 32
-;;; doesn't work too well.
-(deftransform * ((x y)
-		 ((unsigned-byte 32) (unsigned-byte 32))
-		 (unsigned-byte 32))
-  "recode as shift and add"
-  (unless (constant-continuation-p y)
-    (give-up-ir1-transform))
-  (let ((y (continuation-value y))
-	(result nil)
-	(first-one nil))
-    (labels ((tub32 (x) `(truly-the (unsigned-byte 32) ,x))
-	     (add (next-factor)
-	       (setf result
-		     (tub32
-		      (if result
-			  `(+ ,result ,(tub32 next-factor))
-			  next-factor)))))
-      (declare (inline add))
-      (dotimes (bitpos 32)
-	(if first-one
-	    (when (not (logbitp bitpos y))
-	      (add (if (= (1+ first-one) bitpos)
-		       ;; There is only a single bit in the string.
-		       `(ash x ,first-one)
-		       ;; There are at least two.
-		       `(- ,(tub32 `(ash x ,bitpos))
-			   ,(tub32 `(ash x ,first-one)))))
-	      (setf first-one nil))
-	    (when (logbitp bitpos y)
-	      (setf first-one bitpos))))
-      (when first-one
-	(cond ((= first-one 31))
-	      ((= first-one 30)
-	       (add '(ash x 30)))
-	      (t
-	       (add `(- ,(tub32 '(ash x 31)) ,(tub32 `(ash x ,first-one))))))
-	(add '(ash x 31))))
-    (or result 0)))
-
 ;;; If arg is a constant power of two, turn FLOOR into a shift and
 ;;; mask. If CEILING, add in (1- (ABS Y)), do FLOOR and correct a
 ;;; remainder.
diff --git a/src/compiler/x86/arith.lisp b/src/compiler/x86/arith.lisp
index bb52d48..934da04 100644
--- a/src/compiler/x86/arith.lisp
+++ b/src/compiler/x86/arith.lisp
@@ -1373,3 +1373,178 @@
     (inst mov tmp y)
     (inst shr tmp 18)
     (inst xor y tmp)))
+
+(in-package "SB!C")
+
+(defknown %lea ((or (signed-byte 32) (unsigned-byte 32))
+		(or (signed-byte 32) (unsigned-byte 32))
+		(member 1 2 4 8) (signed-byte 32))
+  (or (signed-byte 32) (unsigned-byte 32))
+  (foldable flushable))
+
+(defoptimizer (%lea derive-type) ((base index scale disp))
+  (when (and (constant-continuation-p scale)
+	     (constant-continuation-p disp))
+    (let ((scale (continuation-value scale))
+	  (disp (continuation-value disp))
+	  (base-type (continuation-type base))
+	  (index-type (continuation-type index)))
+      (when (and (numeric-type-p base-type)
+		 (numeric-type-p index-type))
+	(let ((base-lo (numeric-type-low base-type))
+	      (base-hi (numeric-type-high base-type))
+	      (index-lo (numeric-type-low index-type))
+	      (index-hi (numeric-type-high index-type)))
+	  (make-numeric-type :class 'integer
+			     :complexp :real
+			     :low (when (and base-lo index-lo)
+				    (+ base-lo (* index-lo scale) disp))
+			     :high (when (and base-hi index-hi)
+				     (+ base-hi (* index-hi scale) disp))))))))
+
+(defun %lea (base index scale disp)
+  (+ base (* index scale) disp))
+
+(in-package "SB!VM")
+
+(define-vop (%lea/unsigned=>unsigned)
+  (:translate %lea)
+  (:policy :fast-safe)
+  (:args (base :scs (unsigned-reg))
+	 (index :scs (unsigned-reg)))
+  (:info scale disp)
+  (:arg-types unsigned-num unsigned-num
+	      (:constant (member 1 2 4 8))
+	      (:constant (signed-byte 32)))
+  (:results (r :scs (unsigned-reg)))
+  (:result-types unsigned-num)
+  (:generator 5
+    (inst lea r (make-ea :dword :base base :index index
+			 :scale scale :disp disp))))
+
+(define-vop (%lea/signed=>signed)
+  (:translate %lea)
+  (:policy :fast-safe)
+  (:args (base :scs (signed-reg))
+	 (index :scs (signed-reg)))
+  (:info scale disp)
+  (:arg-types signed-num signed-num
+	      (:constant (member 1 2 4 8))
+	      (:constant (signed-byte 32)))
+  (:results (r :scs (signed-reg)))
+  (:result-types signed-num)
+  (:generator 4
+    (inst lea r (make-ea :dword :base base :index index
+			 :scale scale :disp disp))))
+
+(define-vop (%lea/fixnum=>fixnum)
+  (:translate %lea)
+  (:policy :fast-safe)
+  (:args (base :scs (any-reg))
+	 (index :scs (any-reg)))
+  (:info scale disp)
+  (:arg-types tagged-num tagged-num
+	      (:constant (member 1 2 4 8))
+	      (:constant (signed-byte 32)))
+  (:results (r :scs (any-reg)))
+  (:result-types tagged-num)
+  (:generator 3
+    (inst lea r (make-ea :dword :base base :index index
+			 :scale scale :disp disp))))
+
+(in-package "SB!C")
+
+;;; This is essentially a straight implementation of the algorithm in
+;;; "Strength Reduction of Multiplications by Integer Constants",
+;;; Youfeng Wu, ACM SIGPLAN Notices, Vol. 30, No.2, February 1995.
+(defun basic-decompose-multiplication (arg num n-bits condensed)
+  (case (aref condensed 0)
+    (0
+     (let ((tmp (min 3 (aref condensed 1))))
+       (decf (aref condensed 1) tmp)
+       `(truly-the (unsigned-byte 32)
+	 (%lea ,arg
+	       ,(decompose-multiplication
+		 arg (ash (1- num) (- tmp)) (1- n-bits) (subseq condensed 1))
+	       ,(ash 1 tmp) 0))))
+    ((1 2 3)
+     (let ((r0 (aref condensed 0)))
+       (incf (aref condensed 1) r0)
+       `(truly-the (unsigned-byte 32)
+	 (%lea ,(decompose-multiplication
+		 arg (- num (ash 1 r0)) (1- n-bits) (subseq condensed 1))
+	       ,arg
+	       ,(ash 1 r0) 0))))
+    (t (let ((r0 (aref condensed 0)))
+	 (setf (aref condensed 0) 0)
+	 `(truly-the (unsigned-byte 32)
+	   (ash ,(decompose-multiplication
+		  arg (ash num (- r0)) n-bits condensed)
+	        ,r0))))))
+
+(defun decompose-multiplication (arg num n-bits condensed)
+  (cond
+    ((= n-bits 0) 0)
+    ((= num 1) arg)
+    ((= n-bits 1)
+     `(truly-the (unsigned-byte 32) (ash ,arg ,(1- (integer-length num)))))
+    ((let ((max 0) (end 0))
+       (loop for i from 2 to (length condensed)
+	     for j = (reduce #'+ (subseq condensed 0 i))
+	     when (and (> (- (* 2 i) 3 j) max)
+		       (< (+ (ash 1 (1+ j))
+			     (ash (ldb (byte (- 32 (1+ j)) (1+ j)) num)
+				  (1+ j)))
+			  (ash 1 32)))
+	       do (setq max (- (* 2 i) 3 j)
+			end i))
+       (when (> max 0)
+	 (let ((j (reduce #'+ (subseq condensed 0 end))))
+	   (let ((n2 (+ (ash 1 (1+ j))
+			(ash (ldb (byte (- 32 (1+ j)) (1+ j)) num) (1+ j))))
+		 (n1 (1+ (ldb (byte (1+ j) 0) (lognot num)))))
+	   `(truly-the (unsigned-byte 32)
+	     (- ,(optimize-multiply arg n2) ,(optimize-multiply arg n1))))))))
+    ((dolist (i '(9 5 3))
+       (when (integerp (/ num i))
+	 (when (< (logcount (/ num i)) (logcount num))
+	   (let ((x (gensym)))
+	     (return `(let ((,x ,(optimize-multiply arg (/ num i))))
+		       (truly-the (unsigned-byte 32)
+			(%lea ,x ,x (1- ,i) 0)))))))))
+    (t (basic-decompose-multiplication arg num n-bits condensed))))
+	   
+(defun optimize-multiply (arg x)
+  (let* ((n-bits (logcount x))
+	 (condensed (make-array n-bits)))
+    (let ((count 0) (bit 0))
+      (dotimes (i 32)
+	(cond ((logbitp i x)
+	       (setf (aref condensed bit) count)
+	       (setf count 1)
+	       (incf bit))
+	      (t (incf count)))))
+    (decompose-multiplication arg x n-bits condensed)))
+
+(deftransform * ((x y)
+		 ((unsigned-byte 32) (constant-arg (unsigned-byte 32)))
+		 (unsigned-byte 32))
+  "recode as leas, shifts and adds"
+  (let ((y (continuation-value y)))
+    (cond
+      ((= y (ash 1 (integer-length y)))
+       ;; there's a generic transform for y = 2^k
+       (give-up-ir1-transform))
+      ((member y '(3 5 9))
+       ;; we can do these multiplications directly using LEA
+       `(%lea x x ,(1- y) 0))
+      ((member :pentium4 *backend-subfeatures*)
+       ;; the pentium4's multiply unit is reportedly very good
+       (give-up-ir1-transform))
+      ;; FIXME: should make this more fine-grained.  If nothing else,
+      ;; there should probably be a cutoff of about 9 instructions on
+      ;; pentium-class machines.
+      (t (optimize-multiply 'x y)))))
+
+;;; FIXME: we should also be able to write an optimizer or two to
+;;; convert (+ (* x 2) 17), (- (* x 9) 5) to a %LEA.
diff --git a/tests/arith.pure.lisp b/tests/arith.pure.lisp
index c2f0ac0..7493d4e 100644
--- a/tests/arith.pure.lisp
+++ b/tests/arith.pure.lisp
@@ -81,5 +81,24 @@
               (assert (<= exact-q q))
               (assert (< q (1+ exact-q))))))
 
-;; CEILING had a corner case, spotted by Paul Dietz
+;;; CEILING had a corner case, spotted by Paul Dietz
 (assert (= (ceiling most-negative-fixnum (1+ most-positive-fixnum)) -1))
+
+;;; give any optimizers of constant multiplication a light testing.
+;;; 100 may seem low, but (a) it caught CSR's initial errors, and (b)
+;;; before checking in, CSR tested with 10000.  So one hundred
+;;; checkins later, we'll have doubled the coverage.
+(dotimes (i 100)
+  (let* ((x (random most-positive-fixnum))
+	 (x2 (* x 2))
+	 (x3 (* x 3)))
+    (let ((fn (handler-bind ((sb-ext:compiler-note #'error))
+		(compile nil
+			 `(lambda (y)
+			    (declare (optimize speed) (type (integer 0 3) y))
+			    (* y ,x))))))
+      (unless (and (= (funcall fn 0) 0)
+		   (= (funcall fn 1) x)
+		   (= (funcall fn 2) x2)
+		   (= (funcall fn 3) x3))
+	(error "bad results for ~D" x)))))
diff --git a/version.lisp-expr b/version.lisp-expr
index 8655a3a..542a21d 100644
--- a/version.lisp-expr
+++ b/version.lisp-expr
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"0.8.3.12"
+"0.8.3.13"