* optimization: compiler-internal data structure use has been
reviewed, and changes have been made that should improve the
performance of the compiler by about 20%.
+ * microoptimization: the compiler is better able to make use of the
+ x86 LEA instruction for multiplication by constants.
* bug fix: in some situations compiler did not report usage of
generic arithmetic in (SPEED 3) policy.
"KEY-INFO" "KEY-INFO-NAME"
"KEY-INFO-P" "KEY-INFO-TYPE"
"LAYOUT-DEPTHOID" "LAYOUT-INVALID-ERROR"
+ #!+x86 "%LEA"
"LEXENV" "LEXENV-DESIGNATOR"
"LINE-LENGTH"
"ANSI-STREAM"
(inst sra temp y n-fixnum-tag-bits)
(inst smul r x temp)))
+(define-vop (fast-v8-*-c/fixnum=>fixnum fast-safe-arith-op)
+ (:args (x :target r :scs (any-reg zero)))
+ (:info y)
+ (:arg-types tagged-num
+ (:constant (and (signed-byte 13) (not (integer 0 0)))))
+ (:results (r :scs (any-reg)))
+ (:result-types tagged-num)
+ (:note "inline fixnum arithmetic")
+ (:translate *)
+ (:guard (or (member :sparc-v8 *backend-subfeatures*)
+ (and (member :sparc-v9 *backend-subfeatures*)
+ (not (member :sparc-64 *backend-subfeatures*)))))
+ (:generator 1
+ (inst smul r x y)))
+
(define-vop (fast-v8-*/signed=>signed fast-signed-binop)
(:translate *)
(:guard (or (member :sparc-v8 *backend-subfeatures*)
(:generator 3
(inst smul r x y)))
+(define-vop (fast-v8-*-c/signed=>signed fast-signed-binop-c)
+ (:translate *)
+ (:guard (or (member :sparc-v8 *backend-subfeatures*)
+ (and (member :sparc-v9 *backend-subfeatures*)
+ (not (member :sparc-64 *backend-subfeatures*)))))
+ (:generator 2
+ (inst smul r x y)))
+
(define-vop (fast-v8-*/unsigned=>unsigned fast-unsigned-binop)
(:translate *)
(:guard (or (member :sparc-v8 *backend-subfeatures*)
(:generator 3
(inst umul r x y)))
+(define-vop (fast-v8-*-c/unsigned=>unsigned fast-unsigned-binop-c)
+ (:translate *)
+ (:guard (or (member :sparc-v8 *backend-subfeatures*)
+ (and (member :sparc-v9 *backend-subfeatures*)
+ (not (member :sparc-64 *backend-subfeatures*)))))
+ (:generator 2
+ (inst umul r x y)))
+
;; The smul and umul instructions are deprecated on the Sparc V9. Use
;; mulx instead.
(define-vop (fast-v9-*/fixnum=>fixnum fast-fixnum-binop)
(defun ash-right-unsigned (num shuft)
(ash-right-unsigned num shift)))
+(in-package "SB!C")
+
+;;; If both arguments and the result are (UNSIGNED-BYTE 32), try to
+;;; come up with a ``better'' multiplication using multiplier
+;;; recoding. There are two different ways the multiplier can be
+;;; recoded. The more obvious is to shift X by the correct amount for
+;;; each bit set in Y and to sum the results. But if there is a string
+;;; of bits that are all set, you can add X shifted by one more then
+;;; the bit position of the first set bit and subtract X shifted by
+;;; the bit position of the last set bit. We can't use this second
+;;; method when the high order bit is bit 31 because shifting by 32
+;;; doesn't work too well.
+(deftransform * ((x y)
+ ((unsigned-byte 32) (constant-arg (unsigned-byte 32)))
+ (unsigned-byte 32))
+ "recode as shifts and adds"
+ (let ((y (continuation-value y))
+ (adds 0)
+ (shifts 0)
+ (result nil)
+ (first-one nil))
+ (labels ((tub32 (x) `(truly-the (unsigned-byte 32) ,x))
+ (add (next-factor)
+ (setf result
+ (tub32
+ (if result
+ (progn (incf adds) `(+ ,result ,(tub32 next-factor)))
+ next-factor)))))
+ (declare (inline add))
+ (dotimes (bitpos 32)
+ (if first-one
+ (when (not (logbitp bitpos y))
+ (add (if (= (1+ first-one) bitpos)
+ ;; There is only a single bit in the string.
+ (progn (incf shifts) `(ash x ,first-one))
+ ;; There are at least two.
+ (progn
+ (incf adds)
+ (incf shifts 2)
+ `(- ,(tub32 `(ash x ,bitpos))
+ ,(tub32 `(ash x ,first-one))))))
+ (setf first-one nil))
+ (when (logbitp bitpos y)
+ (setf first-one bitpos))))
+ (when first-one
+ (cond ((= first-one 31))
+ ((= first-one 30) (incf shifts) (add '(ash x 30)))
+ (t
+ (incf shifts 2)
+ (incf adds)
+ (add `(- ,(tub32 '(ash x 31)) ,(tub32 `(ash x ,first-one))))))
+ (incf shifts)
+ (add '(ash x 31))))
+
+ (cond
+ ;; we assume, perhaps foolishly, that good SPARCs don't have an
+ ;; issue with multiplications. (Remember that there's a
+ ;; different transform for converting x*2^k to a shift).
+ ((member :sparc-64 *backend-subfeatures*) (give-up-ir1-transform))
+ ((or (member :sparc-v9 *backend-subfeatures*)
+ (member :sparc-v8 *backend-subfeatures*))
+ ;; breakeven point as measured by Raymond Toy
+ (when (> (+ adds shifts) 9)
+ (give-up-ir1-transform))))
+
+ (or result 0)))
+
;; If we can prove that we have a right shift, just do the right shift
;; instead of calling the inline ASH which has to check for the
;; direction of the shift at run-time.
-(in-package "SB!C")
-
(deftransform ash ((num shift) (integer integer))
(let ((num-type (continuation-type num))
(shift-type (continuation-type shift)))
`(- (ash x ,len))
`(ash x ,len))))
-;;; If both arguments and the result are (UNSIGNED-BYTE 32), try to
-;;; come up with a ``better'' multiplication using multiplier
-;;; recoding. There are two different ways the multiplier can be
-;;; recoded. The more obvious is to shift X by the correct amount for
-;;; each bit set in Y and to sum the results. But if there is a string
-;;; of bits that are all set, you can add X shifted by one more then
-;;; the bit position of the first set bit and subtract X shifted by
-;;; the bit position of the last set bit. We can't use this second
-;;; method when the high order bit is bit 31 because shifting by 32
-;;; doesn't work too well.
-(deftransform * ((x y)
- ((unsigned-byte 32) (unsigned-byte 32))
- (unsigned-byte 32))
- "recode as shift and add"
- (unless (constant-continuation-p y)
- (give-up-ir1-transform))
- (let ((y (continuation-value y))
- (result nil)
- (first-one nil))
- (labels ((tub32 (x) `(truly-the (unsigned-byte 32) ,x))
- (add (next-factor)
- (setf result
- (tub32
- (if result
- `(+ ,result ,(tub32 next-factor))
- next-factor)))))
- (declare (inline add))
- (dotimes (bitpos 32)
- (if first-one
- (when (not (logbitp bitpos y))
- (add (if (= (1+ first-one) bitpos)
- ;; There is only a single bit in the string.
- `(ash x ,first-one)
- ;; There are at least two.
- `(- ,(tub32 `(ash x ,bitpos))
- ,(tub32 `(ash x ,first-one)))))
- (setf first-one nil))
- (when (logbitp bitpos y)
- (setf first-one bitpos))))
- (when first-one
- (cond ((= first-one 31))
- ((= first-one 30)
- (add '(ash x 30)))
- (t
- (add `(- ,(tub32 '(ash x 31)) ,(tub32 `(ash x ,first-one))))))
- (add '(ash x 31))))
- (or result 0)))
-
;;; If arg is a constant power of two, turn FLOOR into a shift and
;;; mask. If CEILING, add in (1- (ABS Y)), do FLOOR and correct a
;;; remainder.
(inst mov tmp y)
(inst shr tmp 18)
(inst xor y tmp)))
+
+(in-package "SB!C")
+
+(defknown %lea ((or (signed-byte 32) (unsigned-byte 32))
+ (or (signed-byte 32) (unsigned-byte 32))
+ (member 1 2 4 8) (signed-byte 32))
+ (or (signed-byte 32) (unsigned-byte 32))
+ (foldable flushable))
+
+(defoptimizer (%lea derive-type) ((base index scale disp))
+ (when (and (constant-continuation-p scale)
+ (constant-continuation-p disp))
+ (let ((scale (continuation-value scale))
+ (disp (continuation-value disp))
+ (base-type (continuation-type base))
+ (index-type (continuation-type index)))
+ (when (and (numeric-type-p base-type)
+ (numeric-type-p index-type))
+ (let ((base-lo (numeric-type-low base-type))
+ (base-hi (numeric-type-high base-type))
+ (index-lo (numeric-type-low index-type))
+ (index-hi (numeric-type-high index-type)))
+ (make-numeric-type :class 'integer
+ :complexp :real
+ :low (when (and base-lo index-lo)
+ (+ base-lo (* index-lo scale) disp))
+ :high (when (and base-hi index-hi)
+ (+ base-hi (* index-hi scale) disp))))))))
+
+(defun %lea (base index scale disp)
+ (+ base (* index scale) disp))
+
+(in-package "SB!VM")
+
+(define-vop (%lea/unsigned=>unsigned)
+ (:translate %lea)
+ (:policy :fast-safe)
+ (:args (base :scs (unsigned-reg))
+ (index :scs (unsigned-reg)))
+ (:info scale disp)
+ (:arg-types unsigned-num unsigned-num
+ (:constant (member 1 2 4 8))
+ (:constant (signed-byte 32)))
+ (:results (r :scs (unsigned-reg)))
+ (:result-types unsigned-num)
+ (:generator 5
+ (inst lea r (make-ea :dword :base base :index index
+ :scale scale :disp disp))))
+
+(define-vop (%lea/signed=>signed)
+ (:translate %lea)
+ (:policy :fast-safe)
+ (:args (base :scs (signed-reg))
+ (index :scs (signed-reg)))
+ (:info scale disp)
+ (:arg-types signed-num signed-num
+ (:constant (member 1 2 4 8))
+ (:constant (signed-byte 32)))
+ (:results (r :scs (signed-reg)))
+ (:result-types signed-num)
+ (:generator 4
+ (inst lea r (make-ea :dword :base base :index index
+ :scale scale :disp disp))))
+
+(define-vop (%lea/fixnum=>fixnum)
+ (:translate %lea)
+ (:policy :fast-safe)
+ (:args (base :scs (any-reg))
+ (index :scs (any-reg)))
+ (:info scale disp)
+ (:arg-types tagged-num tagged-num
+ (:constant (member 1 2 4 8))
+ (:constant (signed-byte 32)))
+ (:results (r :scs (any-reg)))
+ (:result-types tagged-num)
+ (:generator 3
+ (inst lea r (make-ea :dword :base base :index index
+ :scale scale :disp disp))))
+
+(in-package "SB!C")
+
+;;; This is essentially a straight implementation of the algorithm in
+;;; "Strength Reduction of Multiplications by Integer Constants",
+;;; Youfeng Wu, ACM SIGPLAN Notices, Vol. 30, No.2, February 1995.
+(defun basic-decompose-multiplication (arg num n-bits condensed)
+ (case (aref condensed 0)
+ (0
+ (let ((tmp (min 3 (aref condensed 1))))
+ (decf (aref condensed 1) tmp)
+ `(truly-the (unsigned-byte 32)
+ (%lea ,arg
+ ,(decompose-multiplication
+ arg (ash (1- num) (- tmp)) (1- n-bits) (subseq condensed 1))
+ ,(ash 1 tmp) 0))))
+ ((1 2 3)
+ (let ((r0 (aref condensed 0)))
+ (incf (aref condensed 1) r0)
+ `(truly-the (unsigned-byte 32)
+ (%lea ,(decompose-multiplication
+ arg (- num (ash 1 r0)) (1- n-bits) (subseq condensed 1))
+ ,arg
+ ,(ash 1 r0) 0))))
+ (t (let ((r0 (aref condensed 0)))
+ (setf (aref condensed 0) 0)
+ `(truly-the (unsigned-byte 32)
+ (ash ,(decompose-multiplication
+ arg (ash num (- r0)) n-bits condensed)
+ ,r0))))))
+
+(defun decompose-multiplication (arg num n-bits condensed)
+ (cond
+ ((= n-bits 0) 0)
+ ((= num 1) arg)
+ ((= n-bits 1)
+ `(truly-the (unsigned-byte 32) (ash ,arg ,(1- (integer-length num)))))
+ ((let ((max 0) (end 0))
+ (loop for i from 2 to (length condensed)
+ for j = (reduce #'+ (subseq condensed 0 i))
+ when (and (> (- (* 2 i) 3 j) max)
+ (< (+ (ash 1 (1+ j))
+ (ash (ldb (byte (- 32 (1+ j)) (1+ j)) num)
+ (1+ j)))
+ (ash 1 32)))
+ do (setq max (- (* 2 i) 3 j)
+ end i))
+ (when (> max 0)
+ (let ((j (reduce #'+ (subseq condensed 0 end))))
+ (let ((n2 (+ (ash 1 (1+ j))
+ (ash (ldb (byte (- 32 (1+ j)) (1+ j)) num) (1+ j))))
+ (n1 (1+ (ldb (byte (1+ j) 0) (lognot num)))))
+ `(truly-the (unsigned-byte 32)
+ (- ,(optimize-multiply arg n2) ,(optimize-multiply arg n1))))))))
+ ((dolist (i '(9 5 3))
+ (when (integerp (/ num i))
+ (when (< (logcount (/ num i)) (logcount num))
+ (let ((x (gensym)))
+ (return `(let ((,x ,(optimize-multiply arg (/ num i))))
+ (truly-the (unsigned-byte 32)
+ (%lea ,x ,x (1- ,i) 0)))))))))
+ (t (basic-decompose-multiplication arg num n-bits condensed))))
+
+(defun optimize-multiply (arg x)
+ (let* ((n-bits (logcount x))
+ (condensed (make-array n-bits)))
+ (let ((count 0) (bit 0))
+ (dotimes (i 32)
+ (cond ((logbitp i x)
+ (setf (aref condensed bit) count)
+ (setf count 1)
+ (incf bit))
+ (t (incf count)))))
+ (decompose-multiplication arg x n-bits condensed)))
+
+(deftransform * ((x y)
+ ((unsigned-byte 32) (constant-arg (unsigned-byte 32)))
+ (unsigned-byte 32))
+ "recode as leas, shifts and adds"
+ (let ((y (continuation-value y)))
+ (cond
+ ((= y (ash 1 (integer-length y)))
+ ;; there's a generic transform for y = 2^k
+ (give-up-ir1-transform))
+ ((member y '(3 5 9))
+ ;; we can do these multiplications directly using LEA
+ `(%lea x x ,(1- y) 0))
+ ((member :pentium4 *backend-subfeatures*)
+ ;; the pentium4's multiply unit is reportedly very good
+ (give-up-ir1-transform))
+ ;; FIXME: should make this more fine-grained. If nothing else,
+ ;; there should probably be a cutoff of about 9 instructions on
+ ;; pentium-class machines.
+ (t (optimize-multiply 'x y)))))
+
+;;; FIXME: we should also be able to write an optimizer or two to
+;;; convert (+ (* x 2) 17), (- (* x 9) 5) to a %LEA.
(assert (<= exact-q q))
(assert (< q (1+ exact-q))))))
-;; CEILING had a corner case, spotted by Paul Dietz
+;;; CEILING had a corner case, spotted by Paul Dietz
(assert (= (ceiling most-negative-fixnum (1+ most-positive-fixnum)) -1))
+
+;;; give any optimizers of constant multiplication a light testing.
+;;; 100 may seem low, but (a) it caught CSR's initial errors, and (b)
+;;; before checking in, CSR tested with 10000. So one hundred
+;;; checkins later, we'll have doubled the coverage.
+(dotimes (i 100)
+ (let* ((x (random most-positive-fixnum))
+ (x2 (* x 2))
+ (x3 (* x 3)))
+ (let ((fn (handler-bind ((sb-ext:compiler-note #'error))
+ (compile nil
+ `(lambda (y)
+ (declare (optimize speed) (type (integer 0 3) y))
+ (* y ,x))))))
+ (unless (and (= (funcall fn 0) 0)
+ (= (funcall fn 1) x)
+ (= (funcall fn 2) x2)
+ (= (funcall fn 3) x3))
+ (error "bad results for ~D" x)))))
;;; checkins which aren't released. (And occasionally for internal
;;; versions, especially for internal versions off the main CVS
;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"0.8.3.12"
+"0.8.3.13"