From: Juho Snellman Date: Fri, 14 Jan 2005 06:47:31 +0000 (+0000) Subject: 0.8.18.31: X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=23b070aba7a0f3339358ef7dea05684f93b065a9;p=sbcl.git 0.8.18.31: Some number-related x86-64 fixes. * Fix backend bug in negative counts for ASH. (Patch by Lutz Euler on sbcl-devel, 2005-01-09). Add some more ASH tests. * Cargo-cult signed modular arithmetic changes from the x86 backend. * Remove optimization of constant multiplication to shifts, adds and leas (except for the simple cases of 2^x, 3, 5, 9) from the x86-64 backend. It was a lot of code that's not really of any use on any existing x86-64 processor. * Fix 32-bit assumptions in deftransform of RANDOM for word-sized integers. --- diff --git a/package-data-list.lisp-expr b/package-data-list.lisp-expr index d1d0e42..44d8971 100644 --- a/package-data-list.lisp-expr +++ b/package-data-list.lisp-expr @@ -1448,7 +1448,7 @@ is a good idea, but see SB-SYS re. blurring of boundaries." "FUNCALLABLE-STRUCTURE-CLASSOID" "%RANDOM-DOUBLE-FLOAT" #!+long-float "%RANDOM-LONG-FLOAT" "%RANDOM-SINGLE-FLOAT" "RANDOM-PCL-CLASSOID" - "%FUNCALLABLE-INSTANCE-INFO" "RANDOM-CHUNK" + "%FUNCALLABLE-INSTANCE-INFO" "RANDOM-CHUNK" "BIG-RANDOM-CHUNK" "MAKE-FUNCALLABLE-STRUCTURE-CLASSOID" "LAYOUT-CLOS-HASH-MAX" "CLASSOID-CELL-NAME" "BUILT-IN-CLASSOID-DIRECT-SUPERCLASSES" diff --git a/src/code/numbers.lisp b/src/code/numbers.lisp index af798b0..ca16028 100644 --- a/src/code/numbers.lisp +++ b/src/code/numbers.lisp @@ -1472,3 +1472,9 @@ the first." (etypecase integer ((signed-byte 30) (sb!c::mask-signed-field 30 (ash integer amount))) (integer (sb!c::mask-signed-field 30 (ash (sb!c::mask-signed-field 30 integer) amount))))) + +#!+x86-64 +(defun sb!vm::ash-left-smod61 (integer amount) + (etypecase integer + ((signed-byte 61) (sb!c::mask-signed-field 61 (ash integer amount))) + (integer (sb!c::mask-signed-field 61 (ash (sb!c::mask-signed-field 61 integer) amount))))) diff --git a/src/code/target-random.lisp b/src/code/target-random.lisp index c729bba..3e35382 100644 --- a/src/code/target-random.lisp +++ b/src/code/target-random.lisp @@ -164,6 +164,13 @@ (defun random-chunk (state) (declare (type random-state state)) (sb!vm::random-mt19937 (random-state-state state))) + +#!-sb-fluid (declaim (inline big-random-chunk)) +(defun big-random-chunk (state) + (declare (type random-state state)) + (logand (1- (expt 2 64)) + (logior (ash (random-chunk state) 32) + (random-chunk state)))) ;;; Handle the single or double float case of RANDOM. We generate a ;;; float between 0.0 and 1.0 by clobbering the significand of 1.0 diff --git a/src/compiler/float-tran.lisp b/src/compiler/float-tran.lisp index c3c811d..538ccc0 100644 --- a/src/compiler/float-tran.lisp +++ b/src/compiler/float-tran.lisp @@ -50,43 +50,48 @@ ;;; through the code this way. It would be nice to move this into the ;;; same file as the other RANDOM definitions. (deftransform random ((num &optional state) - ((integer 1 #.(expt 2 32)) &optional *)) + ((integer 1 #.(expt 2 sb!vm::n-word-bits)) &optional *)) ;; FIXME: I almost conditionalized this as #!+sb-doc. Find some way ;; of automatically finding #!+sb-doc in proximity to DEFTRANSFORM ;; to let me scan for places that I made this mistake and didn't ;; catch myself. "use inline (UNSIGNED-BYTE 32) operations" - (let ((type (lvar-type num))) + (let ((type (lvar-type num)) + (limit (expt 2 sb!vm::n-word-bits)) + (random-chunk (ecase sb!vm::n-word-bits + (32 'random-chunk) + (64 'sb!kernel::big-random-chunk)))) (if (numeric-type-p type) (let ((num-high (numeric-type-high (lvar-type num)))) (aver num-high) (cond ((constant-lvar-p num) ;; Check the worst case sum absolute error for the ;; random number expectations. - (let ((rem (rem (expt 2 32) num-high))) + (let ((rem (rem limit num-high))) (unless (< (/ (* 2 rem (- num-high rem)) - num-high (expt 2 32)) + num-high limit) (expt 2 (- sb!kernel::random-integer-extra-bits))) (give-up-ir1-transform "The random number expectations are inaccurate.")) - (if (= num-high (expt 2 32)) - '(random-chunk (or state *random-state*)) - #!-x86 '(rem (random-chunk (or state *random-state*)) num) - #!+x86 + (if (= num-high limit) + `(,random-chunk (or state *random-state*)) + #!-(or x86 x86-64) + `(rem (,random-chunk (or state *random-state*)) num) + #!+(or x86 x86-64) ;; Use multiplication, which is faster. - '(values (sb!bignum::%multiply - (random-chunk (or state *random-state*)) + `(values (sb!bignum::%multiply + (,random-chunk (or state *random-state*)) num))))) ((> num-high random-fixnum-max) (give-up-ir1-transform "The range is too large to ensure an accurate result.")) - #!+x86 - ((< num-high (expt 2 32)) - '(values (sb!bignum::%multiply - (random-chunk (or state *random-state*)) + #!+(or x86 x86-64) + ((< num-high limit) + `(values (sb!bignum::%multiply + (,random-chunk (or state *random-state*)) num))) (t - '(rem (random-chunk (or state *random-state*)) num)))) + `(rem (,random-chunk (or state *random-state*)) num)))) ;; KLUDGE: a relatively conservative treatment, but better ;; than a bug (reported by PFD sbcl-devel towards the end of ;; 2004-11. diff --git a/src/compiler/generic/vm-tran.lisp b/src/compiler/generic/vm-tran.lisp index 6890462..1489bac 100644 --- a/src/compiler/generic/vm-tran.lisp +++ b/src/compiler/generic/vm-tran.lisp @@ -518,7 +518,9 @@ #!+x86 (def sb!vm::ash-left-smod30 :signed 30) (def sb!vm::ash-left-mod32 :unsigned 32)) #!+#.(cl:if (cl:= 64 sb!vm:n-machine-word-bits) '(and) '(or)) - (def sb!vm::ash-left-mod64 :unsigned 64)) + (progn + #!+x86-64 (def sb!vm::ash-left-smod61 :signed 61) + (def sb!vm::ash-left-mod64 :unsigned 64))) ;;;; word-wise logical operations diff --git a/src/compiler/x86-64/arith.lisp b/src/compiler/x86-64/arith.lisp index 35cd6ac..8f40dd8 100644 --- a/src/compiler/x86-64/arith.lisp +++ b/src/compiler/x86-64/arith.lisp @@ -601,18 +601,19 @@ (inst lea result (make-ea :qword :index number :scale 8))) (t (move result number) - (cond ((plusp amount) - ;; We don't have to worry about overflow because of the - ;; result type restriction. - (inst shl result amount)) - ((zerop amount) ) - ((< amount -63) - (inst xor result result)) - (t - ;; shift too far then back again, to zero tag bits - (inst sar result (- 3 amount)) - (inst shl result 3))))))) - + (cond ((plusp amount) + ;; We don't have to worry about overflow because of the + ;; result type restriction. + (inst shl result amount)) + (t + ;; Since the shift instructions take the shift amount + ;; modulo 64 we must special case amounts of 64 and more. + ;; Because fixnums have only 61 bits, the result is 0 or + ;; -1 for all amounts of 60 or more, so use this as the + ;; limit instead. + (inst sar result (min (- n-word-bits n-fixnum-tag-bits 1) + (- amount))) + (inst and result (lognot fixnum-tag-mask)))))))) (define-vop (fast-ash-left/fixnum=>fixnum) (:translate ash) @@ -1234,20 +1235,33 @@ ;;;; Modular functions -(define-modular-fun +-mod64 (x y) + :unsigned 64) -(define-vop (fast-+-mod64/unsigned=>unsigned fast-+/unsigned=>unsigned) - (:translate +-mod64)) -(define-vop (fast-+-mod64-c/unsigned=>unsigned fast-+-c/unsigned=>unsigned) - (:translate +-mod64)) -(define-modular-fun --mod64 (x y) - :unsigned 64) -(define-vop (fast---mod64/unsigned=>unsigned fast--/unsigned=>unsigned) - (:translate --mod64)) -(define-vop (fast---mod64-c/unsigned=>unsigned fast---c/unsigned=>unsigned) - (:translate --mod64)) - -(define-modular-fun *-mod64 (x y) * :unsigned 64) -(define-vop (fast-*-mod64/unsigned=>unsigned fast-*/unsigned=>unsigned) - (:translate *-mod64)) +(macrolet ((def (name -c-p) + (let ((fun64 (intern (format nil "~S-MOD64" name))) + (vopu (intern (format nil "FAST-~S/UNSIGNED=>UNSIGNED" name))) + (vopcu (intern (format nil "FAST-~S-C/UNSIGNED=>UNSIGNED" name))) + (vopf (intern (format nil "FAST-~S/FIXNUM=>FIXNUM" name))) + (vopcf (intern (format nil "FAST-~S-C/FIXNUM=>FIXNUM" name))) + (vop64u (intern (format nil "FAST-~S-MOD64/UNSIGNED=>UNSIGNED" name))) + (vop64f (intern (format nil "FAST-~S-MOD64/FIXNUM=>FIXNUM" name))) + (vop64cu (intern (format nil "FAST-~S-MOD64-C/UNSIGNED=>UNSIGNED" name))) + (vop64cf (intern (format nil "FAST-~S-MOD64-C/FIXNUM=>FIXNUM" name))) + (sfun61 (intern (format nil "~S-SMOD61" name))) + (svop61f (intern (format nil "FAST-~S-SMOD61/FIXNUM=>FIXNUM" name))) + (svop61cf (intern (format nil "FAST-~S-SMOD61-C/FIXNUM=>FIXNUM" name)))) + `(progn + (define-modular-fun ,fun64 (x y) ,name :unsigned 64) + (define-modular-fun ,sfun61 (x y) ,name :signed 61) + (define-vop (,vop64u ,vopu) (:translate ,fun64)) + (define-vop (,vop64f ,vopf) (:translate ,fun64)) + (define-vop (,svop61f ,vopf) (:translate ,sfun61)) + ,@(when -c-p + `((define-vop (,vop64cu ,vopcu) (:translate ,fun64)) + (define-vop (,svop61cf ,vopcf) (:translate ,sfun61)))))))) + (def + t) + (def - t) + ;; (no -C variant as x86 MUL instruction doesn't take an immediate) + (def * nil)) + ;;; (no -C variant as x86 MUL instruction doesn't take an immediate) (define-vop (fast-ash-left-mod64-c/unsigned=>unsigned @@ -1261,11 +1275,25 @@ (sb!c::give-up-ir1-transform)) '(%primitive fast-ash-left-mod64/unsigned=>unsigned integer count)) +(define-vop (fast-ash-left-smod61-c/fixnum=>fixnum + fast-ash-c/fixnum=>fixnum) + (:translate ash-left-smod61)) +(define-vop (fast-ash-left-smod61/fixnum=>fixnum + fast-ash-left/fixnum=>fixnum)) +(deftransform ash-left-smod61 ((integer count) + ((signed-byte 61) (unsigned-byte 6))) + (when (sb!c::constant-lvar-p count) + (sb!c::give-up-ir1-transform)) + '(%primitive fast-ash-left-smod61/fixnum=>fixnum integer count)) + (in-package "SB!C") (defknown sb!vm::%lea-mod64 (integer integer (member 1 2 4 8) (signed-byte 64)) (unsigned-byte 64) (foldable flushable movable)) +(defknown sb!vm::%lea-smod61 (integer integer (member 1 2 4 8) (signed-byte 64)) + (signed-byte 61) + (foldable flushable movable)) (define-modular-fun-optimizer %lea ((base index scale disp) :unsigned :width width) (when (and (<= width 64) @@ -1274,23 +1302,43 @@ (cut-to-width base :unsigned width) (cut-to-width index :unsigned width) 'sb!vm::%lea-mod64)) +(define-modular-fun-optimizer %lea ((base index scale disp) :signed :width width) + (when (and (<= width 61) + (constant-lvar-p scale) + (constant-lvar-p disp)) + (cut-to-width base :signed width) + (cut-to-width index :signed width) + 'sb!vm::%lea-smod61)) #+sb-xc-host -(defun sb!vm::%lea-mod64 (base index scale disp) - (ldb (byte 64 0) (%lea base index scale disp))) +(progn + (defun sb!vm::%lea-mod64 (base index scale disp) + (ldb (byte 64 0) (%lea base index scale disp))) + (defun sb!vm::%lea-smod61 (base index scale disp) + (mask-signed-field 61 (%lea base index scale disp)))) #-sb-xc-host -(defun sb!vm::%lea-mod64 (base index scale disp) - (let ((base (logand base #xffffffffffffffff)) - (index (logand index #xffffffffffffffff))) - ;; can't use modular version of %LEA, as we only have VOPs for - ;; constant SCALE and DISP. - (ldb (byte 64 0) (+ base (* index scale) disp)))) +(progn + (defun sb!vm::%lea-mod64 (base index scale disp) + (let ((base (logand base #xffffffffffffffff)) + (index (logand index #xffffffffffffffff))) + ;; can't use modular version of %LEA, as we only have VOPs for + ;; constant SCALE and DISP. + (ldb (byte 64 0) (+ base (* index scale) disp)))) + (defun sb!vm::%lea-smod61 (base index scale disp) + (let ((base (mask-signed-field 61 base)) + (index (mask-signed-field 61 index))) + ;; can't use modular version of %LEA, as we only have VOPs for + ;; constant SCALE and DISP. + (mask-signed-field 61 (+ base (* index scale) disp))))) (in-package "SB!VM") (define-vop (%lea-mod64/unsigned=>unsigned %lea/unsigned=>unsigned) (:translate %lea-mod64)) +(define-vop (%lea-smod61/fixnum=>fixnum + %lea/fixnum=>fixnum) + (:translate %lea-smod61)) ;;; logical operations (define-modular-fun lognot-mod64 (x) lognot :unsigned 64) @@ -1318,6 +1366,12 @@ (define-vop (fast-logxor-mod64-c/unsigned=>unsigned fast-logxor-c/unsigned=>unsigned) (:translate logxor-mod64)) +(define-vop (fast-logxor-mod64/fixnum=>fixnum + fast-logxor/fixnum=>fixnum) + (:translate logxor-mod64)) +(define-vop (fast-logxor-mod64-c/fixnum=>fixnum + fast-logxor-c/fixnum=>fixnum) + (:translate logxor-mod64)) (define-source-transform logeqv (&rest args) (if (oddp (length args)) @@ -1572,94 +1626,20 @@ (in-package "SB!C") -;;; This is essentially a straight implementation of the algorithm in -;;; "Strength Reduction of Multiplications by Integer Constants", -;;; Youfeng Wu, ACM SIGPLAN Notices, Vol. 30, No.2, February 1995. -(defun basic-decompose-multiplication (arg num n-bits condensed) - (case (aref condensed 0) - (0 - (let ((tmp (min 3 (aref condensed 1)))) - (decf (aref condensed 1) tmp) - `(logand #xffffffff - (%lea ,arg - ,(decompose-multiplication - arg (ash (1- num) (- tmp)) (1- n-bits) (subseq condensed 1)) - ,(ash 1 tmp) 0)))) - ((1 2 3) - (let ((r0 (aref condensed 0))) - (incf (aref condensed 1) r0) - `(logand #xffffffff - (%lea ,(decompose-multiplication - arg (- num (ash 1 r0)) (1- n-bits) (subseq condensed 1)) - ,arg - ,(ash 1 r0) 0)))) - (t (let ((r0 (aref condensed 0))) - (setf (aref condensed 0) 0) - `(logand #xffffffff - (ash ,(decompose-multiplication - arg (ash num (- r0)) n-bits condensed) - ,r0)))))) - -(defun decompose-multiplication (arg num n-bits condensed) - (cond - ((= n-bits 0) 0) - ((= num 1) arg) - ((= n-bits 1) - `(logand #xffffffff (ash ,arg ,(1- (integer-length num))))) - ((let ((max 0) (end 0)) - (loop for i from 2 to (length condensed) - for j = (reduce #'+ (subseq condensed 0 i)) - when (and (> (- (* 2 i) 3 j) max) - (< (+ (ash 1 (1+ j)) - (ash (ldb (byte (- 64 (1+ j)) (1+ j)) num) - (1+ j))) - (ash 1 64))) - do (setq max (- (* 2 i) 3 j) - end i)) - (when (> max 0) - (let ((j (reduce #'+ (subseq condensed 0 end)))) - (let ((n2 (+ (ash 1 (1+ j)) - (ash (ldb (byte (- 64 (1+ j)) (1+ j)) num) (1+ j)))) - (n1 (1+ (ldb (byte (1+ j) 0) (lognot num))))) - `(logand #xffffffff - (- ,(optimize-multiply arg n2) ,(optimize-multiply arg n1)))))))) - ((dolist (i '(9 5 3)) - (when (integerp (/ num i)) - (when (< (logcount (/ num i)) (logcount num)) - (let ((x (gensym))) - (return `(let ((,x ,(optimize-multiply arg (/ num i)))) - (logand #xffffffff - (%lea ,x ,x (1- ,i) 0))))))))) - (t (basic-decompose-multiplication arg num n-bits condensed)))) - -(defun optimize-multiply (arg x) - (let* ((n-bits (logcount x)) - (condensed (make-array n-bits))) - (let ((count 0) (bit 0)) - (dotimes (i 64) - (cond ((logbitp i x) - (setf (aref condensed bit) count) - (setf count 1) - (incf bit)) - (t (incf count))))) - (decompose-multiplication arg x n-bits condensed))) - (defun *-transformer (y) (cond - (t (give-up-ir1-transform)) ((= y (ash 1 (integer-length y))) ;; there's a generic transform for y = 2^k (give-up-ir1-transform)) ((member y '(3 5 9)) ;; we can do these multiplications directly using LEA `(%lea x x ,(1- y) 0)) - ((member :pentium4 *backend-subfeatures*) - ;; the pentium4's multiply unit is reportedly very good - (give-up-ir1-transform)) - ;; FIXME: should make this more fine-grained. If nothing else, - ;; there should probably be a cutoff of about 9 instructions on - ;; pentium-class machines. - (t (optimize-multiply 'x y)))) + (t + ;; A normal 64-bit multiplication takes 4 cycles on Athlon 64/Opteron. + ;; Optimizing multiplications (other than the above cases) to + ;; shifts/adds/leas gives a maximum improvement of 1 cycle, but requires + ;; quite a lot of hairy code. + (give-up-ir1-transform)))) (deftransform * ((x y) ((unsigned-byte 64) (constant-arg (unsigned-byte 64))) @@ -1667,7 +1647,6 @@ "recode as leas, shifts and adds" (let ((y (lvar-value y))) (*-transformer y))) - (deftransform sb!vm::*-mod64 ((x y) ((unsigned-byte 64) (constant-arg (unsigned-byte 64))) (unsigned-byte 64)) @@ -1675,5 +1654,15 @@ (let ((y (lvar-value y))) (*-transformer y))) -;;; FIXME: we should also be able to write an optimizer or two to -;;; convert (+ (* x 2) 17), (- (* x 9) 5) to a %LEA. +(deftransform * ((x y) + ((signed-byte 61) (constant-arg (unsigned-byte 64))) + (signed-byte 61)) + "recode as leas, shifts and adds" + (let ((y (lvar-value y))) + (*-transformer y))) +(deftransform sb!vm::*-smod61 + ((x y) ((signed-byte 61) (constant-arg (unsigned-byte 64))) + (signed-byte 61)) + "recode as leas, shifts and adds" + (let ((y (lvar-value y))) + (*-transformer y))) diff --git a/tests/arith.impure.lisp b/tests/arith.impure.lisp index e4c7e7d..fb6b7d9 100644 --- a/tests/arith.impure.lisp +++ b/tests/arith.impure.lisp @@ -90,6 +90,60 @@ (the (unsigned-byte 32) (ash x y))) (assert (= (one-more-test-case-to-catch-sparc (1- (ash 1 32)) -40) 0)) + +(eval-when (:compile-toplevel :load-toplevel :execute) + (defvar *n-fixnum-bits* (- sb-vm::n-word-bits sb-vm::n-fixnum-tag-bits)) + (defvar *shifts* (let ((list (list 0 + 1 + (1- sb-vm::n-word-bits) + sb-vm::n-word-bits + (1+ sb-vm::n-word-bits)))) + (append list (mapcar #'- list))))) + +(macrolet ((nc-list () + `(list ,@(loop for i from 0 below (length *shifts*) + collect `(frob (nth ,i *shifts*))))) + (c-list () + `(list ,@(loop for i from 0 below (length *shifts*) + collect `(frob ,(nth i *shifts*)))))) + (defun nc-ash (x) + (macrolet ((frob (y) + `(list x ,y (ash x ,y)))) + (nc-list))) + (defun c-ash (x) + (macrolet ((frob (y) + `(list x ,y (ash x ,y)))) + (c-list))) + (defun nc-modular-ash-ub (x) + (macrolet ((frob (y) + `(list x ,y (logand most-positive-fixnum (ash x ,y))))) + (nc-list))) + (defun c-modular-ash-ub (x) + (declare (type (and fixnum unsigned-byte) x) + (optimize speed)) + (macrolet ((frob (y) + `(list x ,y (logand most-positive-fixnum (ash x ,y))))) + (c-list)))) + +(let* ((values (list 0 1 most-positive-fixnum)) + (neg-values (cons most-negative-fixnum + (mapcar #'- values)))) + (labels ((test (value fun1 fun2) + (let ((res1 (funcall fun1 value)) + (res2 (funcall fun2 value))) + (mapcar (lambda (a b) + (unless (equalp a b) + (error "ash failure for ~A vs ~A: ~A not EQUALP ~A" + fun1 fun2 + a b))) + res1 res2)))) + (loop for x in values do + (test x 'nc-ash 'c-ash) + (test x 'nc-modular-ash-ub 'c-modular-ash-ub)) + (loop for x in neg-values do + (test x 'nc-ash 'c-ash)))) + + (defun 64-bit-logcount (x) (declare (optimize speed) (type (unsigned-byte 54) x)) (logcount x))