X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=src%2Fcompiler%2Fx86-64%2Farith.lisp;h=7ea7d06868141aeb55b9f888b1a6b0f3810ba88b;hb=2dbf6e6a5011edecc5361c208e9d5915ca783351;hp=aa9a1d4f7d08a1339a9cf19a4c5807dc0e315061;hpb=9c9020f77ca14545b36795fcc86db2103cc8621e;p=sbcl.git diff --git a/src/compiler/x86-64/arith.lisp b/src/compiler/x86-64/arith.lisp index aa9a1d4..7ea7d06 100644 --- a/src/compiler/x86-64/arith.lisp +++ b/src/compiler/x86-64/arith.lisp @@ -400,12 +400,12 @@ (:args (x :scs (unsigned-reg) :target eax) (y :scs (unsigned-reg unsigned-stack))) (:arg-types unsigned-num unsigned-num) - (:temporary (:sc unsigned-reg :offset eax-offset :target result + (:temporary (:sc unsigned-reg :offset eax-offset :target r :from (:argument 0) :to :result) eax) (:temporary (:sc unsigned-reg :offset edx-offset :from :eval :to :result) edx) (:ignore edx) - (:results (result :scs (unsigned-reg))) + (:results (r :scs (unsigned-reg))) (:result-types unsigned-num) (:note "inline (unsigned-byte 64) arithmetic") (:vop-var vop) @@ -413,7 +413,7 @@ (:generator 6 (move eax x) (inst mul eax y) - (move result eax))) + (move r eax))) (define-vop (fast-truncate/fixnum=>fixnum fast-safe-arith-op) @@ -593,7 +593,7 @@ (:note "inline ASH") (:generator 2 (cond ((and (= amount 1) (not (location= number result))) - (inst lea result (make-ea :qword :index number :scale 2))) + (inst lea result (make-ea :qword :base number :index number))) ((and (= amount 2) (not (location= number result))) (inst lea result (make-ea :qword :index number :scale 4))) ((and (= amount 3) (not (location= number result))) @@ -653,7 +653,7 @@ (:note "inline ASH") (:generator 3 (cond ((and (= amount 1) (not (location= number result))) - (inst lea result (make-ea :qword :index number :scale 2))) + (inst lea result (make-ea :qword :base number :index number))) ((and (= amount 2) (not (location= number result))) (inst lea result (make-ea :qword :index number :scale 4))) ((and (= amount 3) (not (location= number result))) @@ -680,7 +680,7 @@ (:note "inline ASH") (:generator 3 (cond ((and (= amount 1) (not (location= number result))) - (inst lea result (make-ea :qword :index number :scale 2))) + (inst lea result (make-ea :qword :base number :index number))) ((and (= amount 2) (not (location= number result))) (inst lea result (make-ea :qword :index number :scale 4))) ((and (= amount 3) (not (location= number result))) @@ -694,7 +694,7 @@ (inst shl result amount) (inst shr result (- amount)))) (t (if (sc-is result unsigned-reg) - (inst xor result result) + (zeroize result) (inst mov result 0)))))))) (define-vop (fast-ash-left/signed=>signed) @@ -786,7 +786,7 @@ (inst neg ecx) (inst cmp ecx 63) (inst jmp :be OKAY) - (inst xor result result) + (zeroize result) (inst jmp DONE) OKAY (inst shr result :cl) @@ -896,7 +896,7 @@ (inst or ecx ecx) (inst jmp :ns POSITIVE) (inst neg ecx) - (inst xor zero zero) + (zeroize zero) (inst shr result :cl) (inst cmp ecx 63) (inst cmov :nbe result zero) @@ -927,7 +927,7 @@ (inst inc res) (inst jmp DONE) ZERO - (inst xor res res) + (zeroize res) DONE)) (define-vop (unsigned-byte-64-len) @@ -944,89 +944,63 @@ (inst inc res) (inst jmp DONE) ZERO - (inst xor res res) + (zeroize res) DONE)) - (define-vop (unsigned-byte-64-count) (:translate logcount) (:note "inline (unsigned-byte 64) logcount") (:policy :fast-safe) - (:args (arg :scs (unsigned-reg))) + (:args (arg :scs (unsigned-reg) :target result)) (:arg-types unsigned-num) (:results (result :scs (unsigned-reg))) (:result-types positive-fixnum) - (:temporary (:sc unsigned-reg :from (:argument 0)) temp) - (:temporary (:sc unsigned-reg :from (:argument 0)) t1) - (:generator 60 + (:temporary (:sc unsigned-reg) temp) + (:temporary (:sc unsigned-reg) mask) + (:generator 14 + ;; See the comments below for how the algorithm works. The tricks + ;; used can be found for example in AMD's software optimization + ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the + ;; function "pop1", for 32-bit words. The extension to 64 bits is + ;; straightforward. + ;; Calculate 2-bit sums. Note that the value of a two-digit binary + ;; number is the sum of the right digit and twice the left digit. + ;; Thus we can calculate the sum of the two digits by shifting the + ;; left digit to the right position and doing a two-bit subtraction. + ;; This subtraction will never create a borrow and thus can be made + ;; on all 32 2-digit numbers at once. (move result arg) - (move t1 arg) - - (inst mov temp result) - (inst shr temp 1) - (inst and result #x55555555) ; note these masks will restrict the - (inst and temp #x55555555) ; count to the lower half of arg - (inst add result temp) - - (inst mov temp result) + (move temp arg) + (inst shr result 1) + (inst mov mask #x5555555555555555) + (inst and result mask) + (inst sub temp result) + ;; Calculate 4-bit sums by straightforward shift, mask and add. + ;; Note that we shift the source operand of the MOV and not its + ;; destination so that the SHR and the MOV can execute in the same + ;; clock cycle. + (inst mov result temp) (inst shr temp 2) - (inst and result #x33333333) - (inst and temp #x33333333) - (inst add result temp) - - (inst mov temp result) - (inst shr temp 4) - (inst and result #x0f0f0f0f) - (inst and temp #x0f0f0f0f) + (inst mov mask #x3333333333333333) + (inst and result mask) + (inst and temp mask) (inst add result temp) - + ;; Calculate 8-bit sums. Since each sum is at most 8, which fits + ;; into 4 bits, we can apply the mask after the addition, saving one + ;; instruction. (inst mov temp result) - (inst shr temp 8) - (inst and result #x00ff00ff) - (inst and temp #x00ff00ff) + (inst shr result 4) (inst add result temp) - - (inst mov temp result) - (inst shr temp 16) - (inst and result #x0000ffff) - (inst and temp #x0000ffff) - (inst add result temp) - - ;;; now do the upper half - (inst shr t1 32) - - (inst mov temp t1) - (inst shr temp 1) - (inst and t1 #x55555555) - (inst and temp #x55555555) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 2) - (inst and t1 #x33333333) - (inst and temp #x33333333) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 4) - (inst and t1 #x0f0f0f0f) - (inst and temp #x0f0f0f0f) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 8) - (inst and t1 #x00ff00ff) - (inst and temp #x00ff00ff) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 16) - (inst and t1 #x0000ffff) - (inst and temp #x0000ffff) - (inst add t1 temp) - (inst add result t1))) - - + (inst mov mask #x0f0f0f0f0f0f0f0f) + (inst and result mask) + ;; Add all 8 bytes at once by multiplying with #256r11111111. + ;; We need to calculate only the lower 8 bytes of the product. + ;; Of these the most significant byte contains the final result. + ;; Note that there can be no overflow from one byte to the next + ;; as the sum is at most 64 which needs only 7 bits. + (inst mov mask #x0101010101010101) + (inst imul result mask) + (inst shr result 56))) ;;;; binary conditional VOPs @@ -1233,15 +1207,56 @@ ;;;; Modular functions +(defmacro define-mod-binop ((name prototype) function) + `(define-vop (,name ,prototype) + (:args (x :target r :scs (unsigned-reg signed-reg) + :load-if (not (and (or (sc-is x unsigned-stack) + (sc-is x signed-stack)) + (or (sc-is y unsigned-reg) + (sc-is y signed-reg)) + (or (sc-is r unsigned-stack) + (sc-is r signed-stack)) + (location= x r)))) + (y :scs (unsigned-reg signed-reg unsigned-stack signed-stack))) + (:arg-types untagged-num untagged-num) + (:results (r :scs (unsigned-reg signed-reg) :from (:argument 0) + :load-if (not (and (or (sc-is x unsigned-stack) + (sc-is x signed-stack)) + (or (sc-is y unsigned-reg) + (sc-is y unsigned-reg)) + (or (sc-is r unsigned-stack) + (sc-is r unsigned-stack)) + (location= x r))))) + (:result-types unsigned-num) + (:translate ,function))) +(defmacro define-mod-binop-c ((name prototype) function) + `(define-vop (,name ,prototype) + (:args (x :target r :scs (unsigned-reg signed-reg) + :load-if (not (and (or (sc-is x unsigned-stack) + (sc-is x signed-stack)) + (or (sc-is r unsigned-stack) + (sc-is r signed-stack)) + (location= x r))))) + (:info y) + (:arg-types untagged-num (:constant (or (unsigned-byte 31) (signed-byte 32)))) + (:results (r :scs (unsigned-reg signed-reg) :from (:argument 0) + :load-if (not (and (or (sc-is x unsigned-stack) + (sc-is x signed-stack)) + (or (sc-is r unsigned-stack) + (sc-is r unsigned-stack)) + (location= x r))))) + (:result-types unsigned-num) + (:translate ,function))) + (macrolet ((def (name -c-p) (let ((fun64 (intern (format nil "~S-MOD64" name))) (vopu (intern (format nil "FAST-~S/UNSIGNED=>UNSIGNED" name))) (vopcu (intern (format nil "FAST-~S-C/UNSIGNED=>UNSIGNED" name))) (vopf (intern (format nil "FAST-~S/FIXNUM=>FIXNUM" name))) (vopcf (intern (format nil "FAST-~S-C/FIXNUM=>FIXNUM" name))) - (vop64u (intern (format nil "FAST-~S-MOD64/UNSIGNED=>UNSIGNED" name))) + (vop64u (intern (format nil "FAST-~S-MOD64/WORD=>UNSIGNED" name))) (vop64f (intern (format nil "FAST-~S-MOD64/FIXNUM=>FIXNUM" name))) - (vop64cu (intern (format nil "FAST-~S-MOD64-C/UNSIGNED=>UNSIGNED" name))) + (vop64cu (intern (format nil "FAST-~S-MOD64-C/WORD=>UNSIGNED" name))) (vop64cf (intern (format nil "FAST-~S-MOD64-C/FIXNUM=>FIXNUM" name))) (sfun61 (intern (format nil "~S-SMOD61" name))) (svop61f (intern (format nil "FAST-~S-SMOD61/FIXNUM=>FIXNUM" name))) @@ -1249,11 +1264,11 @@ `(progn (define-modular-fun ,fun64 (x y) ,name :unsigned 64) (define-modular-fun ,sfun61 (x y) ,name :signed 61) - (define-vop (,vop64u ,vopu) (:translate ,fun64)) + (define-mod-binop (,vop64u ,vopu) ,fun64) (define-vop (,vop64f ,vopf) (:translate ,fun64)) (define-vop (,svop61f ,vopf) (:translate ,sfun61)) ,@(when -c-p - `((define-vop (,vop64cu ,vopcu) (:translate ,fun64)) + `((define-mod-binop-c (,vop64cu ,vopcu) ,fun64) (define-vop (,svop61cf ,vopcf) (:translate ,sfun61)))))))) (def + t) (def - t) @@ -1356,12 +1371,12 @@ (inst not r))) (define-modular-fun logxor-mod64 (x y) logxor :unsigned 64) -(define-vop (fast-logxor-mod64/unsigned=>unsigned - fast-logxor/unsigned=>unsigned) - (:translate logxor-mod64)) -(define-vop (fast-logxor-mod64-c/unsigned=>unsigned - fast-logxor-c/unsigned=>unsigned) - (:translate logxor-mod64)) +(define-mod-binop (fast-logxor-mod64/word=>unsigned + fast-logxor/unsigned=>unsigned) + logxor-mod64) +(define-mod-binop-c (fast-logxor-mod64-c/word=>unsigned + fast-logxor-c/unsigned=>unsigned) + logxor-mod64) (define-vop (fast-logxor-mod64/fixnum=>fixnum fast-logxor/fixnum=>fixnum) (:translate logxor-mod64))