;; at the low five bits of the result.
(inst sar result (min 31 (- amount)))
;; Fixnum correction.
- (inst and result #xfffffffc)))))))
+ (inst and result (lognot fixnum-tag-mask))))))))
(define-vop (fast-ash-left/fixnum=>fixnum)
(:translate ash)
(:translate logcount)
(:note "inline (unsigned-byte 32) logcount")
(:policy :fast-safe)
- (:args (arg :scs (unsigned-reg)))
+ (:args (arg :scs (unsigned-reg) :target result))
(:arg-types unsigned-num)
(:results (result :scs (unsigned-reg)))
(:result-types positive-fixnum)
- (:temporary (:sc unsigned-reg :from (:argument 0)) temp)
- (:generator 30
+ (:temporary (:sc unsigned-reg) temp)
+ (:generator 14
+ ;; See the comments below for how the algorithm works. The tricks
+ ;; used can be found for example in AMD's software optimization
+ ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
+ ;; function "pop1".
+ ;; Calculate 2-bit sums. Note that the value of a two-digit binary
+ ;; number is the sum of the right digit and twice the left digit.
+ ;; Thus we can calculate the sum of the two digits by shifting the
+ ;; left digit to the right position and doing a two-bit subtraction.
+ ;; This subtraction will never create a borrow and thus can be made
+ ;; on all 16 2-digit numbers at once.
(move result arg)
-
- (inst mov temp result)
- (inst shr temp 1)
+ (move temp arg)
+ (inst shr result 1)
(inst and result #x55555555)
- (inst and temp #x55555555)
- (inst add result temp)
-
- (inst mov temp result)
+ (inst sub temp result)
+ ;; Calculate 4-bit sums by straightforward shift, mask and add.
+ ;; Note that we shift the source operand of the MOV and not its
+ ;; destination so that the SHR and the MOV can execute in the same
+ ;; clock cycle.
+ (inst mov result temp)
(inst shr temp 2)
(inst and result #x33333333)
(inst and temp #x33333333)
(inst add result temp)
-
+ ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
+ ;; into 4 bits, we can apply the mask after the addition, saving one
+ ;; instruction.
(inst mov temp result)
- (inst shr temp 4)
- (inst and result #x0f0f0f0f)
- (inst and temp #x0f0f0f0f)
+ (inst shr result 4)
(inst add result temp)
-
+ (inst and result #x0f0f0f0f)
+ ;; Calculate the two 16-bit sums and the 32-bit sum. No masking is
+ ;; necessary inbetween since the final sum is at most 32 which fits
+ ;; into 6 bits.
(inst mov temp result)
- (inst shr temp 8)
- (inst and result #x00ff00ff)
- (inst and temp #x00ff00ff)
+ (inst shr result 8)
(inst add result temp)
-
(inst mov temp result)
- (inst shr temp 16)
- (inst and result #x0000ffff)
- (inst and temp #x0000ffff)
- (inst add result temp)))
+ (inst shr result 16)
+ (inst add result temp)
+ (inst and result #xff)))
\f
;;;; binary conditional VOPs
(:arg-types unsigned-num (:constant (unsigned-byte 32)))
(:info target not-p y))
+(macrolet ((define-logtest-vops ()
+ `(progn
+ ,@(loop for suffix in '(/fixnum -c/fixnum
+ /signed -c/signed
+ /unsigned -c/unsigned)
+ for cost in '(4 3 6 5 6 5)
+ collect
+ `(define-vop (,(symbolicate "FAST-LOGTEST" suffix)
+ ,(symbolicate "FAST-CONDITIONAL" suffix))
+ (:translate logtest)
+ (:generator ,cost
+ (emit-optimized-test-inst x
+ ,(if (eq suffix '-c/fixnum)
+ '(fixnumize y)
+ 'y))
+ (inst jmp (if not-p :e :ne) target)))))))
+ (define-logtest-vops))
+
+(defknown %logbitp (integer unsigned-byte) boolean
+ (movable foldable flushable always-translatable))
+
+;;; only for constant folding within the compiler
+(defun %logbitp (integer index)
+ (logbitp index integer))
+
+;;; too much work to do the non-constant case (maybe?)
+(define-vop (fast-logbitp-c/fixnum fast-conditional-c/fixnum)
+ (:translate %logbitp)
+ (:arg-types tagged-num (:constant (integer 0 29)))
+ (:generator 4
+ (inst bt x (+ y n-fixnum-tag-bits))
+ (inst jmp (if not-p :nc :c) target)))
+
+(define-vop (fast-logbitp/signed fast-conditional/signed)
+ (:args (x :scs (signed-reg signed-stack))
+ (y :scs (signed-reg)))
+ (:translate %logbitp)
+ (:generator 6
+ (inst bt x y)
+ (inst jmp (if not-p :nc :c) target)))
+
+(define-vop (fast-logbitp-c/signed fast-conditional-c/signed)
+ (:translate %logbitp)
+ (:arg-types signed-num (:constant (integer 0 31)))
+ (:generator 5
+ (inst bt x y)
+ (inst jmp (if not-p :nc :c) target)))
+
+(define-vop (fast-logbitp/unsigned fast-conditional/unsigned)
+ (:args (x :scs (unsigned-reg unsigned-stack))
+ (y :scs (unsigned-reg)))
+ (:translate %logbitp)
+ (:generator 6
+ (inst bt x y)
+ (inst jmp (if not-p :nc :c) target)))
+
+(define-vop (fast-logbitp-c/unsigned fast-conditional-c/unsigned)
+ (:translate %logbitp)
+ (:arg-types unsigned-num (:constant (integer 0 31)))
+ (:generator 5
+ (inst bt x y)
+ (inst jmp (if not-p :nc :c) target)))
(macrolet ((define-conditional-vop (tran cond unsigned not-cond not-unsigned)
`(progn
(define-full-reffer bignum-ref * bignum-digits-offset other-pointer-lowtag
(unsigned-reg) unsigned-num sb!bignum:%bignum-ref)
-
+(define-full-reffer+offset bignum-ref-with-offset *
+ bignum-digits-offset other-pointer-lowtag
+ (unsigned-reg) unsigned-num sb!bignum:%bignum-ref-with-offset)
(define-full-setter bignum-set * bignum-digits-offset other-pointer-lowtag
(unsigned-reg) unsigned-num sb!bignum:%bignum-set)
:load-if (not (and (sc-is result unsigned-stack)
(location= digit result)))))
(:result-types unsigned-num)
- (:generator 1
+ (:generator 2
(move result digit)
(move ecx count)
(inst sar result :cl)))
+(define-vop (digit-ashr/c)
+ (:translate sb!bignum:%ashr)
+ (:policy :fast-safe)
+ (:args (digit :scs (unsigned-reg unsigned-stack) :target result))
+ (:arg-types unsigned-num (:constant (integer 0 31)))
+ (:info count)
+ (:results (result :scs (unsigned-reg) :from (:argument 0)
+ :load-if (not (and (sc-is result unsigned-stack)
+ (location= digit result)))))
+ (:result-types unsigned-num)
+ (:generator 1
+ (move result digit)
+ (inst sar result count)))
+
(define-vop (digit-lshr digit-ashr)
(:translate sb!bignum:%digit-logical-shift-right)
(:generator 1