From: Juho Snellman Date: Fri, 28 Jul 2006 01:08:40 +0000 (+0000) Subject: 0.9.15.1: X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=1a1f1815159e714a635e92e9f0f2f7845e64fc91;p=sbcl.git 0.9.15.1: Faster implementation of the LOGCOUNT VOP for x86 and x86-64, and an faster BIGNUM-LOGCOUNT on all platforms. (Patch from Lutz Euler on sbcl-devel, "Patch: Optimisation of LOGCOUNT" on 2006-07-23). --- diff --git a/NEWS b/NEWS index cfc0b4b..7d2ef39 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,8 @@ ;;;; -*- coding: utf-8; -*- +changes in sbcl-0.9.16 relative to sbcl-0.9.15: + * optimization: faster LOGCOUNT implementation on x86 and x86-64 + (thanks to Lutz Euler) + changes in sbcl-0.9.15 relative to sbcl-0.9.14: * added support for the ucs-2 external format. (contributed by Ivan Boldyrev) diff --git a/src/code/bignum.lisp b/src/code/bignum.lisp index 9374acc..c07fc19 100644 --- a/src/code/bignum.lisp +++ b/src/code/bignum.lisp @@ -1320,16 +1320,18 @@ (defun bignum-logcount (bignum) (declare (type bignum-type bignum)) - (let* ((length (%bignum-length bignum)) - (plusp (%bignum-0-or-plusp bignum length)) - (result 0)) + (let ((length (%bignum-length bignum)) + (result 0)) (declare (type bignum-index length) (fixnum result)) (do ((index 0 (1+ index))) - ((= index length) result) + ((= index length) + (if (%bignum-0-or-plusp bignum length) + result + (- (* length digit-size) result))) (let ((digit (%bignum-ref bignum index))) (declare (type bignum-element-type digit)) - (incf result (logcount (if plusp digit (%lognot digit)))))))) + (incf result (logcount digit)))))) ;;;; logical operations diff --git a/src/compiler/x86-64/arith.lisp b/src/compiler/x86-64/arith.lisp index aa9a1d4..100ca5e 100644 --- a/src/compiler/x86-64/arith.lisp +++ b/src/compiler/x86-64/arith.lisp @@ -947,86 +947,60 @@ (inst xor res res) DONE)) - (define-vop (unsigned-byte-64-count) (:translate logcount) (:note "inline (unsigned-byte 64) logcount") (:policy :fast-safe) - (:args (arg :scs (unsigned-reg))) + (:args (arg :scs (unsigned-reg) :target result)) (:arg-types unsigned-num) (:results (result :scs (unsigned-reg))) (:result-types positive-fixnum) - (:temporary (:sc unsigned-reg :from (:argument 0)) temp) - (:temporary (:sc unsigned-reg :from (:argument 0)) t1) - (:generator 60 + (:temporary (:sc unsigned-reg) temp) + (:temporary (:sc unsigned-reg) mask) + (:generator 14 + ;; See the comments below for how the algorithm works. The tricks + ;; used can be found for example in AMD's software optimization + ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the + ;; function "pop1", for 32-bit words. The extension to 64 bits is + ;; straightforward. + ;; Calculate 2-bit sums. Note that the value of a two-digit binary + ;; number is the sum of the right digit and twice the left digit. + ;; Thus we can calculate the sum of the two digits by shifting the + ;; left digit to the right position and doing a two-bit subtraction. + ;; This subtraction will never create a borrow and thus can be made + ;; on all 32 2-digit numbers at once. (move result arg) - (move t1 arg) - - (inst mov temp result) - (inst shr temp 1) - (inst and result #x55555555) ; note these masks will restrict the - (inst and temp #x55555555) ; count to the lower half of arg - (inst add result temp) - - (inst mov temp result) + (move temp arg) + (inst shr result 1) + (inst mov mask #x5555555555555555) + (inst and result mask) + (inst sub temp result) + ;; Calculate 4-bit sums by straightforward shift, mask and add. + ;; Note that we shift the source operand of the MOV and not its + ;; destination so that the SHR and the MOV can execute in the same + ;; clock cycle. + (inst mov result temp) (inst shr temp 2) - (inst and result #x33333333) - (inst and temp #x33333333) - (inst add result temp) - - (inst mov temp result) - (inst shr temp 4) - (inst and result #x0f0f0f0f) - (inst and temp #x0f0f0f0f) + (inst mov mask #x3333333333333333) + (inst and result mask) + (inst and temp mask) (inst add result temp) - + ;; Calculate 8-bit sums. Since each sum is at most 8, which fits + ;; into 4 bits, we can apply the mask after the addition, saving one + ;; instruction. (inst mov temp result) - (inst shr temp 8) - (inst and result #x00ff00ff) - (inst and temp #x00ff00ff) + (inst shr result 4) (inst add result temp) - - (inst mov temp result) - (inst shr temp 16) - (inst and result #x0000ffff) - (inst and temp #x0000ffff) - (inst add result temp) - - ;;; now do the upper half - (inst shr t1 32) - - (inst mov temp t1) - (inst shr temp 1) - (inst and t1 #x55555555) - (inst and temp #x55555555) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 2) - (inst and t1 #x33333333) - (inst and temp #x33333333) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 4) - (inst and t1 #x0f0f0f0f) - (inst and temp #x0f0f0f0f) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 8) - (inst and t1 #x00ff00ff) - (inst and temp #x00ff00ff) - (inst add t1 temp) - - (inst mov temp t1) - (inst shr temp 16) - (inst and t1 #x0000ffff) - (inst and temp #x0000ffff) - (inst add t1 temp) - (inst add result t1))) - - + (inst mov mask #x0f0f0f0f0f0f0f0f) + (inst and result mask) + ;; Add all 8 bytes at once by multiplying with #256r11111111. + ;; We need to calculate only the lower 8 bytes of the product. + ;; Of these the most significant byte contains the final result. + ;; Note that there can be no overflow from one byte to the next + ;; as the sum is at most 64 which needs only 7 bits. + (inst mov mask #x0101010101010101) + (inst imul result mask) + (inst shr result 56))) ;;;; binary conditional VOPs diff --git a/src/compiler/x86/arith.lisp b/src/compiler/x86/arith.lisp index 1078a27..ffd540a 100644 --- a/src/compiler/x86/arith.lisp +++ b/src/compiler/x86/arith.lisp @@ -953,43 +953,53 @@ (:translate logcount) (:note "inline (unsigned-byte 32) logcount") (:policy :fast-safe) - (:args (arg :scs (unsigned-reg))) + (:args (arg :scs (unsigned-reg) :target result)) (:arg-types unsigned-num) (:results (result :scs (unsigned-reg))) (:result-types positive-fixnum) - (:temporary (:sc unsigned-reg :from (:argument 0)) temp) - (:generator 30 + (:temporary (:sc unsigned-reg) temp) + (:generator 14 + ;; See the comments below for how the algorithm works. The tricks + ;; used can be found for example in AMD's software optimization + ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the + ;; function "pop1". + ;; Calculate 2-bit sums. Note that the value of a two-digit binary + ;; number is the sum of the right digit and twice the left digit. + ;; Thus we can calculate the sum of the two digits by shifting the + ;; left digit to the right position and doing a two-bit subtraction. + ;; This subtraction will never create a borrow and thus can be made + ;; on all 16 2-digit numbers at once. (move result arg) - - (inst mov temp result) - (inst shr temp 1) + (move temp arg) + (inst shr result 1) (inst and result #x55555555) - (inst and temp #x55555555) - (inst add result temp) - - (inst mov temp result) + (inst sub temp result) + ;; Calculate 4-bit sums by straightforward shift, mask and add. + ;; Note that we shift the source operand of the MOV and not its + ;; destination so that the SHR and the MOV can execute in the same + ;; clock cycle. + (inst mov result temp) (inst shr temp 2) (inst and result #x33333333) (inst and temp #x33333333) (inst add result temp) - + ;; Calculate 8-bit sums. Since each sum is at most 8, which fits + ;; into 4 bits, we can apply the mask after the addition, saving one + ;; instruction. (inst mov temp result) - (inst shr temp 4) - (inst and result #x0f0f0f0f) - (inst and temp #x0f0f0f0f) + (inst shr result 4) (inst add result temp) - + (inst and result #x0f0f0f0f) + ;; Calculate the two 16-bit sums and the 32-bit sum. No masking is + ;; necessary inbetween since the final sum is at most 32 which fits + ;; into 6 bits. (inst mov temp result) - (inst shr temp 8) - (inst and result #x00ff00ff) - (inst and temp #x00ff00ff) + (inst shr result 8) (inst add result temp) - (inst mov temp result) - (inst shr temp 16) - (inst and result #x0000ffff) - (inst and temp #x0000ffff) - (inst add result temp))) + (inst shr result 16) + (inst add result temp) + (inst and result #xff))) ;;;; binary conditional VOPs diff --git a/tests/arith.pure.lisp b/tests/arith.pure.lisp index 630c0a8..915c545 100644 --- a/tests/arith.pure.lisp +++ b/tests/arith.pure.lisp @@ -264,3 +264,29 @@ (funcall (lambda () (declare (notinline logxor)) (min (logxor 0 0 0 286142502)))))) + +;; Small bugs in LOGCOUNT can still allow SBCL to be built and thus go +;; unnoticed, so check more thoroughly here. +(with-test (:name :logcount) + (flet ((test (x n) + (unless (= (logcount x) n) + (error "logcount failure for ~a" x)))) + ;; Test with some patterns with well known number of ones/zeroes ... + (dotimes (i 128) + (let ((x (ash 1 i))) + (test x 1) + (test (- x) i) + (test (1- x) i))) + ;; ... and with some random integers of varying length. + (flet ((test-logcount (x) + (declare (type integer x)) + (do ((result 0 (1+ result)) + (x (if (minusp x) + (lognot x) + x) + (logand x (1- x)))) + ((zerop x) result)))) + (dotimes (i 200) + (let ((x (random (ash 1 i)))) + (test x (test-logcount x)) + (test (- x) (test-logcount (- x)))))))) diff --git a/version.lisp-expr b/version.lisp-expr index 469961e..3956eb4 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.9.15" +"0.9.15.1"