From b6e9c116826f9461e2660d87c031fd1c4488e776 Mon Sep 17 00:00:00 2001 From: Juho Snellman Date: Thu, 3 Nov 2005 12:41:07 +0000 Subject: [PATCH] 0.9.6.14: Faster allocation on x86-64 (25% speedup on memory-intensive CL-BENCH tests, 5% on more generic stuff like COMPILER): * Inline allocation was using a memory-to-register XCHG (latency 16 on Athlon 64) on the fast path. Use a temporary register instead. * Change the temp-tn from r13 to r11, which has a shorter encoding (results in smaller core and better icache behaviour) TODO: Check whether the XCHG issue also caused the bizarre P4 performance problems with the (disabled) x86 inline allocation support, and whether anything can be done to fix the problem. Using the same solution is probably impossible due to the lack of extra registers. --- NEWS | 1 + src/assembly/x86-64/support.lisp | 12 ++++++------ src/compiler/x86-64/macros.lisp | 14 ++++++++------ src/compiler/x86-64/vm.lisp | 18 ++++++++++++++---- version.lisp-expr | 2 +- 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 1b7ab15..215c3e2 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,7 @@ changes in sbcl-0.9.7 relative to sbcl-0.9.6: merged with *DEFAULT-PATHNAME-DEFAULTS*. * optimization: performance improvements to IO on file streams of :ELEMENT-TYPE CHARACTER + * optimization: much faster memory allocation on x86-64 changes in sbcl-0.9.6 relative to sbcl-0.9.5: * bug fix: add a workaround to SBCL looping infinitely at startup on diff --git a/src/assembly/x86-64/support.lisp b/src/assembly/x86-64/support.lisp index 8f5e239..5e10c88 100644 --- a/src/assembly/x86-64/support.lisp +++ b/src/assembly/x86-64/support.lisp @@ -13,24 +13,24 @@ (ecase style (:raw (values - `((inst lea r13-tn + `((inst lea temp-reg-tn (make-ea :qword :disp (make-fixup ',name :assembly-routine))) - (inst call r13-tn)) + (inst call temp-reg-tn)) nil)) (:full-call (values `((note-this-location ,vop :call-site) - (inst lea r13-tn + (inst lea temp-reg-tn (make-ea :qword :disp (make-fixup ',name :assembly-routine))) - (inst call r13-tn) + (inst call temp-reg-tn) (note-this-location ,vop :single-value-return) (move rsp-tn rbx-tn)) '((:save-p :compute-only)))) (:none (values - `((inst lea r13-tn + `((inst lea temp-reg-tn (make-ea :qword :disp (make-fixup ',name :assembly-routine))) - (inst jmp r13-tn)) + (inst jmp temp-reg-tn)) nil)))) (!def-vm-support-routine generate-return-sequence (style) diff --git a/src/compiler/x86-64/macros.lisp b/src/compiler/x86-64/macros.lisp index 4a6a36b..ccfc7d7 100644 --- a/src/compiler/x86-64/macros.lisp +++ b/src/compiler/x86-64/macros.lisp @@ -130,9 +130,9 @@ (defun allocation-tramp (alloc-tn size &optional ignored) (declare (ignore ignored)) (inst push size) - (inst lea r13-tn (make-ea :qword + (inst lea temp-reg-tn (make-ea :qword :disp (make-fixup "alloc_tramp" :foreign))) - (inst call r13-tn) + (inst call temp-reg-tn) (inst pop alloc-tn) (values)) @@ -168,12 +168,14 @@ (cond (in-elsewhere (allocation-tramp alloc-tn size)) (t + (inst mov temp-reg-tn free-pointer) (unless (and (tn-p size) (location= alloc-tn size)) (inst mov alloc-tn size)) - (inst add alloc-tn free-pointer) + (inst add alloc-tn temp-reg-tn) (inst cmp end-addr alloc-tn) (inst jmp :be NOT-INLINE) - (inst xchg free-pointer alloc-tn) + (inst mov free-pointer alloc-tn) + (inst mov alloc-tn temp-reg-tn) (emit-label DONE) (assemble (*elsewhere*) (emit-label NOT-INLINE) @@ -189,9 +191,9 @@ (defun allocation (alloc-tn size &optional ignored) (declare (ignore ignored)) (inst push size) - (inst lea r13-tn (make-ea :qword + (inst lea temp-reg-tn (make-ea :qword :disp (make-fixup "alloc_tramp" :foreign))) - (inst call r13-tn) + (inst call temp-reg-tn) (inst pop alloc-tn) (values)) diff --git a/src/compiler/x86-64/vm.lisp b/src/compiler/x86-64/vm.lisp index 32a59d9..ed05703 100644 --- a/src/compiler/x86-64/vm.lisp +++ b/src/compiler/x86-64/vm.lisp @@ -71,7 +71,7 @@ (defreg r15b 30 :byte) (defregset *byte-regs* al cl dl bl sil dil r8b r9b r10b - r11b #+nil r12b #+nil r13b r14b r15b) + #+nil r11b #+nil r12b r13b r14b r15b) ;; word registers (defreg ax 0 :word) @@ -116,8 +116,13 @@ ;; list of qword registers. However ;; r13 is already used as temporary [#lisp irc 2005/01/30] ;; and we're now going to use r12 for the struct thread* + ;; + ;; Except that now we use r11 instead of r13 as the temporary, + ;; since it's got a more compact encoding than r13, and experimentally + ;; the temporary gets used more than the other registers that are never + ;; wired. -- JES, 2005-11-02 (defregset *qword-regs* rax rcx rdx rbx rsi rdi - r8 r9 r10 r11 r14 r15) + r8 r9 r10 #+nil r11 #+nil r12 r13 r14 r15) ;; floating point registers (defreg float0 0 :float) @@ -382,15 +387,20 @@ `(progn ,@(forms))))) (def-misc-reg-tns unsigned-reg rax rbx rcx rdx rbp rsp rdi rsi - r8 r9 r10 r11 r12 r13 r14 r15) + r8 r9 r10 r11 r12 r13 r14 r15) (def-misc-reg-tns dword-reg eax ebx ecx edx ebp esp edi esi) (def-misc-reg-tns word-reg ax bx cx dx bp sp di si) (def-misc-reg-tns byte-reg al cl dl bl sil dil r8b r9b r10b - r11b r14b r15b) + r11b r12b r13b r14b r15b) (def-misc-reg-tns single-reg float0 float1 float2 float3 float4 float5 float6 float7 float8 float9 float10 float11 float12 float13 float14 float15)) +;; A register that's never used by the code generator, and can therefore +;; be used as an assembly temporary in cases where a VOP :TEMPORARY can't +;; be used. +(defparameter temp-reg-tn r11-tn) + ;;; TNs for registers used to pass arguments (defparameter *register-arg-tns* (mapcar (lambda (register-arg-name) diff --git a/version.lisp-expr b/version.lisp-expr index 19b7386..bb543ba 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.9.6.13" +"0.9.6.14" -- 1.7.10.4