From 462b946eaf95cdba2648a67ad2bc3b06b7c7a4f2 Mon Sep 17 00:00:00 2001 From: Juho Snellman Date: Mon, 13 Nov 2006 06:10:15 +0000 Subject: [PATCH] 0.9.18.47: Faster &MORE-handling VOPs on x86 and x86-64. * The performance of LOOP is really bad on modern x86, rewrite the loops in %LISTIFY-REST-ARGS and %MORE-ARG-VALUES to do the index manipulation and branching explicitly. * REP MOVS isn't very good for copying small blocks of memory, use an explicit loop in COPY-MORE-ARG. * Rewrite the x86-64 COPY-MORE-ARG to take advantage of the extra registers. * Implement %MORE-ARG (exists on all other platforms). --- src/compiler/fndb.lisp | 1 - src/compiler/x86-64/call.lisp | 115 ++++++++++++++++++--------------------- src/compiler/x86-64/values.lisp | 15 ++--- src/compiler/x86/call.lisp | 45 +++++++++++---- src/compiler/x86/values.lisp | 15 ++--- version.lisp-expr | 2 +- 6 files changed, 103 insertions(+), 90 deletions(-) diff --git a/src/compiler/fndb.lisp b/src/compiler/fndb.lisp index 71eb272..fe73ff0 100644 --- a/src/compiler/fndb.lisp +++ b/src/compiler/fndb.lisp @@ -1407,7 +1407,6 @@ (defknown %special-unbind (t) t) (defknown %listify-rest-args (t index) list (flushable)) (defknown %more-arg-context (t t) (values t index) (flushable)) -#!-stack-grows-downward-not-upward (defknown %more-arg (t index) t) #!+stack-grows-downward-not-upward (defknown %more-kw-arg (t index) (values t t)) diff --git a/src/compiler/x86-64/call.lisp b/src/compiler/x86-64/call.lisp index e0af725..da925b3 100644 --- a/src/compiler/x86-64/call.lisp +++ b/src/compiler/x86-64/call.lisp @@ -1148,25 +1148,10 @@ ;;; Copy a &MORE arg from the argument area to the end of the current ;;; frame. FIXED is the number of non-&MORE arguments. -;;; -;;; The tricky part is doing this without trashing any of the calling -;;; convention registers that are still needed. This vop is emitted -;;; directly after the xep-allocate frame. That means the registers -;;; are in use as follows: -;;; -;;; RAX -- The lexenv. -;;; RBX -- Available. -;;; RCX -- The total number of arguments. -;;; RDX -- The first arg. -;;; RDI -- The second arg. -;;; RSI -- The third arg. -;;; -;;; So basically, we have one register available for our use: RBX. -;;; -;;; What we can do is push the other regs onto the stack, and then -;;; restore their values by looking directly below where we put the -;;; more-args. (define-vop (copy-more-arg) + (:temporary (:sc any-reg :offset r8-offset) copy-index) + (:temporary (:sc any-reg :offset r9-offset) source) + (:temporary (:sc descriptor-reg :offset r10-offset) temp) (:info fixed) (:generator 20 ;; Avoid the copy if there are no more args. @@ -1202,29 +1187,23 @@ ;; Number to copy = nargs-fixed (inst sub rcx-tn (fixnumize fixed)))) - ;; Save rdi and rsi register args. - (inst push rdi-tn) - (inst push rsi-tn) - ;; Okay, we have pushed the register args. We can trash them - ;; now. - - ;; Initialize dst to be end of stack; skiping the values pushed - ;; above. - (inst lea rdi-tn (make-ea :qword :base rsp-tn :disp 16)) - - ;; Initialize src to be end of args. - (inst mov rsi-tn rbp-tn) - (inst sub rsi-tn rbx-tn) + ;; Initialize R8 to be the end of args. + (inst mov source rbp-tn) + (inst sub source rbx-tn) - (inst shr rcx-tn word-shift) ; make word count - ;; And copy the args. - (inst cld) ; auto-inc RSI and RDI. - (inst rep) - (inst movs :qword) + ;; We need to copy from downwards up to avoid overwriting some of + ;; the yet uncopied args. So we need to use R9 as the copy index + ;; and RCX as the loop counter, rather than using RCX for both. + (inst xor copy-index copy-index) - ;; So now we need to restore RDI and RSI. - (inst pop rsi-tn) - (inst pop rdi-tn) + ;; We used to use REP MOVS here, but on modern x86 it performs + ;; much worse than an explicit loop for small blocks. + COPY-LOOP + (inst mov temp (make-ea :qword :base source :index copy-index)) + (inst mov (make-ea :qword :base rsp-tn :index copy-index) temp) + (inst add copy-index n-word-bytes) + (inst sub rcx-tn n-word-bytes) + (inst jmp :nz COPY-LOOP) DO-REGS @@ -1233,26 +1212,26 @@ ;; Here: nargs>=1 && nargs>fixed (when (< fixed register-arg-count) - ;; Now we have to deposit any more args that showed up in - ;; registers. - (do ((i fixed)) - ( nil ) - ;; Store it relative to rbp - (inst mov (make-ea :qword :base rbp-tn - :disp (- (* n-word-bytes - (+ 1 (- i fixed) - (max 3 (sb-allocated-size 'stack)))))) - (nth i *register-arg-tns*)) - - (incf i) - (when (>= i register-arg-count) - (return)) - - ;; Don't deposit any more than there are. - (if (zerop i) - (inst test rcx-tn rcx-tn) - (inst cmp rcx-tn (fixnumize i))) - (inst jmp :eq DONE))) + ;; Now we have to deposit any more args that showed up in + ;; registers. + (do ((i fixed)) + ( nil ) + ;; Store it relative to rbp + (inst mov (make-ea :qword :base rbp-tn + :disp (- (* n-word-bytes + (+ 1 (- i fixed) + (max 3 (sb-allocated-size 'stack)))))) + (nth i *register-arg-tns*)) + + (incf i) + (when (>= i register-arg-count) + (return)) + + ;; Don't deposit any more than there are. + (if (zerop i) + (inst test rcx-tn rcx-tn) + (inst cmp rcx-tn (fixnumize i))) + (inst jmp :eq DONE))) (inst jmp DONE) @@ -1278,6 +1257,19 @@ (inst mov keyword (make-ea :qword :base object :index index :disp n-word-bytes)))) +(define-vop (more-arg) + (:translate sb!c::%more-arg) + (:policy :fast-safe) + (:args (object :scs (descriptor-reg) :to (:result 1)) + (index :scs (any-reg) :to (:result 1) :target value)) + (:arg-types * tagged-num) + (:results (value :scs (descriptor-reg any-reg))) + (:result-types *) + (:generator 4 + (move value index) + (inst neg value) + (inst mov value (make-ea :qword :base object :index value)))) + ;;; Turn more arg (context, count) into a list. (defoptimizer (%listify-rest-args stack-allocate-result) ((&rest args)) t) @@ -1308,8 +1300,6 @@ (maybe-pseudo-atomic stack-allocate-p (allocation dst dst node stack-allocate-p) (inst lea dst (make-ea :byte :base dst :disp list-pointer-lowtag)) - ;; Convert the count into a raw value, so that we can use the - ;; LOOP instruction. (inst shr rcx (1- n-lowtag-bits)) ;; Set decrement mode (successive args at lower addresses) (inst std) @@ -1328,7 +1318,8 @@ (inst lods rax) (storew rax dst 0 list-pointer-lowtag) ;; Go back for more. - (inst loop loop) + (inst sub rcx 1) + (inst jmp :nz loop) ;; NIL out the last cons. (storew nil-value dst 1 list-pointer-lowtag)) (emit-label done)))) diff --git a/src/compiler/x86-64/values.lisp b/src/compiler/x86-64/values.lisp index 4f5f5ae..0c3f4e7 100644 --- a/src/compiler/x86-64/values.lisp +++ b/src/compiler/x86-64/values.lisp @@ -113,7 +113,7 @@ (:arg-types * positive-fixnum positive-fixnum) (:temporary (:sc any-reg :offset rsi-offset :from (:argument 0)) src) (:temporary (:sc descriptor-reg :offset rax-offset) temp) - (:temporary (:sc unsigned-reg :offset rcx-offset) temp1) + (:temporary (:sc unsigned-reg :offset rcx-offset) loop-index) (:results (start :scs (any-reg)) (count :scs (any-reg))) (:generator 20 @@ -135,17 +135,18 @@ (move count num) (inst sub count skip))) - (move temp1 count) + (move loop-index count) (inst mov start rsp-tn) (inst jecxz DONE) ; check for 0 count? - (inst shr temp1 word-shift) ; convert the fixnum to a count. + (inst sub rsp-tn count) + (inst sub src count) - (inst std) ; move down the stack as more value are copied to the bottom. LOOP - (inst lods temp) - (inst push temp) - (inst loop LOOP) + (inst mov temp (make-ea :qword :base src :index loop-index)) + (inst sub loop-index n-word-bytes) + (inst mov (make-ea :qword :base rsp-tn :index loop-index) temp) + (inst jmp :nz LOOP) DONE)) diff --git a/src/compiler/x86/call.lisp b/src/compiler/x86/call.lisp index 3cca339..ad0bac1 100644 --- a/src/compiler/x86/call.lisp +++ b/src/compiler/x86/call.lisp @@ -1236,24 +1236,33 @@ ;; Save edi and esi register args. (inst push edi-tn) (inst push esi-tn) + (inst push ebx-tn) ;; Okay, we have pushed the register args. We can trash them ;; now. - ;; Initialize dst to be end of stack; skiping the values pushed - ;; above. - (inst lea edi-tn (make-ea :dword :base esp-tn :disp 8)) - ;; Initialize src to be end of args. (inst mov esi-tn ebp-tn) (inst sub esi-tn ebx-tn) - (inst shr ecx-tn word-shift) ; make word count - ;; And copy the args. - (inst cld) ; auto-inc ESI and EDI. - (inst rep) - (inst movs :dword) + ;; We need to copy from downwards up to avoid overwriting some of + ;; the yet uncopied args. So we need to use EBX as the copy index + ;; and ECX as the loop counter, rather than using ECX for both. + (inst xor ebx-tn ebx-tn) + + ;; We used to use REP MOVS here, but on modern x86 it performs + ;; much worse than an explicit loop for small blocks. + COPY-LOOP + (inst mov edi-tn (make-ea :dword :base esi-tn :index ebx-tn)) + ;; The :DISP is to account for the registers saved on the stack + (inst mov (make-ea :dword :base esp-tn :disp (* 3 n-word-bytes) + :index ebx-tn) + edi-tn) + (inst add ebx-tn n-word-bytes) + (inst sub ecx-tn n-word-bytes) + (inst jmp :nz COPY-LOOP) ;; So now we need to restore EDI and ESI. + (inst pop ebx-tn) (inst pop esi-tn) (inst pop edi-tn) @@ -1315,6 +1324,19 @@ (inst mov keyword (make-ea :dword :base object :index index :disp n-word-bytes)))))) +(define-vop (more-arg) + (:translate sb!c::%more-arg) + (:policy :fast-safe) + (:args (object :scs (descriptor-reg) :to (:result 1)) + (index :scs (any-reg) :to (:result 1) :target value)) + (:arg-types * tagged-num) + (:results (value :scs (descriptor-reg any-reg))) + (:result-types *) + (:generator 4 + (move value index) + (inst neg value) + (inst mov value (make-ea :dword :base object :index value)))) + ;;; Turn more arg (context, count) into a list. (defoptimizer (%listify-rest-args stack-allocate-result) ((&rest args)) t) @@ -1345,8 +1367,6 @@ (maybe-pseudo-atomic stack-allocate-p (allocation dst dst node stack-allocate-p) (inst lea dst (make-ea :byte :base dst :disp list-pointer-lowtag)) - ;; Convert the count into a raw value, so that we can use the - ;; LOOP instruction. (inst shr ecx 2) ;; Set decrement mode (successive args at lower addresses) (inst std) @@ -1365,7 +1385,8 @@ (inst lods eax) (storew eax dst 0 list-pointer-lowtag) ;; Go back for more. - (inst loop loop) + (inst sub ecx 1) + (inst jmp :nz loop) ;; NIL out the last cons. (storew nil-value dst 1 list-pointer-lowtag)) (emit-label done) diff --git a/src/compiler/x86/values.lisp b/src/compiler/x86/values.lisp index 9524d43..e1b9a5f 100644 --- a/src/compiler/x86/values.lisp +++ b/src/compiler/x86/values.lisp @@ -115,7 +115,7 @@ (:arg-types * positive-fixnum positive-fixnum) (:temporary (:sc any-reg :offset esi-offset :from (:argument 0)) src) (:temporary (:sc descriptor-reg :offset eax-offset) temp) - (:temporary (:sc unsigned-reg :offset ecx-offset) temp1) + (:temporary (:sc unsigned-reg :offset ecx-offset) loop-index) (:results (start :scs (any-reg)) (count :scs (any-reg))) (:generator 20 @@ -137,17 +137,18 @@ (move count num) (inst sub count skip))) - (move temp1 count) + (move loop-index count) (inst mov start esp-tn) (inst jecxz done) ; check for 0 count? - (inst shr temp1 word-shift) ; convert the fixnum to a count. + (inst sub esp-tn count) + (inst sub src count) - (inst std) ; move down the stack as more value are copied to the bottom. LOOP - (inst lods temp) - (inst push temp) - (inst loop loop) + (inst mov temp (make-ea :dword :base src :index loop-index)) + (inst sub loop-index n-word-bytes) + (inst mov (make-ea :dword :base esp-tn :index loop-index) temp) + (inst jmp :nz LOOP) DONE ;; solaris requires DF being zero. diff --git a/version.lisp-expr b/version.lisp-expr index 664b379..f5cabe0 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.9.18.46" +"0.9.18.47" -- 1.7.10.4