From ea3096bd2e37ec173fc4931c7d76f233c459dc86 Mon Sep 17 00:00:00 2001 From: Gabor Melis Date: Sat, 12 Nov 2005 19:50:48 +0000 Subject: [PATCH] 0.9.6.38: * enabled a faster inline-alloc on x86 * +300k to core size * tested on P4 and Pentium M, it is roughly 25% faster on pure consing --- src/compiler/x86/macros.lisp | 49 ++++++++++++++++++++++++------------------ src/runtime/x86-assem.S | 8 ------- version.lisp-expr | 2 +- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/compiler/x86/macros.lisp b/src/compiler/x86/macros.lisp index e61732b..0b8f234 100644 --- a/src/compiler/x86/macros.lisp +++ b/src/compiler/x86/macros.lisp @@ -125,19 +125,17 @@ ;;;; allocation helpers -;;; All allocation is done by calls to assembler routines that -;;; eventually invoke the C alloc() function. Once upon a time -;;; (before threads) allocation within an alloc_region could also be -;;; done inline, with the aid of two C symbols storing the current -;;; allocation region boundaries; however, C symbols are global. +;;; Allocation within alloc_region (which is thread local) can be done +;;; inline. If the alloc_region is overflown allocation is done by +;;; calling the C alloc() function. ;;; C calls for allocation don't /seem/ to make an awful lot of -;;; difference to speed. Guessing from historical context, it looks -;;; like inline allocation was introduced before pseudo-atomic, at -;;; which time all calls to alloc() would have needed a syscall to -;;; mask signals for the duration. Now we have pseudoatomic there's -;;; no need for that overhead. Still, inline alloc would be a neat -;;; addition someday (except see below). +;;; difference to speed. On pure consing it's about a 25% +;;; gain. Guessing from historical context, it looks like inline +;;; allocation was introduced before pseudo-atomic, at which time all +;;; calls to alloc() would have needed a syscall to mask signals for +;;; the duration. Now we have pseudoatomic there's no need for that +;;; overhead. (defun allocation-dynamic-extent (alloc-tn size) (inst sub esp-tn size) @@ -175,6 +173,7 @@ (defun allocation-inline (alloc-tn size) (let ((ok (gen-label)) + (done (gen-label)) (free-pointer (make-ea :dword :disp #!+sb-thread (* n-word-bytes thread-alloc-region-slot) @@ -191,7 +190,7 @@ (inst add alloc-tn free-pointer) #!+sb-thread (inst fs-segment-prefix) (inst cmp alloc-tn end-addr) - (inst jmp :be OK) + (inst jmp :be ok) (let ((dst (ecase (tn-offset alloc-tn) (#.eax-offset "alloc_overflow_eax") (#.ecx-offset "alloc_overflow_ecx") @@ -200,9 +199,23 @@ (#.esi-offset "alloc_overflow_esi") (#.edi-offset "alloc_overflow_edi")))) (inst call (make-fixup dst :foreign))) + (inst jmp-short done) (emit-label ok) - #!+sb-thread (inst fs-segment-prefix) - (inst xchg free-pointer alloc-tn)) + ;; Swap ALLOC-TN and FREE-POINTER + (cond ((and (tn-p size) (location= alloc-tn size)) + ;; XCHG is extremely slow, use the xor swap trick + #!+sb-thread (inst fs-segment-prefix) + (inst xor alloc-tn free-pointer) + #!+sb-thread (inst fs-segment-prefix) + (inst xor free-pointer alloc-tn) + #!+sb-thread (inst fs-segment-prefix) + (inst xor alloc-tn free-pointer)) + (t + ;; It's easier if SIZE is still available. + #!+sb-thread (inst fs-segment-prefix) + (inst mov free-pointer alloc-tn) + (inst sub alloc-tn size))) + (emit-label done)) (values)) @@ -219,13 +232,7 @@ (defun allocation (alloc-tn size &optional inline dynamic-extent) (cond (dynamic-extent (allocation-dynamic-extent alloc-tn size)) - ;; FIXME: for reasons unknown, inline allocation is a speed win on - ;; non-P4s, and a speed loss on P4s (and probably other such - ;; high-spec high-cache machines). :INLINE-ALLOCATION-IS-GOOD is - ;; a bit of a KLUDGE, really. -- CSR, 2004-08-05 (following - ;; observations made by ASF and Juho Snellman) - ((and (member :inline-allocation-is-good *backend-subfeatures*) - (or (null inline) (policy inline (>= speed space)))) + ((or (null inline) (policy inline (>= speed space))) (allocation-inline alloc-tn size)) (t (allocation-notinline alloc-tn size))) (values)) diff --git a/src/runtime/x86-assem.S b/src/runtime/x86-assem.S index c16b3ef..ad9e089 100644 --- a/src/runtime/x86-assem.S +++ b/src/runtime/x86-assem.S @@ -669,10 +669,8 @@ GNAME(alloc_16_to_edi): #ifdef LISP_FEATURE_SB_THREAD #define START_REGION %fs:THREAD_ALLOC_REGION_OFFSET -#define DISPLACEMENT $7 #else #define START_REGION boxed_region -#define DISPLACEMENT $6 #endif /* This routine handles an overflow with eax=crfp+size. So the @@ -690,7 +688,6 @@ GNAME(alloc_overflow_eax): addl $4,%esp # pop the size arg. popl %edx # Restore edx. popl %ecx # Restore ecx. - addl DISPLACEMENT,(%esp) # Adjust the return address to skip the next inst. ret .size GNAME(alloc_overflow_eax),.-GNAME(alloc_overflow_eax) @@ -708,7 +705,6 @@ GNAME(alloc_overflow_ecx): movl %eax,%ecx # setup the destination. popl %edx # Restore edx. popl %eax # Restore eax. - addl DISPLACEMENT,(%esp) # Adjust the return address to skip the next inst. ret .size GNAME(alloc_overflow_ecx),.-GNAME(alloc_overflow_ecx) @@ -726,7 +722,6 @@ GNAME(alloc_overflow_edx): movl %eax,%edx # setup the destination. popl %ecx # Restore ecx. popl %eax # Restore eax. - addl DISPLACEMENT,(%esp) # Adjust the return address to skip the next inst. ret .size GNAME(alloc_overflow_edx),.-GNAME(alloc_overflow_edx) @@ -748,7 +743,6 @@ GNAME(alloc_overflow_ebx): popl %edx # Restore edx. popl %ecx # Restore ecx. popl %eax # Restore eax. - addl DISPLACEMENT,(%esp) # Adjust the return address to skip the next inst. ret .size GNAME(alloc_overflow_ebx),.-GNAME(alloc_overflow_ebx) @@ -770,7 +764,6 @@ GNAME(alloc_overflow_esi): popl %edx # Restore edx. popl %ecx # Restore ecx. popl %eax # Restore eax. - addl DISPLACEMENT,(%esp) # Adjust the return address to skip the next inst. ret .size GNAME(alloc_overflow_esi),.-GNAME(alloc_overflow_esi) @@ -790,7 +783,6 @@ GNAME(alloc_overflow_edi): popl %edx # Restore edx. popl %ecx # Restore ecx. popl %eax # Restore eax. - addl DISPLACEMENT,(%esp) # Adjust the return address to skip the next inst. ret .size GNAME(alloc_overflow_edi),.-GNAME(alloc_overflow_edi) diff --git a/version.lisp-expr b/version.lisp-expr index cdd8eaf..f2961b1 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.9.6.37" +"0.9.6.38" -- 1.7.10.4