From 28aaa39f4e31e7a71e7f82fce53bd0ad804efa5e Mon Sep 17 00:00:00 2001 From: Alastair Bridgewater Date: Sun, 8 Aug 2010 01:14:04 +0000 Subject: [PATCH] 1.0.41.40: ppc: Shorten the gencgc allocation sequence. * Rearrange the allocation sequence to avoid all branches, relying on the runtime to manipulate the point at which execution resumes from an allocation trap to compensate. * Update the runtime to match the new allocation sequence. * There is a further possible optimization here: The runtime allocation trap handler can also accept an ADDI instruction where the current sequence uses an ADD. In the case of a fixed allocation size, this would save loading the temp register with the size. * Another optimization, along the same lines as the previous one: With a fixed allocation size, adjusting the pointer to point to the beginning of the data block and setting the lowtag could be done in a single instruction. * A third optimization, one which would entail modifying the allocation trap handler slightly, and depends on at least the first optimization above being in place: Once temp-tn is no longer being used to hold the allocation size for fixed allocations, it is available to hold the address of the alloc region when threading is disabled, thus saving having to reload it (two instructions). --- src/compiler/ppc/macros.lisp | 107 +++++++++++++++++++++--------------------- src/runtime/ppc-arch.c | 20 +++++--- version.lisp-expr | 2 +- 3 files changed, 69 insertions(+), 60 deletions(-) diff --git a/src/compiler/ppc/macros.lisp b/src/compiler/ppc/macros.lisp index 7921078..c52d4d5 100644 --- a/src/compiler/ppc/macros.lisp +++ b/src/compiler/ppc/macros.lisp @@ -201,67 +201,68 @@ (inst addi alloc-tn alloc-tn ,alloc-size) (inst add alloc-tn alloc-tn ,alloc-size)))) #!+gencgc - (let ((fix-addr (gensym)) - (inline-alloc (gensym))) - `(let ((,fix-addr (gen-label)) - (,inline-alloc (gen-label))) - ;; Make temp-tn be the size - (cond ((numberp ,size) - (inst lr ,temp-tn ,size)) - (t - (move ,temp-tn ,size))) - - #!-sb-thread - (inst lr ,flag-tn (make-fixup "boxed_region" :foreign)) - #!-sb-thread - (inst lwz ,result-tn ,flag-tn 0) - #!+sb-thread - (inst lwz ,result-tn thread-base-tn (* thread-alloc-region-slot - n-word-bytes)) - - ;; we can optimize this to only use one fixup here, once we get - ;; it working - ;; (inst lr ,flag-tn (make-fixup "boxed_region" :foreign 4)) - ;; (inst lwz ,flag-tn ,flag-tn 0) - #!-sb-thread - (inst lwz ,flag-tn ,flag-tn 4) - #!+sb-thread - (inst lwz ,flag-tn thread-base-tn (* (1+ thread-alloc-region-slot) + `(progn + ;; Make temp-tn be the size + (cond ((numberp ,size) + (inst lr ,temp-tn ,size)) + (t + (move ,temp-tn ,size))) + + #!-sb-thread + (inst lr ,flag-tn (make-fixup "boxed_region" :foreign)) + #!-sb-thread + (inst lwz ,result-tn ,flag-tn 0) + #!+sb-thread + (inst lwz ,result-tn thread-base-tn (* thread-alloc-region-slot n-word-bytes)) - (without-scheduling () - ;; CAUTION: The C code depends on the exact order of - ;; instructions here. In particular, three instructions before - ;; the TW instruction must be an ADD or ADDI instruction, so it - ;; can figure out the size of the desired allocation. - ;; Now make result-tn point at the end of the object, to - ;; figure out if we overflowed the current region. - (inst add ,result-tn ,result-tn ,temp-tn) - ;; result-tn points to the new end of the region. Did we go past - ;; the actual end of the region? If so, we need a full alloc. - ;; The C code depends on this exact form of instruction. If - ;; either changes, you have to change the other appropriately! - (inst cmpw ,result-tn ,flag-tn) - - (inst bng ,inline-alloc) - (inst tw :lge ,result-tn ,flag-tn)) - (inst b ,fix-addr) - - (emit-label ,inline-alloc) + ;; we can optimize this to only use one fixup here, once we get + ;; it working + ;; (inst lr ,flag-tn (make-fixup "boxed_region" :foreign 4)) + ;; (inst lwz ,flag-tn ,flag-tn 0) + #!-sb-thread + (inst lwz ,flag-tn ,flag-tn 4) + #!+sb-thread + (inst lwz ,flag-tn thread-base-tn (* (1+ thread-alloc-region-slot) + n-word-bytes)) + + (without-scheduling () + ;; CAUTION: The C code depends on the exact order of + ;; instructions here. In particular, immediately before the + ;; TW instruction must be an ADD or ADDI instruction, so it + ;; can figure out the size of the desired allocation and + ;; storing the new base pointer back to the allocation region + ;; must take two instructions (one on threaded targets). + + ;; Now make result-tn point at the end of the object, to + ;; figure out if we overflowed the current region. + (inst add ,result-tn ,result-tn ,temp-tn) + ;; result-tn points to the new end of the region. Did we go past + ;; the actual end of the region? If so, we need a full alloc. + ;; The C code depends on this exact form of instruction. If + ;; either changes, you have to change the other appropriately! + (inst tw :lge ,result-tn ,flag-tn) + + ;; The C code depends on this instruction sequence taking up + ;; #!-sb-thread three #!+sb-thread one machine instruction. + ;; The lr of a fixup counts as two instructions. #!-sb-thread (inst lr ,flag-tn (make-fixup "boxed_region" :foreign)) #!-sb-thread (inst stw ,result-tn ,flag-tn 0) #!+sb-thread (inst stw ,result-tn thread-base-tn (* thread-alloc-region-slot - n-word-bytes)) - - (emit-label ,fix-addr) - ;; At this point, result-tn points at the end of the object. - ;; Adjust to point to the beginning. - (inst sub ,result-tn ,result-tn ,temp-tn) - ;; Set the lowtag appropriately - (inst ori ,result-tn ,result-tn ,lowtag)))) + n-word-bytes))) + + ;; Should the allocation trap above have fired, the runtime + ;; arranges for execution to resume here, just after where we + ;; would have updated the free pointer in the alloc region. + + ;; At this point, result-tn points at the end of the object. + ;; Adjust to point to the beginning. + (inst sub ,result-tn ,result-tn ,temp-tn) + ;; Set the lowtag appropriately + (inst ori ,result-tn ,result-tn ,lowtag))) (defmacro with-fixed-allocation ((result-tn flag-tn temp-tn type-code size &key (lowtag other-pointer-lowtag)) diff --git a/src/runtime/ppc-arch.c b/src/runtime/ppc-arch.c index a4df6f5..281de3c 100644 --- a/src/runtime/ppc-arch.c +++ b/src/runtime/ppc-arch.c @@ -300,15 +300,15 @@ allocation_trap_p(os_context_t * context) && (4 == ((inst >> 1) & 0x3ff))) { /* * We got the instruction. Now, look back to make sure it was - * proceeded by what we expected. 2 instructions back should be - * an ADD or ADDI instruction. + * proceeded by what we expected. The previous instruction + * should be an ADD or ADDI instruction. */ unsigned int add_inst; - add_inst = pc[-3]; + add_inst = pc[-1]; #if 0 fprintf(stderr, " add inst at %p: inst = 0x%08x\n", - pc - 3, add_inst); + pc - 1, add_inst); #endif opcode = add_inst >> 26; if ((opcode == 31) && (266 == ((add_inst >> 1) & 0x1ff))) { @@ -389,7 +389,7 @@ handle_allocation_trap(os_context_t * context) * is the size of the allocation. Get it and call alloc to allocate * new space. */ - inst = pc[-3]; + inst = pc[-1]; opcode = inst >> 26; #if 0 fprintf(stderr, " add inst = 0x%08x, opcode = %d\n", inst, opcode); @@ -491,6 +491,15 @@ handle_allocation_trap(os_context_t * context) undo_fake_foreign_function_call(context); } + /* Skip the allocation trap and the write of the updated free + * pointer back to the allocation region. This is two + * instructions when threading is enabled and four instructions + * otherwise. */ +#ifdef LISP_FEATURE_SB_THREAD + (*os_context_pc_addr(context)) = pc + 2; +#else + (*os_context_pc_addr(context)) = pc + 4; +#endif } #endif @@ -550,7 +559,6 @@ sigtrap_handler(int signal, siginfo_t *siginfo, os_context_t *context) /* Is this an allocation trap? */ if (allocation_trap_p(context)) { handle_allocation_trap(context); - arch_skip_instruction(context); return; } #endif diff --git a/version.lisp-expr b/version.lisp-expr index c7105bd..a57a651 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.41.39" +"1.0.41.40" -- 1.7.10.4