From 2973941cf214a5ea274cd0381a651ce0e1a7fab2 Mon Sep 17 00:00:00 2001
From: Gabor Melis <mega@hotpop.com>
Date: Tue, 21 Apr 2009 10:24:15 +0000
Subject: [PATCH] 1.0.27.11: swap ocfp and return-pc slots in x86oid call
 frames

Forward port of Alastair Bridgewater's patch. Also, port it to x86-64.
Bring x86 and x86-64 sources closer in the process.

Plus cleanups, indentation, remove dead code, comments, more checks.
---
 doc/internals/calling-convention.texinfo |  213 +++++++++++++++---------------
 src/assembly/x86-64/arith.lisp           |   28 ++--
 src/assembly/x86-64/assem-rtns.lisp      |   24 ++--
 src/assembly/x86/arith.lisp              |   29 ++--
 src/assembly/x86/assem-rtns.lisp         |   38 +++---
 src/compiler/x86-64/call.lisp            |  194 +++++++++++----------------
 src/compiler/x86-64/insts.lisp           |    2 +-
 src/compiler/x86-64/nlx.lisp             |    2 +-
 src/compiler/x86-64/static-fn.lisp       |   10 +-
 src/compiler/x86-64/vm.lisp              |   12 +-
 src/compiler/x86/call.lisp               |  200 +++++++++++-----------------
 src/compiler/x86/insts.lisp              |    4 +-
 src/compiler/x86/nlx.lisp                |    2 +-
 src/compiler/x86/static-fn.lisp          |   10 +-
 src/compiler/x86/vm.lisp                 |    4 +-
 src/runtime/backtrace.c                  |    4 +-
 src/runtime/x86-64-assem.S               |    3 +-
 src/runtime/x86-assem.S                  |    5 +-
 version.lisp-expr                        |    2 +-
 19 files changed, 360 insertions(+), 426 deletions(-)

diff --git a/doc/internals/calling-convention.texinfo b/doc/internals/calling-convention.texinfo
index 3e403f0..bf5b2f3 100644
--- a/doc/internals/calling-convention.texinfo
+++ b/doc/internals/calling-convention.texinfo
@@ -11,31 +11,27 @@
 * Additional Notes::            
 @end menu
 
-The calling convention used within Lisp code on SBCL/x86 was,
-for the longest time, really bad. If it weren't for the fact
-that it predates modern x86 CPUs, one might almost believe it to
-have been designed explicitly to defeat the branch-prediction
-hardware therein. This chapter is somewhat of a brain-dump of
-information that might be useful when attempting to improve the
-situation further, mostly written immediately after having made
-a dent in the problem.
-
-Assumptions about the calling convention are embedded throughout
-the system. The runtime knows how to call in to Lisp and receive
-a value from Lisp, the assembly-routines have intimate knowledge
-of what registers are involved in a call situation,
+The calling convention used within Lisp code on SBCL/x86 was, for the
+longest time, really bad. If it weren't for the fact that it predates
+modern x86 CPUs, one might almost believe it to have been designed
+explicitly to defeat the branch-prediction hardware therein. This
+chapter is somewhat of a brain-dump of information that might be
+useful when attempting to improve the situation further, mostly
+written immediately after having made a dent in the problem.
+
+Assumptions about the calling convention are embedded throughout the
+system. The runtime knows how to call in to Lisp and receive a value
+from Lisp, the assembly-routines have intimate knowledge of what
+registers are involved in a call situation,
 @file{src/compiler/target/call.lisp} contains the VOPs involved in
-implementing function call/return, and @file{src/compiler/ir2tran.lisp} has
-assumptions about frame allocation and argument/return-value
-passing locations.
-
-The current round of changes has been limited to VOPs, assembly-routines,
-related support functions, and the required support in the runtime.
+implementing function call/return, and
+@file{src/compiler/ir2tran.lisp} has assumptions about frame
+allocation and argument/return-value passing locations.
 
 Note that most of this documentation also applies to other CPUs,
-modulo the actual registers involved, the displacement used
-in the single-value return convention, and the fact that they
-use the ``old'' convention anywhere it is mentioned.
+modulo the actual registers involved, the displacement used in the
+single-value return convention, and the fact that they use the ``old''
+convention anywhere it is mentioned.
 
 
 @node Assembly Routines
@@ -58,13 +54,13 @@ use the ``old'' convention anywhere it is mentioned.
 ;;;    -- AB, 2006/Feb/05.
 @end example
 
-There are a couple of assembly-routines that implement parts of
-the process of returning or tail-calling with a variable number
-of values. These are @code{return-multiple} and @code{tail-call-variable} in
+There are a couple of assembly-routines that implement parts of the
+process of returning or tail-calling with a variable number of values.
+These are @code{return-multiple} and @code{tail-call-variable} in
 @file{src/assembly/x86/assem-rtns.lisp}. They have their own calling
-convention for invocation from a VOP, but implement various
-block-move operations on the stack contents followed by a return
-or tail-call operation.
+convention for invocation from a VOP, but implement various block-move
+operations on the stack contents followed by a return or tail-call
+operation.
 
 That's about all I have to say about the assembly-routines.
 
@@ -73,22 +69,22 @@ That's about all I have to say about the assembly-routines.
 @comment  node-name,  next,  previous,  up
 @section Local Calls
 
-Calls within a block, whatever a block is, can use a local
-calling convention in which the compiler knows where all of the
-values are to be stored, and thus can elide the check for number
-of return values, stack-pointer restoration, etc. Alternately,
-they can use the full unknown-values return convention while
-trying to short-circuit the call convention. There is probably
-some low-hanging fruit here in terms of CPU branch-prediction.
+Calls within a block, whatever a block is, can use a local calling
+convention in which the compiler knows where all of the values are to
+be stored, and thus can elide the check for number of return values,
+stack-pointer restoration, etc. Alternately, they can use the full
+unknown-values return convention while trying to short-circuit the
+call convention. There is probably some low-hanging fruit here in
+terms of CPU branch-prediction.
 
-The local (known-values) calling convention is implemented by
-the @code{known-call-local} and @code{known-return} VOPs.
+The local (known-values) calling convention is implemented by the
+@code{known-call-local} and @code{known-return} VOPs.
 
 Local unknown-values calls are handled at the call site by the
-@code{call-local} and @code{mutiple-call-local} VOPs. The main difference
-between the full call and local call protocols here is that
-local calls use a different frame setup protocol, and will tend
-to not use the normal frame layout for the old frame-pointer and
+@code{call-local} and @code{mutiple-call-local} VOPs. The main
+difference between the full call and local call protocols here is that
+local calls use a different frame setup protocol, and will tend to not
+use the normal frame layout for the old frame-pointer and
 return-address.
 
 
@@ -108,96 +104,97 @@ return-address.
 @end example
 
 Basically, we use caller-allocated frames, pass an fdefinition,
-function, or closure in @code{EAX},
-argcount in @code{ECX}, and first three args in @code{EDX}, @code{EDI},
-and @code{ESI}. @code{EBP} points to just past the start of the frame
-(the first frame slot is at @code{[EBP-4]}, not the traditional @code{[EBP]},
-due in part to how the frame allocation works). The caller stores the
-link for the old frame at @code{[EBP-4]} and reserved space for a
-return address at @code{[EBP-8]}. @code{[EBP-12]} appears to be an
-empty slot available to the compiler within a function, it
-may-or-may-not be used by some of the call/return junk. The first stack
-argument is at @code{[EBP-16]}. The callee then reallocates the
+function, or closure in @code{EAX}, argcount in @code{ECX}, and first
+three args in @code{EDX}, @code{EDI}, and @code{ESI}. @code{EBP}
+points to just past the start of the frame (the first frame slot is at
+@code{[EBP-4]}, not the traditional @code{[EBP]}, due in part to how
+the frame allocation works). The caller stores the link for the old
+frame at @code{[EBP-4]} and reserved space for a return address at
+@code{[EBP-8]}. @code{[EBP-12]} appears to be an empty slot that
+conveniently makes just enough space for the first three multiple
+return values (returned in the argument passing registers) to be
+written over the beginning of the frame by the receiver. The first
+stack argument is at @code{[EBP-16]}. The callee then reallocates the
 frame to include sufficient space for its local variables, after
 possibly converting any @code{&rest} arguments to a proper list.
 
+The above scheme was changed in 1.0.27 on x86 and x86-64 by swapping
+the old frame pointer and the return address.
 
 @node Unknown-Values Returns
 @comment  node-name,  next,  previous,  up
 @section Unknown-Values Returns
 
-The unknown-values return convention consists of two parts. The
-first part is that of returning a single value. The second is
-that of returning a different number of values. We also changed
-the convention here recently, so we should describe both the old
-and new versions. The three interesting VOPs here are @code{return-single},
-@code{return}, and @code{return-multiple}.
+The unknown-values return convention consists of two parts. The first
+part is that of returning a single value. The second is that of
+returning a different number of values. We also changed the convention
+in 0.9.10, so we should describe both the old and new versions. The
+three interesting VOPs here are @code{return-single}, @code{return},
+and @code{return-multiple}.
 
 For a single-value return, we load the return value in the first
-argument-passing register (@code{A0}, or @code{EDI}), reload the old frame
-pointer, burn the stack frame, and return. The old convention
-was to increment the return address by two before returning,
-typically via a @code{JMP}, which was guaranteed to screw up branch-
-prediction hardware. The new convention is to return with the
-carry flag clear.
-
-For a multiple-value return, we pass the first three values in
-the argument-passing registers, and the remainder on the stack.
-@code{ECX} contains the total number of values as a fixnum, @code{EBX} points
-to where the callee frame was, @code{EBP} has been restored to point to
-the caller frame, and the first of the values on the stack (the
-fourth overall) is at @code{[EBP-16]}. The old convention was just to
-jump to the return address at this point. The newer one has us
-setting the carry flag first.
-
-The code at the call site for accepting some number of unknown-
-values is fairly well boilerplated. If we are expecting zero or
-one values, then we need to reset the stack pointer if we are in
-a multiple-value return. In the old convention we just encoded a
-@code{MOV ESP, EBX} instruction, which neatly fit in the two byte gap
-that was skipped by a single-value return. In the new convention
-we have to explicitly check the carry flag with a conditional
-jump around the @code{MOV ESP, EBX} instruction. When expecting more
-than one value, we need to arrange to set up default values when
-a single-value return happens, so we encode a jump around a
-stub of code which fakes up the register use convention of a
-multiple-value return. Again, in the old convention this was a
-two-byte unconditionl jump, and in the new convention this is
-a conditional jump based on the carry flag.
+argument-passing register (@code{A0}, or @code{EDI}), reload the old
+frame pointer, burn the stack frame, and return. The old convention
+was to increment the return address by two before returning, typically
+via a @code{JMP}, which was guaranteed to screw up branch- prediction
+hardware. The new convention is to return with the carry flag clear.
+
+For a multiple-value return, we pass the first three values in the
+argument-passing registers, and the remainder on the stack. @code{ECX}
+contains the total number of values as a fixnum, @code{EBX} points to
+where the callee frame was, @code{EBP} has been restored to point to
+the caller frame, and the first of the values on the stack (the fourth
+overall) is at @code{[EBP-16]}. The old convention was just to jump to
+the return address at this point. The newer one has us setting the
+carry flag first.
+
+The code at the call site for accepting some number of unknown- values
+is fairly well boilerplated. If we are expecting zero or one values,
+then we need to reset the stack pointer if we are in a multiple-value
+return. In the old convention we just encoded a @code{MOV ESP, EBX}
+instruction, which neatly fit in the two byte gap that was skipped by
+a single-value return. In the new convention we have to explicitly
+check the carry flag with a conditional jump around the @code{MOV ESP,
+EBX} instruction. When expecting more than one value, we need to
+arrange to set up default values when a single-value return happens,
+so we encode a jump around a stub of code which fakes up the register
+use convention of a multiple-value return. Again, in the old
+convention this was a two-byte unconditionl jump, and in the new
+convention this is a conditional jump based on the carry flag.
 
 
 @node IR2 Conversion
 @comment  node-name,  next,  previous,  up
 @section IR2 Conversion
 
-The actual selection of VOPs for implementing call/return for a
-given function is handled in ir2tran.lisp. Returns are handled
-by @code{ir2-convert-return}, calls are handled by @code{ir2-convert-local-call},
-@code{ir2-convert-full-call}, and @code{ir2-convert-mv-call}, and
-function prologues are handled by @code{ir2-convert-bind} (which calls
-@code{init-xep-environment} for the case of an entry point for a full
-call).
+The actual selection of VOPs for implementing call/return for a given
+function is handled in ir2tran.lisp. Returns are handled by
+@code{ir2-convert-return}, calls are handled by
+@code{ir2-convert-local-call}, @code{ir2-convert-full-call}, and
+@code{ir2-convert-mv-call}, and function prologues are handled by
+@code{ir2-convert-bind} (which calls @code{init-xep-environment} for
+the case of an entry point for a full call).
 
 
 @node Additional Notes
 @comment  node-name,  next,  previous,  up
 @section Additional Notes
 
-The low-hanging fruit here is going to be changing every call
-and return to use @code{CALL} and @code{RETURN} instructions
-instead of @code{JMP} instructions.
+The low-hanging fruit here is going to be changing every call and
+return to use @code{CALL} and @code{RETURN} instructions instead of
+@code{JMP} instructions.
 
 A more involved change would be to reduce the number of argument
-passing registers from three to two, which may be beneficial in
-terms of our quest to free up a GPR for use on Win32 boxes for a
-thread structure.
+passing registers from three to two, which may be beneficial in terms
+of our quest to free up a GPR for use on Win32 boxes for a thread
+structure.
 
 Another possible win could be to store multiple return-values
-somewhere other than the stack, such as a dedicated area of the
-thread structure. The main concern here in terms of clobbering
-would be to make sure that interrupts (and presumably the
-internal-error machinery) know to save the area and that the
-compiler knows that the area cannot be live across a function
-call. Actually implementing this would involve hacking the IR2
-conversion, since as it stands now the same argument conventions
-are used for both call and return value storage (same TNs).
+somewhere other than the stack, such as a dedicated area of the thread
+structure. The main concern here in terms of clobbering would be to
+make sure that interrupts (and presumably the internal-error
+machinery) know to save the area and that the compiler knows that the
+area cannot be live across a function call. Actually implementing this
+would involve hacking the IR2 conversion, since as it stands now the
+same argument conventions are used for both call and return value
+storage (same TNs).
diff --git a/src/assembly/x86-64/arith.lisp b/src/assembly/x86-64/arith.lisp
index d47b720..b6a1934 100644
--- a/src/assembly/x86-64/arith.lisp
+++ b/src/assembly/x86-64/arith.lisp
@@ -36,17 +36,16 @@
                 (inst jmp :nz DO-STATIC-FUN)    ; no - do generic
 
                 ,@body
-                (inst clc)
+                (inst clc) ; single-value return
                 (inst ret)
 
                 DO-STATIC-FUN
-                (inst pop rax)
                 (inst push rbp-tn)
-                (inst lea
-                      rbp-tn
-                      (make-ea :qword :base rsp-tn :disp n-word-bytes))
-                (inst sub rsp-tn (fixnumize 2))
-                (inst push rax)              ; callers return addr
+                (inst lea rbp-tn (make-ea :qword
+                                          :base rsp-tn
+                                          :disp (* 2 n-word-bytes)))
+                (inst sub rsp-tn (fixnumize 1))
+                (inst push (make-ea :qword :base rbp-tn :disp (- n-word-bytes)))
                 (inst mov rcx (fixnumize 2)) ; arg count
                 (inst jmp
                       (make-ea :qword
@@ -127,11 +126,10 @@
   (inst test x fixnum-tag-mask)
   (inst jmp :z FIXNUM)
 
-  (inst pop rax)
   (inst push rbp-tn)
-  (inst lea rbp-tn (make-ea :qword :base rsp-tn :disp n-word-bytes))
-  (inst sub rsp-tn (fixnumize 2))
-  (inst push rax)
+  (inst lea rbp-tn (make-ea :qword :base rsp-tn :disp (* 2 n-word-bytes)))
+  (inst sub rsp-tn (fixnumize 1))
+  (inst push (make-ea :qword :base rbp-tn :disp (- n-word-bytes)))
   (inst mov rcx (fixnumize 1))    ; arg count
   (inst jmp (make-ea :qword
                      :disp (+ nil-value (static-fun-offset '%negate))))
@@ -163,7 +161,7 @@
                 (inst mov rcx x)
                 (inst or rcx y)
                 (inst test rcx fixnum-tag-mask)
-                (inst jmp :nz DO-STATIC-FUN)
+                (inst jmp :nz DO-STATIC-FUN)  ; are both fixnums?
 
                 (inst cmp x y)
                 (inst ret)
@@ -173,7 +171,7 @@
                 (inst sub rsp-tn (fixnumize 3))
                 (inst mov (make-ea :qword
                                    :base rcx
-                                   :disp (fixnumize -1))
+                                   :disp (frame-byte-offset ocfp-save-offset))
                       rbp-tn)
                 (move rbp-tn rcx)
                 (inst mov rcx (fixnumize 2))
@@ -238,7 +236,7 @@
   (inst sub rsp-tn (fixnumize 3))
   (inst mov (make-ea :qword
                      :base rcx
-                     :disp (fixnumize -1))
+                     :disp (frame-byte-offset ocfp-save-offset))
         rbp-tn)
   (move rbp-tn rcx)
   (inst mov rcx (fixnumize 2))
@@ -295,7 +293,7 @@
   (inst sub rsp-tn (fixnumize 3))
   (inst mov (make-ea :qword
                      :base rcx
-                     :disp (fixnumize -1))
+                     :disp (frame-byte-offset ocfp-save-offset))
         rbp-tn)
   (move rbp-tn rcx)
   (inst mov rcx (fixnumize 2))
diff --git a/src/assembly/x86-64/assem-rtns.lisp b/src/assembly/x86-64/assem-rtns.lisp
index dea6cbb..7270802 100644
--- a/src/assembly/x86-64/assem-rtns.lisp
+++ b/src/assembly/x86-64/assem-rtns.lisp
@@ -135,11 +135,11 @@
   (inst cmp ecx (fixnumize 3))
   (inst jmp :le REGISTER-ARGS)
 
-  ;; Save the OLD-FP and RETURN-PC because the blit it going to trash
-  ;; those stack locations. Save the ECX, because the loop is going
-  ;; to trash it.
-  (pushw rbp-tn -1)
-  (loadw ebx rbp-tn -2)
+  ;; Save the OLD-FP and RETURN-PC because the blit is going to trash
+  ;; those stack locations. Save the ECX, because the loop is going to
+  ;; trash it.
+  (pushw rbp-tn (frame-word-offset ocfp-save-offset))
+  (loadw ebx rbp-tn (frame-word-offset return-pc-save-offset))
   (inst push ecx)
 
   ;; Do the blit. Because we are coping from smaller addresses to
@@ -147,25 +147,27 @@
   ;; our way down.
   (inst shr ecx 3)                      ; fixnum to raw words
   (inst std)                            ; count down
-  (inst lea edi (make-ea :qword :base rbp-tn :disp (- n-word-bytes)))
+  (inst lea edi (make-ea :qword :base rbp-tn :disp (frame-byte-offset 0)))
   (inst sub esi (fixnumize 1))
   (inst rep)
   (inst movs :qword)
   (inst cld)
 
   ;; Load the register arguments carefully.
-  (loadw edx rbp-tn -1)
+  (loadw edx rbp-tn (frame-word-offset ocfp-save-offset))
 
   ;; Restore OLD-FP and ECX.
   (inst pop ecx)
-  (popw rbp-tn -1)                      ; overwrites a0
+  ;; Overwrites a1
+  (popw rbp-tn (frame-word-offset ocfp-save-offset))
 
   ;; Blow off the stack above the arguments.
   (inst lea rsp-tn (make-ea :qword :base edi :disp n-word-bytes))
 
   ;; remaining register args
-  (loadw edi rbp-tn -2)
-  (loadw esi rbp-tn -3)
+  (inst mov edi edx)
+  (loadw edx rbp-tn (frame-word-offset 0))
+  (loadw esi rbp-tn (frame-word-offset 2))
 
   ;; Push the (saved) return-pc so it looks like we just called.
   (inst push ebx)
@@ -187,7 +189,7 @@
         (make-ea :qword :base rbp-tn :disp (* -3 n-word-bytes)))
 
   ;; Push the return-pc so it looks like we just called.
-  (pushw rbp-tn -2)    ; XXX dan ?
+  (pushw rbp-tn (frame-word-offset return-pc-save-offset))
 
   ;; And away we go.
   (inst jmp (make-ea :byte :base eax
diff --git a/src/assembly/x86/arith.lisp b/src/assembly/x86/arith.lisp
index f3081f8..28d510f 100644
--- a/src/assembly/x86/arith.lisp
+++ b/src/assembly/x86/arith.lisp
@@ -40,13 +40,12 @@
                 (inst ret)
 
                 DO-STATIC-FUN
-                (inst pop eax)
                 (inst push ebp-tn)
-                (inst lea
-                      ebp-tn
-                      (make-ea :dword :base esp-tn :disp n-word-bytes))
-                (inst sub esp-tn (fixnumize 2))
-                (inst push eax)  ; callers return addr
+                (inst lea ebp-tn (make-ea :dword
+                                          :base esp-tn
+                                          :disp (* 2 n-word-bytes)))
+                (inst sub esp-tn (fixnumize 1))
+                (inst push (make-ea :dword :base ebp-tn :disp (- n-word-bytes)))
                 (inst mov ecx (fixnumize 2)) ; arg count
                 (inst jmp
                       (make-ea :dword
@@ -86,7 +85,7 @@
     (move eax x)                          ; must use eax for 64-bit result
     (inst sar eax n-fixnum-tag-bits)      ; remove *4 fixnum bias
     (inst imul y)                         ; result in edx:eax
-    (inst jmp :no okay)                   ; still fixnum
+    (inst jmp :no OKAY)                   ; still fixnum
 
     ;; zzz jrd changed edx to ebx in here, as edx isn't listed as a temp, above
     ;;     pfw says that loses big -- edx is target for arg x and result res
@@ -130,11 +129,10 @@
   (inst test x fixnum-tag-mask)
   (inst jmp :z FIXNUM)
 
-  (inst pop eax)
   (inst push ebp-tn)
-  (inst lea ebp-tn (make-ea :dword :base esp-tn :disp n-word-bytes))
-  (inst sub esp-tn (fixnumize 2))
-  (inst push eax)
+  (inst lea ebp-tn (make-ea :dword :base esp-tn :disp (* 2 n-word-bytes)))
+  (inst sub esp-tn (fixnumize 1))
+  (inst push (make-ea :dword :base ebp-tn :disp (- n-word-bytes)))
   (inst mov ecx (fixnumize 1))    ; arg count
   (inst jmp (make-ea :dword
                      :disp (+ nil-value (static-fun-offset '%negate))))
@@ -174,7 +172,8 @@
                 (move ecx esp-tn)
                 (inst sub esp-tn (fixnumize 3))
                 (inst mov (make-ea :dword
-                                   :base ecx :disp (fixnumize -1))
+                                   :base ecx
+                                   :disp (frame-byte-offset ocfp-save-offset))
                       ebp-tn)
                 (move ebp-tn ecx)
                 (inst mov ecx (fixnumize 2))
@@ -189,7 +188,7 @@
                     (:g `((inst cmp x (1+ nil-value)))))
                 (inst ret))
              #-sb-assembling
-                          `(define-vop (,name)
+             `(define-vop (,name)
                 (:translate ,translate)
                 (:policy :safe)
                 (:save-p t)
@@ -244,7 +243,7 @@
   (inst sub esp-tn (fixnumize 3))
   (inst mov (make-ea :dword
                      :base ecx
-                     :disp (fixnumize -1))
+                     :disp (frame-byte-offset ocfp-save-offset))
         ebp-tn)
   (move ebp-tn ecx)
   (inst mov ecx (fixnumize 2))
@@ -301,7 +300,7 @@
   (inst sub esp-tn (fixnumize 3))
   (inst mov (make-ea :dword
                      :base ecx
-                     :disp (fixnumize -1))
+                     :disp (frame-byte-offset ocfp-save-offset))
         ebp-tn)
   (move ebp-tn ecx)
   (inst mov ecx (fixnumize 2))
diff --git a/src/assembly/x86/assem-rtns.lisp b/src/assembly/x86/assem-rtns.lisp
index 930c2e7..36152cd 100644
--- a/src/assembly/x86/assem-rtns.lisp
+++ b/src/assembly/x86/assem-rtns.lisp
@@ -31,13 +31,13 @@
      (:temp edi unsigned-reg edi-offset))
 
   ;; Pick off the cases where everything fits in register args.
-  (inst jecxz zero-values)
+  (inst jecxz ZERO-VALUES)
   (inst cmp ecx (fixnumize 1))
-  (inst jmp :e one-value)
+  (inst jmp :e ONE-VALUE)
   (inst cmp ecx (fixnumize 2))
-  (inst jmp :e two-values)
+  (inst jmp :e TWO-VALUES)
   (inst cmp ecx (fixnumize 3))
-  (inst jmp :e three-values)
+  (inst jmp :e THREE-VALUES)
 
   ;; Save the count, because the loop is going to destroy it.
   (inst mov edx ecx)
@@ -135,11 +135,11 @@
   (inst cmp ecx (fixnumize 3))
   (inst jmp :le REGISTER-ARGS)
 
-  ;; Save the OLD-FP and RETURN-PC because the blit it going to trash
-  ;; those stack locations. Save the ECX, because the loop is going
-  ;; to trash it.
-  (pushw ebp-tn -1)
-  (loadw ebx ebp-tn -2)
+  ;; Save the OLD-FP and RETURN-PC because the blit is going to trash
+  ;; those stack locations. Save the ECX, because the loop is going to
+  ;; trash it.
+  (pushw ebp-tn (frame-word-offset ocfp-save-offset))
+  (loadw ebx ebp-tn (frame-word-offset return-pc-save-offset))
   (inst push ecx)
 
   ;; Do the blit. Because we are coping from smaller addresses to
@@ -147,25 +147,27 @@
   ;; our way down.
   (inst shr ecx 2)                      ; fixnum to raw words
   (inst std)                            ; count down
-  (inst lea edi (make-ea :dword :base ebp-tn :disp (- n-word-bytes)))
+  (inst lea edi (make-ea :dword :base ebp-tn :disp (frame-byte-offset 0)))
   (inst sub esi (fixnumize 1))
   (inst rep)
   (inst movs :dword)
   (inst cld)
 
   ;; Load the register arguments carefully.
-  (loadw edx ebp-tn -1)
+  (loadw edx ebp-tn (frame-word-offset ocfp-save-offset))
 
   ;; Restore OLD-FP and ECX.
   (inst pop ecx)
-  (popw ebp-tn -1)                      ; overwrites a0
+  ;; Overwrites a1
+  (popw ebp-tn (frame-word-offset ocfp-save-offset))
 
   ;; Blow off the stack above the arguments.
   (inst lea esp-tn (make-ea :dword :base edi :disp n-word-bytes))
 
   ;; remaining register args
-  (loadw edi ebp-tn -2)
-  (loadw esi ebp-tn -3)
+  (inst mov edi edx)
+  (loadw edx ebp-tn (frame-word-offset 0))
+  (loadw esi ebp-tn (frame-word-offset 2))
 
   ;; Push the (saved) return-pc so it looks like we just called.
   (inst push ebx)
@@ -184,7 +186,7 @@
         (make-ea :dword :base ebp-tn :disp (* -3 n-word-bytes)))
 
   ;; Push the return-pc so it looks like we just called.
-  (pushw ebp-tn -2)
+  (pushw ebp-tn (frame-word-offset return-pc-save-offset))
 
   ;; And away we go.
   (inst jmp (make-ea-for-object-slot eax closure-fun-slot fun-pointer-lowtag)))
@@ -207,10 +209,10 @@
     (inst jmp :z error))
 
   (inst cmp target (make-ea-for-object-slot catch catch-block-tag-slot 0))
-  (inst jmp :e exit)
+  (inst jmp :e EXIT)
 
   (loadw catch catch catch-block-previous-catch-slot)
-  (inst jmp loop)
+  (inst jmp LOOP)
 
   EXIT
 
@@ -241,7 +243,7 @@
   (inst cmp uwp
         (make-ea-for-object-slot block unwind-block-current-uwp-slot 0))
   ;; If a match, return to context in arg block.
-  (inst jmp :e do-exit)
+  (inst jmp :e DO-EXIT)
 
   ;; Not a match - return to *CURRENT-UNWIND-PROTECT-BLOCK* context.
   ;; Important! Must save (and return) the arg 'block' for later use!!
diff --git a/src/compiler/x86-64/call.lisp b/src/compiler/x86-64/call.lisp
index a0ae3a4..73ff2ac 100644
--- a/src/compiler/x86-64/call.lisp
+++ b/src/compiler/x86-64/call.lisp
@@ -133,7 +133,7 @@
 
     ;; The start of the actual code.
     ;; Save the return-pc.
-    (popw rbp-tn (- (1+ return-pc-save-offset)))
+    (popw rbp-tn (frame-word-offset return-pc-save-offset))
 
     ;; If copy-more-arg follows it will allocate the correct stack
     ;; size. The stack is not allocated first here as this may expose
@@ -260,7 +260,7 @@
 
             (inst cmp rcx-tn (fixnumize i))
             (inst jmp :be default-lab)
-            (loadw rdx-tn rbx-tn (- (1+ i)))
+            (loadw rdx-tn rbx-tn (frame-word-offset i))
             (inst mov tn rdx-tn)))
 
         (emit-label defaulting-done)
@@ -298,7 +298,7 @@
       (emit-label no-stack-args)
       (inst lea rdi-tn
             (make-ea :qword :base rbp-tn
-                     :disp (* (- (1+ register-arg-count)) n-word-bytes)))
+                     :disp (frame-byte-offset register-arg-count)))
       ;; Load RAX with NIL so we can quickly store it, and set up
       ;; stuff for the loop.
       (inst mov rax-tn nil-value)
@@ -311,7 +311,7 @@
       ;; and then default the remaining stack arguments.
       (emit-label regs-defaulted)
       ;; Save EDI.
-      (storew rdi-tn rbx-tn (- (1+ 1)))
+      (storew rdi-tn rbx-tn (frame-word-offset 1))
       ;; Compute the number of stack arguments, and if it's zero or
       ;; less, don't copy any stack arguments.
       (inst sub rcx-tn (fixnumize register-arg-count))
@@ -327,19 +327,19 @@
       ;; Compute a pointer to where the stack args go.
       (inst lea rdi-tn
             (make-ea :qword :base rbp-tn
-                     :disp (* (- (1+ register-arg-count)) n-word-bytes)))
+                     :disp (frame-byte-offset register-arg-count)))
       ;; Save ESI, and compute a pointer to where the args come from.
-      (storew rsi-tn rbx-tn (- (1+ 2)))
+      (storew rsi-tn rbx-tn (frame-word-offset 2))
       (inst lea rsi-tn
             (make-ea :qword :base rbx-tn
-                     :disp (* (- (1+ register-arg-count)) n-word-bytes)))
+                     :disp (frame-byte-offset register-arg-count)))
       ;; Do the copy.
       (inst shr rcx-tn word-shift)              ; make word count
       (inst std)
       (inst rep)
       (inst movs :qword)
       ;; Restore RSI.
-      (loadw rsi-tn rbx-tn (- (1+ 2)))
+      (loadw rsi-tn rbx-tn (frame-word-offset 2))
       ;; Now we have to default the remaining args. Find out how many.
       (inst sub rax-tn (fixnumize (- nvals register-arg-count)))
       (inst neg rax-tn)
@@ -355,7 +355,7 @@
       (inst stos rax-tn)
       ;; Restore EDI, and reset the stack.
       (emit-label restore-edi)
-      (loadw rdi-tn rbx-tn (- (1+ 1)))
+      (loadw rdi-tn rbx-tn (frame-word-offset 1))
       (inst mov rsp-tn rbx-tn)
       (inst cld))))
   (values))
@@ -387,7 +387,7 @@
 
     (cond ((location= start (first *register-arg-tns*))
            (inst push (first *register-arg-tns*))
-           (inst lea start (make-ea :qword :base rsp-tn :disp 8)))
+           (inst lea start (make-ea :qword :base rsp-tn :disp n-word-bytes)))
           (t (inst mov start rsp-tn)
              (inst push (first *register-arg-tns*))))
     (inst mov count (fixnumize 1))
@@ -464,12 +464,14 @@
       ;; Is the return-pc on the stack or in a register?
       (sc-case ret-tn
         ((sap-stack)
+         (unless (= (tn-offset ret-tn) return-pc-save-offset)
+           (error "ret-tn ~A in wrong stack slot" ret-tn))
          #+nil (format t "*call-local: ret-tn on stack; offset=~S~%"
                        (tn-offset ret-tn))
          (inst lea return-label (make-fixup nil :code-object RETURN))
-         (storew return-label rbp-tn (- (1+ (tn-offset ret-tn)))))
-        ((sap-reg)
-         (inst lea ret-tn (make-fixup nil :code-object RETURN)))))
+         (storew return-label rbp-tn (frame-word-offset (tn-offset ret-tn))))
+        (t
+         (error "ret-tn ~A in sap-reg" ret-tn))))
 
     (note-this-location vop :call-site)
     (inst jmp target)
@@ -507,10 +509,9 @@
                        (tn-offset ret-tn))
          ;; Stack
          (inst lea return-label (make-fixup nil :code-object RETURN))
-         (storew return-label rbp-tn (- (1+ (tn-offset ret-tn)))))
-        ((sap-reg)
-         ;; Register
-         (inst lea ret-tn (make-fixup nil :code-object RETURN)))))
+         (storew return-label rbp-tn (frame-word-offset (tn-offset ret-tn))))
+        (t
+         (error "multiple-call-local: return-pc not on stack."))))
 
     (note-this-location vop :call-site)
     (inst jmp target)
@@ -556,10 +557,9 @@
                        (tn-offset ret-tn))
          ;; Stack
          (inst lea return-label (make-fixup nil :code-object RETURN))
-         (storew return-label rbp-tn (- (1+ (tn-offset ret-tn)))))
-        ((sap-reg)
-         ;; Register
-         (inst lea ret-tn (make-fixup nil :code-object RETURN)))))
+         (storew return-label rbp-tn (frame-word-offset (tn-offset ret-tn))))
+        (t
+         (error "known-call-local: return-pc not on stack."))))
 
     (note-this-location vop :call-site)
     (inst jmp target)
@@ -621,39 +621,23 @@
     (trace-table-entry trace-table-fun-epilogue)
     ;; return-pc may be either in a register or on the stack.
     (sc-case return-pc
-      ((sap-reg)
-       (sc-case old-fp
-         ((control-stack)
-          (cond ((zerop (tn-offset old-fp))
-                 ;; Zot all of the stack except for the old-fp.
-                 (inst lea rsp-tn (make-ea :qword :base rbp-tn
-                                           :disp (- (* (1+ ocfp-save-offset)
-                                                       n-word-bytes))))
-                 ;; Restore the old fp from its save location on the stack,
-                 ;; and zot the stack.
-                 (inst pop rbp-tn))
-
-                (t
-                 (cerror "Continue anyway"
-                         "VOP return-local doesn't work if old-fp (in slot ~
-                          ~S) is not in slot 0"
-                         (tn-offset old-fp)))))
-
-         ((any-reg descriptor-reg)
-          ;; Zot all the stack.
-          (move rsp-tn rbp-tn)
-          ;; Restore the old-fp.
-          (move rbp-tn old-fp)))
-
-       ;; Return; return-pc is in a register.
-       (inst jmp return-pc))
-
       ((sap-stack)
+       (unless (and (sc-is old-fp control-stack)
+                    (= (tn-offset old-fp) ocfp-save-offset))
+         (error "known-return: ocfp not on stack in standard save location?"))
+       (unless (and (sc-is return-pc sap-stack)
+                    (= (tn-offset return-pc) return-pc-save-offset))
+         (error
+          "known-return: return-pc not on stack in standard save location?"))
+
+       ;; Zot all of the stack except for the old-fp and return-pc.
        (inst lea rsp-tn
              (make-ea :qword :base rbp-tn
-                      :disp (- (* (1+ (tn-offset return-pc)) n-word-bytes))))
-       (move rbp-tn old-fp)
-       (inst ret (* (tn-offset return-pc) n-word-bytes))))
+                      :disp (frame-byte-offset ocfp-save-offset)))
+       (inst pop rbp-tn)
+       (inst ret (* (tn-offset return-pc) n-word-bytes)))
+      (t
+       (error "known-return, return-pc not on stack")))
 
     (trace-table-entry trace-table-normal)))
 
@@ -820,20 +804,20 @@
                                       ;; FIXME: FORMAT T for stale
                                       ;; diagnostic output (several of
                                       ;; them around here), ick
-                                      (format t "** tail-call old-fp not S0~%")
+                                      (error "** tail-call old-fp not S0~%")
                                       (move old-fp-tmp old-fp)
                                       (storew old-fp-tmp
                                               rbp-tn
-                                              (- (1+ ocfp-save-offset)))))
+                                              (frame-word-offset ocfp-save-offset))))
                                    ((any-reg descriptor-reg)
-                                    (format t "** tail-call old-fp in reg not S0~%")
+                                    (error "** tail-call old-fp in reg not S0~%")
                                     (storew old-fp
                                             rbp-tn
-                                            (- (1+ ocfp-save-offset)))))
+                                            (frame-word-offset ocfp-save-offset))))
 
                           ;; For tail call, we have to push the
                           ;; return-pc so that it looks like we CALLed
-                          ;; drspite the fact that we are going to JMP.
+                          ;; despite the fact that we are going to JMP.
                           (inst push return-pc)
                           ))
                        (t
@@ -855,7 +839,8 @@
                                '(inst sub rsp-tn (fixnumize 3)))
 
                           ;; Save the fp
-                          (storew rbp-tn new-fp (- (1+ ocfp-save-offset)))
+                          (storew rbp-tn new-fp
+                                  (frame-word-offset ocfp-save-offset))
 
                           (move rbp-tn new-fp) ; NB - now on new stack frame.
                           )))
@@ -886,7 +871,7 @@
                (trace-table-entry trace-table-normal)))))
 
   (define-full-call call nil :fixed nil)
-  (define-full-call call-named t  :fixed nil)
+  (define-full-call call-named t :fixed nil)
   (define-full-call multiple-call nil :unknown nil)
   (define-full-call multiple-call-named t :unknown nil)
   (define-full-call tail-call nil :tail nil)
@@ -930,15 +915,15 @@
 
 ;;;; unknown values return
 
-;;; Return a single-value using the Unknown-Values convention. Specifically,
-;;; we jump to clear the stack and jump to return-pc+3.
-;;;
-;;; We require old-fp to be in a register, because we want to reset RSP before
-;;; restoring RBP. If old-fp were still on the stack, it could get clobbered
-;;; by a signal.
+;;; Return a single-value using the Unknown-Values convention.
 ;;;
 ;;; pfw--get wired-tn conflicts sometimes if register sc specd for args
 ;;; having problems targeting args to regs -- using temps instead.
+;;;
+;;; First off, modifying the return-pc defeats the branch-prediction
+;;; optimizations on modern CPUs quite handily. Second, we can do all
+;;; this without needing a temp register. Fixed the latter, at least.
+;;; -- AB 2006/Feb/04
 (define-vop (return-single)
   (:args (old-fp)
          (return-pc)
@@ -948,55 +933,31 @@
     (trace-table-entry trace-table-fun-epilogue)
     ;; Code structure lifted from known-return.
     (sc-case return-pc
-      ((sap-reg)
-       ;; return PC in register for some reason (local call?)
-       ;; we jmp to the return pc after fixing the stack and frame.
-       (sc-case old-fp
-         ((control-stack)
-          ;; ofp on stack must be in slot 0 (the traditional storage place).
-          ;; Drop the stack above it and pop it off.
-          (cond ((zerop (tn-offset old-fp))
-                 (inst lea rsp-tn (make-ea :dword :base rbp-tn
-                                           :disp (- (* (1+ ocfp-save-offset)
-                                                       n-word-bytes))))
-                 (inst pop rbp-tn))
-                (t
-                 ;; Should this ever happen, we do the same as above, but
-                 ;; using (tn-offset old-fp) instead of ocfp-save-offset
-                 ;; (which is 0 anyway, see src/compiler/x86/vm.lisp) and
-                 ;; then lea rsp again against itself with a displacement
-                 ;; of (* (tn-offset old-fp) n-word-bytes) to clear the
-                 ;; rest of the stack.
-                 (cerror "Continue anyway"
-                         "VOP return-single doesn't work if old-fp (in slot ~S) is not in slot 0" (tn-offset old-fp)))))
-         ((any-reg descriptor-reg)
-          ;; ofp in reg, drop the stack and load the real fp.
-          (move rsp-tn rbp-tn)
-          (move rbp-tn old-fp)))
-
-       ;; Set single-value-return flag
-       (inst clc)
-       ;; And return
-       (inst jmp return-pc))
-
       ((sap-stack)
        ;; Note that this will only work right if, when old-fp is on
        ;; the stack, it has a lower tn-offset than return-pc. One of
        ;; the comments in known-return indicate that this is the case
        ;; (in that it will be in its save location), but we may wish
        ;; to assert that (in either the weaker or stronger forms).
-       ;; Should this ever not be the case, we should load old-fp
-       ;; into a temp reg while we fix the stack.
-       ;; Drop stack above return-pc
-       (inst lea rsp-tn (make-ea :dword :base rbp-tn
-                                 :disp (- (* (1+ (tn-offset return-pc))
-                                             n-word-bytes))))
+       ;; Should this ever not be the case, we should load old-fp into
+       ;; a temp reg while we fix the stack.
+       (unless (and (sc-is old-fp control-stack)
+                    (= (tn-offset old-fp) ocfp-save-offset))
+         (error "ocfp not on stack in standard save location?"))
+       (unless (and (sc-is return-pc sap-stack)
+                    (= (tn-offset return-pc) return-pc-save-offset))
+         (error "return-pc not on stack in standard save location?"))
+       ;; Drop stack above old-fp
+       (inst lea rsp-tn (make-ea :qword :base rbp-tn
+                                 :disp (frame-byte-offset (tn-offset old-fp))))
        ;; Set single-value return flag
        (inst clc)
        ;; Restore the old frame pointer
-       (move rbp-tn old-fp)
+       (inst pop rbp-tn)
        ;; And return, dropping the rest of the stack as we go.
-       (inst ret (* (tn-offset return-pc) n-word-bytes))))))
+       (inst ret (* (tn-offset return-pc) n-word-bytes)))
+      (t
+       (error "return pc not on stack")))))
 
 ;;; Do unknown-values return of a fixed (other than 1) number of
 ;;; values. The VALUES are required to be set up in the standard
@@ -1038,8 +999,10 @@
     (move rbp-tn old-fp)
     ;; Clear as much of the stack as possible, but not past the return
     ;; address.
-    (inst lea rsp-tn (make-ea :qword :base rbx
-                              :disp (- (* (max nvals 2) n-word-bytes))))
+    (inst lea rsp-tn
+          (make-ea :qword :base rbx
+                   :disp (frame-byte-offset (max (1- nvals)
+                                                 return-pc-save-offset))))
     ;; Pre-default any argument register that need it.
     (when (< nvals register-arg-count)
       (let* ((arg-tns (nthcdr nvals (list a0 a1 a2)))
@@ -1053,16 +1016,17 @@
     ;; stack and we've changed the stack pointer. So we have to
     ;; tell it to index off of RBX instead of RBP.
     (cond ((zerop nvals)
-           ;; Return popping the return address and the OCFP.
-           (inst ret n-word-bytes))
+           ;; Return popping the return address and what's earlier in
+           ;; the frame.
+           (inst ret (* return-pc-save-offset n-word-bytes)))
           ((= nvals 1)
-           ;; Return popping the return, leaving 1 slot. Can this
-           ;; happen, or is a single value return handled elsewhere?
-           (inst ret))
+           ;; This is handled in RETURN-SINGLE.
+           (error "nvalues is 1"))
           (t
-           (inst jmp (make-ea :qword :base rbx
-                              :disp (- (* (1+ (tn-offset return-pc))
-                                          n-word-bytes))))))
+           ;; Thou shalt not JMP unto thy return address.
+           (inst push (make-ea :qword :base rbx
+                               :disp (frame-byte-offset (tn-offset return-pc))))
+           (inst ret)))
 
     (trace-table-entry trace-table-normal)))
 
@@ -1113,8 +1077,8 @@
         ;; clear the multiple-value return flag
         (inst clc)
         ;; Out of here.
-        (inst jmp rax)
-
+        (inst push rax)
+        (inst ret)
         ;; Nope, not the single case. Jump to the assembly routine.
         (emit-label not-single)))
     (move rsi vals)
diff --git a/src/compiler/x86-64/insts.lisp b/src/compiler/x86-64/insts.lisp
index 7b800e4..fdf8ad7 100644
--- a/src/compiler/x86-64/insts.lisp
+++ b/src/compiler/x86-64/insts.lisp
@@ -2583,7 +2583,7 @@
   (:printer byte ((op #b11000010) (imm nil :type 'imm-word-16))
             '(:name :tab imm))
   (:emitter
-   (cond (stack-delta
+   (cond ((and stack-delta (not (zerop stack-delta)))
           (emit-byte segment #b11000010)
           (emit-word segment stack-delta))
          (t
diff --git a/src/compiler/x86-64/nlx.lisp b/src/compiler/x86-64/nlx.lisp
index 6208d5c..a277afa 100644
--- a/src/compiler/x86-64/nlx.lisp
+++ b/src/compiler/x86-64/nlx.lisp
@@ -274,7 +274,7 @@
           (make-ea :qword :base rbp-tn :disp (* -3 n-word-bytes)))
 
     ;; Push the return-pc so it looks like we just called.
-    (pushw rbp-tn -2)
+    (pushw rbp-tn (frame-word-offset return-pc-save-offset))
 
     ;; Call it
     (inst jmp (make-ea :qword :base block
diff --git a/src/compiler/x86-64/static-fn.lisp b/src/compiler/x86-64/static-fn.lisp
index ae94061..ee2baea 100644
--- a/src/compiler/x86-64/static-fn.lisp
+++ b/src/compiler/x86-64/static-fn.lisp
@@ -16,7 +16,7 @@
   (:policy :safe)
   (:variant-vars function)
   (:vop-var vop)
-  (:node-var node)
+  ;;(:node-var node)
   (:temporary (:sc unsigned-reg :offset ebx-offset
                    :from (:eval 0) :to (:eval 2)) ebx)
   (:temporary (:sc unsigned-reg :offset ecx-offset
@@ -79,15 +79,17 @@
          ;; If speed not more important than size, duplicate the
          ;; effect of the ENTER with discrete instructions. Takes
          ;; 2+1+3+2=8 bytes as opposed to 4+3=7 bytes.
-         (cond ((policy node (>= speed space))
+         (cond (t ;(policy node (>= speed space))
                 (inst mov ebx rsp-tn)
+                ;; Dummy for return address
+                (inst push rbp-tn)
                 ;; Save the old-fp
                 (inst push rbp-tn)
                 ;; Ensure that at least three slots are available; one
                 ;; above, two more needed.
-                (inst sub rsp-tn (fixnumize 2))
+                (inst sub rsp-tn (fixnumize 1))
                 (inst mov rbp-tn ebx))
-               (t
+               #+(or) (t
                 (inst enter (fixnumize 2))
                 ;; The enter instruction pushes EBP and then copies
                 ;; ESP into EBP. We want the new EBP to be the
diff --git a/src/compiler/x86-64/vm.lisp b/src/compiler/x86-64/vm.lisp
index 7803220..405ca67 100644
--- a/src/compiler/x86-64/vm.lisp
+++ b/src/compiler/x86-64/vm.lisp
@@ -446,10 +446,18 @@
 ;;;; miscellaneous function call parameters
 
 ;;; offsets of special stack frame locations
-(def!constant ocfp-save-offset 0)
-(def!constant return-pc-save-offset 1)
+(def!constant ocfp-save-offset 1)
+(def!constant return-pc-save-offset 0)
 (def!constant code-save-offset 2)
 
+(declaim (inline frame-word-offset))
+(defun frame-word-offset (index)
+  (- (1+ index)))
+
+(declaim (inline frame-byte-offset))
+(defun frame-byte-offset (index)
+  (* (frame-word-offset index) n-word-bytes))
+
 (def!constant lra-save-offset return-pc-save-offset) ; ?
 
 ;;; This is used by the debugger.
diff --git a/src/compiler/x86/call.lisp b/src/compiler/x86/call.lisp
index bdd1d54..f4f0c29 100644
--- a/src/compiler/x86/call.lisp
+++ b/src/compiler/x86/call.lisp
@@ -212,7 +212,7 @@
     (let ((regs-defaulted (gen-label)))
       (note-this-location vop :unknown-return)
       (inst jmp :c regs-defaulted)
-      ;; Default the unsuppled registers.
+      ;; Default the unsupplied registers.
       (let* ((2nd-tn-ref (tn-ref-across values))
              (2nd-tn (tn-ref-tn 2nd-tn-ref)))
         (inst mov 2nd-tn nil-value)
@@ -395,7 +395,7 @@
 
     (cond ((location= start (first *register-arg-tns*))
            (inst push (first *register-arg-tns*))
-           (inst lea start (make-ea :dword :base esp-tn :disp 4)))
+           (inst lea start (make-ea :dword :base esp-tn :disp n-word-bytes)))
           (t (inst mov start esp-tn)
              (inst push (first *register-arg-tns*))))
     (inst mov count (fixnumize 1))
@@ -471,12 +471,14 @@
       ;; Is the return-pc on the stack or in a register?
       (sc-case ret-tn
         ((sap-stack)
+         (unless (= (tn-offset ret-tn) return-pc-save-offset)
+           (error "ret-tn ~A in wrong stack slot" ret-tn))
          #+nil (format t "*call-local: ret-tn on stack; offset=~S~%"
                        (tn-offset ret-tn))
-         (storew (make-fixup nil :code-object return)
+         (storew (make-fixup nil :code-object RETURN)
                  ebp-tn (frame-word-offset (tn-offset ret-tn))))
-        ((sap-reg)
-         (inst lea ret-tn (make-fixup nil :code-object return)))))
+        (t
+         (error "ret-tn ~A in sap-reg" ret-tn))))
 
     (note-this-location vop :call-site)
     (inst jmp target)
@@ -512,11 +514,10 @@
          #+nil (format t "*multiple-call-local: ret-tn on stack; offset=~S~%"
                        (tn-offset ret-tn))
          ;; Stack
-         (storew (make-fixup nil :code-object return)
+         (storew (make-fixup nil :code-object RETURN)
                  ebp-tn (frame-word-offset (tn-offset ret-tn))))
-        ((sap-reg)
-         ;; Register
-         (inst lea ret-tn (make-fixup nil :code-object return)))))
+        (t
+         (error "multiple-call-local: return-pc not on stack."))))
 
     (note-this-location vop :call-site)
     (inst jmp target)
@@ -560,11 +561,10 @@
          #+nil (format t "*known-call-local: ret-tn on stack; offset=~S~%"
                        (tn-offset ret-tn))
          ;; Stack
-         (storew (make-fixup nil :code-object return)
+         (storew (make-fixup nil :code-object RETURN)
                  ebp-tn (frame-word-offset (tn-offset ret-tn))))
-        ((sap-reg)
-         ;; Register
-         (inst lea ret-tn (make-fixup nil :code-object return)))))
+        (t
+         (error "known-call-local: return-pc not on stack."))))
 
     (note-this-location vop :call-site)
     (inst jmp target)
@@ -636,52 +636,25 @@
 
     ;; return-pc may be either in a register or on the stack.
     (sc-case return-pc
-      ((sap-reg)
-       (sc-case old-fp
-         ((control-stack)
-
-          #+nil (format t "*known-return: old-fp ~S on stack; offset=~S~%"
-                        old-fp (tn-offset old-fp))
-
-          (cond ((zerop (tn-offset old-fp))
-                 ;; Zot all of the stack except for the old-fp.
-                 (inst lea esp-tn (make-ea :dword :base ebp-tn
-                                           :disp (frame-byte-offset ocfp-save-offset)))
-                 ;; Restore the old fp from its save location on the stack,
-                 ;; and zot the stack.
-                 (inst pop ebp-tn))
-
-                (t
-                 (cerror "Continue anyway"
-                         "VOP return-local doesn't work if old-fp (in slot ~
-                          ~S) is not in slot 0"
-                         (tn-offset old-fp)))))
-
-         ((any-reg descriptor-reg)
-          ;; Zot all the stack.
-          (move esp-tn ebp-tn)
-          ;; Restore the old-fp.
-          (move ebp-tn old-fp)))
-
-       ;; Return; return-pc is in a register.
-       (inst jmp return-pc))
-
       ((sap-stack)
-
        #+nil (format t "*known-return: return-pc ~S on stack; offset=~S~%"
                      return-pc (tn-offset return-pc))
+       (unless (and (sc-is old-fp control-stack)
+                    (= (tn-offset old-fp) ocfp-save-offset))
+         (error "known-return: ocfp not on stack in standard save location?"))
+       (unless (and (sc-is return-pc sap-stack)
+                    (= (tn-offset return-pc) return-pc-save-offset))
+         (error
+          "known-return: return-pc not on stack in standard save location?"))
 
        ;; Zot all of the stack except for the old-fp and return-pc.
        (inst lea esp-tn
              (make-ea :dword :base ebp-tn
-                      :disp (frame-byte-offset (tn-offset return-pc))))
-       ;; Restore the old fp. old-fp may be either on the stack in its
-       ;; save location or in a register, in either case this restores it.
-       (move ebp-tn old-fp)
-       ;; The return pops the return address (4 bytes), then we need
-       ;; to pop all the slots before the return-pc which includes the
-       ;; 4 bytes for the old-fp.
-       (inst ret (* (tn-offset return-pc) n-word-bytes))))
+                      :disp (frame-byte-offset ocfp-save-offset)))
+       (inst pop ebp-tn)
+       (inst ret (* (tn-offset return-pc) n-word-bytes)))
+      (t
+       (error "known-return, return-pc not on stack")))
 
     (trace-table-entry trace-table-normal)))
 
@@ -848,13 +821,13 @@
                                       ;; FIXME: FORMAT T for stale
                                       ;; diagnostic output (several of
                                       ;; them around here), ick
-                                      (format t "** tail-call old-fp not S0~%")
+                                      (error "** tail-call old-fp not S0~%")
                                       (move old-fp-tmp old-fp)
                                       (storew old-fp-tmp
                                               ebp-tn
                                               (frame-word-offset ocfp-save-offset))))
                                    ((any-reg descriptor-reg)
-                                    (format t "** tail-call old-fp in reg not S0~%")
+                                    (error "** tail-call old-fp in reg not S0~%")
                                     (storew old-fp
                                             ebp-tn
                                             (frame-word-offset ocfp-save-offset))))
@@ -883,7 +856,8 @@
                                '(inst sub esp-tn (fixnumize 3)))
 
                           ;; Save the fp
-                          (storew ebp-tn new-fp (frame-word-offset ocfp-save-offset))
+                          (storew ebp-tn new-fp
+                                  (frame-word-offset ocfp-save-offset))
 
                           (move ebp-tn new-fp) ; NB - now on new stack frame.
                           )))
@@ -952,12 +926,7 @@
 
 ;;;; unknown values return
 
-;;; Return a single-value using the Unknown-Values convention. Specifically,
-;;; we jump to clear the stack and jump to return-pc+2.
-;;;
-;;; We require old-fp to be in a register, because we want to reset ESP before
-;;; restoring EBP. If old-fp were still on the stack, it could get clobbered
-;;; by a signal.
+;;; Return a single-value using the Unknown-Values convention.
 ;;;
 ;;; pfw--get wired-tn conflicts sometimes if register sc specd for args
 ;;; having problems targeting args to regs -- using temps instead.
@@ -975,53 +944,31 @@
     (trace-table-entry trace-table-fun-epilogue)
     ;; Code structure lifted from known-return.
     (sc-case return-pc
-      ((sap-reg)
-       ;; return PC in register for some reason (local call?)
-       ;; we jmp to the return pc after fixing the stack and frame.
-       (sc-case old-fp
-         ((control-stack)
-          ;; ofp on stack must be in slot 0 (the traditional storage place).
-          ;; Drop the stack above it and pop it off.
-          (cond ((zerop (tn-offset old-fp))
-                 (inst lea esp-tn (make-ea :dword :base ebp-tn
-                                           :disp (frame-byte-offset ocfp-save-offset)))
-                 (inst pop ebp-tn))
-                (t
-                 ;; Should this ever happen, we do the same as above, but
-                 ;; using (tn-offset old-fp) instead of ocfp-save-offset
-                 ;; (which is 0 anyway, see src/compiler/x86/vm.lisp) and
-                 ;; then lea esp again against itself with a displacement
-                 ;; of (* (tn-offset old-fp) n-word-bytes) to clear the
-                 ;; rest of the stack.
-                 (cerror "Continue anyway"
-                         "VOP return-single doesn't work if old-fp (in slot ~S) is not in slot 0" (tn-offset old-fp)))))
-         ((any-reg descriptor-reg)
-          ;; ofp in reg, drop the stack and load the real fp.
-          (move esp-tn ebp-tn)
-          (move ebp-tn old-fp)))
-
-       ;; Set single-value-return flag
-       (inst clc)
-       ;; And return
-       (inst jmp return-pc))
-
       ((sap-stack)
        ;; Note that this will only work right if, when old-fp is on
        ;; the stack, it has a lower tn-offset than return-pc. One of
        ;; the comments in known-return indicate that this is the case
        ;; (in that it will be in its save location), but we may wish
        ;; to assert that (in either the weaker or stronger forms).
-       ;; Should this ever not be the case, we should load old-fp
-       ;; into a temp reg while we fix the stack.
-       ;; Drop stack above return-pc
+       ;; Should this ever not be the case, we should load old-fp into
+       ;; a temp reg while we fix the stack.
+       (unless (and (sc-is old-fp control-stack)
+                    (= (tn-offset old-fp) ocfp-save-offset))
+         (error "ocfp not on stack in standard save location?"))
+       (unless (and (sc-is return-pc sap-stack)
+                    (= (tn-offset return-pc) return-pc-save-offset))
+         (error "return-pc not on stack in standard save location?"))
+       ;; Drop stack above old-fp
        (inst lea esp-tn (make-ea :dword :base ebp-tn
-                                 :disp (frame-byte-offset (tn-offset return-pc))))
+                                 :disp (frame-byte-offset (tn-offset old-fp))))
        ;; Set single-value return flag
        (inst clc)
        ;; Restore the old frame pointer
-       (move ebp-tn old-fp)
+       (inst pop ebp-tn)
        ;; And return, dropping the rest of the stack as we go.
-       (inst ret (* (tn-offset return-pc) n-word-bytes))))))
+       (inst ret (* (tn-offset return-pc) n-word-bytes)))
+      (t
+       (error "return pc not on stack")))))
 
 ;;; Do unknown-values return of a fixed (other than 1) number of
 ;;; values. The VALUES are required to be set up in the standard
@@ -1053,18 +1000,27 @@
                    :from :eval) a2)
 
   (:generator 6
+    (unless (and (sc-is old-fp control-stack)
+                 (= (tn-offset old-fp) ocfp-save-offset))
+      (error "ocfp not on stack in standard save location?"))
+    (unless (and (sc-is return-pc sap-stack)
+                 (= (tn-offset return-pc) return-pc-save-offset))
+      (error "return-pc not on stack in standard save location?"))
+
     (trace-table-entry trace-table-fun-epilogue)
     ;; Establish the values pointer and values count.
     (move ebx ebp-tn)
     (if (zerop nvals)
-        (inst xor ecx ecx) ; smaller
-      (inst mov ecx (fixnumize nvals)))
+        (inst xor ecx ecx)              ; smaller
+        (inst mov ecx (fixnumize nvals)))
     ;; Restore the frame pointer.
     (move ebp-tn old-fp)
     ;; Clear as much of the stack as possible, but not past the return
     ;; address.
-    (inst lea esp-tn (make-ea :dword :base ebx
-                              :disp (- (* (max nvals 2) n-word-bytes))))
+    (inst lea esp-tn
+          (make-ea :dword :base ebx
+                   :disp (frame-byte-offset (max (1- nvals)
+                                                 return-pc-save-offset))))
     ;; Pre-default any argument register that need it.
     (when (< nvals register-arg-count)
       (let* ((arg-tns (nthcdr nvals (list a0 a1 a2)))
@@ -1072,21 +1028,23 @@
         (inst mov first nil-value)
         (dolist (tn (cdr arg-tns))
           (inst mov tn first))))
-    ;; Set multi-value return flag.
+    ;; Set the multiple value return flag.
     (inst stc)
     ;; And away we go. Except that return-pc is still on the
     ;; stack and we've changed the stack pointer. So we have to
     ;; tell it to index off of EBX instead of EBP.
     (cond ((zerop nvals)
-           ;; Return popping the return address and the OCFP.
-           (inst ret n-word-bytes))
+           ;; Return popping the return address and what's earlier in
+           ;; the frame.
+           (inst ret (* return-pc-save-offset n-word-bytes)))
           ((= nvals 1)
-           ;; Return popping the return, leaving 1 slot. Can this
-           ;; happen, or is a single value return handled elsewhere?
-           (inst ret))
+           ;; This is handled in RETURN-SINGLE.
+           (error "nvalues is 1"))
           (t
-           (inst jmp (make-ea :dword :base ebx
-                              :disp (frame-byte-offset (tn-offset return-pc))))))
+           ;; Thou shalt not JMP unto thy return address.
+           (inst push (make-ea :dword :base ebx
+                               :disp (frame-byte-offset (tn-offset return-pc))))
+           (inst ret)))
 
     (trace-table-entry trace-table-normal)))
 
@@ -1133,11 +1091,11 @@
         (move old-fp-temp old-fp)
         (move esp-tn ebp-tn)
         (move ebp-tn old-fp-temp)
-        ;; Set the single-value return flag.
+        ;; clear the multiple-value return flag
         (inst clc)
         ;; Out of here.
-        (inst jmp eax)
-
+        (inst push eax)
+        (inst ret)
         ;; Nope, not the single case. Jump to the assembly routine.
         (emit-label not-single)))
     (move esi vals)
@@ -1191,10 +1149,10 @@
   (:generator 20
     ;; Avoid the copy if there are no more args.
     (cond ((zerop fixed)
-           (inst jecxz just-alloc-frame))
+           (inst jecxz JUST-ALLOC-FRAME))
           (t
            (inst cmp ecx-tn (fixnumize fixed))
-           (inst jmp :be just-alloc-frame)))
+           (inst jmp :be JUST-ALLOC-FRAME)))
 
     ;; Allocate the space on the stack.
     ;; stack = ebp - (max 3 frame-size) - (nargs - fixed)
@@ -1217,7 +1175,7 @@
            ;; Number to copy = nargs-3
            (inst sub ecx-tn (fixnumize register-arg-count))
            ;; Everything of interest in registers.
-           (inst jmp :be do-regs))
+           (inst jmp :be DO-REGS))
           (t
            ;; Number to copy = nargs-fixed
            (inst sub ecx-tn (fixnumize fixed))))
@@ -1268,7 +1226,7 @@
               ( nil )
               ;; Store it relative to ebp
               (inst mov (make-ea :dword :base ebp-tn
-                                 :disp (- (* 4
+                                 :disp (- (* n-word-bytes
                                              (+ 1 (- i fixed)
                                                 (max 3 (sb-allocated-size 'stack))))))
                     (nth i *register-arg-tns*))
@@ -1281,9 +1239,9 @@
               (if (zerop i)
                   (inst test ecx-tn ecx-tn)
                 (inst cmp ecx-tn (fixnumize i)))
-              (inst jmp :eq done)))
+              (inst jmp :eq DONE)))
 
-    (inst jmp done)
+    (inst jmp DONE)
 
     JUST-ALLOC-FRAME
     (inst lea esp-tn
@@ -1352,12 +1310,12 @@
       (inst lea dst (make-ea :dword :base ecx :index ecx))
       (maybe-pseudo-atomic stack-allocate-p
        (allocation dst dst node stack-allocate-p list-pointer-lowtag)
-       (inst shr ecx 2)
+       (inst shr ecx (1- n-lowtag-bits))
        ;; Set decrement mode (successive args at lower addresses)
        (inst std)
        ;; Set up the result.
        (move result dst)
-       ;; Jump into the middle of the loop, 'cause that's were we want
+       ;; Jump into the middle of the loop, 'cause that's where we want
        ;; to start.
        (inst jmp enter)
        (emit-label loop)
@@ -1403,7 +1361,7 @@
     ;; Point to the first more-arg, not above it.
     (inst lea context (make-ea :dword :base esp-tn
                                :index count :scale 1
-                               :disp (- (+ (fixnumize fixed) 4))))
+                               :disp (- (+ (fixnumize fixed) n-word-bytes))))
     (unless (zerop fixed)
       (inst sub count (fixnumize fixed)))))
 
diff --git a/src/compiler/x86/insts.lisp b/src/compiler/x86/insts.lisp
index 39a7d09..794c90d 100644
--- a/src/compiler/x86/insts.lisp
+++ b/src/compiler/x86/insts.lisp
@@ -1796,7 +1796,7 @@
                     y))
              ((sc-is x control-stack)
               (inst test (make-ea :byte :base ebp-tn
-                                  :disp (- (* (1+ offset) n-word-bytes)))
+                                  :disp (frame-byte-offset offset))
                     y))
              (t
               (inst test x y)))))
@@ -2078,7 +2078,7 @@
   (:printer byte ((op #b11000010) (imm nil :type 'imm-word-16))
             '(:name :tab imm))
   (:emitter
-   (cond (stack-delta
+   (cond ((and stack-delta (not (zerop stack-delta)))
           (emit-byte segment #b11000010)
           (emit-word segment stack-delta))
          (t
diff --git a/src/compiler/x86/nlx.lisp b/src/compiler/x86/nlx.lisp
index 37b752a..7a910a1 100644
--- a/src/compiler/x86/nlx.lisp
+++ b/src/compiler/x86/nlx.lisp
@@ -293,7 +293,7 @@
           (make-ea :dword :base ebp-tn :disp (* -3 n-word-bytes)))
 
     ;; Push the return-pc so it looks like we just called.
-    (pushw ebp-tn -2)
+    (pushw ebp-tn (frame-word-offset return-pc-save-offset))
 
     ;; Call it
     (inst jmp (make-ea :dword :base block
diff --git a/src/compiler/x86/static-fn.lisp b/src/compiler/x86/static-fn.lisp
index cf8fdc5..675d204 100644
--- a/src/compiler/x86/static-fn.lisp
+++ b/src/compiler/x86/static-fn.lisp
@@ -16,7 +16,7 @@
   (:policy :safe)
   (:variant-vars function)
   (:vop-var vop)
-  (:node-var node)
+  ;;(:node-var node)
   (:temporary (:sc unsigned-reg :offset ebx-offset
                    :from (:eval 0) :to (:eval 2)) ebx)
   (:temporary (:sc unsigned-reg :offset ecx-offset
@@ -78,15 +78,17 @@
          ;; If speed not more important than size, duplicate the
          ;; effect of the ENTER with discrete instructions. Takes
          ;; 2+1+3+2=8 bytes as opposed to 4+3=7 bytes.
-         (cond ((policy node (>= speed space))
+         (cond (t ;(policy node (>= speed space))
                 (inst mov ebx esp-tn)
+                ;; Dummy for return address
+                (inst push ebp-tn)
                 ;; Save the old-fp
                 (inst push ebp-tn)
                 ;; Ensure that at least three slots are available; one
                 ;; above, two more needed.
-                (inst sub esp-tn (fixnumize 2))
+                (inst sub esp-tn (fixnumize 1))
                 (inst mov ebp-tn ebx))
-               (t
+               #+(or) (t
                 (inst enter (fixnumize 2))
                 ;; The enter instruction pushes EBP and then copies
                 ;; ESP into EBP. We want the new EBP to be the
diff --git a/src/compiler/x86/vm.lisp b/src/compiler/x86/vm.lisp
index d0023c6..7b1f486 100644
--- a/src/compiler/x86/vm.lisp
+++ b/src/compiler/x86/vm.lisp
@@ -422,8 +422,8 @@
 ;;;; miscellaneous function call parameters
 
 ;;; offsets of special stack frame locations
-(def!constant ocfp-save-offset 0)
-(def!constant return-pc-save-offset 1)
+(def!constant ocfp-save-offset 1)
+(def!constant return-pc-save-offset 0)
 
 (declaim (inline frame-word-offset))
 (defun frame-word-offset (index)
diff --git a/src/runtime/backtrace.c b/src/runtime/backtrace.c
index b6dfc44..ea53069 100644
--- a/src/runtime/backtrace.c
+++ b/src/runtime/backtrace.c
@@ -335,8 +335,8 @@ x86_call_context (void *fp, void **ra, void **ocfp)
 
   c_ocfp    = *((void **) fp);
   c_ra      = *((void **) fp + 1);
-  lisp_ocfp = *((void **) fp - 1);
-  lisp_ra   = *((void **) fp - 2);
+  lisp_ocfp = *((void **) fp - 2);
+  lisp_ra   = *((void **) fp - 1);
 
   lisp_valid_p = (lisp_ocfp > fp
                   && stack_pointer_p(lisp_ocfp)
diff --git a/src/runtime/x86-64-assem.S b/src/runtime/x86-64-assem.S
index fb12a06..64f04ee 100644
--- a/src/runtime/x86-64-assem.S
+++ b/src/runtime/x86-64-assem.S
@@ -210,8 +210,9 @@ Ldone:
 
 	/* Alloc new frame. */
 	mov	%rsp,%rbx	# The current sp marks start of new frame.
+	push	%rbp		# dummy for return address
 	push	%rbp		# fp in save location S0
-	sub	$16,%rsp	# Ensure 3 slots are allocated, one above.
+	sub	$8,%rsp	        # Ensure 3 slots are allocated, two above.
 	mov	%rbx,%rbp	# Switch to new frame.
 
 Lcall:
diff --git a/src/runtime/x86-assem.S b/src/runtime/x86-assem.S
index 8101da2..04c6031 100644
--- a/src/runtime/x86-assem.S
+++ b/src/runtime/x86-assem.S
@@ -273,8 +273,9 @@ Ldone:
 
 	/* Alloc new frame. */
 	mov	%esp,%ebx	# The current sp marks start of new frame.
-	push	%ebp		# fp in save location S0
-	sub	$8,%esp		# Ensure 3 slots are allocated, one above.
+	push	%ebp		# dummy for return address
+	push	%ebp		# fp in save location S1
+	sub	$4,%esp		# Ensure 3 slots are allocated, two above.
 	mov	%ebx,%ebp	# Switch to new frame.
 
 	call	*CLOSURE_FUN_OFFSET(%eax)
diff --git a/version.lisp-expr b/version.lisp-expr
index 19cf88c..091d37f 100644
--- a/version.lisp-expr
+++ b/version.lisp-expr
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.27.10"
+"1.0.27.11"
-- 
1.7.10.4