From d42fc83f2dd4d1f191aa164425f33d2d60fc4b36 Mon Sep 17 00:00:00 2001 From: Juho Snellman Date: Thu, 17 Feb 2005 21:59:01 +0000 Subject: [PATCH] 0.8.19.31: x86-64 floating point improvements: * Optimization: fewer moves on floating point add/sub/mul/div. Only use a temporary xmm register if absolutely needed. * Optimization: When restoring xmm15 to zero after a c-call, use xorpd instead of a temporary register and movq. * Replace broken x87 FLOATING-POINT-MODES and SET-FLOATING-POINT-MODES with SSE code that uses the mxcsr control register. * Fix movd / movq from a xmm register to a gp register. * Add new instructions: ldmxcsr, stmxcsr --- NEWS | 2 + src/compiler/x86-64/c-call.lisp | 9 ++-- src/compiler/x86-64/float.lisp | 100 +++++++++++++++++++++------------------ src/compiler/x86-64/insts.lisp | 19 +++++++- version.lisp-expr | 2 +- 5 files changed, 77 insertions(+), 55 deletions(-) diff --git a/NEWS b/NEWS index 302a2f3..77c9937 100644 --- a/NEWS +++ b/NEWS @@ -46,6 +46,8 @@ changes in sbcl-0.8.20 (0.9alpha.0?) relative to sbcl-0.8.19: ** The disassembler understands more x86-64. (thanks to Cheuksan Wang) ** The regression tests use SB-ALIEN:INT instead of SB-ALIEN:INTEGER for enums. (thanks to Vincent Arkesteijn) + ** Multiple small optimizations and bugfixes for floating point + operations. * fixed some bugs revealed by Paul Dietz' test suite: ** Space, Tab, Linefeed, Return and Page have the invalid secondary constituent character trait. diff --git a/src/compiler/x86-64/c-call.lisp b/src/compiler/x86-64/c-call.lisp index 99b1525..da3c82c 100644 --- a/src/compiler/x86-64/c-call.lisp +++ b/src/compiler/x86-64/c-call.lisp @@ -238,8 +238,6 @@ (args :more t)) (:results (results :more t)) (:temporary (:sc unsigned-reg :offset rax-offset :to :result) rax) - (:temporary (:sc unsigned-reg :offset rcx-offset - :from :eval :to :result) rcx) (:ignore results) (:vop-var vop) (:save-p t) @@ -255,11 +253,10 @@ ;; To give the debugger a clue. XX not really internal-error? (note-this-location vop :internal-error) ;; FLOAT15 needs to contain FP zero in Lispland - (inst xor rcx rcx) - (inst movd (make-random-tn :kind :normal + (let ((float15 (make-random-tn :kind :normal :sc (sc-or-lose 'double-reg) - :offset float15-offset) - rcx))) + :offset float15-offset))) + (inst xorpd float15 float15)))) (define-vop (alloc-number-stack-space) (:info amount) diff --git a/src/compiler/x86-64/float.lisp b/src/compiler/x86-64/float.lisp index 9c018cf..b0dfce0 100644 --- a/src/compiler/x86-64/float.lisp +++ b/src/compiler/x86-64/float.lisp @@ -438,29 +438,36 @@ (frob single-float-op single-reg single-float) (frob double-float-op double-reg double-float)) -(macrolet ((frob (op sinst sname scost dinst dname dcost) +(macrolet ((generate (movinst opinst commutative) + `(progn + (cond + ((location= x r) + (inst ,opinst x y)) + ((and ,commutative (location= y r)) + (inst ,opinst y x)) + ((not (location= r y)) + (inst ,movinst r x) + (inst ,opinst r y)) + (t + (inst ,movinst tmp x) + (inst ,opinst tmp y) + (inst ,movinst r tmp))))) + (frob (op sinst sname scost dinst dname dcost commutative) `(progn (define-vop (,sname single-float-op) (:translate ,op) - (:results (r :scs (single-reg))) (:temporary (:sc single-reg) tmp) (:generator ,scost - (inst movss tmp x) - (inst ,sinst tmp y) - (inst movss r tmp))) + (generate movss ,sinst ,commutative))) (define-vop (,dname double-float-op) (:translate ,op) - (:results (r :scs (double-reg))) (:temporary (:sc single-reg) tmp) (:generator ,dcost - (inst movsd tmp x) - (inst ,dinst tmp y) - (inst movsd r tmp)))))) - (frob + addss +/single-float 2 addsd +/double-float 2) - (frob - subss -/single-float 2 subsd -/double-float 2) - (frob * mulss */single-float 4 mulsd */double-float 5) - (frob / divss //single-float 12 divsd //double-float 19)) - + (generate movsd ,dinst ,commutative)))))) + (frob + addss +/single-float 2 addsd +/double-float 2 t) + (frob - subss -/single-float 2 subsd -/double-float 2 nil) + (frob * mulss */single-float 4 mulsd */double-float 5 t) + (frob / divss //single-float 12 divsd //double-float 19 nil)) (macrolet ((frob ((name translate sc type) &body body) @@ -857,31 +864,26 @@ (defknown ((setf floating-point-modes)) (float-modes) float-modes) -(def!constant npx-env-size (* 7 n-word-bytes)) -(def!constant npx-cw-offset 0) -(def!constant npx-sw-offset 4) - (define-vop (floating-point-modes) (:results (res :scs (unsigned-reg))) (:result-types unsigned-num) (:translate floating-point-modes) (:policy :fast-safe) - (:temporary (:sc unsigned-reg :offset eax-offset :target res - :to :result) eax) + (:temporary (:sc unsigned-stack :from :argument :to :result) temp) (:generator 8 - (inst sub rsp-tn npx-env-size) ; Make space on stack. - (inst wait) ; Catch any pending FPE exceptions - (inst fstenv (make-ea :dword :base rsp-tn)) ; masks all exceptions - (inst fldenv (make-ea :dword :base rsp-tn)) ; Restore previous state. - ;; Move current status to high word. - (inst movzxd eax (make-ea :dword :base rsp-tn :disp (- npx-sw-offset 2))) - ;; Move exception mask to low word. - (inst mov ax-tn (make-ea :word :base rsp-tn :disp npx-cw-offset)) - (inst add rsp-tn npx-env-size) ; Pop stack. - (inst xor eax #x3f) ; Flip exception mask to trap enable bits. - (move res eax))) - -;;; XXX BROKEN + (inst stmxcsr temp) + (move res temp) + ;; Extract status from bytes 0-5 to bytes 16-21 + (inst and temp (1- (expt 2 6))) + (inst shl temp 16) + ;; Extract mask from bytes 7-12 to bytes 0-5 + (inst shr res 7) + (inst and res (1- (expt 2 6))) + ;; Flip the bits to convert from "1 means exception masked" to + ;; "1 means exception enabled". + (inst xor res (1- (expt 2 6))) + (inst or res temp))) + (define-vop (set-floating-point-modes) (:args (new :scs (unsigned-reg) :to :result :target res)) (:results (res :scs (unsigned-reg))) @@ -889,20 +891,26 @@ (:result-types unsigned-num) (:translate (setf floating-point-modes)) (:policy :fast-safe) - (:temporary (:sc unsigned-reg :offset eax-offset - :from :eval :to :result) eax) + (:temporary (:sc unsigned-reg :from :argument :to :result) temp1) + (:temporary (:sc unsigned-stack :from :argument :to :result) temp2) (:generator 3 - (inst sub rsp-tn npx-env-size) ; Make space on stack. - (inst wait) ; Catch any pending FPE exceptions. - (inst fstenv (make-ea :dword :base rsp-tn)) - (inst mov eax new) - (inst xor eax #x3f) ; Turn trap enable bits into exception mask. - (inst mov (make-ea :word :base rsp-tn :disp npx-cw-offset) ax-tn) - (inst shr eax 16) ; position status word - (inst mov (make-ea :word :base rsp-tn :disp npx-sw-offset) ax-tn) - (inst fldenv (make-ea :dword :base rsp-tn)) - (inst add rsp-tn npx-env-size) ; Pop stack. - (move res new))) + (move res new) + (inst stmxcsr temp2) + ;; Clear status + masks + (inst and temp2 (lognot (logior (1- (expt 2 6)) + (ash (1- (expt 2 6)) 7)))) + ;; Replace current status + (move temp1 new) + (inst shr temp1 16) + (inst and temp1 (1- (expt 2 6))) + (inst or temp2 temp1) + ;; Replace exception masks + (move temp1 new) + (inst and temp1 (1- (expt 2 6))) + (inst xor temp1 (1- (expt 2 6))) + (inst shl temp1 7) + (inst or temp2 temp1) + (inst ldmxcsr temp2))) ;;;; complex float VOPs diff --git a/src/compiler/x86-64/insts.lisp b/src/compiler/x86-64/insts.lisp index 2abd0ea..a03f5ac 100644 --- a/src/compiler/x86-64/insts.lisp +++ b/src/compiler/x86-64/insts.lisp @@ -3181,13 +3181,14 @@ (define-instruction movd (segment dst src) ; (:printer reg-reg/mem ((op #x10) (width 1))) ;wrong (:emitter - (cond ((typep dst 'tn) + (cond ((fp-reg-tn-p dst) (emit-byte segment #x66) (maybe-emit-rex-for-ea segment src dst) (emit-byte segment #x0f) (emit-byte segment #x6e) (emit-ea segment src (reg-tn-encoding dst))) (t + (aver (fp-reg-tn-p src)) (emit-byte segment #x66) (maybe-emit-rex-for-ea segment dst src) (emit-byte segment #x0f) @@ -3197,13 +3198,14 @@ (define-instruction movq (segment dst src) ; (:printer reg-reg/mem ((op #x10) (width 1))) ;wrong (:emitter - (cond ((typep dst 'tn) + (cond ((fp-reg-tn-p dst) (emit-byte segment #xf3) (maybe-emit-rex-for-ea segment src dst) (emit-byte segment #x0f) (emit-byte segment #x7e) (emit-ea segment src (reg-tn-encoding dst))) (t + (aver (fp-reg-tn-p src)) (emit-byte segment #x66) (maybe-emit-rex-for-ea segment dst src) (emit-byte segment #x0f) @@ -3389,3 +3391,16 @@ (emit-byte segment #x0f) (emit-byte segment #x5c) (emit-ea segment src (reg-tn-encoding dst)))) + +(define-instruction ldmxcsr (segment src) + (:emitter + (emit-byte segment #x0f) + (emit-byte segment #xae) + (emit-ea segment src 2))) + +(define-instruction stmxcsr (segment dst) + (:emitter + (emit-byte segment #x0f) + (emit-byte segment #xae) + (emit-ea segment dst 3))) + diff --git a/version.lisp-expr b/version.lisp-expr index b075cec..92a9dba 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.8.19.30" +"0.8.19.31" -- 1.7.10.4