From 9b1fade83db8453b75b8c7380eb12ce41b5b889c Mon Sep 17 00:00:00 2001 From: Nikodemus Siivola Date: Sat, 17 May 2008 11:02:27 +0000 Subject: [PATCH] 1.0.16.35: improved TIME output * Print measured times using fixed-width decimal output with the measured precision, instead of converting to floats for printing. * Report processor cycle counts on x86 and x86-64. ** Since Intel doesn't seem to consider it necessary to issue a CPUID both before and after RDTSC, maybe we don't need to do that either. ** New feature, :CYCLE-COUNTER, for platforms that implement SB-VM::%READ-CYCLE-COUNTER. * Instead of reporting %EVAL calls, report "interpreted forms", which means both %EVAL and SIMPLE-EVAL-IN-LEXENV. * Report "lambdas converted" for the compiler, not counting TL-XEPs. * Report CPU percentage (computed from real and run time.) * Report total run time separately. Condence run time output slightly by reporting total, user, and system on the same line. * Report non-GC time as well. * Condence output by omitting page faults, converted lambdas, and interpreted forms when they are zero. --- NEWS | 10 +++ make-config.sh | 4 +- package-data-list.lisp-expr | 8 +- src/code/cold-init.lisp | 3 +- src/code/early-full-eval.lisp | 7 -- src/code/eval.lisp | 10 +++ src/code/time.lisp | 163 ++++++++++++++++++++++++++++++-------- src/compiler/early-c.lisp | 1 + src/compiler/ir1tran-lambda.lisp | 6 ++ src/compiler/x86-64/system.lisp | 10 +-- src/compiler/x86/system.lisp | 10 +-- version.lisp-expr | 2 +- 12 files changed, 174 insertions(+), 60 deletions(-) diff --git a/NEWS b/NEWS index db3a9d3..a00bc78 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,16 @@ changes in sbcl-1.0.17 relative to 1.0.16: use this feature in the meanwhile. * new feature: runtime argument --control-stack-size can be used to adjust thread default control stack size. + * enhancement: improved TIME output + ** all times are reported using the measured accuracy (milliseconds + for real and GC times, microseconds for everything else.) + ** processor cycle counts on x86 and x86-64. + ** interpreted forms are counted for both evaluator modes. + ** number of lambdas converted by the compiler is reported. + ** CPU percentage report (computed from real and total run time.) + ** more comprehensive run time reporting, using a condenced format + ** interperted form, lambda, and page fault counts are omitted + when zero. * optimization: ADJOIN and PUSHNEW are upto ~70% faster in normal SPEED policies. * optimization: APPEND is upto ~10% faster in normal SPEED policies. diff --git a/make-config.sh b/make-config.sh index 23f8dcc..7c6997d 100644 --- a/make-config.sh +++ b/make-config.sh @@ -283,7 +283,7 @@ cd "$original_dir" if [ "$sbcl_arch" = "x86" ]; then printf ' :gencgc :stack-grows-downward-not-upward :c-stack-is-control-stack' >> $ltf printf ' :compare-and-swap-vops :unwind-to-frame-and-call-vop' >> $ltf - printf ' :stack-allocatable-closures :alien-callbacks' >> $ltf + printf ' :stack-allocatable-closures :alien-callbacks :cycle-counter' >> $ltf case "$sbcl_os" in linux | freebsd | netbsd | openbsd | sunos | darwin | win32) printf ' :linkage-table' >> $ltf @@ -296,7 +296,7 @@ if [ "$sbcl_arch" = "x86" ]; then elif [ "$sbcl_arch" = "x86-64" ]; then printf ' :gencgc :stack-grows-downward-not-upward :c-stack-is-control-stack :linkage-table' >> $ltf printf ' :compare-and-swap-vops :unwind-to-frame-and-call-vop' >> $ltf - printf ' :stack-allocatable-closures :alien-callbacks' >> $ltf + printf ' :stack-allocatable-closures :alien-callbacks :cycle-counter' >> $ltf elif [ "$sbcl_arch" = "mips" ]; then printf ' :linkage-table' >> $ltf printf ' :stack-allocatable-closures' >> $ltf diff --git a/package-data-list.lisp-expr b/package-data-list.lisp-expr index e8b95e7..129bfb6 100644 --- a/package-data-list.lisp-expr +++ b/package-data-list.lisp-expr @@ -1256,7 +1256,9 @@ is a good idea, but see SB-SYS re. blurring of boundaries." "%WITH-ARRAY-DATA/FP" "%WITH-ARRAY-DATA-MACRO" "*CURRENT-LEVEL-IN-PRINT*" - "*EMPTY-TYPE*" "*GC-INHIBIT*" "*GC-PENDING*" + "*EMPTY-TYPE*" + "*EVAL-CALLS*" + "*GC-INHIBIT*" "*GC-PENDING*" #!+sb-thread "*STOP-FOR-GC-PENDING*" "*UNIVERSAL-TYPE*" "*UNIVERSAL-FUN-TYPE*" "*UNPARSE-FUN-TYPE-SIMPLIFY*" @@ -2549,6 +2551,7 @@ structure representations" :name "SB!EVAL" :doc "internal: the evaluator implementation used to execute code without compiling it." :use ("CL" "SB!KERNEL" "SB!EXT") + :reexport ("*EVAL-CALLS*") :export ("INTERPRETED-FUNCTION" "INTERPRETED-FUNCTION-P" "INTERPRETED-FUNCTION-NAME" @@ -2561,8 +2564,7 @@ structure representations" "EVAL-IN-NATIVE-ENVIRONMENT" "PREPARE-FOR-COMPILE" "COUNT-EVAL-CALLS" - "*EVAL-LEVEL*" - "*EVAL-CALLS*")) + "*EVAL-LEVEL*")) #!+win32 #s(sb-cold:package-data diff --git a/src/code/cold-init.lisp b/src/code/cold-init.lisp index c5ee9f2..2d07ef1 100644 --- a/src/code/cold-init.lisp +++ b/src/code/cold-init.lisp @@ -108,8 +108,7 @@ sb!kernel::*gc-epoch* (cons nil nil)) ;; I'm not sure where eval is first called, so I put this first. - #!+sb-eval - (show-and-call sb!eval::!full-eval-cold-init) + (show-and-call !eval-cold-init) (show-and-call thread-init-or-reinit) (show-and-call !typecheckfuns-cold-init) diff --git a/src/code/early-full-eval.lisp b/src/code/early-full-eval.lisp index 7114306..97c85ef 100644 --- a/src/code/early-full-eval.lisp +++ b/src/code/early-full-eval.lisp @@ -12,15 +12,8 @@ (in-package "SB!EVAL") (defparameter *eval-level* -1) -(defparameter *eval-calls* 0) (defparameter *eval-verbose* nil) -(defun !full-eval-cold-init () - (setf *eval-level* -1 - *eval-calls* 0 - *eval-verbose* nil - *evaluator-mode* :compile)) - ;; !defstruct-with-alternate-metaclass is unslammable and the ;; RECOMPILE restart doesn't work on it. This is the main reason why ;; this stuff is split out into its own file. Also, it lets the diff --git a/src/code/eval.lisp b/src/code/eval.lisp index 99fea12..39c295d 100644 --- a/src/code/eval.lisp +++ b/src/code/eval.lisp @@ -11,6 +11,15 @@ (in-package "SB!IMPL") +(defparameter *eval-calls* 0) + +(defun !eval-cold-init () + (setf *eval-calls* 0 + *evaluator-mode* :compile) + #!+sb-eval + (setf sb!eval::*eval-level* -1 + sb!eval::*eval-verbose* nil)) + ;;; general case of EVAL (except in that it can't handle toplevel ;;; EVAL-WHEN magic properly): Delegate to #'COMPILE. (defun %simple-eval (expr lexenv) @@ -93,6 +102,7 @@ (defun simple-eval-in-lexenv (original-exp lexenv) (declare (optimize (safety 1))) ;; (aver (lexenv-simple-p lexenv)) + (incf *eval-calls*) (handler-bind ((sb!c:compiler-error (lambda (c) diff --git a/src/code/time.lisp b/src/code/time.lisp index 60bf2a5..deda6de 100644 --- a/src/code/time.lisp +++ b/src/code/time.lisp @@ -265,7 +265,21 @@ format." (defmacro time (form) #!+sb-doc - "Execute FORM and print timing information on *TRACE-OUTPUT*." + "Execute FORM and print timing information on *TRACE-OUTPUT*. + +On some hardware platforms estimated processor cycle counts are +included in this output; this number is slightly inflated, since it +includes the pipeline involved in reading the cycle counter -- +executing \(TIME NIL) a few times will give you an idea of the +overhead, and its variance. The cycle counters are also per processor, +not per thread: if multiple threads are running on the same processor, +the reported counts will include cycles taken up by all threads +running on the processor where TIME was executed. Furthermore, if the +operating system migrates the thread to another processor between +reads of the cycle counter, the results will be completely bogus. +Finally, the counter is cycle counter, incremented by the hardware +even when the process is halted -- which is to say that cycles pass +normally during operations like SLEEP." `(%time (lambda () ,form))) ;;; Return all the data that we want TIME to report. @@ -273,6 +287,71 @@ format." (multiple-value-bind (user sys faults) (sb!sys:get-system-info) (values user sys faults (get-bytes-consed)))) + +(defun elapsed-cycles (h0 l0 h1 l1) + (declare (ignorable h0 l0 h1 l1)) + #!+cycle-counter + (+ (ash (- h1 h0) 32) + (- l1 l0)) + #!-cycle-counter + nil) +(declaim (inline read-cycle-counter)) +(defun read-cycle-counter () + #!+cycle-counter + (sb!vm::%read-cycle-counter) + #!-cycle-counter + (values 0 0)) + +;;; This is so that we don't have to worry about the vagaries of +;;; floating point printing, or about conversions to floats dropping +;;; or introducing decimals, which are liable to imply wrong precision. +(defun format-microseconds (stream usec &optional colonp atp) + (declare (ignore colonp)) + (%format-decimal stream usec 6) + (unless atp + (write-string " seconds" stream))) + +(defun format-milliseconds (stream usec &optional colonp atp) + (declare (ignore colonp)) + (%format-decimal stream usec 3) + (unless atp + (write-string " seconds" stream))) + +(defun %format-decimal (stream number power) + (declare (stream stream) + (integer number power)) + (when (minusp number) + (write-char #\- stream) + (setf number (- number))) + (let ((scale (expt 10 power))) + (flet ((%fraction (fraction) + (let ((scaled (* 10 fraction))) + (loop while (< scaled scale) + do (write-char #\0 stream) + (setf scaled (* scaled 10)))) + (format stream "~D" fraction)) + (%zeroes () + (let ((scaled (/ scale 10))) + (write-char #\0 stream) + (loop while (> scaled 1) + do (write-char #\0 stream) + (setf scaled (/ scaled 10)))))) + (cond ((zerop number) + (write-string "0." stream) + (%zeroes)) + ((< number scale) + (write-string "0." stream) + (%fraction number)) + ((= number scale) + (write-string "1." stream) + (%zeroes)) + ((> number scale) + (multiple-value-bind (whole fraction) (floor number scale) + (format stream "~D." whole) + (%fraction fraction)))))) + + nil) + ;;; The guts of the TIME macro. Compute overheads, run the (compiled) ;;; function, report the times. (defun %time (fun) @@ -316,35 +395,53 @@ format." (old-run-utime old-run-stime old-page-faults old-bytes-consed) (time-get-sys-info)) (setq old-real-time (get-internal-real-time)) - (let ((start-gc-run-time *gc-run-time*) - #!+sb-eval (sb!eval:*eval-calls* 0)) - (declare #!+sb-eval (special sb!eval:*eval-calls*)) - (multiple-value-prog1 - ;; Execute the form and return its values. - (funcall fun) - (multiple-value-setq - (new-run-utime new-run-stime new-page-faults new-bytes-consed) - (time-get-sys-info)) - (setq new-real-time (- (get-internal-real-time) real-time-overhead)) - (let ((gc-run-time (max (- *gc-run-time* start-gc-run-time) 0))) - (format *trace-output* - "~&Evaluation took:~% ~ - ~S second~:P of real time~% ~ - ~S second~:P of user run time~% ~ - ~S second~:P of system run time~% ~ - ~@[[Run times include ~S second~:P GC run time.]~% ~]~ - ~@[~S call~:P to %EVAL~% ~]~ - ~S page fault~:P and~% ~ - ~:D bytes consed.~%" - (max (/ (- new-real-time old-real-time) - (float sb!xc:internal-time-units-per-second)) - 0.0) - (max (/ (- new-run-utime old-run-utime) 1000000.0) 0.0) - (max (/ (- new-run-stime old-run-stime) 1000000.0) 0.0) - (unless (zerop gc-run-time) - (/ (float gc-run-time) - (float sb!xc:internal-time-units-per-second))) - #!+sb-eval sb!eval:*eval-calls* #!-sb-eval nil - (max (- new-page-faults old-page-faults) 0) - (max (- new-bytes-consed old-bytes-consed) 0))))))) - + (let ((start-gc-internal-run-time *gc-run-time*) + (*eval-calls* 0) + (sb!c::*lambda-conversions* 0)) + (declare (special *eval-calls* sb!c::*lambda-conversions*)) + (multiple-value-bind (h0 l0) (read-cycle-counter) + (multiple-value-prog1 + ;; Execute the form and return its values. + (funcall fun) + (multiple-value-bind (h1 l1) (read-cycle-counter) + (let ((stop-gc-internal-run-time *gc-run-time*)) + (multiple-value-setq + (new-run-utime new-run-stime new-page-faults new-bytes-consed) + (time-get-sys-info)) + (setq new-real-time (- (get-internal-real-time) real-time-overhead)) + (let* ((gc-internal-run-time (max (- stop-gc-internal-run-time start-gc-internal-run-time) 0)) + (real-time (max (- new-real-time old-real-time) 0)) + (user-run-time (max (- new-run-utime old-run-utime) 0)) + (system-run-time (max (- new-run-stime old-run-stime) 0)) + (total-run-time (+ user-run-time system-run-time)) + (cycles (elapsed-cycles h0 l0 h1 l1)) + (page-faults (max (- new-page-faults old-page-faults) 0))) + (format *trace-output* + "~&Evaluation took:~%~ + ~@< ~@;~/sb-impl::format-milliseconds/ of real time~%~ + ~/sb-impl::format-microseconds/ of total run time ~ + (~@/sb-impl::format-microseconds/ user, ~@/sb-impl::format-microseconds/ system)~%~ + ~[[ Run times consist of ~/sb-impl::format-milliseconds/ GC time, ~ + and ~/sb-impl::format-milliseconds/ non-GC time. ]~%~;~2*~]~ + ~,2F% CPU~%~ + ~@[~:D form~:P interpreted~%~]~ + ~@[~:D lambda~:P converted~%~]~ + ~@[~:D processor cycles~%~]~ + ~@[~:D page fault~:P~%~]~ + ~:D bytes consed~:>~%" + real-time + total-run-time + user-run-time + system-run-time + (if (zerop gc-internal-run-time) 1 0) + gc-internal-run-time + ;; Round up so we don't mislead by saying 0.0 seconds of non-GC time... + (- (ceiling total-run-time 1000) gc-internal-run-time) + (if (zerop real-time) + 100.0 + (float (* 100 (/ (round total-run-time 1000) real-time)))) + (unless (zerop *eval-calls*) *eval-calls*) + (unless (zerop sb!c::*lambda-conversions*) sb!c::*lambda-conversions*) + cycles + (unless (zerop page-faults) page-faults) + (max (- new-bytes-consed old-bytes-consed) 0)))))))))) diff --git a/src/compiler/early-c.lisp b/src/compiler/early-c.lisp index 9df9131..cafd675 100644 --- a/src/compiler/early-c.lisp +++ b/src/compiler/early-c.lisp @@ -123,6 +123,7 @@ (defvar *trace-table*) (defvar *undefined-warnings*) (defvar *warnings-p*) +(defvar *lambda-conversions*) ;;; This lock is seized in the compiler, and related areas: the ;;; compiler is not presently thread-safe diff --git a/src/compiler/ir1tran-lambda.lisp b/src/compiler/ir1tran-lambda.lisp index 6fd0661..ee321c0 100644 --- a/src/compiler/ir1tran-lambda.lisp +++ b/src/compiler/ir1tran-lambda.lisp @@ -933,6 +933,12 @@ :debug-name debug-name)))) (setf (functional-inline-expansion res) form) (setf (functional-arg-documentation res) (cadr form)) + (when (boundp '*lambda-conversions*) + ;; KLUDGE: Not counting TL-XEPs is a lie, of course, but + ;; keeps things less confusing to users of TIME, where this + ;; count gets used. + (unless (and (consp debug-name) (eq 'tl-xep (car debug-name))) + (incf *lambda-conversions*))) res)))) (defun wrap-forms-in-debug-catch (forms) diff --git a/src/compiler/x86-64/system.lisp b/src/compiler/x86-64/system.lisp index 88f9997..ce91fb4 100644 --- a/src/compiler/x86-64/system.lisp +++ b/src/compiler/x86-64/system.lisp @@ -336,14 +336,12 @@ (:result-types unsigned-num unsigned-num) (:generator 5 (zeroize eax) + ;; Intel docs seem quite consistent on only using CPUID before RDTSC, + ;; not both before and after. Go figure. (inst cpuid) (inst rdtsc) - (inst push edx) - (inst push eax) - (zeroize eax) - (inst cpuid) - (inst pop lo) - (inst pop hi))) + (move lo eax) + (move hi edx))) (defmacro with-cycle-counter (&body body) "Returns the primary value of BODY as the primary value, and the diff --git a/src/compiler/x86/system.lisp b/src/compiler/x86/system.lisp index ca691a8..d65cda3 100644 --- a/src/compiler/x86/system.lisp +++ b/src/compiler/x86/system.lisp @@ -330,14 +330,12 @@ (:result-types unsigned-num unsigned-num) (:generator 5 (inst xor eax eax) + ;; Intel docs seem quite consistent on only using CPUID before RDTSC, + ;; not both before and after. Go figure. (inst cpuid) (inst rdtsc) - (inst push edx) - (inst push eax) - (inst xor eax eax) - (inst cpuid) - (inst pop lo) - (inst pop hi))) + (move lo eax) + (move hi edx))) (defmacro with-cycle-counter (&body body) "Returns the primary value of BODY as the primary value, and the diff --git a/version.lisp-expr b/version.lisp-expr index a4bb747..4c254c0 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.16.34" +"1.0.16.35" -- 1.7.10.4