1.0.16.35: improved TIME output
authorNikodemus Siivola <nikodemus@random-state.net>
Sat, 17 May 2008 11:02:27 +0000 (11:02 +0000)
committerNikodemus Siivola <nikodemus@random-state.net>
Sat, 17 May 2008 11:02:27 +0000 (11:02 +0000)
 * Print measured times using fixed-width decimal output with the
   measured precision, instead of converting to floats for printing.

 * Report processor cycle counts on x86 and x86-64.
   ** Since Intel doesn't seem to consider it necessary to issue a
      CPUID both before and after RDTSC, maybe we don't need to do
      that either.
   ** New feature, :CYCLE-COUNTER, for platforms that implement
      SB-VM::%READ-CYCLE-COUNTER.

 * Instead of reporting %EVAL calls, report "interpreted forms", which means
   both %EVAL and SIMPLE-EVAL-IN-LEXENV.

 * Report "lambdas converted" for the compiler, not counting TL-XEPs.

 * Report CPU percentage (computed from real and run time.)

 * Report total run time separately. Condence run time output slightly
   by reporting total, user, and system on the same line.

 * Report non-GC time as well.

 * Condence output by omitting page faults, converted lambdas, and
   interpreted forms when they are zero.

12 files changed:
NEWS
make-config.sh
package-data-list.lisp-expr
src/code/cold-init.lisp
src/code/early-full-eval.lisp
src/code/eval.lisp
src/code/time.lisp
src/compiler/early-c.lisp
src/compiler/ir1tran-lambda.lisp
src/compiler/x86-64/system.lisp
src/compiler/x86/system.lisp
version.lisp-expr

diff --git a/NEWS b/NEWS
index db3a9d3..a00bc78 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,16 @@ changes in sbcl-1.0.17 relative to 1.0.16:
     use this feature in the meanwhile.
   * new feature: runtime argument --control-stack-size can be used to
     adjust thread default control stack size.
+  * enhancement: improved TIME output
+    ** all times are reported using the measured accuracy (milliseconds
+       for real and GC times, microseconds for everything else.)
+    ** processor cycle counts on x86 and x86-64.
+    ** interpreted forms are counted for both evaluator modes.
+    ** number of lambdas converted by the compiler is reported.
+    ** CPU percentage report (computed from real and total run time.)
+    ** more comprehensive run time reporting, using a condenced format
+    ** interperted form, lambda, and page fault counts are omitted
+       when zero.
   * optimization: ADJOIN and PUSHNEW are upto ~70% faster in normal
     SPEED policies.
   * optimization: APPEND is upto ~10% faster in normal SPEED policies.
index 23f8dcc..7c6997d 100644 (file)
@@ -283,7 +283,7 @@ cd "$original_dir"
 if [ "$sbcl_arch" = "x86" ]; then
     printf ' :gencgc :stack-grows-downward-not-upward :c-stack-is-control-stack' >> $ltf
     printf ' :compare-and-swap-vops :unwind-to-frame-and-call-vop' >> $ltf
-    printf ' :stack-allocatable-closures :alien-callbacks' >> $ltf
+    printf ' :stack-allocatable-closures :alien-callbacks :cycle-counter' >> $ltf
     case "$sbcl_os" in
     linux | freebsd | netbsd | openbsd | sunos | darwin | win32)
         printf ' :linkage-table' >> $ltf
@@ -296,7 +296,7 @@ if [ "$sbcl_arch" = "x86" ]; then
 elif [ "$sbcl_arch" = "x86-64" ]; then
     printf ' :gencgc :stack-grows-downward-not-upward :c-stack-is-control-stack :linkage-table' >> $ltf
     printf ' :compare-and-swap-vops :unwind-to-frame-and-call-vop' >> $ltf
-    printf ' :stack-allocatable-closures :alien-callbacks' >> $ltf
+    printf ' :stack-allocatable-closures :alien-callbacks :cycle-counter' >> $ltf
 elif [ "$sbcl_arch" = "mips" ]; then
     printf ' :linkage-table' >> $ltf
     printf ' :stack-allocatable-closures' >> $ltf
index e8b95e7..129bfb6 100644 (file)
@@ -1256,7 +1256,9 @@ is a good idea, but see SB-SYS re. blurring of boundaries."
                "%WITH-ARRAY-DATA/FP"
                "%WITH-ARRAY-DATA-MACRO"
                "*CURRENT-LEVEL-IN-PRINT*"
-               "*EMPTY-TYPE*" "*GC-INHIBIT*" "*GC-PENDING*"
+               "*EMPTY-TYPE*"
+               "*EVAL-CALLS*"
+               "*GC-INHIBIT*" "*GC-PENDING*"
                #!+sb-thread "*STOP-FOR-GC-PENDING*"
                "*UNIVERSAL-TYPE*"
                "*UNIVERSAL-FUN-TYPE*" "*UNPARSE-FUN-TYPE-SIMPLIFY*"
@@ -2549,6 +2551,7 @@ structure representations"
       :name "SB!EVAL"
       :doc "internal: the evaluator implementation used to execute code without compiling it."
       :use ("CL" "SB!KERNEL" "SB!EXT")
+      :reexport ("*EVAL-CALLS*")
       :export ("INTERPRETED-FUNCTION"
                "INTERPRETED-FUNCTION-P"
                "INTERPRETED-FUNCTION-NAME"
@@ -2561,8 +2564,7 @@ structure representations"
                "EVAL-IN-NATIVE-ENVIRONMENT"
                "PREPARE-FOR-COMPILE"
                "COUNT-EVAL-CALLS"
-               "*EVAL-LEVEL*"
-               "*EVAL-CALLS*"))
+               "*EVAL-LEVEL*"))
 
    #!+win32
    #s(sb-cold:package-data
index c5ee9f2..2d07ef1 100644 (file)
         sb!kernel::*gc-epoch* (cons nil nil))
 
   ;; I'm not sure where eval is first called, so I put this first.
-  #!+sb-eval
-  (show-and-call sb!eval::!full-eval-cold-init)
+  (show-and-call !eval-cold-init)
 
   (show-and-call thread-init-or-reinit)
   (show-and-call !typecheckfuns-cold-init)
index 7114306..97c85ef 100644 (file)
 (in-package "SB!EVAL")
 
 (defparameter *eval-level* -1)
-(defparameter *eval-calls* 0)
 (defparameter *eval-verbose* nil)
 
-(defun !full-eval-cold-init ()
-  (setf *eval-level* -1
-        *eval-calls* 0
-        *eval-verbose* nil
-        *evaluator-mode* :compile))
-
 ;; !defstruct-with-alternate-metaclass is unslammable and the
 ;; RECOMPILE restart doesn't work on it.  This is the main reason why
 ;; this stuff is split out into its own file.  Also, it lets the
index 99fea12..39c295d 100644 (file)
 
 (in-package "SB!IMPL")
 
+(defparameter *eval-calls* 0)
+
+(defun !eval-cold-init ()
+  (setf *eval-calls* 0
+        *evaluator-mode* :compile)
+  #!+sb-eval
+  (setf sb!eval::*eval-level* -1
+        sb!eval::*eval-verbose* nil))
+
 ;;; general case of EVAL (except in that it can't handle toplevel
 ;;; EVAL-WHEN magic properly): Delegate to #'COMPILE.
 (defun %simple-eval (expr lexenv)
 (defun simple-eval-in-lexenv (original-exp lexenv)
   (declare (optimize (safety 1)))
   ;; (aver (lexenv-simple-p lexenv))
+  (incf *eval-calls*)
   (handler-bind
       ((sb!c:compiler-error
         (lambda (c)
index 60bf2a5..deda6de 100644 (file)
@@ -265,7 +265,21 @@ format."
 
 (defmacro time (form)
   #!+sb-doc
-  "Execute FORM and print timing information on *TRACE-OUTPUT*."
+  "Execute FORM and print timing information on *TRACE-OUTPUT*.
+
+On some hardware platforms estimated processor cycle counts are
+included in this output; this number is slightly inflated, since it
+includes the pipeline involved in reading the cycle counter --
+executing \(TIME NIL) a few times will give you an idea of the
+overhead, and its variance. The cycle counters are also per processor,
+not per thread: if multiple threads are running on the same processor,
+the reported counts will include cycles taken up by all threads
+running on the processor where TIME was executed. Furthermore, if the
+operating system migrates the thread to another processor between
+reads of the cycle counter, the results will be completely bogus.
+Finally, the counter is cycle counter, incremented by the hardware
+even when the process is halted -- which is to say that cycles pass
+normally during operations like SLEEP."
   `(%time (lambda () ,form)))
 
 ;;; Return all the data that we want TIME to report.
@@ -273,6 +287,71 @@ format."
   (multiple-value-bind (user sys faults) (sb!sys:get-system-info)
     (values user sys faults (get-bytes-consed))))
 
+
+(defun elapsed-cycles (h0 l0 h1 l1)
+  (declare (ignorable h0 l0 h1 l1))
+  #!+cycle-counter
+  (+ (ash (- h1 h0) 32)
+     (- l1 l0))
+  #!-cycle-counter
+  nil)
+(declaim (inline read-cycle-counter))
+(defun read-cycle-counter ()
+  #!+cycle-counter
+  (sb!vm::%read-cycle-counter)
+  #!-cycle-counter
+  (values 0 0))
+
+;;; This is so that we don't have to worry about the vagaries of
+;;; floating point printing, or about conversions to floats dropping
+;;; or introducing decimals, which are liable to imply wrong precision.
+(defun format-microseconds (stream usec &optional colonp atp)
+  (declare (ignore colonp))
+  (%format-decimal stream usec 6)
+  (unless atp
+    (write-string " seconds" stream)))
+
+(defun format-milliseconds (stream usec &optional colonp atp)
+  (declare (ignore colonp))
+  (%format-decimal stream usec 3)
+  (unless atp
+    (write-string " seconds" stream)))
+
+(defun %format-decimal (stream number power)
+  (declare (stream stream)
+           (integer number power))
+  (when (minusp number)
+    (write-char #\- stream)
+    (setf number (- number)))
+  (let ((scale (expt 10 power)))
+    (flet ((%fraction (fraction)
+             (let ((scaled (* 10 fraction)))
+               (loop while (< scaled scale)
+                     do (write-char #\0 stream)
+                        (setf scaled (* scaled 10))))
+             (format stream "~D" fraction))
+           (%zeroes ()
+             (let ((scaled (/ scale 10)))
+               (write-char #\0 stream)
+               (loop while (> scaled 1)
+                     do (write-char #\0 stream)
+                        (setf scaled (/ scaled 10))))))
+      (cond ((zerop number)
+             (write-string "0." stream)
+             (%zeroes))
+            ((< number scale)
+             (write-string "0." stream)
+             (%fraction number))
+            ((= number scale)
+             (write-string "1." stream)
+             (%zeroes))
+            ((> number scale)
+             (multiple-value-bind (whole fraction) (floor number scale)
+               (format stream "~D." whole)
+               (%fraction fraction))))))
+
+  nil)
+
 ;;; The guts of the TIME macro. Compute overheads, run the (compiled)
 ;;; function, report the times.
 (defun %time (fun)
@@ -316,35 +395,53 @@ format."
         (old-run-utime old-run-stime old-page-faults old-bytes-consed)
       (time-get-sys-info))
     (setq old-real-time (get-internal-real-time))
-    (let ((start-gc-run-time *gc-run-time*)
-          #!+sb-eval (sb!eval:*eval-calls* 0))
-      (declare #!+sb-eval (special sb!eval:*eval-calls*))
-      (multiple-value-prog1
-          ;; Execute the form and return its values.
-          (funcall fun)
-        (multiple-value-setq
-            (new-run-utime new-run-stime new-page-faults new-bytes-consed)
-          (time-get-sys-info))
-        (setq new-real-time (- (get-internal-real-time) real-time-overhead))
-        (let ((gc-run-time (max (- *gc-run-time* start-gc-run-time) 0)))
-          (format *trace-output*
-                  "~&Evaluation took:~%  ~
-                 ~S second~:P of real time~%  ~
-                 ~S second~:P of user run time~%  ~
-                 ~S second~:P of system run time~%  ~
-              ~@[[Run times include ~S second~:P GC run time.]~%  ~]~
-              ~@[~S call~:P to %EVAL~%  ~]~
-                 ~S page fault~:P and~%  ~
-                 ~:D bytes consed.~%"
-                  (max (/ (- new-real-time old-real-time)
-                          (float sb!xc:internal-time-units-per-second))
-                       0.0)
-                  (max (/ (- new-run-utime old-run-utime) 1000000.0) 0.0)
-                  (max (/ (- new-run-stime old-run-stime) 1000000.0) 0.0)
-                  (unless (zerop gc-run-time)
-                    (/ (float gc-run-time)
-                       (float sb!xc:internal-time-units-per-second)))
-                  #!+sb-eval sb!eval:*eval-calls* #!-sb-eval nil
-                  (max (- new-page-faults old-page-faults) 0)
-                  (max (- new-bytes-consed old-bytes-consed) 0)))))))
-
+    (let ((start-gc-internal-run-time *gc-run-time*)
+          (*eval-calls* 0)
+          (sb!c::*lambda-conversions* 0))
+      (declare (special *eval-calls* sb!c::*lambda-conversions*))
+      (multiple-value-bind (h0 l0) (read-cycle-counter)
+        (multiple-value-prog1
+            ;; Execute the form and return its values.
+            (funcall fun)
+          (multiple-value-bind (h1 l1) (read-cycle-counter)
+            (let ((stop-gc-internal-run-time *gc-run-time*))
+              (multiple-value-setq
+                  (new-run-utime new-run-stime new-page-faults new-bytes-consed)
+                (time-get-sys-info))
+              (setq new-real-time (- (get-internal-real-time) real-time-overhead))
+              (let* ((gc-internal-run-time (max (- stop-gc-internal-run-time start-gc-internal-run-time) 0))
+                     (real-time (max (- new-real-time old-real-time) 0))
+                     (user-run-time (max (- new-run-utime old-run-utime) 0))
+                     (system-run-time (max (- new-run-stime old-run-stime) 0))
+                     (total-run-time (+ user-run-time system-run-time))
+                     (cycles (elapsed-cycles h0 l0 h1 l1))
+                     (page-faults (max (- new-page-faults old-page-faults) 0)))
+                (format *trace-output*
+                        "~&Evaluation took:~%~
+                         ~@<  ~@;~/sb-impl::format-milliseconds/ of real time~%~
+                                 ~/sb-impl::format-microseconds/ of total run time ~
+                                  (~@/sb-impl::format-microseconds/ user, ~@/sb-impl::format-microseconds/ system)~%~
+                                 ~[[ Run times consist of ~/sb-impl::format-milliseconds/ GC time, ~
+                                                      and ~/sb-impl::format-milliseconds/ non-GC time. ]~%~;~2*~]~
+                                 ~,2F% CPU~%~
+                                 ~@[~:D form~:P interpreted~%~]~
+                                 ~@[~:D lambda~:P converted~%~]~
+                                 ~@[~:D processor cycles~%~]~
+                                 ~@[~:D page fault~:P~%~]~
+                                 ~:D bytes consed~:>~%"
+                        real-time
+                        total-run-time
+                        user-run-time
+                        system-run-time
+                        (if (zerop gc-internal-run-time) 1 0)
+                        gc-internal-run-time
+                        ;; Round up so we don't mislead by saying 0.0 seconds of non-GC time...
+                        (- (ceiling total-run-time 1000) gc-internal-run-time)
+                        (if (zerop real-time)
+                            100.0
+                            (float (* 100 (/ (round total-run-time 1000) real-time))))
+                        (unless (zerop *eval-calls*) *eval-calls*)
+                        (unless (zerop sb!c::*lambda-conversions*) sb!c::*lambda-conversions*)
+                        cycles
+                        (unless (zerop page-faults) page-faults)
+                        (max (- new-bytes-consed old-bytes-consed) 0))))))))))
index 9df9131..cafd675 100644 (file)
 (defvar *trace-table*)
 (defvar *undefined-warnings*)
 (defvar *warnings-p*)
+(defvar *lambda-conversions*)
 
 ;;; This lock is seized in the compiler, and related areas: the
 ;;; compiler is not presently thread-safe
index 6fd0661..ee321c0 100644 (file)
                                                    :debug-name debug-name))))
         (setf (functional-inline-expansion res) form)
         (setf (functional-arg-documentation res) (cadr form))
+        (when (boundp '*lambda-conversions*)
+          ;; KLUDGE: Not counting TL-XEPs is a lie, of course, but
+          ;; keeps things less confusing to users of TIME, where this
+          ;; count gets used.
+          (unless (and (consp debug-name) (eq 'tl-xep (car debug-name)))
+            (incf *lambda-conversions*)))
         res))))
 
 (defun wrap-forms-in-debug-catch (forms)
index 88f9997..ce91fb4 100644 (file)
   (:result-types unsigned-num unsigned-num)
   (:generator 5
      (zeroize eax)
+     ;; Intel docs seem quite consistent on only using CPUID before RDTSC,
+     ;; not both before and after. Go figure.
      (inst cpuid)
      (inst rdtsc)
-     (inst push edx)
-     (inst push eax)
-     (zeroize eax)
-     (inst cpuid)
-     (inst pop lo)
-     (inst pop hi)))
+     (move lo eax)
+     (move hi edx)))
 
 (defmacro with-cycle-counter (&body body)
   "Returns the primary value of BODY as the primary value, and the
index ca691a8..d65cda3 100644 (file)
   (:result-types unsigned-num unsigned-num)
   (:generator 5
      (inst xor eax eax)
+     ;; Intel docs seem quite consistent on only using CPUID before RDTSC,
+     ;; not both before and after. Go figure.
      (inst cpuid)
      (inst rdtsc)
-     (inst push edx)
-     (inst push eax)
-     (inst xor eax eax)
-     (inst cpuid)
-     (inst pop lo)
-     (inst pop hi)))
+     (move lo eax)
+     (move hi edx)))
 
 (defmacro with-cycle-counter (&body body)
   "Returns the primary value of BODY as the primary value, and the
index a4bb747..4c254c0 100644 (file)
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.16.34"
+"1.0.16.35"