X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=contrib%2Fsb-sprof%2Fsb-sprof.lisp;h=dd1760c0d5164b8706208db6b6c9dbc73610c6ca;hb=f7a78dd3554bd977b006e5da349a11d4e8463bb5;hp=4802e9e55b6c1dd47fd0a7b61a3615fa76bcebe8;hpb=d64e6164df53ee0467f68ddfaedbf0efadd4ab78;p=sbcl.git diff --git a/contrib/sb-sprof/sb-sprof.lisp b/contrib/sb-sprof/sb-sprof.lisp index 4802e9e..dd1760c 100644 --- a/contrib/sb-sprof/sb-sprof.lisp +++ b/contrib/sb-sprof/sb-sprof.lisp @@ -100,8 +100,10 @@ (defpackage #:sb-sprof (:use #:cl #:sb-ext #:sb-unix #:sb-alien #:sb-sys) (:export #:*sample-interval* #:*max-samples* #:*alloc-interval* + #:*report-sort-by* #:*report-sort-order* #:start-sampling #:stop-sampling #:with-sampling #:with-profiling #:start-profiling #:stop-profiling + #:profile-call-counts #:unprofile-call-counts #:reset #:report)) (in-package #:sb-sprof) @@ -248,11 +250,13 @@ ;; the graph was created (depending on the current allocation mode) (sample-interval (sb-impl::missing-arg) :type number) ;; the sampling-mode that was used for the profiling run - (sampling-mode (sb-impl::missing-arg) :type (member :cpu :alloc)) + (sampling-mode (sb-impl::missing-arg) :type (member :cpu :alloc :time)) ;; number of samples taken - (nsamples (sb-impl::missing-arg) :type sb-impl::index) + (nsamples (sb-impl::missing-arg) :type sb-int:index) + ;; threads that have been sampled + (sampled-threads nil :type list) ;; sample count for samples not in any function - (elsewhere-count (sb-impl::missing-arg) :type sb-impl::index) + (elsewhere-count (sb-impl::missing-arg) :type sb-int:index) ;; a flat list of NODEs, sorted by sample count (flat-nodes () :type list)) @@ -280,7 +284,10 @@ ;; the debug-info that this node was created from (debug-info nil :type t) ;; list of NODEs for functions calling this one - (callers () :type list)) + (callers () :type list) + ;; the call count for the function that corresponds to this node (or NIL + ;; if call counting wasn't enabled for this function) + (call-count nil :type (or null integer))) ;;; A cycle in a call graph. The functions forming the cycle are ;;; found in the SCC-VERTICES slot of the VERTEX structure. @@ -291,7 +298,21 @@ (defstruct (call (:include edge) (:constructor make-call (vertex))) ;; number of times the call was sampled - (count 1 :type sb-impl::index)) + (count 1 :type sb-int:index)) + +(defvar *sample-interval* 0.01 + "Default number of seconds between samples.") +(declaim (type number *sample-interval*)) + +(defvar *alloc-interval* 4 + "Default number of allocation region openings between samples.") +(declaim (type number *alloc-interval*)) + +(defvar *max-samples* 50000 + "Default number of traces taken. This variable is somewhat misnamed: +each trace may actually consist of an arbitrary number of samples, depending +on the depth of the call stack.") +(declaim (type sb-int:index *max-samples*)) ;;; Encapsulate all the information about a sampling run (defstruct (samples) @@ -306,13 +327,14 @@ ;; Each sample takes two cells in the vector 2)) :type simple-vector) - (trace-count 0 :type sb-impl::index) - (index 0 :type sb-impl::index) - (mode nil :type (member :cpu :alloc)) - (sample-interval *sample-interval* :type number) - (alloc-interval *alloc-interval* :type number) + (trace-count 0 :type sb-int:index) + (index 0 :type sb-int:index) + (mode nil :type (member :cpu :alloc :time)) + (sample-interval (sb-int:missing-arg) :type number) + (alloc-interval (sb-int:missing-arg) :type number) (max-depth most-positive-fixnum :type number) - (max-samples *max-samples* :type sb-impl::index)) + (max-samples (sb-int:missing-arg) :type sb-int:index) + (sampled-threads nil :type list)) (defmethod print-object ((call-graph call-graph) stream) (print-unreadable-object (call-graph stream :type t :identity t) @@ -333,42 +355,30 @@ (defvar *sampling-mode* :cpu "Default sampling mode. :CPU for cpu profiling, :ALLOC for allocation profiling") -(declaim (type (member :cpu :alloc) *sampling-mode*)) - -(defvar *sample-interval* 0.01 - "Default number of seconds between samples.") -(declaim (number *sample-interval*)) +(declaim (type (member :cpu :alloc :time) *sampling-mode*)) (defvar *alloc-region-size* #-gencgc - 4096 - ;; This hardcoded 2 matches the one in gc_find_freeish_pages. It's not - ;; really worth genesifying. + (get-page-size) #+gencgc - (* 2 sb-vm:gencgc-page-size)) -(declaim (number *alloc-region-size*)) - -(defvar *alloc-interval* 4 - "Default number of allocation region openings between samples.") -(declaim (number *alloc-interval*)) - -(defvar *max-samples* 50000 - "Default number of traces taken. This variable is somewhat misnamed: -each trace may actually consist of an arbitrary number of samples, depending -on the depth of the call stack.") -(declaim (type sb-impl::index *max-samples*)) + (max sb-vm:gencgc-alloc-granularity sb-vm:gencgc-card-bytes)) +(declaim (type number *alloc-region-size*)) (defvar *samples* nil) (declaim (type (or null samples) *samples*)) (defvar *profiling* nil) +(declaim (type (member nil :alloc :cpu :time) *profiling*)) (defvar *sampling* nil) -(declaim (type boolean *profiling* *sampling*)) +(declaim (type boolean *sampling*)) (defvar *show-progress* nil) (defvar *old-sampling* nil) +;; Call count encapsulation information +(defvar *encapsulations* (make-hash-table :test 'equal)) + (defun turn-off-sampling () (setq *old-sampling* *sampling*) (setq *sampling* nil)) @@ -471,68 +481,123 @@ on the depth of the call stack.") 'trace-start)) (incf (samples-index samples) 2)) -;;; Ensure that only one thread at a time will be executing sigprof handler. -(defvar *sigprof-handler-lock* (sb-thread:make-mutex :name "SIGPROF handler")) +;;; List of thread currently profiled, or :ALL for all threads. +(defvar *profiled-threads* nil) +(declaim (type (or list (member :all)) *profiled-threads*)) + +;;; Thread which runs the wallclock timers, if any. +(defvar *timer-thread* nil) + +(defun profiled-threads () + (let ((profiled-threads *profiled-threads*)) + (remove *timer-thread* + (if (eq :all profiled-threads) + (sb-thread:list-all-threads) + profiled-threads)))) + +(defun profiled-thread-p (thread) + (let ((profiled-threads *profiled-threads*)) + (or (and (eq :all profiled-threads) + (not (eq *timer-thread* thread))) + (member thread profiled-threads :test #'eq)))) -;;; SIGPROF handler. Record current PC and return address in -;;; *SAMPLES*. #+(or x86 x86-64) -(defun sigprof-handler (signal code scp) - (declare (ignore signal code) - (optimize speed (space 0)) - (muffle-conditions compiler-note) - (disable-package-locks sb-di::x86-call-context) - (type system-area-pointer scp)) - (sb-sys:without-interrupts - (let ((sb-vm:*alloc-signal* nil) - (samples *samples*)) +(progn + ;; Ensure that only one thread at a time will be doing profiling stuff. + (defvar *profiler-lock* (sb-thread:make-mutex :name "Statistical Profiler")) + (defvar *distribution-lock* (sb-thread:make-mutex :name "Wallclock profiling lock")) + + (declaim (inline pthread-kill)) + (define-alien-routine pthread-kill int (os-thread unsigned-long) (signal int)) + + ;;; A random thread will call this in response to either a timer firing, + ;;; This in turn will distribute the notice to those threads we are + ;;; interested using SIGPROF. + (defun thread-distribution-handler () + (declare (optimize speed (space 0))) + (when *sampling* + #+sb-thread + (let ((lock *distribution-lock*)) + ;; Don't flood the system with more interrupts if the last + ;; set is still being delivered. + (unless (sb-thread:mutex-value lock) + (sb-thread::with-system-mutex (lock) + (dolist (thread (profiled-threads)) + ;; This may occasionally fail to deliver the signal, but that + ;; seems better then using kill_thread_safely with it's 1 + ;; second backoff. + (let ((os-thread (sb-thread::thread-os-thread thread))) + (when os-thread + (pthread-kill os-thread sb-unix:sigprof))))))) + #-sb-thread + (unix-kill 0 sb-unix:sigprof))) + + (defun sigprof-handler (signal code scp) + (declare (ignore signal code) (optimize speed (space 0)) + (disable-package-locks sb-di::x86-call-context) + (muffle-conditions compiler-note) + (type system-area-pointer scp)) + (let ((self sb-thread:*current-thread*) + (profiling *profiling*)) + ;; Turn off allocation counter when it is not needed. Doing this in the + ;; signal handler means we don't have to worry about racing with the runtime + (unless (eq :alloc profiling) + (setf sb-vm::*alloc-signal* nil)) (when (and *sampling* - samples - (< (samples-trace-count samples) - (samples-max-samples samples))) - (sb-sys:without-gcing - (sb-thread:with-mutex (*sigprof-handler-lock*) - (with-alien ((scp (* os-context-t) :local scp)) - (let* ((pc-ptr (sb-vm:context-pc scp)) - (fp (sb-vm::context-register scp #.sb-vm::ebp-offset))) - ;; For some reason completely bogus small values for the - ;; frame pointer are returned every now and then, leading - ;; to segfaults. Try to avoid these cases. - ;; - ;; FIXME: Do a more thorough sanity check on ebp, or figure - ;; out why this is happening. - ;; -- JES, 2005-01-11 - (when (< fp 4096) - (return-from sigprof-handler nil)) - (incf (samples-trace-count samples)) - (let ((fp (int-sap fp)) - (ok t)) - (declare (type system-area-pointer fp pc-ptr)) - ;; FIXME: How annoying. The XC doesn't store enough - ;; type information about SB-DI::X86-CALL-CONTEXT, - ;; even if we declaim the ftype explicitly in - ;; src/code/debug-int. And for some reason that type - ;; information is needed for the inlined version to - ;; be compiled without boxing the returned saps. So - ;; we declare the correct ftype here manually, even - ;; if the compiler should be able to deduce this - ;; exact same information. - (declare (ftype (function (system-area-pointer) - (values (member nil t) - system-area-pointer - system-area-pointer)) - sb-di::x86-call-context)) - (record-trace-start samples) - (dotimes (i (samples-max-depth samples)) + ;; Normal SIGPROF gets practically speaking delivered to threads + ;; depending on the run time they use, so we need to filter + ;; out those we don't care about. For :ALLOC and :TIME profiling + ;; only the interesting threads get SIGPROF in the first place. + ;; + ;; ...except that Darwin at least doesn't seem to work like we + ;; would want it to, which makes multithreaded :CPU profiling pretty + ;; pointless there -- though it may be that our mach magic is + ;; partially to blame? + (or (not (eq :cpu profiling)) (profiled-thread-p self))) + (sb-thread::with-system-mutex (*profiler-lock* :without-gcing t) + (let ((samples *samples*)) + (when (and samples + (< (samples-trace-count samples) + (samples-max-samples samples))) + (with-alien ((scp (* os-context-t) :local scp)) + (let* ((pc-ptr (sb-vm:context-pc scp)) + (fp (sb-vm::context-register scp #.sb-vm::ebp-offset))) + ;; foreign code might not have a useful frame + ;; pointer in ebp/rbp, so make sure it looks + ;; reasonable before walking the stack + (unless (sb-di::control-stack-pointer-valid-p (sb-sys:int-sap fp)) (record samples pc-ptr) - (setf (values ok pc-ptr fp) - (sb-di::x86-call-context fp)) - (unless ok - (return)))))))))) - ;; Reset the allocation counter - (when (and sb-vm:*alloc-signal* - (<= sb-vm:*alloc-signal* 0)) - (setf sb-vm:*alloc-signal* (1- *alloc-interval*))) + (return-from sigprof-handler nil)) + (incf (samples-trace-count samples)) + (pushnew self (samples-sampled-threads samples)) + (let ((fp (int-sap fp)) + (ok t)) + (declare (type system-area-pointer fp pc-ptr)) + ;; FIXME: How annoying. The XC doesn't store enough + ;; type information about SB-DI::X86-CALL-CONTEXT, + ;; even if we declaim the ftype explicitly in + ;; src/code/debug-int. And for some reason that type + ;; information is needed for the inlined version to + ;; be compiled without boxing the returned saps. So + ;; we declare the correct ftype here manually, even + ;; if the compiler should be able to deduce this + ;; exact same information. + (declare (ftype (function (system-area-pointer) + (values (member nil t) + system-area-pointer + system-area-pointer)) + sb-di::x86-call-context)) + (record-trace-start samples) + (dotimes (i (samples-max-depth samples)) + (record samples pc-ptr) + (setf (values ok pc-ptr fp) + (sb-di::x86-call-context fp)) + (unless ok + (return)))))) + ;; Reset thread-local allocation counter before interrupts + ;; are enabled. + (when (eq t sb-vm::*alloc-signal*) + (setf sb-vm:*alloc-signal* (1- (samples-alloc-interval samples))))))))) nil)) ;; FIXME: On non-x86 platforms we don't yet walk the call stack deeper @@ -549,6 +614,7 @@ on the depth of the call stack.") (sb-sys:without-gcing (with-alien ((scp (* os-context-t) :local scp)) (locally (declare (optimize (inhibit-warnings 2))) + (incf (samples-trace-count samples)) (record-trace-start samples) (let* ((pc-ptr (sb-vm:context-pc scp)) (fp (sb-vm::context-register scp #.sb-vm::cfp-offset)) @@ -578,6 +644,7 @@ on the depth of the call stack.") (loop t) (max-depth most-positive-fixnum) show-progress + (threads '(list sb-thread:*current-thread*)) (report nil report-p)) &body body) "Repeatedly evaluate BODY with statistical profiling turned on. @@ -596,8 +663,9 @@ on the depth of the call stack.") *ALLOC-INTERVAL*. :MODE - If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run - the profiler in allocation profiling mode. + If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run the + profiler in allocation profiling mode. If :TIME, run the profiler + in wallclock profiling mode. :MAX-SAMPLES Repeat evaluating body until samples are taken. @@ -612,7 +680,20 @@ on the depth of the call stack.") :RESET It true, call RESET at the beginning. -e + + :THREADS + Form that evaluates to the list threads to profile, or :ALL to indicate + that all threads should be profiled. Defaults to the current + thread. (Note: START-PROFILING defaults to all threads.) + + :THREADS has no effect on call-counting at the moment. + + On some platforms (eg. Darwin) the signals used by the profiler are + not properly delivered to threads in proportion to their CPU usage + when doing :CPU profiling. If you see empty call graphs, or are obviously + missing several samples from certain threads, you may be falling afoul + of this. + :LOOP If true (the default) repeatedly evaluate BODY. If false, evaluate if only once." @@ -620,13 +701,12 @@ e `(let* ((*sample-interval* ,sample-interval) (*alloc-interval* ,alloc-interval) (*sampling* nil) - (sb-vm:*alloc-signal* nil) (*sampling-mode* ,mode) (*max-samples* ,max-samples)) ,@(when reset '((reset))) (unwind-protect (progn - (start-profiling :max-depth ',max-depth) + (start-profiling :max-depth ,max-depth :threads ,threads) (loop (when (>= (samples-trace-count *samples*) (samples-max-samples *samples*)) @@ -645,14 +725,20 @@ e (stop-profiling)) ,@(when report-p `((report :type ,report))))) +(defvar *timer* nil) + +(defvar *old-alloc-interval* nil) +(defvar *old-sample-interval* nil) + (defun start-profiling (&key (max-samples *max-samples*) (mode *sampling-mode*) (sample-interval *sample-interval*) (alloc-interval *alloc-interval*) (max-depth most-positive-fixnum) + (threads :all) (sampling t)) - "Start profiling statistically if not already profiling. - The following keyword args are recognized: + "Start profiling statistically in the current thread if not already profiling. +The following keyword args are recognized: :SAMPLE-INTERVAL Take a sample every seconds. Default is *SAMPLE-INTERVAL*. @@ -664,7 +750,8 @@ e :MODE If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run - the profiler in allocation profiling mode. + the profiler in allocation profiling mode. If :TIME, run the profiler + in wallclock profiling mode. :MAX-SAMPLES Maximum number of samples. Default is *MAX-SAMPLES*. @@ -673,6 +760,19 @@ e Maximum call stack depth that the profiler should consider. Only has an effect on x86 and x86-64. + :THREADS + List threads to profile, or :ALL to indicate that all threads should be + profiled. Defaults to :ALL. (Note: WITH-PROFILING defaults to the current + thread.) + + :THREADS has no effect on call-counting at the moment. + + On some platforms (eg. Darwin) the signals used by the profiler are + not properly delivered to threads in proportion to their CPU usage + when doing :CPU profiling. If you see empty call graphs, or are obviously + missing several samples from certain threads, you may be falling afoul + of this. + :SAMPLING If true, the default, start sampling right away. If false, START-SAMPLING can be used to turn sampling on." @@ -687,26 +787,69 @@ e (setf *sampling* sampling *samples* (make-samples :max-depth max-depth :max-samples max-samples + :sample-interval sample-interval + :alloc-interval alloc-interval :mode mode)) + (enable-call-counting) + (setf *profiled-threads* threads) (sb-sys:enable-interrupt sb-unix:sigprof #'sigprof-handler) - (if (eq mode :alloc) - (setf sb-vm:*alloc-signal* (1- alloc-interval)) - (progn - (unix-setitimer :profile secs usecs secs usecs) - (setf sb-vm:*alloc-signal* nil))) - (setq *profiling* t))) + (ecase mode + (:alloc + (let ((alloc-signal (1- alloc-interval))) + #+sb-thread + (progn + (when (eq :all threads) + ;; Set the value new threads inherit. + (sb-thread::with-all-threads-lock + (setf sb-thread::*default-alloc-signal* alloc-signal))) + ;; Turn on allocation profiling in existing threads. + (dolist (thread (profiled-threads)) + (sb-thread::%set-symbol-value-in-thread 'sb-vm::*alloc-signal* thread alloc-signal))) + #-sb-thread + (setf sb-vm:*alloc-signal* alloc-signal))) + (:cpu + (unix-setitimer :profile secs usecs secs usecs)) + (:time + #+sb-thread + (let ((setup (sb-thread:make-semaphore :name "Timer thread setup semaphore"))) + (setf *timer-thread* + (sb-thread:make-thread (lambda () + (sb-thread:wait-on-semaphore setup) + (loop while (eq sb-thread:*current-thread* *timer-thread*) + do (sleep 1.0))) + :name "SB-SPROF wallclock timer thread")) + (sb-thread:signal-semaphore setup)) + #-sb-thread + (setf *timer-thread* nil) + (setf *timer* (make-timer #'thread-distribution-handler :name "SB-PROF wallclock timer" + :thread *timer-thread*)) + (schedule-timer *timer* sample-interval :repeat-interval sample-interval))) + (setq *profiling* mode))) (values)) (defun stop-profiling () "Stop profiling if profiling." - (when *profiling* - (unix-setitimer :profile 0 0 0 0) - ;; Even with the timer shut down we cannot be sure that there is - ;; no undelivered sigprof. Besides, leaving the signal handler - ;; installed won't hurt. - (setq *sampling* nil) - (setq sb-vm:*alloc-signal* nil) - (setq *profiling* nil)) + (let ((profiling *profiling*)) + (when profiling + ;; Even with the timers shut down we cannot be sure that there is no + ;; undelivered sigprof. The handler is also responsible for turning the + ;; *ALLOC-SIGNAL* off in individual threads. + (ecase profiling + (:alloc + #+sb-thread + (setf sb-thread::*default-alloc-signal* nil) + #-sb-thread + (setf sb-vm:*alloc-signal* nil)) + (:cpu + (unix-setitimer :profile 0 0 0 0)) + (:time + (unschedule-timer *timer*) + (setf *timer* nil + *timer-thread* nil))) + (disable-call-counting) + (setf *profiling* nil + *sampling* nil + *profiled-threads* nil))) (values)) (defun reset () @@ -780,6 +923,8 @@ e (when info (multiple-value-bind (new key) (make-node info) + (when (eql (node-name new) 'call-counter) + (return-from lookup-node (values nil nil))) (let* ((key (cons (node-name new) key)) (found (gethash key *name->node*))) (cond (found @@ -791,6 +936,11 @@ e (node-end-pc-or-offset new))) found) (t + (let ((call-count-info (gethash (node-name new) + *encapsulations*))) + (when call-count-info + (setf (node-call-count new) + (car call-count-info)))) (setf (gethash key *name->node*) new) new)))))) @@ -846,6 +996,7 @@ e (samples-alloc-interval *samples*) (samples-sample-interval *samples*)) :sampling-mode (samples-mode *samples*) + :sampled-threads (samples-sampled-threads *samples*) :elsewhere-count elsewhere-count :vertices sorted-nodes))))) @@ -912,26 +1063,42 @@ e count (scc-p v)))) (if (eq (call-graph-sampling-mode call-graph) :alloc) (format t "~2&Number of samples: ~d~%~ - Sample interval: ~a regions (approximately ~a kB)~%~ - Total sampling amount: ~a regions (approximately ~a kB)~%~ - Number of cycles: ~d~2%" + Alloc interval: ~a regions (approximately ~a kB)~%~ + Total sampling amount: ~a regions (approximately ~a kB)~%~ + Number of cycles: ~d~%~ + Sampled threads:~{~% ~S~}~2%" nsamples interval (truncate (* interval *alloc-region-size*) 1024) (* nsamples interval) (truncate (* nsamples interval *alloc-region-size*) 1024) - ncycles) + ncycles + (call-graph-sampled-threads call-graph)) (format t "~2&Number of samples: ~d~%~ - Sample interval: ~f seconds~%~ - Total sampling time: ~f seconds~%~ - Number of cycles: ~d~2%" + Sample interval: ~f seconds~%~ + Total sampling time: ~f seconds~%~ + Number of cycles: ~d~%~ + Sampled threads:~{~% ~S~}~2%" nsamples interval (* nsamples interval) - ncycles)))) + ncycles + (call-graph-sampled-threads call-graph))))) + +(declaim (type (member :samples :cumulative-samples) *report-sort-by*)) +(defvar *report-sort-by* :samples + "Method for sorting the flat report: either by :SAMPLES or by :CUMULATIVE-SAMPLES.") + +(declaim (type (member :descending :ascending) *report-sort-order*)) +(defvar *report-sort-order* :descending + "Order for sorting the flat report: either :DESCENDING or :ASCENDING.") (defun print-flat (call-graph &key (stream *standard-output*) max - min-percent (print-header t)) + min-percent (print-header t) + (sort-by *report-sort-by*) + (sort-order *report-sort-order*)) + (declare (type (member :descending :ascending) sort-order) + (type (member :samples :cumulative-samples) sort-by)) (let ((*standard-output* stream) (*print-pretty* nil) (total-count 0) @@ -943,11 +1110,23 @@ e (when print-header (print-call-graph-header call-graph)) (format t "~& Self Total Cumul~%") - (format t "~& Nr Count % Count % Count % Function~%") + (format t "~& Nr Count % Count % Count % Calls Function~%") (print-separator) (let ((elsewhere-count (call-graph-elsewhere-count call-graph)) - (i 0)) - (dolist (node (call-graph-flat-nodes call-graph)) + (i 0) + (nodes (stable-sort (copy-list (call-graph-flat-nodes call-graph)) + (let ((cmp (if (eq :descending sort-order) #'> #'<))) + (multiple-value-bind (primary secondary) + (if (eq :samples sort-by) + (values #'node-count #'node-accrued-count) + (values #'node-accrued-count #'node-count)) + (lambda (x y) + (let ((cx (funcall primary x)) + (cy (funcall primary y))) + (if (= cx cy) + (funcall cmp (funcall secondary x) (funcall secondary y)) + (funcall cmp cx cy))))))))) + (dolist (node nodes) (when (or (and max (> (incf i) max)) (< (node-count node) min-count)) (return)) @@ -957,20 +1136,22 @@ e (accrued-percent (samples-percent call-graph accrued-count))) (incf total-count count) (incf total-percent percent) - (format t "~&~4d ~6d ~5,1f ~6d ~5,1f ~6d ~5,1f ~s~%" - (node-index node) + (format t "~&~4d ~6d ~5,1f ~6d ~5,1f ~6d ~5,1f ~8@a ~s~%" + (incf i) count percent accrued-count accrued-percent total-count total-percent + (or (node-call-count node) "-") (node-name node)) (finish-output))) (print-separator) - (format t "~& ~6d ~5,1f elsewhere~%" + (format t "~& ~6d ~5,1f~36a elsewhere~%" elsewhere-count - (samples-percent call-graph elsewhere-count))))) + (samples-percent call-graph elsewhere-count) + "")))) (defun print-cycles (call-graph) (when (some #'cycle-p (graph-vertices call-graph)) @@ -1037,6 +1218,8 @@ e :min-percent min-percent :print-header nil)))) (defun report (&key (type :graph) max min-percent call-graph + ((:sort-by *report-sort-by*) *report-sort-by*) + ((:sort-order *report-sort-order*) *report-sort-order*) (stream *standard-output*) ((:show-progress *show-progress*))) "Report statistical profiling results. The following keyword args are recognized: @@ -1057,6 +1240,16 @@ e Don't show functions taking less than of the total time in the flat report. + :SORT-BY + If :SAMPLES, sort flat report by number of samples taken. + If :CUMULATIVE-SAMPLES, sort flat report by cumulative number of samples + taken (shows how much time each function spent on stack.) Default + is *REPORT-SORT-BY*. + + :SORT-ORDER + If :DESCENDING, sort flat report in descending order. If :ASCENDING, + sort flat report in ascending order. Default is *REPORT-SORT-ORDER*. + :SHOW-PROGRESS If true, print progress messages while generating the call graph. @@ -1064,16 +1257,23 @@ e Print a report from instead of the latest profiling results. - Value of this function is a CALL-GRAPH object representing the - resulting call-graph." - (let ((graph (or call-graph (make-call-graph most-positive-fixnum)))) - (ecase type - (:flat - (print-flat graph :stream stream :max max :min-percent min-percent)) - (:graph - (print-graph graph :stream stream :max max :min-percent min-percent)) - ((nil))) - graph)) +Value of this function is a CALL-GRAPH object representing the +resulting call-graph, or NIL if there are no samples (eg. right after +calling RESET.) + +Profiling is stopped before the call graph is generated." + (cond (*samples* + (let ((graph (or call-graph (make-call-graph most-positive-fixnum)))) + (ecase type + (:flat + (print-flat graph :stream stream :max max :min-percent min-percent)) + (:graph + (print-graph graph :stream stream :max max :min-percent min-percent)) + ((nil))) + graph)) + (t + (format stream "~&; No samples to report.~%") + nil))) ;;; Interface to DISASSEMBLE @@ -1113,6 +1313,87 @@ e (pushnew 'add-disassembly-profile-note sb-disassem::*default-dstate-hooks*) + +;;;; Call counting + +;;; The following functions tell sb-sprof to do call count profiling +;;; for the named functions in addition to normal statistical +;;; profiling. The benefit of this over using SB-PROFILE is that this +;;; encapsulation is a lot more lightweight, due to not needing to +;;; track cpu usage / consing. (For example, compiling asdf 20 times +;;; took 13s normally, 15s with call counting for all functions in +;;; SB-C, and 94s with SB-PROFILE profiling SB-C). + +(defun profile-call-counts (&rest names) + "Mark the functions named by NAMES as being subject to call counting +during statistical profiling. If a string is used as a name, it will +be interpreted as a package name. In this case call counting will be +done for all functions with names like X or (SETF X), where X is a symbol +with the package as its home package." + (dolist (name names) + (if (stringp name) + (let ((package (find-package name))) + (do-symbols (symbol package) + (when (eql (symbol-package symbol) package) + (dolist (function-name (list symbol (list 'setf symbol))) + (profile-call-counts-for-function function-name))))) + (profile-call-counts-for-function name)))) + +(defun profile-call-counts-for-function (function-name) + (unless (gethash function-name *encapsulations*) + (setf (gethash function-name *encapsulations*) nil))) + +(defun unprofile-call-counts () + "Clear all call counting information. Call counting will be done for no +functions during statistical profiling." + (clrhash *encapsulations*)) + +;;; Called when profiling is started to enable the call counting +;;; encapsulation. Wrap all the call counted functions +(defun enable-call-counting () + (maphash (lambda (k v) + (declare (ignore v)) + (enable-call-counting-for-function k)) + *encapsulations*)) + +;;; Called when profiling is stopped to disable the encapsulation. Restore +;;; the original functions. +(defun disable-call-counting () + (maphash (lambda (k v) + (when v + (assert (cdr v)) + (without-package-locks + (setf (fdefinition k) (cdr v))) + (setf (cdr v) nil))) + *encapsulations*)) + +(defun enable-call-counting-for-function (function-name) + (let ((info (gethash function-name *encapsulations*))) + ;; We should never try to encapsulate an fdefn multiple times. + (assert (or (null info) + (null (cdr info)))) + (when (and (fboundp function-name) + (or (not (symbolp function-name)) + (and (not (special-operator-p function-name)) + (not (macro-function function-name))))) + (let* ((original-fun (fdefinition function-name)) + (info (cons 0 original-fun))) + (setf (gethash function-name *encapsulations*) info) + (without-package-locks + (setf (fdefinition function-name) + (sb-int:named-lambda call-counter (sb-int:&more more-context more-count) + (declare (optimize speed (safety 0))) + ;; 2^59 calls should be enough for anybody, and it + ;; allows using fixnum arithmetic on x86-64. 2^32 + ;; isn't enough, so we can't do that on 32 bit platforms. + (incf (the (unsigned-byte 59) + (car info))) + (multiple-value-call original-fun + (sb-c:%more-arg-values more-context + 0 + more-count))))))))) + + ;;; silly examples (defun test-0 (n &optional (depth 0))