1 ;;; Copyright (C) 2003 Gerd Moellmann <gerd.moellmann@t-online.de>
2 ;;; All rights reserved.
4 ;;; Redistribution and use in source and binary forms, with or without
5 ;;; modification, are permitted provided that the following conditions
8 ;;; 1. Redistributions of source code must retain the above copyright
9 ;;; notice, this list of conditions and the following disclaimer.
10 ;;; 2. Redistributions in binary form must reproduce the above copyright
11 ;;; notice, this list of conditions and the following disclaimer in the
12 ;;; documentation and/or other materials provided with the distribution.
13 ;;; 3. The name of the author may not be used to endorse or promote
14 ;;; products derived from this software without specific prior written
17 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
21 ;;; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 ;;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
23 ;;; OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24 ;;; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ;;; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
27 ;;; USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30 ;;; Statistical profiler.
34 ;;; This profiler arranges for SIGPROF interrupts to interrupt a
35 ;;; running program at regular intervals. Each time a SIGPROF occurs,
36 ;;; the current program counter and return address is recorded in a
37 ;;; vector, until a configurable maximum number of samples have been
40 ;;; A profiling report is generated from the samples array by
41 ;;; determining the Lisp functions corresponding to the recorded
42 ;;; addresses. Each program counter/return address pair forms one
43 ;;; edge in a call graph.
47 ;;; The code being generated on x86 makes determining callers reliably
48 ;;; something between extremely difficult and impossible. Example:
50 ;;; 10979F00: .entry eval::eval-stack-args(arg-count)
51 ;;; 18: pop dword ptr [ebp-8]
52 ;;; 1B: lea esp, [ebp-32]
57 ;;; 29: mov [ebp-12], edi
58 ;;; 2C: mov dword ptr [ebp-16], #x28F0000B ; nil
59 ;;; ; No-arg-parsing entry point
60 ;;; 33: mov dword ptr [ebp-20], 0
62 ;;; 3C: L0: mov edx, esp
64 ;;; 41: mov eax, [#x10979EF8] ; #<FDEFINITION object for eval::eval-stack-pop>
66 ;;; 49: mov [edx-4], ebp
68 ;;; 4E: call dword ptr [eax+5]
71 ;;; Suppose this function is interrupted by SIGPROF at 4E. At that
72 ;;; point, the frame pointer EBP has been modified so that the
73 ;;; original return address of the caller of eval-stack-args is no
74 ;;; longer where it can be found by x86-call-context, and the new
75 ;;; return address, for the call to eval-stack-pop, is not yet on the
76 ;;; stack. The effect is that x86-call-context returns something
77 ;;; bogus, which leads to wrong edges in the call graph.
79 ;;; One thing that one might try is filtering cases where the program
80 ;;; is interrupted at a call instruction. But since the above example
81 ;;; of an interrupt at a call instruction isn't the only case where
82 ;;; the stack is something x86-call-context can't really cope with,
83 ;;; this is not a general solution.
85 ;;; Random ideas for implementation:
87 ;;; * Space profiler. Sample when new pages are allocated instead of
90 ;;; * Record a configurable number of callers up the stack. That
91 ;;; could give a more complete graph when there are many small
94 ;;; * Print help strings for reports, include hints to the problem
97 ;;; * Make flat report the default since call-graph isn't that
100 (defpackage #:sb-sprof
101 (:use #:cl #:sb-ext #:sb-unix #:sb-alien #:sb-sys)
102 (:export #:*sample-interval* #:*max-samples* #:*alloc-interval*
103 #:start-sampling #:stop-sampling #:with-sampling
104 #:with-profiling #:start-profiling #:stop-profiling
105 #:profile-call-counts #:unprofile-call-counts
108 (in-package #:sb-sprof)
113 (defstruct (vertex (:constructor make-vertex)
114 (:constructor make-scc (scc-vertices edges)))
115 (visited nil :type boolean)
116 (root nil :type (or null vertex))
118 (edges () :type list)
119 (scc-vertices () :type list))
122 (vertex (sb-impl::missing-arg) :type vertex))
125 (vertices () :type list))
127 (declaim (inline scc-p))
128 (defun scc-p (vertex)
129 (not (null (vertex-scc-vertices vertex))))
131 (defmacro do-vertices ((vertex graph) &body body)
132 `(dolist (,vertex (graph-vertices ,graph))
135 (defmacro do-edges ((edge edge-to vertex) &body body)
136 `(dolist (,edge (vertex-edges ,vertex))
137 (let ((,edge-to (edge-vertex ,edge)))
140 (defun self-cycle-p (vertex)
141 (do-edges (e to vertex)
145 (defun map-vertices (fn vertices)
147 (setf (vertex-visited v) nil))
149 (unless (vertex-visited v)
152 ;;; Eeko Nuutila, Eljas Soisalon-Soininen, around 1992. Improves on
153 ;;; Tarjan's original algorithm by not using the stack when processing
154 ;;; trivial components. Trivial components should appear frequently
155 ;;; in a call-graph such as ours, I think. Same complexity O(V+E) as
157 (defun strong-components (vertices)
158 (let ((in-component (make-array (length vertices)
159 :element-type 'boolean
160 :initial-element nil))
164 (labels ((min-root (x y)
165 (let ((rx (vertex-root x))
166 (ry (vertex-root y)))
167 (if (< (vertex-dfn rx) (vertex-dfn ry))
171 (aref in-component (vertex-dfn v)))
172 ((setf in-component) (in v)
173 (setf (aref in-component (vertex-dfn v)) in))
175 (> (vertex-dfn x) (vertex-dfn y)))
177 (setf (vertex-dfn v) (incf dfn)
180 (vertex-visited v) t)
182 (unless (vertex-visited w)
184 (unless (in-component w)
185 (setf (vertex-root v) (min-root v w))))
186 (if (eq v (vertex-root v))
187 (loop while (and stack (vertex-> (car stack) v))
189 collect w into this-component
190 do (setf (in-component w) t)
192 (setf (in-component v) t)
193 (push (cons v this-component) components))
195 (map-vertices #'visit vertices)
198 ;;; Given a dag as a list of vertices, return the list sorted
199 ;;; topologically, children first.
200 (defun topological-sort (dag)
203 (labels ((rec-sort (v)
204 (setf (vertex-visited v) t)
205 (setf (vertex-dfn v) (incf dfn))
206 (dolist (e (vertex-edges v))
207 (unless (vertex-visited (edge-vertex e))
208 (rec-sort (edge-vertex e))))
210 (map-vertices #'rec-sort dag)
213 ;;; Reduce graph G to a dag by coalescing strongly connected components
214 ;;; into vertices. Sort the result topologically.
215 (defun reduce-graph (graph &optional (scc-constructor #'make-scc))
216 (sb-int:collect ((sccs) (trivial))
217 (dolist (c (strong-components (graph-vertices graph)))
218 (if (or (cdr c) (self-cycle-p (car c)))
219 (sb-int:collect ((outgoing))
224 (sccs (funcall scc-constructor c (outgoing))))
227 (dolist (v (trivial))
229 (when (member w (vertex-scc-vertices scc))
230 (setf (edge-vertex e) scc)))))
231 (setf (graph-vertices graph)
232 (topological-sort (nconc (sccs) (trivial))))))
237 "Type used for addresses, for instance, program counters,
238 code start/end locations etc."
239 '(unsigned-byte #.sb-vm::n-machine-word-bits))
241 (defconstant +unknown-address+ 0
242 "Constant representing an address that cannot be determined.")
244 ;;; A call graph. Vertices are NODE structures, edges are CALL
246 (defstruct (call-graph (:include graph)
247 (:constructor %make-call-graph))
248 ;; the value of *SAMPLE-INTERVAL* or *ALLOC-INTERVAL* at the time
249 ;; the graph was created (depending on the current allocation mode)
250 (sample-interval (sb-impl::missing-arg) :type number)
251 ;; the sampling-mode that was used for the profiling run
252 (sampling-mode (sb-impl::missing-arg) :type (member :cpu :alloc))
253 ;; number of samples taken
254 (nsamples (sb-impl::missing-arg) :type sb-int:index)
255 ;; sample count for samples not in any function
256 (elsewhere-count (sb-impl::missing-arg) :type sb-int:index)
257 ;; a flat list of NODEs, sorted by sample count
258 (flat-nodes () :type list))
260 ;;; A node in a call graph, representing a function that has been
261 ;;; sampled. The edges of a node are CALL structures that represent
262 ;;; functions called from a given node.
263 (defstruct (node (:include vertex)
264 (:constructor %make-node))
265 ;; A numeric label for the node. The most frequently called function
266 ;; gets label 1. This is just for identification purposes in the
268 (index 0 :type fixnum)
269 ;; Start and end address of the function's code. Depending on the
270 ;; debug-info, this might be either as absolute addresses for things
271 ;; that won't move around in memory, or as relative offsets from
272 ;; some point for things that might move.
273 (start-pc-or-offset 0 :type address)
274 (end-pc-or-offset 0 :type address)
275 ;; the name of the function
277 ;; sample count for this function
278 (count 0 :type fixnum)
279 ;; count including time spent in functions called from this one
280 (accrued-count 0 :type fixnum)
281 ;; the debug-info that this node was created from
282 (debug-info nil :type t)
283 ;; list of NODEs for functions calling this one
284 (callers () :type list)
285 ;; the call count for the function that corresponds to this node (or NIL
286 ;; if call counting wasn't enabled for this function)
287 (call-count nil :type (or null integer)))
289 ;;; A cycle in a call graph. The functions forming the cycle are
290 ;;; found in the SCC-VERTICES slot of the VERTEX structure.
291 (defstruct (cycle (:include node)))
293 ;;; An edge in a call graph. EDGE-VERTEX is the function being
295 (defstruct (call (:include edge)
296 (:constructor make-call (vertex)))
297 ;; number of times the call was sampled
298 (count 1 :type sb-int:index))
300 ;;; Encapsulate all the information about a sampling run
302 ;; When this vector fills up, we allocate a new one and copy over
304 (vector (make-array (* *max-samples*
305 ;; Arbitrary guess at how many samples we'll be
306 ;; taking for each trace. The exact amount doesn't
307 ;; matter, this is just to decrease the amount of
308 ;; re-allocation that will need to be done.
310 ;; Each sample takes two cells in the vector
313 (trace-count 0 :type sb-int:index)
314 (index 0 :type sb-int:index)
315 (mode nil :type (member :cpu :alloc))
316 (sample-interval *sample-interval* :type number)
317 (alloc-interval *alloc-interval* :type number)
318 (max-depth most-positive-fixnum :type number)
319 (max-samples *max-samples* :type sb-int:index))
321 (defmethod print-object ((call-graph call-graph) stream)
322 (print-unreadable-object (call-graph stream :type t :identity t)
323 (format stream "~d samples" (call-graph-nsamples call-graph))))
325 (defmethod print-object ((node node) stream)
326 (print-unreadable-object (node stream :type t :identity t)
327 (format stream "~s [~d]" (node-name node) (node-index node))))
329 (defmethod print-object ((call call) stream)
330 (print-unreadable-object (call stream :type t :identity t)
331 (format stream "~s [~d]" (node-name (call-vertex call))
332 (node-index (call-vertex call)))))
334 (deftype report-type ()
335 '(member nil :flat :graph))
337 (defvar *sampling-mode* :cpu
338 "Default sampling mode. :CPU for cpu profiling, :ALLOC for allocation
340 (declaim (type (member :cpu :alloc) *sampling-mode*))
342 (defvar *sample-interval* 0.01
343 "Default number of seconds between samples.")
344 (declaim (number *sample-interval*))
346 (defvar *alloc-region-size*
349 ;; This hardcoded 2 matches the one in gc_find_freeish_pages. It's not
350 ;; really worth genesifying.
352 (* 2 sb-vm:gencgc-page-size))
353 (declaim (number *alloc-region-size*))
355 (defvar *alloc-interval* 4
356 "Default number of allocation region openings between samples.")
357 (declaim (number *alloc-interval*))
359 (defvar *max-samples* 50000
360 "Default number of traces taken. This variable is somewhat misnamed:
361 each trace may actually consist of an arbitrary number of samples, depending
362 on the depth of the call stack.")
363 (declaim (type sb-int:index *max-samples*))
365 (defvar *samples* nil)
366 (declaim (type (or null samples) *samples*))
368 (defvar *profiling* nil)
369 (defvar *sampling* nil)
370 (declaim (type boolean *profiling* *sampling*))
372 (defvar *show-progress* nil)
374 (defvar *old-sampling* nil)
376 ;; Call count encapsulation information
377 (defvar *encapsulations* (make-hash-table :test 'equal))
379 (defun turn-off-sampling ()
380 (setq *old-sampling* *sampling*)
381 (setq *sampling* nil))
383 (defun turn-on-sampling ()
384 (setq *sampling* *old-sampling*))
386 (defun show-progress (format-string &rest args)
387 (when *show-progress*
388 (apply #'format t format-string args)
391 (defun start-sampling ()
392 "Switch on statistical sampling."
395 (defun stop-sampling ()
396 "Switch off statistical sampling."
397 (setq *sampling* nil))
399 (defmacro with-sampling ((&optional (on t)) &body body)
400 "Evaluate body with statistical sampling turned on or off."
401 `(let ((*sampling* ,on)
402 (sb-vm:*alloc-signal* sb-vm:*alloc-signal*))
405 ;;; Return something serving as debug info for address PC.
406 (declaim (inline debug-info))
407 (defun debug-info (pc)
408 (declare (type system-area-pointer pc)
409 (muffle-conditions compiler-note))
410 (let ((ptr (sb-di::component-ptr-from-pc pc)))
411 (cond ((sap= ptr (int-sap 0))
412 (let ((name (sap-foreign-symbol pc)))
414 (values (format nil "foreign function ~a" name)
416 (values nil (sap-int pc)))))
418 (let* ((code (sb-di::component-from-component-ptr ptr))
419 (code-header-len (* (sb-kernel:get-header-data code)
421 (pc-offset (- (sap-int pc)
422 (- (sb-kernel:get-lisp-obj-address code)
423 sb-vm:other-pointer-lowtag)
425 (df (sb-di::debug-fun-from-pc code pc-offset)))
426 (cond ((typep df 'sb-di::bogus-debug-fun)
427 (values code (sap-int pc)))
429 ;; The code component might be moved by the GC. Store
430 ;; a PC offset, and reconstruct the data in
431 ;; SAMPLE-PC-FROM-PC-OR-OFFSET.
432 (values df pc-offset))
434 (values nil 0))))))))
436 (defun ensure-samples-vector (samples)
437 (let ((vector (samples-vector samples))
438 (index (samples-index samples)))
439 ;; Allocate a new sample vector if the old one is full
440 (if (= (length vector) index)
441 (let ((new-vector (make-array (* 2 index))))
442 (format *trace-output* "Profiler sample vector full (~a traces / ~a samples), doubling the size~%"
443 (samples-trace-count samples)
445 (replace new-vector vector)
446 (setf (samples-vector samples) new-vector))
449 (declaim (inline record))
450 (defun record (samples pc)
451 (declare (type system-area-pointer pc)
452 (muffle-conditions compiler-note))
453 (multiple-value-bind (info pc-or-offset)
455 (let ((vector (ensure-samples-vector samples))
456 (index (samples-index samples)))
457 (declare (type simple-vector vector))
458 ;; Allocate a new sample vector if the old one is full
459 (when (= (length vector) index)
460 (let ((new-vector (make-array (* 2 index))))
461 (format *trace-output* "Profiler sample vector full (~a traces / ~a samples), doubling the size~%"
462 (samples-trace-count samples)
464 (replace new-vector vector)
465 (setf vector new-vector
466 (samples-vector samples) new-vector)))
467 ;; For each sample, store the debug-info and the PC/offset into
469 (setf (aref vector index) info
470 (aref vector (1+ index)) pc-or-offset)))
471 (incf (samples-index samples) 2))
473 (defun record-trace-start (samples)
474 ;; Mark the start of the trace.
475 (let ((vector (ensure-samples-vector samples)))
476 (declare (type simple-vector vector))
477 (setf (aref vector (samples-index samples))
479 (incf (samples-index samples) 2))
481 ;;; Ensure that only one thread at a time will be executing sigprof handler.
482 (defvar *sigprof-handler-lock* (sb-thread:make-mutex :name "SIGPROF handler"))
484 ;;; SIGPROF handler. Record current PC and return address in
487 (defun sigprof-handler (signal code scp)
488 (declare (ignore signal code)
489 (optimize speed (space 0))
490 (muffle-conditions compiler-note)
491 (disable-package-locks sb-di::x86-call-context)
492 (type system-area-pointer scp))
493 (sb-sys:without-interrupts
494 (let ((sb-vm:*alloc-signal* nil)
496 (when (and *sampling*
498 (< (samples-trace-count samples)
499 (samples-max-samples samples)))
500 (sb-sys:without-gcing
501 (sb-thread:with-mutex (*sigprof-handler-lock*)
502 (with-alien ((scp (* os-context-t) :local scp))
503 (let* ((pc-ptr (sb-vm:context-pc scp))
504 (fp (sb-vm::context-register scp #.sb-vm::ebp-offset)))
505 ;; For some reason completely bogus small values for the
506 ;; frame pointer are returned every now and then, leading
507 ;; to segfaults. Try to avoid these cases.
509 ;; FIXME: Do a more thorough sanity check on ebp, or figure
510 ;; out why this is happening.
511 ;; -- JES, 2005-01-11
513 (return-from sigprof-handler nil))
514 (incf (samples-trace-count samples))
515 (let ((fp (int-sap fp))
517 (declare (type system-area-pointer fp pc-ptr))
518 ;; FIXME: How annoying. The XC doesn't store enough
519 ;; type information about SB-DI::X86-CALL-CONTEXT,
520 ;; even if we declaim the ftype explicitly in
521 ;; src/code/debug-int. And for some reason that type
522 ;; information is needed for the inlined version to
523 ;; be compiled without boxing the returned saps. So
524 ;; we declare the correct ftype here manually, even
525 ;; if the compiler should be able to deduce this
526 ;; exact same information.
527 (declare (ftype (function (system-area-pointer)
528 (values (member nil t)
530 system-area-pointer))
531 sb-di::x86-call-context))
532 (record-trace-start samples)
533 (dotimes (i (samples-max-depth samples))
534 (record samples pc-ptr)
535 (setf (values ok pc-ptr fp)
536 (sb-di::x86-call-context fp))
539 ;; Reset the allocation counter
540 (when (and sb-vm:*alloc-signal*
541 (<= sb-vm:*alloc-signal* 0))
542 (setf sb-vm:*alloc-signal* (1- *alloc-interval*)))
545 ;; FIXME: On non-x86 platforms we don't yet walk the call stack deeper
548 (defun sigprof-handler (signal code scp)
549 (declare (ignore signal code))
550 (sb-sys:without-interrupts
551 (let ((samples *samples*))
552 (when (and *sampling*
554 (< (samples-trace-count samples)
555 (samples-max-samples samples)))
556 (sb-sys:without-gcing
557 (with-alien ((scp (* os-context-t) :local scp))
558 (locally (declare (optimize (inhibit-warnings 2)))
559 (incf (samples-trace-count samples))
560 (record-trace-start samples)
561 (let* ((pc-ptr (sb-vm:context-pc scp))
562 (fp (sb-vm::context-register scp #.sb-vm::cfp-offset))
565 (* sb-vm::lra-save-offset sb-vm::n-word-bytes))))
566 (record samples pc-ptr)
567 (record samples (int-sap ra))))))))))
569 ;;; Return the start address of CODE.
570 (defun code-start (code)
571 (declare (type sb-kernel:code-component code))
572 (sap-int (sb-kernel:code-instructions code)))
574 ;;; Return start and end address of CODE as multiple values.
575 (defun code-bounds (code)
576 (declare (type sb-kernel:code-component code))
577 (let* ((start (code-start code))
578 (end (+ start (sb-kernel:%code-code-size code))))
581 (defmacro with-profiling ((&key (sample-interval '*sample-interval*)
582 (alloc-interval '*alloc-interval*)
583 (max-samples '*max-samples*)
585 (mode '*sampling-mode*)
587 (max-depth most-positive-fixnum)
589 (report nil report-p))
591 "Repeatedly evaluate BODY with statistical profiling turned on.
592 In multi-threaded operation, only the thread in which WITH-PROFILING
593 was evaluated will be profiled by default. If you want to profile
594 multiple threads, invoke the profiler with START-PROFILING.
596 The following keyword args are recognized:
599 Take a sample every <n> seconds. Default is *SAMPLE-INTERVAL*.
602 Take a sample every time <n> allocation regions (approximately
603 8kB) have been allocated since the last sample. Default is
607 If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run
608 the profiler in allocation profiling mode.
611 Repeat evaluating body until <max> samples are taken.
612 Default is *MAX-SAMPLES*.
615 Maximum call stack depth that the profiler should consider. Only
616 has an effect on x86 and x86-64.
619 If specified, call REPORT with :TYPE <type> at the end.
622 It true, call RESET at the beginning.
625 If true (the default) repeatedly evaluate BODY. If false, evaluate
627 (declare (type report-type report))
628 `(let* ((*sample-interval* ,sample-interval)
629 (*alloc-interval* ,alloc-interval)
631 (sb-vm:*alloc-signal* nil)
632 (*sampling-mode* ,mode)
633 (*max-samples* ,max-samples))
634 ,@(when reset '((reset)))
637 (start-profiling :max-depth ',max-depth)
639 (when (>= (samples-trace-count *samples*)
640 (samples-max-samples *samples*))
642 ,@(when show-progress
643 `((format t "~&===> ~d of ~d samples taken.~%"
644 (samples-trace-count *samples*)
645 (samples-max-samples *samples*))))
646 (let ((.last-index. (samples-index *samples*)))
648 (when (= .last-index. (samples-index *samples*))
649 (warn "No sampling progress; possibly a profiler bug.")
654 ,@(when report-p `((report :type ,report)))))
656 (defun start-profiling (&key (max-samples *max-samples*)
657 (mode *sampling-mode*)
658 (sample-interval *sample-interval*)
659 (alloc-interval *alloc-interval*)
660 (max-depth most-positive-fixnum)
662 "Start profiling statistically if not already profiling.
663 The following keyword args are recognized:
666 Take a sample every <n> seconds. Default is *SAMPLE-INTERVAL*.
669 Take a sample every time <n> allocation regions (approximately
670 8kB) have been allocated since the last sample. Default is
674 If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run
675 the profiler in allocation profiling mode.
678 Maximum number of samples. Default is *MAX-SAMPLES*.
681 Maximum call stack depth that the profiler should consider. Only
682 has an effect on x86 and x86-64.
685 If true, the default, start sampling right away.
686 If false, START-SAMPLING can be used to turn sampling on."
688 (when (eq mode :alloc)
689 (error "Allocation profiling is only supported for builds using the generational garbage collector."))
691 (multiple-value-bind (secs usecs)
692 (multiple-value-bind (secs rest)
693 (truncate sample-interval)
694 (values secs (truncate (* rest 1000000))))
695 (setf *sampling* sampling
696 *samples* (make-samples :max-depth max-depth
697 :max-samples max-samples
699 (enable-call-counting)
700 (sb-sys:enable-interrupt sb-unix:sigprof #'sigprof-handler)
702 (setf sb-vm:*alloc-signal* (1- alloc-interval))
704 (unix-setitimer :profile secs usecs secs usecs)
705 (setf sb-vm:*alloc-signal* nil)))
706 (setq *profiling* t)))
709 (defun stop-profiling ()
710 "Stop profiling if profiling."
712 (unix-setitimer :profile 0 0 0 0)
713 (disable-call-counting)
714 ;; Even with the timer shut down we cannot be sure that there is
715 ;; no undelivered sigprof. Besides, leaving the signal handler
716 ;; installed won't hurt.
717 (setq *sampling* nil)
718 (setq sb-vm:*alloc-signal* nil)
719 (setq *profiling* nil))
723 "Reset the profiler."
725 (setq *sampling* nil)
729 ;;; Make a NODE for debug-info INFO.
730 (defun make-node (info)
731 (flet ((clean-name (name)
732 (if (and (consp name)
734 '(sb-c::xep sb-c::tl-xep sb-c::&more-processor
737 sb-c::hairy-arg-processor
738 sb-c::&optional-processor)))
742 (sb-kernel::code-component
743 (multiple-value-bind (start end)
746 (%make-node :name (or (sb-disassem::find-assembler-routine start)
747 (format nil "~a" info))
749 :start-pc-or-offset start
750 :end-pc-or-offset end)
752 (sb-di::compiled-debug-fun
753 (let* ((name (sb-di::debug-fun-name info))
754 (cdf (sb-di::compiled-debug-fun-compiler-debug-fun info))
755 (start-offset (sb-c::compiled-debug-fun-start-pc cdf))
756 (end-offset (sb-c::compiled-debug-fun-elsewhere-pc cdf))
757 (component (sb-di::compiled-debug-fun-component info))
758 (start-pc (code-start component)))
759 ;; Call graphs are mostly useless unless we somehow
760 ;; distinguish a gazillion different (LAMBDA ())'s.
761 (when (equal name '(lambda ()))
762 (setf name (format nil "Unknown component: #x~x" start-pc)))
763 (values (%make-node :name (clean-name name)
765 :start-pc-or-offset start-offset
766 :end-pc-or-offset end-offset)
769 (%make-node :name (clean-name (sb-di::debug-fun-name info))
772 (%make-node :name (coerce info 'string)
773 :debug-info info)))))
775 ;;; One function can have more than one COMPILED-DEBUG-FUNCTION with
776 ;;; the same name. Reduce the number of calls to Debug-Info by first
777 ;;; looking for a given PC in a red-black tree. If not found in the
778 ;;; tree, get debug info, and look for a node in a hash-table by
779 ;;; function name. If not found in the hash-table, make a new node.
781 (defvar *name->node*)
783 (defmacro with-lookup-tables (() &body body)
784 `(let ((*name->node* (make-hash-table :test 'equal)))
787 ;;; Find or make a new node for INFO. Value is the NODE found or
788 ;;; made; NIL if not enough information exists to make a NODE for INFO.
789 (defun lookup-node (info)
791 (multiple-value-bind (new key)
793 (when (eql (node-name new) 'call-counter)
794 (return-from lookup-node (values nil nil)))
795 (let* ((key (cons (node-name new) key))
796 (found (gethash key *name->node*)))
798 (setf (node-start-pc-or-offset found)
799 (min (node-start-pc-or-offset found)
800 (node-start-pc-or-offset new)))
801 (setf (node-end-pc-or-offset found)
802 (max (node-end-pc-or-offset found)
803 (node-end-pc-or-offset new)))
806 (let ((call-count-info (gethash (node-name new)
808 (when call-count-info
809 (setf (node-call-count new)
810 (car call-count-info))))
811 (setf (gethash key *name->node*) new)
814 ;;; Return a list of all nodes created by LOOKUP-NODE.
815 (defun collect-nodes ()
816 (loop for node being the hash-values of *name->node*
819 ;;; Value is a CALL-GRAPH for the current contents of *SAMPLES*.
820 (defun make-call-graph-1 (max-depth)
821 (let ((elsewhere-count 0)
823 (with-lookup-tables ()
824 (loop for i below (- (samples-index *samples*) 2) by 2
826 for debug-info = (aref (samples-vector *samples*) i)
827 for next-info = (aref (samples-vector *samples*)
829 do (if (eq debug-info 'trace-start)
831 (let ((callee (lookup-node debug-info))
832 (caller (unless (eq next-info 'trace-start)
833 (lookup-node next-info))))
834 (when (< depth max-depth)
836 (setf visited-nodes nil)
838 (incf (node-accrued-count callee))
839 (incf (node-count callee)))
841 (incf elsewhere-count))))
844 (push callee visited-nodes))
846 (unless (member caller visited-nodes)
847 (incf (node-accrued-count caller)))
849 (let ((call (find callee (node-edges caller)
850 :key #'call-vertex)))
851 (pushnew caller (node-callers callee))
853 (unless (member caller visited-nodes)
854 (incf (call-count call)))
855 (push (make-call callee)
856 (node-edges caller))))))))))
857 (let ((sorted-nodes (sort (collect-nodes) #'> :key #'node-count)))
858 (loop for node in sorted-nodes and i from 1 do
859 (setf (node-index node) i))
860 (%make-call-graph :nsamples (samples-trace-count *samples*)
861 :sample-interval (if (eq (samples-mode *samples*)
863 (samples-alloc-interval *samples*)
864 (samples-sample-interval *samples*))
865 :sampling-mode (samples-mode *samples*)
866 :elsewhere-count elsewhere-count
867 :vertices sorted-nodes)))))
869 ;;; Reduce CALL-GRAPH to a dag, creating CYCLE structures for call
871 (defun reduce-call-graph (call-graph)
873 (flet ((make-one-cycle (vertices edges)
874 (let* ((name (format nil "<Cycle ~d>" (incf cycle-no)))
875 (count (loop for v in vertices sum (node-count v))))
876 (make-cycle :name name
879 :scc-vertices vertices
881 (reduce-graph call-graph #'make-one-cycle))))
883 ;;; For all nodes in CALL-GRAPH, compute times including the time
884 ;;; spent in functions called from them. Note that the call-graph
885 ;;; vertices are in reverse topological order, children first, so we
886 ;;; will have computed accrued counts of called functions before they
887 ;;; are used to compute accrued counts for callers.
888 (defun compute-accrued-counts (call-graph)
889 (do-vertices (from call-graph)
890 (setf (node-accrued-count from) (node-count from))
891 (do-edges (call to from)
892 (incf (node-accrued-count from)
893 (round (* (/ (call-count call) (node-count to))
894 (node-accrued-count to)))))))
896 ;;; Return a CALL-GRAPH structure for the current contents of
897 ;;; *SAMPLES*. The result contain a list of nodes sorted by self-time
898 ;;; in the FLAT-NODES slot, and a dag in VERTICES, with call cycles
899 ;;; reduced to CYCLE structures.
900 (defun make-call-graph (max-depth)
902 (show-progress "~&Computing call graph ")
903 (let ((call-graph (without-gcing (make-call-graph-1 max-depth))))
904 (setf (call-graph-flat-nodes call-graph)
905 (copy-list (graph-vertices call-graph)))
906 (show-progress "~&Finding cycles")
908 (reduce-call-graph call-graph)
909 (show-progress "~&Propagating counts")
911 (compute-accrued-counts call-graph)
917 (defun print-separator (&key (length 72) (char #\-))
918 (format t "~&~V,,,V<~>~%" length char))
920 (defun samples-percent (call-graph count)
922 (* 100.0 (/ count (call-graph-nsamples call-graph)))
925 (defun print-call-graph-header (call-graph)
926 (let ((nsamples (call-graph-nsamples call-graph))
927 (interval (call-graph-sample-interval call-graph))
928 (ncycles (loop for v in (graph-vertices call-graph)
930 (if (eq (call-graph-sampling-mode call-graph) :alloc)
931 (format t "~2&Number of samples: ~d~%~
932 Sample interval: ~a regions (approximately ~a kB)~%~
933 Total sampling amount: ~a regions (approximately ~a kB)~%~
934 Number of cycles: ~d~2%"
937 (truncate (* interval *alloc-region-size*) 1024)
938 (* nsamples interval)
939 (truncate (* nsamples interval *alloc-region-size*) 1024)
941 (format t "~2&Number of samples: ~d~%~
942 Sample interval: ~f seconds~%~
943 Total sampling time: ~f seconds~%~
944 Number of cycles: ~d~2%"
947 (* nsamples interval)
950 (defun print-flat (call-graph &key (stream *standard-output*) max
951 min-percent (print-header t))
952 (let ((*standard-output* stream)
956 (min-count (if min-percent
957 (round (* (/ min-percent 100.0)
958 (call-graph-nsamples call-graph)))
961 (print-call-graph-header call-graph))
962 (format t "~& Self Total Cumul~%")
963 (format t "~& Nr Count % Count % Count % Calls Function~%")
965 (let ((elsewhere-count (call-graph-elsewhere-count call-graph))
967 (dolist (node (call-graph-flat-nodes call-graph))
968 (when (or (and max (> (incf i) max))
969 (< (node-count node) min-count))
971 (let* ((count (node-count node))
972 (percent (samples-percent call-graph count))
973 (accrued-count (node-accrued-count node))
974 (accrued-percent (samples-percent call-graph accrued-count)))
975 (incf total-count count)
976 (incf total-percent percent)
977 (format t "~&~4d ~6d ~5,1f ~6d ~5,1f ~6d ~5,1f ~8@a ~s~%"
985 (or (node-call-count node) "-")
989 (format t "~& ~6d ~5,1f~36a elsewhere~%"
991 (samples-percent call-graph elsewhere-count)
994 (defun print-cycles (call-graph)
995 (when (some #'cycle-p (graph-vertices call-graph))
996 (format t "~& Cycle~%")
997 (format t "~& Count % Parts~%")
998 (do-vertices (node call-graph)
1000 (flet ((print-info (indent index count percent name)
1001 (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%"
1002 count percent indent name index)))
1004 (format t "~&~6d ~5,1f ~a...~%"
1006 (samples-percent call-graph (cycle-count node))
1008 (dolist (v (vertex-scc-vertices node))
1009 (print-info 4 (node-index v) (node-count v)
1010 (samples-percent call-graph (node-count v))
1015 (defun print-graph (call-graph &key (stream *standard-output*)
1017 (let ((*standard-output* stream)
1018 (*print-pretty* nil))
1019 (print-call-graph-header call-graph)
1020 (print-cycles call-graph)
1021 (flet ((find-call (from to)
1022 (find to (node-edges from) :key #'call-vertex))
1023 (print-info (indent index count percent name)
1024 (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%"
1025 count percent indent name index)))
1026 (format t "~& Callers~%")
1027 (format t "~& Total. Function~%")
1028 (format t "~& Count % Count % Callees~%")
1029 (do-vertices (node call-graph)
1032 ;; Print caller information.
1033 (dolist (caller (node-callers node))
1034 (let ((call (find-call caller node)))
1035 (print-info 4 (node-index caller)
1037 (samples-percent call-graph (call-count call))
1038 (node-name caller))))
1039 ;; Print the node itself.
1040 (format t "~&~6d ~5,1f ~6d ~5,1f ~s [~d]~%"
1042 (samples-percent call-graph (node-count node))
1043 (node-accrued-count node)
1044 (samples-percent call-graph (node-accrued-count node))
1048 (do-edges (call called node)
1049 (print-info 4 (node-index called)
1051 (samples-percent call-graph (call-count call))
1052 (node-name called))))
1055 (print-flat call-graph :stream stream :max max
1056 :min-percent min-percent :print-header nil))))
1058 (defun report (&key (type :graph) max min-percent call-graph
1059 (stream *standard-output*) ((:show-progress *show-progress*)))
1060 "Report statistical profiling results. The following keyword
1061 args are recognized:
1064 Specifies the type of report to generate. If :FLAT, show
1065 flat report, if :GRAPH show a call graph and a flat report.
1066 If nil, don't print out a report.
1069 Specify a stream to print the report on. Default is
1073 Don't show more than <max> entries in the flat report.
1075 :MIN-PERCENT <min-percent>
1076 Don't show functions taking less than <min-percent> of the
1077 total time in the flat report.
1079 :SHOW-PROGRESS <bool>
1080 If true, print progress messages while generating the call graph.
1083 Print a report from <graph> instead of the latest profiling
1086 Value of this function is a CALL-GRAPH object representing the
1087 resulting call-graph."
1088 (let ((graph (or call-graph (make-call-graph most-positive-fixnum))))
1091 (print-flat graph :stream stream :max max :min-percent min-percent))
1093 (print-graph graph :stream stream :max max :min-percent min-percent))
1097 ;;; Interface to DISASSEMBLE
1099 (defun sample-pc-from-pc-or-offset (sample pc-or-offset)
1101 ;; Assembly routines or foreign functions don't move around, so we've
1103 ((or sb-kernel:code-component string)
1105 ;; Lisp functions might move, so we've stored a offset from the
1106 ;; start of the code component.
1107 (sb-di::compiled-debug-fun
1108 (let* ((component (sb-di::compiled-debug-fun-component sample))
1109 (start-pc (code-start component)))
1110 (+ start-pc pc-or-offset)))))
1112 (defun add-disassembly-profile-note (chunk stream dstate)
1113 (declare (ignore chunk stream))
1115 (let* ((location (+ (sb-disassem::seg-virtual-location
1116 (sb-disassem:dstate-segment dstate))
1117 (sb-disassem::dstate-cur-offs dstate)))
1118 (samples (loop with index = (samples-index *samples*)
1119 for x from 0 below (- index 2) by 2
1120 for last-sample = nil then sample
1121 for sample = (aref (samples-vector *samples*) x)
1122 for pc-or-offset = (aref (samples-vector *samples*)
1124 when (and sample (eq last-sample 'trace-start))
1126 (sample-pc-from-pc-or-offset sample
1128 (unless (zerop samples)
1129 (sb-disassem::note (format nil "~A/~A samples"
1130 samples (samples-trace-count *samples*))
1133 (pushnew 'add-disassembly-profile-note sb-disassem::*default-dstate-hooks*)
1138 ;;; The following functions tell sb-sprof to do call count profiling
1139 ;;; for the named functions in addition to normal statistical
1140 ;;; profiling. The benefit of this over using SB-PROFILE is that this
1141 ;;; encapsulation is a lot more lightweight, due to not needing to
1142 ;;; track cpu usage / consing. (For example, compiling asdf 20 times
1143 ;;; took 13s normally, 15s with call counting for all functions in
1144 ;;; SB-C, and 94s with SB-PROFILE profiling SB-C).
1146 (defun profile-call-counts (&rest names)
1147 "Mark the functions named by NAMES as being subject to call counting
1148 during statistical profiling. If a string is used as a name, it will
1149 be interpreted as a package name. In this case call counting will be
1150 done for all functions with names like X or (SETF X), where X is a symbol
1151 with the package as its home package."
1152 (dolist (name names)
1154 (let ((package (find-package name)))
1155 (do-symbols (symbol package)
1156 (when (eql (symbol-package symbol) package)
1157 (dolist (function-name (list symbol (list 'setf symbol)))
1158 (profile-call-counts-for-function function-name)))))
1159 (profile-call-counts-for-function name))))
1161 (defun profile-call-counts-for-function (function-name)
1162 (unless (gethash function-name *encapsulations*)
1163 (setf (gethash function-name *encapsulations*) nil)))
1165 (defun unprofile-call-counts ()
1166 "Clear all call counting information. Call counting will be done for no
1167 functions during statistical profiling."
1168 (clrhash *encapsulations*))
1170 ;;; Called when profiling is started to enable the call counting
1171 ;;; encapsulation. Wrap all the call counted functions
1172 (defun enable-call-counting ()
1173 (maphash (lambda (k v)
1174 (declare (ignore v))
1175 (enable-call-counting-for-function k))
1178 ;;; Called when profiling is stopped to disable the encapsulation. Restore
1179 ;;; the original functions.
1180 (defun disable-call-counting ()
1181 (maphash (lambda (k v)
1184 (without-package-locks
1185 (setf (fdefinition k) (cdr v)))
1186 (setf (cdr v) nil)))
1189 (defun enable-call-counting-for-function (function-name)
1190 (let ((info (gethash function-name *encapsulations*)))
1191 ;; We should never try to encapsulate an fdefn multiple times.
1192 (assert (or (null info)
1194 (when (and (fboundp function-name)
1195 (or (not (symbolp function-name))
1196 (and (not (special-operator-p function-name))
1197 (not (macro-function function-name)))))
1198 (let* ((original-fun (fdefinition function-name))
1199 (info (cons 0 original-fun)))
1200 (setf (gethash function-name *encapsulations*) info)
1201 (without-package-locks
1202 (setf (fdefinition function-name)
1203 (sb-int:named-lambda call-counter (sb-int:&more more-context more-count)
1204 (declare (optimize speed (safety 0)))
1205 ;; 2^59 calls should be enough for anybody, and it
1206 ;; allows using fixnum arithmetic on x86-64. 2^32
1207 ;; isn't enough, so we can't do that on 32 bit platforms.
1208 (incf (the (unsigned-byte 59)
1210 (multiple-value-call original-fun
1211 (sb-c:%more-arg-values more-context
1218 (defun test-0 (n &optional (depth 0))
1219 (declare (optimize (debug 3)))
1222 (test-0 n (1+ depth))
1223 (test-0 n (1+ depth)))))
1226 (with-profiling (:reset t :max-samples 1000 :report :graph)