1 ;;; Copyright (C) 2003 Gerd Moellmann <gerd.moellmann@t-online.de>
2 ;;; All rights reserved.
4 ;;; Redistribution and use in source and binary forms, with or without
5 ;;; modification, are permitted provided that the following conditions
8 ;;; 1. Redistributions of source code must retain the above copyright
9 ;;; notice, this list of conditions and the following disclaimer.
10 ;;; 2. Redistributions in binary form must reproduce the above copyright
11 ;;; notice, this list of conditions and the following disclaimer in the
12 ;;; documentation and/or other materials provided with the distribution.
13 ;;; 3. The name of the author may not be used to endorse or promote
14 ;;; products derived from this software without specific prior written
17 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
21 ;;; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 ;;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
23 ;;; OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24 ;;; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ;;; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
27 ;;; USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30 ;;; Statistical profiler.
34 ;;; This profiler arranges for SIGPROF interrupts to interrupt a
35 ;;; running program at regular intervals. Each time a SIGPROF occurs,
36 ;;; the current program counter and return address is recorded in a
37 ;;; vector, until a configurable maximum number of samples have been
40 ;;; A profiling report is generated from the samples array by
41 ;;; determining the Lisp functions corresponding to the recorded
42 ;;; addresses. Each program counter/return address pair forms one
43 ;;; edge in a call graph.
47 ;;; The code being generated on x86 makes determining callers reliably
48 ;;; something between extremely difficult and impossible. Example:
50 ;;; 10979F00: .entry eval::eval-stack-args(arg-count)
51 ;;; 18: pop dword ptr [ebp-8]
52 ;;; 1B: lea esp, [ebp-32]
57 ;;; 29: mov [ebp-12], edi
58 ;;; 2C: mov dword ptr [ebp-16], #x28F0000B ; nil
59 ;;; ; No-arg-parsing entry point
60 ;;; 33: mov dword ptr [ebp-20], 0
62 ;;; 3C: L0: mov edx, esp
64 ;;; 41: mov eax, [#x10979EF8] ; #<FDEFINITION object for eval::eval-stack-pop>
66 ;;; 49: mov [edx-4], ebp
68 ;;; 4E: call dword ptr [eax+5]
71 ;;; Suppose this function is interrupted by SIGPROF at 4E. At that
72 ;;; point, the frame pointer EBP has been modified so that the
73 ;;; original return address of the caller of eval-stack-args is no
74 ;;; longer where it can be found by x86-call-context, and the new
75 ;;; return address, for the call to eval-stack-pop, is not yet on the
76 ;;; stack. The effect is that x86-call-context returns something
77 ;;; bogus, which leads to wrong edges in the call graph.
79 ;;; One thing that one might try is filtering cases where the program
80 ;;; is interrupted at a call instruction. But since the above example
81 ;;; of an interrupt at a call instruction isn't the only case where
82 ;;; the stack is something x86-call-context can't really cope with,
83 ;;; this is not a general solution.
85 ;;; Random ideas for implementation:
87 ;;; * Space profiler. Sample when new pages are allocated instead of
90 ;;; * Record a configurable number of callers up the stack. That
91 ;;; could give a more complete graph when there are many small
94 ;;; * Print help strings for reports, include hints to the problem
97 ;;; * Make flat report the default since call-graph isn't that
100 (defpackage #:sb-sprof
101 (:use #:cl #:sb-ext #:sb-unix #:sb-alien #:sb-sys)
102 (:export #:*sample-interval* #:*max-samples* #:*alloc-interval*
103 #:start-sampling #:stop-sampling #:with-sampling
104 #:with-profiling #:start-profiling #:stop-profiling
107 (in-package #:sb-sprof)
112 (defstruct (vertex (:constructor make-vertex)
113 (:constructor make-scc (scc-vertices edges)))
114 (visited nil :type boolean)
115 (root nil :type (or null vertex))
117 (edges () :type list)
118 (scc-vertices () :type list))
121 (vertex (sb-impl::missing-arg) :type vertex))
124 (vertices () :type list))
126 (declaim (inline scc-p))
127 (defun scc-p (vertex)
128 (not (null (vertex-scc-vertices vertex))))
130 (defmacro do-vertices ((vertex graph) &body body)
131 `(dolist (,vertex (graph-vertices ,graph))
134 (defmacro do-edges ((edge edge-to vertex) &body body)
135 `(dolist (,edge (vertex-edges ,vertex))
136 (let ((,edge-to (edge-vertex ,edge)))
139 (defun self-cycle-p (vertex)
140 (do-edges (e to vertex)
144 (defun map-vertices (fn vertices)
146 (setf (vertex-visited v) nil))
148 (unless (vertex-visited v)
151 ;;; Eeko Nuutila, Eljas Soisalon-Soininen, around 1992. Improves on
152 ;;; Tarjan's original algorithm by not using the stack when processing
153 ;;; trivial components. Trivial components should appear frequently
154 ;;; in a call-graph such as ours, I think. Same complexity O(V+E) as
156 (defun strong-components (vertices)
157 (let ((in-component (make-array (length vertices)
158 :element-type 'boolean
159 :initial-element nil))
163 (labels ((min-root (x y)
164 (let ((rx (vertex-root x))
165 (ry (vertex-root y)))
166 (if (< (vertex-dfn rx) (vertex-dfn ry))
170 (aref in-component (vertex-dfn v)))
171 ((setf in-component) (in v)
172 (setf (aref in-component (vertex-dfn v)) in))
174 (> (vertex-dfn x) (vertex-dfn y)))
176 (setf (vertex-dfn v) (incf dfn)
179 (vertex-visited v) t)
181 (unless (vertex-visited w)
183 (unless (in-component w)
184 (setf (vertex-root v) (min-root v w))))
185 (if (eq v (vertex-root v))
186 (loop while (and stack (vertex-> (car stack) v))
188 collect w into this-component
189 do (setf (in-component w) t)
191 (setf (in-component v) t)
192 (push (cons v this-component) components))
194 (map-vertices #'visit vertices)
197 ;;; Given a dag as a list of vertices, return the list sorted
198 ;;; topologically, children first.
199 (defun topological-sort (dag)
202 (labels ((rec-sort (v)
203 (setf (vertex-visited v) t)
204 (setf (vertex-dfn v) (incf dfn))
205 (dolist (e (vertex-edges v))
206 (unless (vertex-visited (edge-vertex e))
207 (rec-sort (edge-vertex e))))
209 (map-vertices #'rec-sort dag)
212 ;;; Reduce graph G to a dag by coalescing strongly connected components
213 ;;; into vertices. Sort the result topologically.
214 (defun reduce-graph (graph &optional (scc-constructor #'make-scc))
215 (sb-int:collect ((sccs) (trivial))
216 (dolist (c (strong-components (graph-vertices graph)))
217 (if (or (cdr c) (self-cycle-p (car c)))
218 (sb-int:collect ((outgoing))
223 (sccs (funcall scc-constructor c (outgoing))))
226 (dolist (v (trivial))
228 (when (member w (vertex-scc-vertices scc))
229 (setf (edge-vertex e) scc)))))
230 (setf (graph-vertices graph)
231 (topological-sort (nconc (sccs) (trivial))))))
236 "Type used for addresses, for instance, program counters,
237 code start/end locations etc."
238 '(unsigned-byte #.sb-vm::n-machine-word-bits))
240 (defconstant +unknown-address+ 0
241 "Constant representing an address that cannot be determined.")
243 ;;; A call graph. Vertices are NODE structures, edges are CALL
245 (defstruct (call-graph (:include graph)
246 (:constructor %make-call-graph))
247 ;; the value of *SAMPLE-INTERVAL* or *ALLOC-INTERVAL* at the time
248 ;; the graph was created (depending on the current allocation mode)
249 (sample-interval (sb-impl::missing-arg) :type number)
250 ;; the value of *SAMPLING-MODE* at the time the graph was created
251 (sampling-mode (sb-impl::missing-arg) :type (member :cpu :alloc))
252 ;; number of samples taken
253 (nsamples (sb-impl::missing-arg) :type sb-impl::index)
254 ;; sample count for samples not in any function
255 (elsewhere-count (sb-impl::missing-arg) :type sb-impl::index)
256 ;; a flat list of NODEs, sorted by sample count
257 (flat-nodes () :type list))
259 ;;; A node in a call graph, representing a function that has been
260 ;;; sampled. The edges of a node are CALL structures that represent
261 ;;; functions called from a given node.
262 (defstruct (node (:include vertex)
263 (:constructor %make-node))
264 ;; A numeric label for the node. The most frequently called function
265 ;; gets label 1. This is just for identification purposes in the
267 (index 0 :type fixnum)
268 ;; Start and end address of the function's code. Depending on the
269 ;; debug-info, this might be either as absolute addresses for things
270 ;; that won't move around in memory, or as relative offsets from
271 ;; some point for things that might move.
272 (start-pc-or-offset 0 :type address)
273 (end-pc-or-offset 0 :type address)
274 ;; the name of the function
276 ;; sample count for this function
277 (count 0 :type fixnum)
278 ;; count including time spent in functions called from this one
279 (accrued-count 0 :type fixnum)
280 ;; the debug-info that this node was created from
281 (debug-info nil :type t)
282 ;; list of NODEs for functions calling this one
283 (callers () :type list))
285 ;;; A cycle in a call graph. The functions forming the cycle are
286 ;;; found in the SCC-VERTICES slot of the VERTEX structure.
287 (defstruct (cycle (:include node)))
289 ;;; An edge in a call graph. EDGE-VERTEX is the function being
291 (defstruct (call (:include edge)
292 (:constructor make-call (vertex)))
293 ;; number of times the call was sampled
294 (count 1 :type sb-impl::index))
296 ;;; Encapsulate all the information about a sampling run
298 (vector (make-array (* *max-samples* +sample-size+)) :type simple-vector)
299 (index 0 :type sb-impl::index)
300 (mode *sampling-mode* :type (member :cpu :alloc))
301 (sample-interval *sample-interval* :type number)
302 (alloc-interval *alloc-interval* :type number))
304 (defmethod print-object ((call-graph call-graph) stream)
305 (print-unreadable-object (call-graph stream :type t :identity t)
306 (format stream "~d samples" (call-graph-nsamples call-graph))))
308 (defmethod print-object ((node node) stream)
309 (print-unreadable-object (node stream :type t :identity t)
310 (format stream "~s [~d]" (node-name node) (node-index node))))
312 (defmethod print-object ((call call) stream)
313 (print-unreadable-object (call stream :type t :identity t)
314 (format stream "~s [~d]" (node-name (call-vertex call))
315 (node-index (call-vertex call)))))
317 (deftype report-type ()
318 '(member nil :flat :graph))
320 (defvar *sampling-mode* :cpu
321 "Default sampling mode. :CPU for cpu profiling, :ALLOC for allocation
323 (declaim (type (member :cpu :alloc) *sampling-mode*))
325 (defvar *sample-interval* 0.01
326 "Default number of seconds between samples.")
327 (declaim (number *sample-interval*))
329 (defvar *alloc-region-size*
332 ;; This hardcoded 2 matches the one in gc_find_freeish_pages. It's not
333 ;; really worth genesifying.
335 (* 2 sb-vm:gencgc-page-size))
336 (declaim (number *alloc-region-size*))
338 (defvar *alloc-interval* 4
339 "Default number of allocation region openings between samples.")
340 (declaim (number *alloc-interval*))
342 (defvar *max-samples* 50000
343 "Default number of samples taken.")
344 (declaim (type sb-impl::index *max-samples*))
346 ;; For every profiler event we store this many samples (frames 0-n on
348 (defconstant +sample-depth+
352 ;; We store two elements for each sample. The debug-info of the sample
353 ;; and either its absolute PC or a PC offset, depending on the type of
355 (defconstant +sample-size+ (* +sample-depth+ 2))
357 (defvar *samples* nil)
358 (declaim (type (or null samples) *samples*))
360 (defvar *profiling* nil)
361 (defvar *sampling* nil)
362 (declaim (type boolean *profiling* *sampling*))
364 (defvar *show-progress* nil)
366 (defvar *old-sampling* nil)
368 (defun turn-off-sampling ()
369 (setq *old-sampling* *sampling*)
370 (setq *sampling* nil))
372 (defun turn-on-sampling ()
373 (setq *sampling* *old-sampling*))
375 (defun show-progress (format-string &rest args)
376 (when *show-progress*
377 (apply #'format t format-string args)
380 (defun start-sampling ()
381 "Switch on statistical sampling."
384 (defun stop-sampling ()
385 "Switch off statistical sampling."
386 (setq *sampling* nil))
388 (defmacro with-sampling ((&optional (on t)) &body body)
389 "Evaluate body with statistical sampling turned on or off."
390 `(let ((*sampling* ,on)
391 (sb-vm:*alloc-signal* sb-vm:*alloc-signal*))
394 ;;; Return something serving as debug info for address PC.
395 (declaim (inline debug-info))
396 (defun debug-info (pc)
397 (declare (type system-area-pointer pc)
398 (muffle-conditions compiler-note))
399 (let ((ptr (sb-di::component-ptr-from-pc pc)))
400 (cond ((sap= ptr (int-sap 0))
401 (let ((name (sap-foreign-symbol pc)))
403 (values (format nil "foreign function ~a" name)
405 (values nil (sap-int pc)))))
407 (let* ((code (sb-di::component-from-component-ptr ptr))
408 (code-header-len (* (sb-kernel:get-header-data code)
410 (pc-offset (- (sap-int pc)
411 (- (sb-kernel:get-lisp-obj-address code)
412 sb-vm:other-pointer-lowtag)
414 (df (sb-di::debug-fun-from-pc code pc-offset)))
415 (cond ((typep df 'sb-di::bogus-debug-fun)
416 (values code (sap-int pc)))
418 ;; The code component might be moved by the GC. Store
419 ;; a PC offset, and reconstruct the data in
420 ;; SAMPLE-PC-FROM-PC-OR-OFFSET.
421 (values df pc-offset))
423 (values nil 0))))))))
425 (declaim (inline record))
427 (declare (type system-area-pointer pc)
428 (muffle-conditions compiler-note))
429 (multiple-value-bind (info pc-or-offset)
431 ;; For each sample, store the debug-info and the PC/offset into
433 (let ((vector (samples-vector *samples*)))
434 (setf (aref vector (samples-index *samples*)) info
435 (aref vector (1+ (samples-index *samples*))) pc-or-offset)))
436 (incf (samples-index *samples*) 2))
438 ;;; Ensure that only one thread at a time will be executing sigprof handler.
439 (defvar *sigprof-handler-lock* (sb-thread:make-mutex :name "SIGPROF handler"))
441 ;;; SIGPROF handler. Record current PC and return address in
444 (defun sigprof-handler (signal code scp)
445 (declare (ignore signal code)
446 (optimize speed (space 0))
447 (muffle-conditions compiler-note)
448 (type system-area-pointer scp))
449 (sb-sys:without-interrupts
450 (let ((sb-vm:*alloc-signal* nil))
451 (when (and *sampling*
453 (< (samples-index *samples*)
454 (length (samples-vector *samples*))))
455 (sb-sys:without-gcing
456 (sb-thread:with-mutex (*sigprof-handler-lock*)
457 (with-alien ((scp (* os-context-t) :local scp))
458 (let* ((pc-ptr (sb-vm:context-pc scp))
459 (fp (sb-vm::context-register scp #.sb-vm::ebp-offset)))
460 ;; For some reason completely bogus small values for the
461 ;; frame pointer are returned every now and then, leading
462 ;; to segfaults. Try to avoid these cases.
464 ;; FIXME: Do a more thorough sanity check on ebp, or figure
465 ;; out why this is happening.
466 ;; -- JES, 2005-01-11
468 (dotimes (i +sample-depth+)
469 (record (int-sap 0)))
470 (return-from sigprof-handler nil))
471 (let ((fp (int-sap fp))
473 (declare (type system-area-pointer fp pc-ptr))
474 (dotimes (i +sample-depth+)
477 (setf (values ok pc-ptr fp)
478 (sb-di::x86-call-context fp)))))))))))
479 ;; Reset the allocation counter
480 (when (and sb-vm:*alloc-signal*
481 (<= sb-vm:*alloc-signal* 0))
482 (setf sb-vm:*alloc-signal* (1- *alloc-interval*)))
485 ;; FIXME: On non-x86 platforms we don't yet walk the call stack deeper
488 (defun sigprof-handler (signal code scp)
489 (declare (ignore signal code))
490 (sb-sys:without-interrupts
491 (when (and *sampling*
492 (< (samples-index *samples*) (length (samples-vector *samples*))))
493 (sb-sys:without-gcing
494 (with-alien ((scp (* os-context-t) :local scp))
495 (locally (declare (optimize (inhibit-warnings 2)))
496 (let* ((pc-ptr (sb-vm:context-pc scp))
497 (fp (sb-vm::context-register scp #.sb-vm::cfp-offset))
500 (* sb-vm::lra-save-offset sb-vm::n-word-bytes))))
502 (record (int-sap ra)))))))))
504 ;;; Return the start address of CODE.
505 (defun code-start (code)
506 (declare (type sb-kernel:code-component code))
507 (sap-int (sb-kernel:code-instructions code)))
509 ;;; Return start and end address of CODE as multiple values.
510 (defun code-bounds (code)
511 (declare (type sb-kernel:code-component code))
512 (let* ((start (code-start code))
513 (end (+ start (sb-kernel:%code-code-size code))))
516 (defmacro with-profiling ((&key (sample-interval '*sample-interval*)
517 (alloc-interval '*alloc-interval*)
518 (max-samples '*max-samples*)
520 (mode '*sampling-mode*)
523 (report nil report-p))
525 "Repeatedly evaluate BODY with statistical profiling turned on.
526 In multi-threaded operation, only the thread in which WITH-PROFILING
527 was evaluated will be profiled by default. If you want to profile
528 multiple threads, invoke the profiler with START-PROFILING.
530 The following keyword args are recognized:
533 Take a sample every <n> seconds. Default is *SAMPLE-INTERVAL*.
536 Take a sample every time <n> allocation regions (approximately
537 8kB) have been allocated since the last sample. Default is
541 If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run
542 the profiler in allocation profiling mode.
545 Repeat evaluating body until <max> samples are taken.
546 Default is *MAX-SAMPLES*.
549 If specified, call REPORT with :TYPE <type> at the end.
552 It true, call RESET at the beginning.
555 If true (the default) repeatedly evaluate BODY. If false, evaluate
557 (declare (type report-type report))
558 `(let* ((*sample-interval* ,sample-interval)
559 (*alloc-interval* ,alloc-interval)
561 (sb-vm:*alloc-signal* nil)
562 (*sampling-mode* ,mode)
563 (*max-samples* ,max-samples))
564 ,@(when reset '((reset)))
569 (when (>= (samples-index *samples*)
570 (length (samples-vector *samples*)))
572 ,@(when show-progress
573 `((format t "~&===> ~d of ~d samples taken.~%"
574 (/ (samples-index *samples*) +sample-size+)
576 (let ((.last-index. (samples-index *samples*)))
578 (when (= .last-index. (samples-index *samples*))
579 (warn "No sampling progress; possibly a profiler bug.")
584 ,@(when report-p `((report :type ,report)))))
586 (defun start-profiling (&key (max-samples *max-samples*)
587 (mode *sampling-mode*)
588 (sample-interval *sample-interval*)
589 (alloc-interval *alloc-interval*)
591 "Start profiling statistically if not already profiling.
592 The following keyword args are recognized:
595 Take a sample every <n> seconds. Default is *SAMPLE-INTERVAL*.
598 Take a sample every time <n> allocation regions (approximately
599 8kB) have been allocated since the last sample. Default is
603 If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run
604 the profiler in allocation profiling mode.
607 Maximum number of samples. Default is *MAX-SAMPLES*.
610 If true, the default, start sampling right away.
611 If false, START-SAMPLING can be used to turn sampling on."
613 (when (eq mode :alloc)
614 (error "Allocation profiling is only supported for builds using the generational garbage collector."))
616 (multiple-value-bind (secs usecs)
617 (multiple-value-bind (secs rest)
618 (truncate sample-interval)
619 (values secs (truncate (* rest 1000000))))
620 (setf *sampling-mode* mode
621 *max-samples* max-samples
623 *samples* (make-samples))
624 (sb-sys:enable-interrupt sb-unix:sigprof #'sigprof-handler)
626 (setf sb-vm:*alloc-signal* (1- alloc-interval))
628 (unix-setitimer :profile secs usecs secs usecs)
629 (setf sb-vm:*alloc-signal* nil)))
630 (setq *profiling* t)))
633 (defun stop-profiling ()
634 "Stop profiling if profiling."
636 (unix-setitimer :profile 0 0 0 0)
637 ;; Even with the timer shut down we cannot be sure that there is
638 ;; no undelivered sigprof. Besides, leaving the signal handler
639 ;; installed won't hurt.
640 (setq *sampling* nil)
641 (setq sb-vm:*alloc-signal* nil)
642 (setq *profiling* nil))
646 "Reset the profiler."
648 (setq *sampling* nil)
652 ;;; Make a NODE for debug-info INFO.
653 (defun make-node (info)
654 (flet ((clean-name (name)
655 (if (and (consp name)
657 '(sb-c::xep sb-c::tl-xep sb-c::&more-processor
660 sb-c::hairy-arg-processor
661 sb-c::&optional-processor)))
665 (sb-kernel::code-component
666 (multiple-value-bind (start end)
669 (%make-node :name (or (sb-disassem::find-assembler-routine start)
670 (format nil "~a" info))
672 :start-pc-or-offset start
673 :end-pc-or-offset end)
675 (sb-di::compiled-debug-fun
676 (let* ((name (sb-di::debug-fun-name info))
677 (cdf (sb-di::compiled-debug-fun-compiler-debug-fun info))
678 (start-offset (sb-c::compiled-debug-fun-start-pc cdf))
679 (end-offset (sb-c::compiled-debug-fun-elsewhere-pc cdf))
680 (component (sb-di::compiled-debug-fun-component info))
681 (start-pc (code-start component)))
682 ;; Call graphs are mostly useless unless we somehow
683 ;; distinguish a gazillion different (LAMBDA ())'s.
684 (when (equal name '(lambda ()))
685 (setf name (format nil "Unknown component: #x~x" start-pc)))
686 (values (%make-node :name (clean-name name)
688 :start-pc-or-offset start-offset
689 :end-pc-or-offset end-offset)
692 (%make-node :name (clean-name (sb-di::debug-fun-name info))
695 (%make-node :name (coerce info 'string)
696 :debug-info info)))))
698 ;;; One function can have more than one COMPILED-DEBUG-FUNCTION with
699 ;;; the same name. Reduce the number of calls to Debug-Info by first
700 ;;; looking for a given PC in a red-black tree. If not found in the
701 ;;; tree, get debug info, and look for a node in a hash-table by
702 ;;; function name. If not found in the hash-table, make a new node.
704 (defvar *name->node*)
706 (defmacro with-lookup-tables (() &body body)
707 `(let ((*name->node* (make-hash-table :test 'equal)))
710 ;;; Find or make a new node for INFO. Value is the NODE found or
711 ;;; made; NIL if not enough information exists to make a NODE for INFO.
712 (defun lookup-node (info)
714 (multiple-value-bind (new key)
716 (let* ((key (cons (node-name new) key))
717 (found (gethash key *name->node*)))
719 (setf (node-start-pc-or-offset found)
720 (min (node-start-pc-or-offset found)
721 (node-start-pc-or-offset new)))
722 (setf (node-end-pc-or-offset found)
723 (max (node-end-pc-or-offset found)
724 (node-end-pc-or-offset new)))
727 (setf (gethash key *name->node*) new)
730 ;;; Return a list of all nodes created by LOOKUP-NODE.
731 (defun collect-nodes ()
732 (loop for node being the hash-values of *name->node*
735 ;;; Value is a CALL-GRAPH for the current contents of *SAMPLES*.
736 (defun make-call-graph-1 (depth)
737 (let ((elsewhere-count 0)
739 (with-lookup-tables ()
740 (loop for i below (- (samples-index *samples*) 2) by 2
741 for callee = (lookup-node (aref (samples-vector *samples*) i))
742 for caller = (lookup-node (aref (samples-vector *samples*) (+ i 2)))
744 (when (and *show-progress* (plusp i))
745 (cond ((zerop (mod i 1000))
746 (show-progress "~d" i))
748 (show-progress "."))))
749 (when (< (mod i +sample-size+) depth)
750 (when (= (mod i +sample-size+) 0)
751 (setf visited-nodes nil)
753 (incf (node-accrued-count callee))
754 (incf (node-count callee)))
756 (incf elsewhere-count))))
758 (push callee visited-nodes))
760 (unless (member caller visited-nodes)
761 (incf (node-accrued-count caller)))
763 (let ((call (find callee (node-edges caller)
764 :key #'call-vertex)))
765 (pushnew caller (node-callers callee))
767 (unless (member caller visited-nodes)
768 (incf (call-count call)))
769 (push (make-call callee) (node-edges caller))))))))
770 (let ((sorted-nodes (sort (collect-nodes) #'> :key #'node-count)))
771 (loop for node in sorted-nodes and i from 1 do
772 (setf (node-index node) i))
773 (%make-call-graph :nsamples (/ (samples-index *samples*) +sample-size+)
774 :sample-interval (if (eq (samples-mode *samples*)
776 (samples-alloc-interval *samples*)
777 (samples-sample-interval *samples*))
778 :sampling-mode (samples-mode *samples*)
779 :elsewhere-count elsewhere-count
780 :vertices sorted-nodes)))))
782 ;;; Reduce CALL-GRAPH to a dag, creating CYCLE structures for call
784 (defun reduce-call-graph (call-graph)
786 (flet ((make-one-cycle (vertices edges)
787 (let* ((name (format nil "<Cycle ~d>" (incf cycle-no)))
788 (count (loop for v in vertices sum (node-count v))))
789 (make-cycle :name name
792 :scc-vertices vertices
794 (reduce-graph call-graph #'make-one-cycle))))
796 ;;; For all nodes in CALL-GRAPH, compute times including the time
797 ;;; spent in functions called from them. Note that the call-graph
798 ;;; vertices are in reverse topological order, children first, so we
799 ;;; will have computed accrued counts of called functions before they
800 ;;; are used to compute accrued counts for callers.
801 (defun compute-accrued-counts (call-graph)
802 (do-vertices (from call-graph)
803 (setf (node-accrued-count from) (node-count from))
804 (do-edges (call to from)
805 (incf (node-accrued-count from)
806 (round (* (/ (call-count call) (node-count to))
807 (node-accrued-count to)))))))
809 ;;; Return a CALL-GRAPH structure for the current contents of
810 ;;; *SAMPLES*. The result contain a list of nodes sorted by self-time
811 ;;; in the FLAT-NODES slot, and a dag in VERTICES, with call cycles
812 ;;; reduced to CYCLE structures.
813 (defun make-call-graph (depth)
815 (show-progress "~&Computing call graph ")
816 (let ((call-graph (without-gcing (make-call-graph-1 depth))))
817 (setf (call-graph-flat-nodes call-graph)
818 (copy-list (graph-vertices call-graph)))
819 (show-progress "~&Finding cycles")
821 (reduce-call-graph call-graph)
822 (show-progress "~&Propagating counts")
824 (compute-accrued-counts call-graph)
830 (defun print-separator (&key (length 72) (char #\-))
831 (format t "~&~V,,,V<~>~%" length char))
833 (defun samples-percent (call-graph count)
835 (* 100.0 (/ count (call-graph-nsamples call-graph)))
838 (defun print-call-graph-header (call-graph)
839 (let ((nsamples (call-graph-nsamples call-graph))
840 (interval (call-graph-sample-interval call-graph))
841 (ncycles (loop for v in (graph-vertices call-graph)
843 (if (eq (call-graph-sampling-mode call-graph) :alloc)
844 (format t "~2&Number of samples: ~d~%~
845 Sample interval: ~a regions (approximately ~a kB)~%~
846 Total sampling amount: ~a regions (approximately ~a kB)~%~
847 Number of cycles: ~d~2%"
850 (truncate (* interval *alloc-region-size*) 1024)
851 (* nsamples interval)
852 (truncate (* nsamples interval *alloc-region-size*) 1024)
854 (format t "~2&Number of samples: ~d~%~
855 Sample interval: ~f seconds~%~
856 Total sampling time: ~f seconds~%~
857 Number of cycles: ~d~2%"
860 (* nsamples interval)
863 (defun print-flat (call-graph &key (stream *standard-output*) max
864 min-percent (print-header t))
865 (let ((*standard-output* stream)
869 (min-count (if min-percent
870 (round (* (/ min-percent 100.0)
871 (call-graph-nsamples call-graph)))
874 (print-call-graph-header call-graph))
875 (format t "~& Self Total Cumul~%")
876 (format t "~& Nr Count % Count % Count % Function~%")
878 (let ((elsewhere-count (call-graph-elsewhere-count call-graph))
880 (dolist (node (call-graph-flat-nodes call-graph))
881 (when (or (and max (> (incf i) max))
882 (< (node-count node) min-count))
884 (let* ((count (node-count node))
885 (percent (samples-percent call-graph count))
886 (accrued-count (node-accrued-count node))
887 (accrued-percent (samples-percent call-graph accrued-count)))
888 (incf total-count count)
889 (incf total-percent percent)
890 (format t "~&~4d ~6d ~5,1f ~6d ~5,1f ~6d ~5,1f ~s~%"
901 (format t "~& ~6d ~5,1f elsewhere~%"
903 (samples-percent call-graph elsewhere-count)))))
905 (defun print-cycles (call-graph)
906 (when (some #'cycle-p (graph-vertices call-graph))
907 (format t "~& Cycle~%")
908 (format t "~& Count % Parts~%")
909 (do-vertices (node call-graph)
911 (flet ((print-info (indent index count percent name)
912 (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%"
913 count percent indent name index)))
915 (format t "~&~6d ~5,1f ~a...~%"
917 (samples-percent call-graph (cycle-count node))
919 (dolist (v (vertex-scc-vertices node))
920 (print-info 4 (node-index v) (node-count v)
921 (samples-percent call-graph (node-count v))
926 (defun print-graph (call-graph &key (stream *standard-output*)
928 (let ((*standard-output* stream)
929 (*print-pretty* nil))
930 (print-call-graph-header call-graph)
931 (print-cycles call-graph)
932 (flet ((find-call (from to)
933 (find to (node-edges from) :key #'call-vertex))
934 (print-info (indent index count percent name)
935 (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%"
936 count percent indent name index)))
937 (format t "~& Callers~%")
938 (format t "~& Total. Function~%")
939 (format t "~& Count % Count % Callees~%")
940 (do-vertices (node call-graph)
943 ;; Print caller information.
944 (dolist (caller (node-callers node))
945 (let ((call (find-call caller node)))
946 (print-info 4 (node-index caller)
948 (samples-percent call-graph (call-count call))
949 (node-name caller))))
950 ;; Print the node itself.
951 (format t "~&~6d ~5,1f ~6d ~5,1f ~s [~d]~%"
953 (samples-percent call-graph (node-count node))
954 (node-accrued-count node)
955 (samples-percent call-graph (node-accrued-count node))
959 (do-edges (call called node)
960 (print-info 4 (node-index called)
962 (samples-percent call-graph (call-count call))
963 (node-name called))))
966 (print-flat call-graph :stream stream :max max
967 :min-percent min-percent :print-header nil))))
969 (defun report (&key (type :graph) max min-percent call-graph
970 (stream *standard-output*) ((:show-progress *show-progress*)))
971 "Report statistical profiling results. The following keyword
975 Specifies the type of report to generate. If :FLAT, show
976 flat report, if :GRAPH show a call graph and a flat report.
977 If nil, don't print out a report.
980 Specify a stream to print the report on. Default is
984 Don't show more than <max> entries in the flat report.
986 :MIN-PERCENT <min-percent>
987 Don't show functions taking less than <min-percent> of the
988 total time in the flat report.
990 :SHOW-PROGRESS <bool>
991 If true, print progress messages while generating the call graph.
994 Print a report from <graph> instead of the latest profiling
997 Value of this function is a CALL-GRAPH object representing the
998 resulting call-graph."
999 (let ((graph (or call-graph (make-call-graph (1- +sample-depth+)))))
1002 (print-flat graph :stream stream :max max :min-percent min-percent))
1004 (print-graph graph :stream stream :max max :min-percent min-percent))
1008 ;;; Interface to DISASSEMBLE
1010 (defun sample-pc-from-pc-or-offset (sample pc-or-offset)
1012 ;; Assembly routines or foreign functions don't move around, so we've
1014 ((or sb-kernel:code-component string)
1016 ;; Lisp functions might move, so we've stored a offset from the
1017 ;; start of the code component.
1018 (sb-di::compiled-debug-fun
1019 (let* ((component (sb-di::compiled-debug-fun-component sample))
1020 (start-pc (code-start component)))
1021 (+ start-pc pc-or-offset)))))
1023 (defun add-disassembly-profile-note (chunk stream dstate)
1024 (declare (ignore chunk stream))
1025 (unless (zerop (samples-index *samples*))
1026 (let* ((location (+ (sb-disassem::seg-virtual-location
1027 (sb-disassem:dstate-segment dstate))
1028 (sb-disassem::dstate-cur-offs dstate)))
1029 (samples (loop with index = (samples-index *samples*)
1030 for x from 0 below index by +sample-size+
1031 for sample = (aref (samples-vector *samples*) x)
1032 for pc-or-offset = (aref (samples-vector *samples*)
1036 (sample-pc-from-pc-or-offset sample
1038 (unless (zerop samples)
1039 (sb-disassem::note (format nil "~A/~A samples"
1040 samples (/ (samples-index *samples*)
1044 (pushnew 'add-disassembly-profile-note sb-disassem::*default-dstate-hooks*)
1048 (defun test-0 (n &optional (depth 0))
1049 (declare (optimize (debug 3)))
1052 (test-0 n (1+ depth))
1053 (test-0 n (1+ depth)))))
1056 (with-profiling (:reset t :max-samples 1000 :report :graph)