X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=contrib%2Fsb-sprof%2Fsb-sprof.lisp;h=dd1760c0d5164b8706208db6b6c9dbc73610c6ca;hb=f7a78dd3554bd977b006e5da349a11d4e8463bb5;hp=4fa612b7148389c0471d06215a9fe81b4298d912;hpb=88bcaaccbca230278e26b06d7519c66d3d65e3c2;p=sbcl.git diff --git a/contrib/sb-sprof/sb-sprof.lisp b/contrib/sb-sprof/sb-sprof.lisp index 4fa612b..dd1760c 100644 --- a/contrib/sb-sprof/sb-sprof.lisp +++ b/contrib/sb-sprof/sb-sprof.lisp @@ -51,7 +51,7 @@ ;;; 18: pop dword ptr [ebp-8] ;;; 1B: lea esp, [ebp-32] ;;; 1E: mov edi, edx -;;; +;;; ;;; 20: cmp ecx, 4 ;;; 23: jne L4 ;;; 29: mov [ebp-12], edi @@ -82,10 +82,7 @@ ;;; the stack is something x86-call-context can't really cope with, ;;; this is not a general solution. ;;; -;;; Random ideas for implementation: -;;; -;;; * Show a disassembly of a function annotated with sampling -;;; information. +;;; Random ideas for implementation: ;;; ;;; * Space profiler. Sample when new pages are allocated instead of ;;; at SIGPROF. @@ -102,10 +99,12 @@ (defpackage #:sb-sprof (:use #:cl #:sb-ext #:sb-unix #:sb-alien #:sb-sys) - (:export #:*sample-interval* #:*max-samples* - #:start-sampling #:stop-sampling #:with-sampling - #:with-profiling #:start-profiling #:stop-profiling - #:reset #:report)) + (:export #:*sample-interval* #:*max-samples* #:*alloc-interval* + #:*report-sort-by* #:*report-sort-order* + #:start-sampling #:stop-sampling #:with-sampling + #:with-profiling #:start-profiling #:stop-profiling + #:profile-call-counts #:unprofile-call-counts + #:reset #:report)) (in-package #:sb-sprof) @@ -113,7 +112,7 @@ ;;;; Graph Utilities (defstruct (vertex (:constructor make-vertex) - (:constructor make-scc (scc-vertices edges))) + (:constructor make-scc (scc-vertices edges))) (visited nil :type boolean) (root nil :type (or null vertex)) (dfn 0 :type fixnum) @@ -158,42 +157,42 @@ ;;; Tarjan. (defun strong-components (vertices) (let ((in-component (make-array (length vertices) - :element-type 'boolean - :initial-element nil)) - (stack ()) - (components ()) - (dfn -1)) + :element-type 'boolean + :initial-element nil)) + (stack ()) + (components ()) + (dfn -1)) (labels ((min-root (x y) - (let ((rx (vertex-root x)) - (ry (vertex-root y))) - (if (< (vertex-dfn rx) (vertex-dfn ry)) - rx - ry))) - (in-component (v) - (aref in-component (vertex-dfn v))) - ((setf in-component) (in v) - (setf (aref in-component (vertex-dfn v)) in)) - (vertex-> (x y) - (> (vertex-dfn x) (vertex-dfn y))) - (visit (v) - (setf (vertex-dfn v) (incf dfn) - (in-component v) nil - (vertex-root v) v - (vertex-visited v) t) - (do-edges (e w v) - (unless (vertex-visited w) - (visit w)) - (unless (in-component w) - (setf (vertex-root v) (min-root v w)))) - (if (eq v (vertex-root v)) - (loop while (and stack (vertex-> (car stack) v)) - as w = (pop stack) - collect w into this-component - do (setf (in-component w) t) - finally - (setf (in-component v) t) - (push (cons v this-component) components)) - (push v stack)))) + (let ((rx (vertex-root x)) + (ry (vertex-root y))) + (if (< (vertex-dfn rx) (vertex-dfn ry)) + rx + ry))) + (in-component (v) + (aref in-component (vertex-dfn v))) + ((setf in-component) (in v) + (setf (aref in-component (vertex-dfn v)) in)) + (vertex-> (x y) + (> (vertex-dfn x) (vertex-dfn y))) + (visit (v) + (setf (vertex-dfn v) (incf dfn) + (in-component v) nil + (vertex-root v) v + (vertex-visited v) t) + (do-edges (e w v) + (unless (vertex-visited w) + (visit w)) + (unless (in-component w) + (setf (vertex-root v) (min-root v w)))) + (if (eq v (vertex-root v)) + (loop while (and stack (vertex-> (car stack) v)) + as w = (pop stack) + collect w into this-component + do (setf (in-component w) t) + finally + (setf (in-component v) t) + (push (cons v this-component) components)) + (push v stack)))) (map-vertices #'visit vertices) components))) @@ -201,14 +200,14 @@ ;;; topologically, children first. (defun topological-sort (dag) (let ((sorted ()) - (dfn -1)) + (dfn -1)) (labels ((rec-sort (v) - (setf (vertex-visited v) t) - (setf (vertex-dfn v) (incf dfn)) - (dolist (e (vertex-edges v)) - (unless (vertex-visited (edge-vertex e)) - (rec-sort (edge-vertex e)))) - (push v sorted))) + (setf (vertex-visited v) t) + (setf (vertex-dfn v) (incf dfn)) + (dolist (e (vertex-edges v)) + (unless (vertex-visited (edge-vertex e)) + (rec-sort (edge-vertex e)))) + (push v sorted))) (map-vertices #'rec-sort dag) (nreverse sorted)))) @@ -218,215 +217,27 @@ (sb-int:collect ((sccs) (trivial)) (dolist (c (strong-components (graph-vertices graph))) (if (or (cdr c) (self-cycle-p (car c))) - (sb-int:collect ((outgoing)) - (dolist (v c) - (do-edges (e w v) - (unless (member w c) - (outgoing e)))) - (sccs (funcall scc-constructor c (outgoing)))) - (trivial (car c)))) + (sb-int:collect ((outgoing)) + (dolist (v c) + (do-edges (e w v) + (unless (member w c) + (outgoing e)))) + (sccs (funcall scc-constructor c (outgoing)))) + (trivial (car c)))) (dolist (scc (sccs)) (dolist (v (trivial)) - (do-edges (e w v) - (when (member w (vertex-scc-vertices scc)) - (setf (edge-vertex e) scc))))) + (do-edges (e w v) + (when (member w (vertex-scc-vertices scc)) + (setf (edge-vertex e) scc))))) (setf (graph-vertices graph) - (topological-sort (nconc (sccs) (trivial)))))) - - -;;;; AA Trees - -;;; An AA tree is a red-black tree with the extra condition that left -;;; children may not be red. This condition simplifies the red-black -;;; algorithm. It eliminates half of the restructuring cases, and -;;; simplifies the delete algorithm. - -(defstruct (aa-node (:conc-name aa-)) - (left nil :type (or null aa-node)) - (right nil :type (or null aa-node)) - (level 0 :type integer) - (data nil :type t)) - -(defvar *null-node* - (let ((node (make-aa-node))) - (setf (aa-left node) node) - (setf (aa-right node) node) - node)) - -(defstruct aa-tree - (root *null-node* :type aa-node)) - -(declaim (inline skew split rotate-with-left-child rotate-with-right-child)) - -(defun rotate-with-left-child (k2) - (let ((k1 (aa-left k2))) - (setf (aa-left k2) (aa-right k1)) - (setf (aa-right k1) k2) - k1)) - -(defun rotate-with-right-child (k1) - (let ((k2 (aa-right k1))) - (setf (aa-right k1) (aa-left k2)) - (setf (aa-left k2) k1) - k2)) - -(defun skew (aa) - (if (= (aa-level (aa-left aa)) (aa-level aa)) - (rotate-with-left-child aa) - aa)) - -(defun split (aa) - (when (= (aa-level (aa-right (aa-right aa))) - (aa-level aa)) - (setq aa (rotate-with-right-child aa)) - (incf (aa-level aa))) - aa) - -(macrolet ((def (name () &body body) - (let ((name (sb-int::symbolicate 'aa- name))) - `(defun ,name (item tree &key - (test-< #'<) (test-= #'=) - (node-key #'identity) (item-key #'identity)) - (let ((.item-key. (funcall item-key item))) - (flet ((item-< (node) - (funcall test-< .item-key. - (funcall node-key (aa-data node)))) - (item-= (node) - (funcall test-= .item-key. - (funcall node-key (aa-data node))))) - (declare (inline item-< item-=)) - ,@body)))))) - - (def insert () - (labels ((insert-into (aa) - (cond ((eq aa *null-node*) - (setq aa (make-aa-node :data item - :left *null-node* - :right *null-node*))) - ((item-= aa) - (return-from insert-into aa)) - ((item-< aa) - (setf (aa-left aa) (insert-into (aa-left aa)))) - (t - (setf (aa-right aa) (insert-into (aa-right aa))))) - (split (skew aa)))) - (setf (aa-tree-root tree) - (insert-into (aa-tree-root tree))))) - - (def delete () - (let ((deleted-node *null-node*) - (last-node nil)) - (labels ((remove-from (aa) - (unless (eq aa *null-node*) - (setq last-node aa) - (if (item-< aa) - (setf (aa-left aa) (remove-from (aa-left aa))) - (progn - (setq deleted-node aa) - (setf (aa-right aa) (remove-from (aa-right aa))))) - (cond ((eq aa last-node) - ;; - ;; If at the bottom of the tree, and item - ;; is present, delete it. - (when (and (not (eq deleted-node *null-node*)) - (item-= deleted-node)) - (setf (aa-data deleted-node) (aa-data aa)) - (setq deleted-node *null-node*) - (setq aa (aa-right aa)))) - ;; - ;; Otherwise not at bottom of tree; rebalance. - ((or (< (aa-level (aa-left aa)) - (1- (aa-level aa))) - (< (aa-level (aa-right aa)) - (1- (aa-level aa)))) - (decf (aa-level aa)) - (when (> (aa-level (aa-right aa)) (aa-level aa)) - (setf (aa-level (aa-right aa)) (aa-level aa))) - (setq aa (skew aa)) - (setf (aa-right aa) (skew (aa-right aa))) - (setf (aa-right (aa-right aa)) - (skew (aa-right (aa-right aa)))) - (setq aa (split aa)) - (setf (aa-right aa) (split (aa-right aa)))))) - aa)) - (setf (aa-tree-root tree) - (remove-from (aa-tree-root tree)))))) - - (def find () - (let ((current (aa-tree-root tree))) - (setf (aa-data *null-node*) item) - (loop - (cond ((eq current *null-node*) - (return (values nil nil))) - ((item-= current) - (return (values (aa-data current) t))) - ((item-< current) - (setq current (aa-left current))) - (t - (setq current (aa-right current)))))))) - - -;;;; Other Utilities - -;;; Sort the subsequence of Vec in the interval [From To] using -;;; comparison function Test. Assume each element to sort consists of -;;; Element-Size array slots, and that the slot Key-Offset contains -;;; the sort key. -(defun qsort (vec &key (element-size 1) (key-offset 0) - (from 0) (to (- (length vec) element-size))) - (declare (type fixnum to from element-size key-offset)) - (declare (type (simple-array address) vec)) - (labels ((rotate (i j) - (declare (fixnum i j)) - (loop repeat element-size - for i from i and j from j do - (rotatef (aref vec i) (aref vec j)))) - (key (i) - (aref vec (+ i key-offset))) - (rec-sort (from to) - (declare (fixnum to from)) - (when (> to from) - (let* ((mid (* element-size - (round (+ (/ from element-size) - (/ to element-size)) - 2))) - (i from) - (j (+ to element-size)) - (p (key mid))) - (declare (fixnum mid i j)) - (rotate mid from) - (loop - (loop do (incf i element-size) - until (or (> i to) - ;; QSORT used to take a test - ;; parameter which was funcalled - ;; here. This caused some consing, - ;; which is problematic since - ;; QSORT is indirectly called in - ;; an after-gc-hook. So just - ;; hardcode >, which would've been - ;; used for the test anyway. - ;; --JES, 2004-07-09 - (> p (key i)))) - (loop do (decf j element-size) - until (or (<= j from) - ;; As above. - (> (key j) p))) - (when (< j i) (return)) - (rotate i j)) - (rotate from j) - (rec-sort from (- j element-size)) - (rec-sort i to))))) - (rec-sort from to) - vec)) - + (topological-sort (nconc (sccs) (trivial)))))) ;;;; The Profiler (deftype address () "Type used for addresses, for instance, program counters, code start/end locations etc." - '(unsigned-byte #+alpha 64 #-alpha 32)) + '(unsigned-byte #.sb-vm::n-machine-word-bits)) (defconstant +unknown-address+ 0 "Constant representing an address that cannot be determined.") @@ -434,13 +245,18 @@ ;;; A call graph. Vertices are NODE structures, edges are CALL ;;; structures. (defstruct (call-graph (:include graph) - (:constructor %make-call-graph)) - ;; the value of *Sample-Interval* at the time the graph was created + (:constructor %make-call-graph)) + ;; the value of *SAMPLE-INTERVAL* or *ALLOC-INTERVAL* at the time + ;; the graph was created (depending on the current allocation mode) (sample-interval (sb-impl::missing-arg) :type number) + ;; the sampling-mode that was used for the profiling run + (sampling-mode (sb-impl::missing-arg) :type (member :cpu :alloc :time)) ;; number of samples taken - (nsamples (sb-impl::missing-arg) :type sb-impl::index) + (nsamples (sb-impl::missing-arg) :type sb-int:index) + ;; threads that have been sampled + (sampled-threads nil :type list) ;; sample count for samples not in any function - (elsewhere-count (sb-impl::missing-arg) :type sb-impl::index) + (elsewhere-count (sb-impl::missing-arg) :type sb-int:index) ;; a flat list of NODEs, sorted by sample count (flat-nodes () :type list)) @@ -448,22 +264,30 @@ ;;; sampled. The edges of a node are CALL structures that represent ;;; functions called from a given node. (defstruct (node (:include vertex) - (:constructor %make-node)) + (:constructor %make-node)) ;; A numeric label for the node. The most frequently called function ;; gets label 1. This is just for identification purposes in the ;; profiling report. (index 0 :type fixnum) - ;; start and end address of the function's code - (start-pc 0 :type address) - (end-pc 0 :type address) + ;; Start and end address of the function's code. Depending on the + ;; debug-info, this might be either as absolute addresses for things + ;; that won't move around in memory, or as relative offsets from + ;; some point for things that might move. + (start-pc-or-offset 0 :type address) + (end-pc-or-offset 0 :type address) ;; the name of the function (name nil :type t) ;; sample count for this function (count 0 :type fixnum) ;; count including time spent in functions called from this one (accrued-count 0 :type fixnum) + ;; the debug-info that this node was created from + (debug-info nil :type t) ;; list of NODEs for functions calling this one - (callers () :type list)) + (callers () :type list) + ;; the call count for the function that corresponds to this node (or NIL + ;; if call counting wasn't enabled for this function) + (call-count nil :type (or null integer))) ;;; A cycle in a call graph. The functions forming the cycle are ;;; found in the SCC-VERTICES slot of the VERTEX structure. @@ -472,20 +296,45 @@ ;;; An edge in a call graph. EDGE-VERTEX is the function being ;;; called. (defstruct (call (:include edge) - (:constructor make-call (vertex))) + (:constructor make-call (vertex))) ;; number of times the call was sampled - (count 1 :type sb-impl::index)) - -;;; Info about a function in dynamic-space. This is used to track -;;; address changes of functions during GC. -(defstruct (dyninfo (:constructor make-dyninfo (code start end))) - ;; component this info is for - (code (sb-impl::missing-arg) :type sb-kernel::code-component) - ;; current start and end address of the component - (start (sb-impl::missing-arg) :type address) - (end (sb-impl::missing-arg) :type address) - ;; new start address of the component, after GC. - (new-start 0 :type address)) + (count 1 :type sb-int:index)) + +(defvar *sample-interval* 0.01 + "Default number of seconds between samples.") +(declaim (type number *sample-interval*)) + +(defvar *alloc-interval* 4 + "Default number of allocation region openings between samples.") +(declaim (type number *alloc-interval*)) + +(defvar *max-samples* 50000 + "Default number of traces taken. This variable is somewhat misnamed: +each trace may actually consist of an arbitrary number of samples, depending +on the depth of the call stack.") +(declaim (type sb-int:index *max-samples*)) + +;;; Encapsulate all the information about a sampling run +(defstruct (samples) + ;; When this vector fills up, we allocate a new one and copy over + ;; the old contents. + (vector (make-array (* *max-samples* + ;; Arbitrary guess at how many samples we'll be + ;; taking for each trace. The exact amount doesn't + ;; matter, this is just to decrease the amount of + ;; re-allocation that will need to be done. + 10 + ;; Each sample takes two cells in the vector + 2)) + :type simple-vector) + (trace-count 0 :type sb-int:index) + (index 0 :type sb-int:index) + (mode nil :type (member :cpu :alloc :time)) + (sample-interval (sb-int:missing-arg) :type number) + (alloc-interval (sb-int:missing-arg) :type number) + (max-depth most-positive-fixnum :type number) + (max-samples (sb-int:missing-arg) :type sb-int:index) + (sampled-threads nil :type list)) (defmethod print-object ((call-graph call-graph) stream) (print-unreadable-object (call-graph stream :type t :identity t) @@ -498,38 +347,38 @@ (defmethod print-object ((call call) stream) (print-unreadable-object (call stream :type t :identity t) (format stream "~s [~d]" (node-name (call-vertex call)) - (node-index (call-vertex call))))) + (node-index (call-vertex call))))) (deftype report-type () '(member nil :flat :graph)) -(defvar *sample-interval* 0.01 - "Default number of seconds between samples.") -(declaim (number *sample-interval*)) - -(defvar *max-samples* 50000 - "Default number of samples taken.") -(declaim (type sb-impl::index *max-samples*)) +(defvar *sampling-mode* :cpu + "Default sampling mode. :CPU for cpu profiling, :ALLOC for allocation +profiling") +(declaim (type (member :cpu :alloc :time) *sampling-mode*)) -(defconstant +sample-size+ 2) +(defvar *alloc-region-size* + #-gencgc + (get-page-size) + #+gencgc + (max sb-vm:gencgc-alloc-granularity sb-vm:gencgc-card-bytes)) +(declaim (type number *alloc-region-size*)) (defvar *samples* nil) -(declaim (type (or null (vector address)) *samples*)) - -(defvar *samples-index* 0) -(declaim (type sb-impl::index *samples-index*)) +(declaim (type (or null samples) *samples*)) (defvar *profiling* nil) +(declaim (type (member nil :alloc :cpu :time) *profiling*)) (defvar *sampling* nil) -(declaim (type boolean *profiling* *sampling*)) - -(defvar *dynamic-space-code-info* ()) -(declaim (type list *dynamic-space-code-info*)) +(declaim (type boolean *sampling*)) (defvar *show-progress* nil) (defvar *old-sampling* nil) +;; Call count encapsulation information +(defvar *encapsulations* (make-hash-table :test 'equal)) + (defun turn-off-sampling () (setq *old-sampling* *sampling*) (setq *sampling* nil)) @@ -552,67 +401,228 @@ (defmacro with-sampling ((&optional (on t)) &body body) "Evaluate body with statistical sampling turned on or off." - `(let ((*sampling* ,on)) + `(let ((*sampling* ,on) + (sb-vm:*alloc-signal* sb-vm:*alloc-signal*)) ,@body)) -(defun sort-samples (&key (key :pc)) - "Sort *Samples* using comparison Test. Key must be one of - :Pc or :Return-Pc for sorting by pc or return pc." - (declare (type (member :pc :return-pc) key)) - (when (plusp *samples-index*) - (qsort *samples* - :from 0 - :to (- *samples-index* +sample-size+) - :element-size +sample-size+ - :key-offset (if (eq key :pc) 0 1)))) - -(defun record (pc) - (declare (type address pc)) - (setf (aref *samples* *samples-index*) pc) - (incf *samples-index*)) - -;;; SIGPROF handler. Record current PC and return address in -;;; *SAMPLES*. -#+x86 -(defun sigprof-handler (signal code scp) - (declare (ignore signal code) (type system-area-pointer scp)) - (when (and *sampling* - (< *samples-index* (length *samples*))) - (sb-sys:without-gcing - (with-alien ((scp (* os-context-t) :local scp)) - (locally (declare (optimize (inhibit-warnings 2))) - (let* ((pc-ptr (sb-vm:context-pc scp)) - (fp (sb-vm::context-register scp #.sb-vm::ebp-offset)) - (ra (sap-ref-32 (int-sap fp) - (- (* (1+ sb-vm::return-pc-save-offset) - sb-vm::n-word-bytes))))) - (record (sap-int pc-ptr)) - (record ra))))))) - -#-x86 +;;; Return something serving as debug info for address PC. +(declaim (inline debug-info)) +(defun debug-info (pc) + (declare (type system-area-pointer pc) + (muffle-conditions compiler-note)) + (let ((ptr (sb-di::component-ptr-from-pc pc))) + (cond ((sap= ptr (int-sap 0)) + (let ((name (sap-foreign-symbol pc))) + (if name + (values (format nil "foreign function ~a" name) + (sap-int pc)) + (values nil (sap-int pc))))) + (t + (let* ((code (sb-di::component-from-component-ptr ptr)) + (code-header-len (* (sb-kernel:get-header-data code) + sb-vm:n-word-bytes)) + (pc-offset (- (sap-int pc) + (- (sb-kernel:get-lisp-obj-address code) + sb-vm:other-pointer-lowtag) + code-header-len)) + (df (sb-di::debug-fun-from-pc code pc-offset))) + (cond ((typep df 'sb-di::bogus-debug-fun) + (values code (sap-int pc))) + (df + ;; The code component might be moved by the GC. Store + ;; a PC offset, and reconstruct the data in + ;; SAMPLE-PC-FROM-PC-OR-OFFSET. + (values df pc-offset)) + (t + (values nil 0)))))))) + +(defun ensure-samples-vector (samples) + (let ((vector (samples-vector samples)) + (index (samples-index samples))) + ;; Allocate a new sample vector if the old one is full + (if (= (length vector) index) + (let ((new-vector (make-array (* 2 index)))) + (format *trace-output* "Profiler sample vector full (~a traces / ~a samples), doubling the size~%" + (samples-trace-count samples) + (truncate index 2)) + (replace new-vector vector) + (setf (samples-vector samples) new-vector)) + vector))) + +(declaim (inline record)) +(defun record (samples pc) + (declare (type system-area-pointer pc) + (muffle-conditions compiler-note)) + (multiple-value-bind (info pc-or-offset) + (debug-info pc) + (let ((vector (ensure-samples-vector samples)) + (index (samples-index samples))) + (declare (type simple-vector vector)) + ;; Allocate a new sample vector if the old one is full + (when (= (length vector) index) + (let ((new-vector (make-array (* 2 index)))) + (format *trace-output* "Profiler sample vector full (~a traces / ~a samples), doubling the size~%" + (samples-trace-count samples) + (truncate index 2)) + (replace new-vector vector) + (setf vector new-vector + (samples-vector samples) new-vector))) + ;; For each sample, store the debug-info and the PC/offset into + ;; adjacent cells. + (setf (aref vector index) info + (aref vector (1+ index)) pc-or-offset))) + (incf (samples-index samples) 2)) + +(defun record-trace-start (samples) + ;; Mark the start of the trace. + (let ((vector (ensure-samples-vector samples))) + (declare (type simple-vector vector)) + (setf (aref vector (samples-index samples)) + 'trace-start)) + (incf (samples-index samples) 2)) + +;;; List of thread currently profiled, or :ALL for all threads. +(defvar *profiled-threads* nil) +(declaim (type (or list (member :all)) *profiled-threads*)) + +;;; Thread which runs the wallclock timers, if any. +(defvar *timer-thread* nil) + +(defun profiled-threads () + (let ((profiled-threads *profiled-threads*)) + (remove *timer-thread* + (if (eq :all profiled-threads) + (sb-thread:list-all-threads) + profiled-threads)))) + +(defun profiled-thread-p (thread) + (let ((profiled-threads *profiled-threads*)) + (or (and (eq :all profiled-threads) + (not (eq *timer-thread* thread))) + (member thread profiled-threads :test #'eq)))) + +#+(or x86 x86-64) +(progn + ;; Ensure that only one thread at a time will be doing profiling stuff. + (defvar *profiler-lock* (sb-thread:make-mutex :name "Statistical Profiler")) + (defvar *distribution-lock* (sb-thread:make-mutex :name "Wallclock profiling lock")) + + (declaim (inline pthread-kill)) + (define-alien-routine pthread-kill int (os-thread unsigned-long) (signal int)) + + ;;; A random thread will call this in response to either a timer firing, + ;;; This in turn will distribute the notice to those threads we are + ;;; interested using SIGPROF. + (defun thread-distribution-handler () + (declare (optimize speed (space 0))) + (when *sampling* + #+sb-thread + (let ((lock *distribution-lock*)) + ;; Don't flood the system with more interrupts if the last + ;; set is still being delivered. + (unless (sb-thread:mutex-value lock) + (sb-thread::with-system-mutex (lock) + (dolist (thread (profiled-threads)) + ;; This may occasionally fail to deliver the signal, but that + ;; seems better then using kill_thread_safely with it's 1 + ;; second backoff. + (let ((os-thread (sb-thread::thread-os-thread thread))) + (when os-thread + (pthread-kill os-thread sb-unix:sigprof))))))) + #-sb-thread + (unix-kill 0 sb-unix:sigprof))) + + (defun sigprof-handler (signal code scp) + (declare (ignore signal code) (optimize speed (space 0)) + (disable-package-locks sb-di::x86-call-context) + (muffle-conditions compiler-note) + (type system-area-pointer scp)) + (let ((self sb-thread:*current-thread*) + (profiling *profiling*)) + ;; Turn off allocation counter when it is not needed. Doing this in the + ;; signal handler means we don't have to worry about racing with the runtime + (unless (eq :alloc profiling) + (setf sb-vm::*alloc-signal* nil)) + (when (and *sampling* + ;; Normal SIGPROF gets practically speaking delivered to threads + ;; depending on the run time they use, so we need to filter + ;; out those we don't care about. For :ALLOC and :TIME profiling + ;; only the interesting threads get SIGPROF in the first place. + ;; + ;; ...except that Darwin at least doesn't seem to work like we + ;; would want it to, which makes multithreaded :CPU profiling pretty + ;; pointless there -- though it may be that our mach magic is + ;; partially to blame? + (or (not (eq :cpu profiling)) (profiled-thread-p self))) + (sb-thread::with-system-mutex (*profiler-lock* :without-gcing t) + (let ((samples *samples*)) + (when (and samples + (< (samples-trace-count samples) + (samples-max-samples samples))) + (with-alien ((scp (* os-context-t) :local scp)) + (let* ((pc-ptr (sb-vm:context-pc scp)) + (fp (sb-vm::context-register scp #.sb-vm::ebp-offset))) + ;; foreign code might not have a useful frame + ;; pointer in ebp/rbp, so make sure it looks + ;; reasonable before walking the stack + (unless (sb-di::control-stack-pointer-valid-p (sb-sys:int-sap fp)) + (record samples pc-ptr) + (return-from sigprof-handler nil)) + (incf (samples-trace-count samples)) + (pushnew self (samples-sampled-threads samples)) + (let ((fp (int-sap fp)) + (ok t)) + (declare (type system-area-pointer fp pc-ptr)) + ;; FIXME: How annoying. The XC doesn't store enough + ;; type information about SB-DI::X86-CALL-CONTEXT, + ;; even if we declaim the ftype explicitly in + ;; src/code/debug-int. And for some reason that type + ;; information is needed for the inlined version to + ;; be compiled without boxing the returned saps. So + ;; we declare the correct ftype here manually, even + ;; if the compiler should be able to deduce this + ;; exact same information. + (declare (ftype (function (system-area-pointer) + (values (member nil t) + system-area-pointer + system-area-pointer)) + sb-di::x86-call-context)) + (record-trace-start samples) + (dotimes (i (samples-max-depth samples)) + (record samples pc-ptr) + (setf (values ok pc-ptr fp) + (sb-di::x86-call-context fp)) + (unless ok + (return)))))) + ;; Reset thread-local allocation counter before interrupts + ;; are enabled. + (when (eq t sb-vm::*alloc-signal*) + (setf sb-vm:*alloc-signal* (1- (samples-alloc-interval samples))))))))) + nil)) + +;; FIXME: On non-x86 platforms we don't yet walk the call stack deeper +;; than one level. +#-(or x86 x86-64) (defun sigprof-handler (signal code scp) (declare (ignore signal code)) - (when (and *sampling* - (< *samples-index* (length *samples*))) - (sb-sys:without-gcing - (with-alien ((scp (* os-context-t) :local scp)) - (locally (declare (optimize (inhibit-warnings 2))) - (let* ((pc-ptr (sb-vm:context-pc scp)) - (fp (sb-vm::context-register scp #.sb-vm::cfp-offset)) - (ra (sap-ref-32 - (int-sap fp) - (* sb-vm::lra-save-offset sb-vm::n-word-bytes)))) - (record (sap-int pc-ptr)) - (record ra))))))) - -;;; Map function FN over code objects in dynamic-space. FN is called -;;; with two arguments, the object and its size in bytes. -(defun map-dynamic-space-code (fn) - (flet ((call-if-code (obj obj-type size) - (declare (ignore obj-type)) - (when (sb-kernel:code-component-p obj) - (funcall fn obj size)))) - (sb-vm::map-allocated-objects #'call-if-code :dynamic))) + (sb-sys:without-interrupts + (let ((samples *samples*)) + (when (and *sampling* + samples + (< (samples-trace-count samples) + (samples-max-samples samples))) + (sb-sys:without-gcing + (with-alien ((scp (* os-context-t) :local scp)) + (locally (declare (optimize (inhibit-warnings 2))) + (incf (samples-trace-count samples)) + (record-trace-start samples) + (let* ((pc-ptr (sb-vm:context-pc scp)) + (fp (sb-vm::context-register scp #.sb-vm::cfp-offset)) + (ra (sap-ref-word + (int-sap fp) + (* sb-vm::lra-save-offset sb-vm::n-word-bytes)))) + (record samples pc-ptr) + (record samples (int-sap ra)))))))))) ;;; Return the start address of CODE. (defun code-start (code) @@ -623,207 +633,277 @@ (defun code-bounds (code) (declare (type sb-kernel:code-component code)) (let* ((start (code-start code)) - (end (+ start (sb-kernel:%code-code-size code)))) + (end (+ start (sb-kernel:%code-code-size code)))) (values start end))) -;;; Record the addresses of dynamic-space code objects in -;;; *DYNAMIC-SPACE-CODE-INFO*. Call this with GC disabled. -(defun record-dyninfo () - (flet ((record-address (code size) - (declare (ignore size)) - (multiple-value-bind (start end) - (code-bounds code) - (push (make-dyninfo code start end) - *dynamic-space-code-info*)))) - (map-dynamic-space-code #'record-address))) - -;;; Adjust pcs or return-pcs in *SAMPLES* for address changes of -;;; dynamic-space code objects. KEY being :PC means adjust pcs. -(defun adjust-samples (key) - (declare (type (member :pc :return-pc) key)) - (sort-samples :key key) - (let ((sidx 0) - (offset (if (eq key :pc) 0 1))) - (declare (type sb-impl::index sidx)) - (dolist (info *dynamic-space-code-info*) - (unless (= (dyninfo-new-start info) (dyninfo-start info)) - (let ((pos (do ((i sidx (+ i +sample-size+))) - ((= i *samples-index*) nil) - (declare (type sb-impl::index i)) - (when (<= (dyninfo-start info) - (aref *samples* (+ i offset)) - (dyninfo-end info)) - (return i))))) - (when pos - (setq sidx pos) - (loop with delta = (- (dyninfo-new-start info) - (dyninfo-start info)) - for j from sidx below *samples-index* by +sample-size+ - as pc = (aref *samples* (+ j offset)) - while (<= (dyninfo-start info) pc (dyninfo-end info)) do - (incf (aref *samples* (+ j offset)) delta) - (incf sidx +sample-size+)))))))) - -;;; This runs from *AFTER-GC-HOOKS*. Adjust *SAMPLES* for address -;;; changes of dynamic-space code objects. -(defun adjust-samples-for-address-changes () - (sb-sys:without-gcing - (turn-off-sampling) - (setq *dynamic-space-code-info* - (sort *dynamic-space-code-info* #'> :key #'dyninfo-start)) - (dolist (info *dynamic-space-code-info*) - (setf (dyninfo-new-start info) - (code-start (dyninfo-code info)))) - (progn - (adjust-samples :pc) - (adjust-samples :return-pc)) - (dolist (info *dynamic-space-code-info*) - (let ((size (- (dyninfo-end info) (dyninfo-start info)))) - (setf (dyninfo-start info) (dyninfo-new-start info)) - (setf (dyninfo-end info) (+ (dyninfo-new-start info) size)))) - (turn-on-sampling))) - (defmacro with-profiling ((&key (sample-interval '*sample-interval*) - (max-samples '*max-samples*) - (reset nil) - show-progress - (report nil report-p)) - &body body) - "Repeatedly evaluate Body with statistical profiling turned on. + (alloc-interval '*alloc-interval*) + (max-samples '*max-samples*) + (reset nil) + (mode '*sampling-mode*) + (loop t) + (max-depth most-positive-fixnum) + show-progress + (threads '(list sb-thread:*current-thread*)) + (report nil report-p)) + &body body) + "Repeatedly evaluate BODY with statistical profiling turned on. + In multi-threaded operation, only the thread in which WITH-PROFILING + was evaluated will be profiled by default. If you want to profile + multiple threads, invoke the profiler with START-PROFILING. + The following keyword args are recognized: - :Sample-Interval - Take a sample every seconds. Default is - *Sample-Interval*. + :SAMPLE-INTERVAL + Take a sample every seconds. Default is *SAMPLE-INTERVAL*. + + :ALLOC-INTERVAL + Take a sample every time allocation regions (approximately + 8kB) have been allocated since the last sample. Default is + *ALLOC-INTERVAL*. - :Max-Samples + :MODE + If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run the + profiler in allocation profiling mode. If :TIME, run the profiler + in wallclock profiling mode. + + :MAX-SAMPLES Repeat evaluating body until samples are taken. - Default is *Max-Samples*. + Default is *MAX-SAMPLES*. + + :MAX-DEPTH + Maximum call stack depth that the profiler should consider. Only + has an effect on x86 and x86-64. + + :REPORT + If specified, call REPORT with :TYPE at the end. - :Report - If specified, call Report with :Type at the end. + :RESET + It true, call RESET at the beginning. - :Reset - It true, call Reset at the beginning." + :THREADS + Form that evaluates to the list threads to profile, or :ALL to indicate + that all threads should be profiled. Defaults to the current + thread. (Note: START-PROFILING defaults to all threads.) + + :THREADS has no effect on call-counting at the moment. + + On some platforms (eg. Darwin) the signals used by the profiler are + not properly delivered to threads in proportion to their CPU usage + when doing :CPU profiling. If you see empty call graphs, or are obviously + missing several samples from certain threads, you may be falling afoul + of this. + + :LOOP + If true (the default) repeatedly evaluate BODY. If false, evaluate + if only once." (declare (type report-type report)) - `(let ((*sample-interval* ,sample-interval) - (*max-samples* ,max-samples)) + `(let* ((*sample-interval* ,sample-interval) + (*alloc-interval* ,alloc-interval) + (*sampling* nil) + (*sampling-mode* ,mode) + (*max-samples* ,max-samples)) ,@(when reset '((reset))) - (start-profiling) - (loop - (when (>= *samples-index* (length *samples*)) - (return)) - ,@(when show-progress - `((format t "~&===> ~d of ~d samples taken.~%" - (/ *samples-index* +sample-size+) - *max-samples*))) - (let ((.last-index. *samples-index*)) - ,@body - (when (= .last-index. *samples-index*) - (warn "No sampling progress; possibly a profiler bug.") - (return)))) - (stop-profiling) + (unwind-protect + (progn + (start-profiling :max-depth ,max-depth :threads ,threads) + (loop + (when (>= (samples-trace-count *samples*) + (samples-max-samples *samples*)) + (return)) + ,@(when show-progress + `((format t "~&===> ~d of ~d samples taken.~%" + (samples-trace-count *samples*) + (samples-max-samples *samples*)))) + (let ((.last-index. (samples-index *samples*))) + ,@body + (when (= .last-index. (samples-index *samples*)) + (warn "No sampling progress; possibly a profiler bug.") + (return))) + (unless ,loop + (return)))) + (stop-profiling)) ,@(when report-p `((report :type ,report))))) -(defun start-profiling (&key (max-samples *max-samples*) - (sample-interval *sample-interval*) - (sampling t)) - "Start profiling statistically if not already profiling. - The following keyword args are recognized: +(defvar *timer* nil) - :Sample-Interval - Take a sample every seconds. Default is - *Sample-Interval*. +(defvar *old-alloc-interval* nil) +(defvar *old-sample-interval* nil) - :Max-Samples - Maximum number of samples. Default is *Max-Samples*. - - :Sampling +(defun start-profiling (&key (max-samples *max-samples*) + (mode *sampling-mode*) + (sample-interval *sample-interval*) + (alloc-interval *alloc-interval*) + (max-depth most-positive-fixnum) + (threads :all) + (sampling t)) + "Start profiling statistically in the current thread if not already profiling. +The following keyword args are recognized: + + :SAMPLE-INTERVAL + Take a sample every seconds. Default is *SAMPLE-INTERVAL*. + + :ALLOC-INTERVAL + Take a sample every time allocation regions (approximately + 8kB) have been allocated since the last sample. Default is + *ALLOC-INTERVAL*. + + :MODE + If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run + the profiler in allocation profiling mode. If :TIME, run the profiler + in wallclock profiling mode. + + :MAX-SAMPLES + Maximum number of samples. Default is *MAX-SAMPLES*. + + :MAX-DEPTH + Maximum call stack depth that the profiler should consider. Only + has an effect on x86 and x86-64. + + :THREADS + List threads to profile, or :ALL to indicate that all threads should be + profiled. Defaults to :ALL. (Note: WITH-PROFILING defaults to the current + thread.) + + :THREADS has no effect on call-counting at the moment. + + On some platforms (eg. Darwin) the signals used by the profiler are + not properly delivered to threads in proportion to their CPU usage + when doing :CPU profiling. If you see empty call graphs, or are obviously + missing several samples from certain threads, you may be falling afoul + of this. + + :SAMPLING If true, the default, start sampling right away. - If false, Start-Sampling can be used to turn sampling on." + If false, START-SAMPLING can be used to turn sampling on." + #-gencgc + (when (eq mode :alloc) + (error "Allocation profiling is only supported for builds using the generational garbage collector.")) (unless *profiling* (multiple-value-bind (secs usecs) - (multiple-value-bind (secs rest) - (truncate sample-interval) - (values secs (truncate (* rest 1000000)))) - (setq *samples* (make-array (* max-samples +sample-size+) - :element-type 'address)) - (setq *samples-index* 0) - (setq *sampling* sampling) - ;; Disabled for now, since this was causing some problems with the - ;; sampling getting turned off completely. --JES, 2004-06-19 - ;; - ;; BEFORE-GC-HOOKS have exceedingly bad interactions with - ;; threads. -- CSR, 2004-06-21 - ;; - ;; (pushnew 'turn-off-sampling *before-gc-hooks*) - (pushnew 'adjust-samples-for-address-changes *after-gc-hooks*) - (record-dyninfo) - (sb-sys:enable-interrupt sb-unix::sigprof #'sigprof-handler) - (unix-setitimer :profile secs usecs secs usecs) - (setq *profiling* t))) + (multiple-value-bind (secs rest) + (truncate sample-interval) + (values secs (truncate (* rest 1000000)))) + (setf *sampling* sampling + *samples* (make-samples :max-depth max-depth + :max-samples max-samples + :sample-interval sample-interval + :alloc-interval alloc-interval + :mode mode)) + (enable-call-counting) + (setf *profiled-threads* threads) + (sb-sys:enable-interrupt sb-unix:sigprof #'sigprof-handler) + (ecase mode + (:alloc + (let ((alloc-signal (1- alloc-interval))) + #+sb-thread + (progn + (when (eq :all threads) + ;; Set the value new threads inherit. + (sb-thread::with-all-threads-lock + (setf sb-thread::*default-alloc-signal* alloc-signal))) + ;; Turn on allocation profiling in existing threads. + (dolist (thread (profiled-threads)) + (sb-thread::%set-symbol-value-in-thread 'sb-vm::*alloc-signal* thread alloc-signal))) + #-sb-thread + (setf sb-vm:*alloc-signal* alloc-signal))) + (:cpu + (unix-setitimer :profile secs usecs secs usecs)) + (:time + #+sb-thread + (let ((setup (sb-thread:make-semaphore :name "Timer thread setup semaphore"))) + (setf *timer-thread* + (sb-thread:make-thread (lambda () + (sb-thread:wait-on-semaphore setup) + (loop while (eq sb-thread:*current-thread* *timer-thread*) + do (sleep 1.0))) + :name "SB-SPROF wallclock timer thread")) + (sb-thread:signal-semaphore setup)) + #-sb-thread + (setf *timer-thread* nil) + (setf *timer* (make-timer #'thread-distribution-handler :name "SB-PROF wallclock timer" + :thread *timer-thread*)) + (schedule-timer *timer* sample-interval :repeat-interval sample-interval))) + (setq *profiling* mode))) (values)) (defun stop-profiling () "Stop profiling if profiling." - (when *profiling* - (setq *after-gc-hooks* - (delete 'adjust-samples-for-address-changes *after-gc-hooks*)) - (unix-setitimer :profile 0 0 0 0) - (sb-sys:enable-interrupt sb-unix::sigprof :default) - (setq *sampling* nil) - (setq *profiling* nil)) + (let ((profiling *profiling*)) + (when profiling + ;; Even with the timers shut down we cannot be sure that there is no + ;; undelivered sigprof. The handler is also responsible for turning the + ;; *ALLOC-SIGNAL* off in individual threads. + (ecase profiling + (:alloc + #+sb-thread + (setf sb-thread::*default-alloc-signal* nil) + #-sb-thread + (setf sb-vm:*alloc-signal* nil)) + (:cpu + (unix-setitimer :profile 0 0 0 0)) + (:time + (unschedule-timer *timer*) + (setf *timer* nil + *timer-thread* nil))) + (disable-call-counting) + (setf *profiling* nil + *sampling* nil + *profiled-threads* nil))) (values)) (defun reset () "Reset the profiler." (stop-profiling) (setq *sampling* nil) - (setq *dynamic-space-code-info* ()) (setq *samples* nil) - (setq *samples-index* 0) (values)) ;;; Make a NODE for debug-info INFO. (defun make-node (info) - (typecase info - (sb-kernel::code-component - (multiple-value-bind (start end) - (code-bounds info) - (%make-node :name (or (sb-disassem::find-assembler-routine start) - (format nil "~a" info)) - :start-pc start :end-pc end))) - (sb-di::compiled-debug-fun - (let* ((name (sb-di::debug-fun-name info)) - (cdf (sb-di::compiled-debug-fun-compiler-debug-fun info)) - (start-offset (sb-c::compiled-debug-fun-start-pc cdf)) - (end-offset (sb-c::compiled-debug-fun-elsewhere-pc cdf)) - (component (sb-di::compiled-debug-fun-component info)) - (start-pc (code-start component))) - (%make-node :name name - :start-pc (+ start-pc start-offset) - :end-pc (+ start-pc end-offset)))) - (t - (%make-node :name (sb-di::debug-fun-name info))))) - -;;; Return something serving as debug info for address PC. If we can -;;; get something from SB-DI:DEBUG-FUNCTION-FROM-PC, return that. -;;; Otherwise, if we can determine a code component, return that. -;;; Otherwise return nil. -(defun debug-info (pc) - (declare (type address pc)) - (let ((ptr (sb-di::component-ptr-from-pc (int-sap pc)))) - (unless (sap= ptr (int-sap 0)) - (let* ((code (sb-di::component-from-component-ptr ptr)) - (code-header-len (* (sb-kernel:get-header-data code) - sb-vm:n-word-bytes)) - (pc-offset (- pc - (- (sb-kernel:get-lisp-obj-address code) - sb-vm:other-pointer-lowtag) - code-header-len)) - (df (ignore-errors (sb-di::debug-fun-from-pc code - pc-offset)))) - (or df code))))) + (flet ((clean-name (name) + (if (and (consp name) + (member (first name) + '(sb-c::xep sb-c::tl-xep sb-c::&more-processor + sb-c::varargs-entry + sb-c::top-level-form + sb-c::hairy-arg-processor + sb-c::&optional-processor))) + (second name) + name))) + (typecase info + (sb-kernel::code-component + (multiple-value-bind (start end) + (code-bounds info) + (values + (%make-node :name (or (sb-disassem::find-assembler-routine start) + (format nil "~a" info)) + :debug-info info + :start-pc-or-offset start + :end-pc-or-offset end) + info))) + (sb-di::compiled-debug-fun + (let* ((name (sb-di::debug-fun-name info)) + (cdf (sb-di::compiled-debug-fun-compiler-debug-fun info)) + (start-offset (sb-c::compiled-debug-fun-start-pc cdf)) + (end-offset (sb-c::compiled-debug-fun-elsewhere-pc cdf)) + (component (sb-di::compiled-debug-fun-component info)) + (start-pc (code-start component))) + ;; Call graphs are mostly useless unless we somehow + ;; distinguish a gazillion different (LAMBDA ())'s. + (when (equal name '(lambda ())) + (setf name (format nil "Unknown component: #x~x" start-pc))) + (values (%make-node :name (clean-name name) + :debug-info info + :start-pc-or-offset start-offset + :end-pc-or-offset end-offset) + component))) + (sb-di::debug-fun + (%make-node :name (clean-name (sb-di::debug-fun-name info)) + :debug-info info)) + (t + (%make-node :name (coerce info 'string) + :debug-info info))))) ;;; One function can have more than one COMPILED-DEBUG-FUNCTION with ;;; the same name. Reduce the number of calls to Debug-Info by first @@ -831,103 +911,107 @@ ;;; tree, get debug info, and look for a node in a hash-table by ;;; function name. If not found in the hash-table, make a new node. -(defvar *node-tree*) (defvar *name->node*) (defmacro with-lookup-tables (() &body body) - `(let ((*node-tree* (make-aa-tree)) - (*name->node* (make-hash-table :test 'equal))) + `(let ((*name->node* (make-hash-table :test 'equal))) ,@body)) -(defun tree-find (item) - (flet ((pc/node-= (pc node) - (<= (node-start-pc node) pc (node-end-pc node))) - (pc/node-< (pc node) - (< pc (node-start-pc node)))) - (aa-find item *node-tree* :test-= #'pc/node-= :test-< #'pc/node-<))) - -(defun tree-insert (item) - (flet ((node/node-= (x y) - (<= (node-start-pc y) (node-start-pc x) (node-end-pc y))) - (node/node-< (x y) - (< (node-start-pc x) (node-start-pc y)))) - (aa-insert item *node-tree* :test-= #'node/node-= :test-< #'node/node-<))) - -;;; Find or make a new node for address PC. Value is the NODE found -;;; or made; NIL if not enough information exists to make a NODE for -;;; PC. -(defun lookup-node (pc) - (declare (type address pc)) - (or (tree-find pc) - (let ((info (debug-info pc))) - (when info - (let* ((new (make-node info)) - (found (gethash (node-name new) *name->node*))) - (cond (found - (setf (node-start-pc found) - (min (node-start-pc found) (node-start-pc new))) - (setf (node-end-pc found) - (max (node-end-pc found) (node-end-pc new))) - found) - (t - (setf (gethash (node-name new) *name->node*) new) - (tree-insert new) - new))))))) +;;; Find or make a new node for INFO. Value is the NODE found or +;;; made; NIL if not enough information exists to make a NODE for INFO. +(defun lookup-node (info) + (when info + (multiple-value-bind (new key) + (make-node info) + (when (eql (node-name new) 'call-counter) + (return-from lookup-node (values nil nil))) + (let* ((key (cons (node-name new) key)) + (found (gethash key *name->node*))) + (cond (found + (setf (node-start-pc-or-offset found) + (min (node-start-pc-or-offset found) + (node-start-pc-or-offset new))) + (setf (node-end-pc-or-offset found) + (max (node-end-pc-or-offset found) + (node-end-pc-or-offset new))) + found) + (t + (let ((call-count-info (gethash (node-name new) + *encapsulations*))) + (when call-count-info + (setf (node-call-count new) + (car call-count-info)))) + (setf (gethash key *name->node*) new) + new)))))) ;;; Return a list of all nodes created by LOOKUP-NODE. (defun collect-nodes () (loop for node being the hash-values of *name->node* - collect node)) + collect node)) ;;; Value is a CALL-GRAPH for the current contents of *SAMPLES*. -(defun make-call-graph-1 () - (let ((elsewhere-count 0)) +(defun make-call-graph-1 (max-depth) + (let ((elsewhere-count 0) + visited-nodes) (with-lookup-tables () - (loop for i below *samples-index* by +sample-size+ - as pc = (aref *samples* i) - as return-pc = (aref *samples* (1+ i)) - as callee = (lookup-node pc) - as caller = - (when (and callee (/= return-pc +unknown-address+)) - (let ((caller (lookup-node return-pc))) - (when caller - caller))) - when (and *show-progress* (plusp i)) do - (cond ((zerop (mod i 1000)) - (show-progress "~d" i)) - ((zerop (mod i 100)) - (show-progress "."))) - if callee do - (incf (node-count callee)) - else do - (incf elsewhere-count) - when (and callee caller) do - (let ((call (find callee (node-edges caller) - :key #'call-vertex))) - (pushnew caller (node-callers callee)) - (if call - (incf (call-count call)) - (push (make-call callee) (node-edges caller))))) + (loop for i below (- (samples-index *samples*) 2) by 2 + with depth = 0 + for debug-info = (aref (samples-vector *samples*) i) + for next-info = (aref (samples-vector *samples*) + (+ i 2)) + do (if (eq debug-info 'trace-start) + (setf depth 0) + (let ((callee (lookup-node debug-info)) + (caller (unless (eq next-info 'trace-start) + (lookup-node next-info)))) + (when (< depth max-depth) + (when (zerop depth) + (setf visited-nodes nil) + (cond (callee + (incf (node-accrued-count callee)) + (incf (node-count callee))) + (t + (incf elsewhere-count)))) + (incf depth) + (when callee + (push callee visited-nodes)) + (when caller + (unless (member caller visited-nodes) + (incf (node-accrued-count caller))) + (when callee + (let ((call (find callee (node-edges caller) + :key #'call-vertex))) + (pushnew caller (node-callers callee)) + (if call + (unless (member caller visited-nodes) + (incf (call-count call))) + (push (make-call callee) + (node-edges caller)))))))))) (let ((sorted-nodes (sort (collect-nodes) #'> :key #'node-count))) - (loop for node in sorted-nodes and i from 1 do - (setf (node-index node) i)) - (%make-call-graph :nsamples (/ *samples-index* +sample-size+) - :sample-interval *sample-interval* - :elsewhere-count elsewhere-count - :vertices sorted-nodes))))) + (loop for node in sorted-nodes and i from 1 do + (setf (node-index node) i)) + (%make-call-graph :nsamples (samples-trace-count *samples*) + :sample-interval (if (eq (samples-mode *samples*) + :alloc) + (samples-alloc-interval *samples*) + (samples-sample-interval *samples*)) + :sampling-mode (samples-mode *samples*) + :sampled-threads (samples-sampled-threads *samples*) + :elsewhere-count elsewhere-count + :vertices sorted-nodes))))) ;;; Reduce CALL-GRAPH to a dag, creating CYCLE structures for call ;;; cycles. (defun reduce-call-graph (call-graph) (let ((cycle-no 0)) (flet ((make-one-cycle (vertices edges) - (let* ((name (format nil "" (incf cycle-no))) - (count (loop for v in vertices sum (node-count v)))) - (make-cycle :name name - :index cycle-no - :count count - :scc-vertices vertices - :edges edges)))) + (let* ((name (format nil "" (incf cycle-no))) + (count (loop for v in vertices sum (node-count v)))) + (make-cycle :name name + :index cycle-no + :count count + :scc-vertices vertices + :edges edges)))) (reduce-graph call-graph #'make-one-cycle)))) ;;; For all nodes in CALL-GRAPH, compute times including the time @@ -940,22 +1024,24 @@ (setf (node-accrued-count from) (node-count from)) (do-edges (call to from) (incf (node-accrued-count from) - (round (* (/ (call-count call) (node-count to)) - (node-accrued-count to))))))) + (round (* (/ (call-count call) (node-count to)) + (node-accrued-count to))))))) ;;; Return a CALL-GRAPH structure for the current contents of ;;; *SAMPLES*. The result contain a list of nodes sorted by self-time ;;; in the FLAT-NODES slot, and a dag in VERTICES, with call cycles ;;; reduced to CYCLE structures. -(defun make-call-graph () +(defun make-call-graph (max-depth) (stop-profiling) (show-progress "~&Computing call graph ") - (let ((call-graph (without-gcing (make-call-graph-1)))) + (let ((call-graph (without-gcing (make-call-graph-1 max-depth)))) (setf (call-graph-flat-nodes call-graph) - (copy-list (graph-vertices call-graph))) + (copy-list (graph-vertices call-graph))) (show-progress "~&Finding cycles") + #+nil (reduce-call-graph call-graph) (show-progress "~&Propagating counts") + #+nil (compute-accrued-counts call-graph) call-graph)) @@ -966,58 +1052,106 @@ (format t "~&~V,,,V<~>~%" length char)) (defun samples-percent (call-graph count) - (* 100.0 (/ count (call-graph-nsamples call-graph)))) + (if (> count 0) + (* 100.0 (/ count (call-graph-nsamples call-graph))) + 0)) (defun print-call-graph-header (call-graph) (let ((nsamples (call-graph-nsamples call-graph)) - (interval (call-graph-sample-interval call-graph)) - (ncycles (loop for v in (graph-vertices call-graph) - count (scc-p v)))) - (format t "~2&Number of samples: ~d~%~ - Sample interval: ~f seconds~%~ - Total sampling time: ~f seconds~%~ - Number of cycles: ~d~2%" - nsamples - interval - (* nsamples interval) - ncycles))) + (interval (call-graph-sample-interval call-graph)) + (ncycles (loop for v in (graph-vertices call-graph) + count (scc-p v)))) + (if (eq (call-graph-sampling-mode call-graph) :alloc) + (format t "~2&Number of samples: ~d~%~ + Alloc interval: ~a regions (approximately ~a kB)~%~ + Total sampling amount: ~a regions (approximately ~a kB)~%~ + Number of cycles: ~d~%~ + Sampled threads:~{~% ~S~}~2%" + nsamples + interval + (truncate (* interval *alloc-region-size*) 1024) + (* nsamples interval) + (truncate (* nsamples interval *alloc-region-size*) 1024) + ncycles + (call-graph-sampled-threads call-graph)) + (format t "~2&Number of samples: ~d~%~ + Sample interval: ~f seconds~%~ + Total sampling time: ~f seconds~%~ + Number of cycles: ~d~%~ + Sampled threads:~{~% ~S~}~2%" + nsamples + interval + (* nsamples interval) + ncycles + (call-graph-sampled-threads call-graph))))) + +(declaim (type (member :samples :cumulative-samples) *report-sort-by*)) +(defvar *report-sort-by* :samples + "Method for sorting the flat report: either by :SAMPLES or by :CUMULATIVE-SAMPLES.") + +(declaim (type (member :descending :ascending) *report-sort-order*)) +(defvar *report-sort-order* :descending + "Order for sorting the flat report: either :DESCENDING or :ASCENDING.") (defun print-flat (call-graph &key (stream *standard-output*) max - min-percent (print-header t)) + min-percent (print-header t) + (sort-by *report-sort-by*) + (sort-order *report-sort-order*)) + (declare (type (member :descending :ascending) sort-order) + (type (member :samples :cumulative-samples) sort-by)) (let ((*standard-output* stream) - (*print-pretty* nil) - (total-count 0) - (total-percent 0) - (min-count (if min-percent - (round (* (/ min-percent 100.0) - (call-graph-nsamples call-graph))) - 0))) + (*print-pretty* nil) + (total-count 0) + (total-percent 0) + (min-count (if min-percent + (round (* (/ min-percent 100.0) + (call-graph-nsamples call-graph))) + 0))) (when print-header (print-call-graph-header call-graph)) - (format t "~& Self Total~%") - (format t "~& Nr Count % Count % Function~%") + (format t "~& Self Total Cumul~%") + (format t "~& Nr Count % Count % Count % Calls Function~%") (print-separator) (let ((elsewhere-count (call-graph-elsewhere-count call-graph)) - (i 0)) - (dolist (node (call-graph-flat-nodes call-graph)) - (when (or (and max (> (incf i) max)) - (< (node-count node) min-count)) - (return)) - (let* ((count (node-count node)) - (percent (samples-percent call-graph count))) - (incf total-count count) - (incf total-percent percent) - (format t "~&~4d ~6d ~5,1f ~6d ~5,1f ~s~%" - (node-index node) - count - percent - total-count - total-percent - (node-name node)))) + (i 0) + (nodes (stable-sort (copy-list (call-graph-flat-nodes call-graph)) + (let ((cmp (if (eq :descending sort-order) #'> #'<))) + (multiple-value-bind (primary secondary) + (if (eq :samples sort-by) + (values #'node-count #'node-accrued-count) + (values #'node-accrued-count #'node-count)) + (lambda (x y) + (let ((cx (funcall primary x)) + (cy (funcall primary y))) + (if (= cx cy) + (funcall cmp (funcall secondary x) (funcall secondary y)) + (funcall cmp cx cy))))))))) + (dolist (node nodes) + (when (or (and max (> (incf i) max)) + (< (node-count node) min-count)) + (return)) + (let* ((count (node-count node)) + (percent (samples-percent call-graph count)) + (accrued-count (node-accrued-count node)) + (accrued-percent (samples-percent call-graph accrued-count))) + (incf total-count count) + (incf total-percent percent) + (format t "~&~4d ~6d ~5,1f ~6d ~5,1f ~6d ~5,1f ~8@a ~s~%" + (incf i) + count + percent + accrued-count + accrued-percent + total-count + total-percent + (or (node-call-count node) "-") + (node-name node)) + (finish-output))) (print-separator) - (format t "~& ~6d ~5,1f elsewhere~%" - elsewhere-count - (samples-percent call-graph elsewhere-count))))) + (format t "~& ~6d ~5,1f~36a elsewhere~%" + elsewhere-count + (samples-percent call-graph elsewhere-count) + "")))) (defun print-cycles (call-graph) (when (some #'cycle-p (graph-vertices call-graph)) @@ -1025,104 +1159,241 @@ (format t "~& Count % Parts~%") (do-vertices (node call-graph) (when (cycle-p node) - (flet ((print-info (indent index count percent name) - (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%" - count percent indent name index))) - (print-separator) - (format t "~&~6d ~5,1f ~a...~%" - (node-count node) - (samples-percent call-graph (cycle-count node)) - (node-name node)) - (dolist (v (vertex-scc-vertices node)) - (print-info 4 (node-index v) (node-count v) + (flet ((print-info (indent index count percent name) + (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%" + count percent indent name index))) + (print-separator) + (format t "~&~6d ~5,1f ~a...~%" + (node-count node) + (samples-percent call-graph (cycle-count node)) + (node-name node)) + (dolist (v (vertex-scc-vertices node)) + (print-info 4 (node-index v) (node-count v) (samples-percent call-graph (node-count v)) (node-name v)))))) (print-separator) (format t "~2%"))) (defun print-graph (call-graph &key (stream *standard-output*) - max min-percent) + max min-percent) (let ((*standard-output* stream) - (*print-pretty* nil)) + (*print-pretty* nil)) (print-call-graph-header call-graph) (print-cycles call-graph) (flet ((find-call (from to) - (find to (node-edges from) :key #'call-vertex)) - (print-info (indent index count percent name) - (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%" - count percent indent name index))) + (find to (node-edges from) :key #'call-vertex)) + (print-info (indent index count percent name) + (format t "~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%" + count percent indent name index))) (format t "~& Callers~%") - (format t "~& Cumul. Function~%") + (format t "~& Total. Function~%") (format t "~& Count % Count % Callees~%") (do-vertices (node call-graph) - (print-separator) - ;; - ;; Print caller information. - (dolist (caller (node-callers node)) - (let ((call (find-call caller node))) - (print-info 4 (node-index caller) + (print-separator) + ;; + ;; Print caller information. + (dolist (caller (node-callers node)) + (let ((call (find-call caller node))) + (print-info 4 (node-index caller) (call-count call) (samples-percent call-graph (call-count call)) (node-name caller)))) - ;; Print the node itself. - (format t "~&~6d ~5,1f ~6d ~5,1f ~s [~d]~%" - (node-count node) - (samples-percent call-graph (node-count node)) - (node-accrued-count node) - (samples-percent call-graph (node-accrued-count node)) - (node-name node) - (node-index node)) - ;; Print callees. - (do-edges (call called node) - (print-info 4 (node-index called) + ;; Print the node itself. + (format t "~&~6d ~5,1f ~6d ~5,1f ~s [~d]~%" + (node-count node) + (samples-percent call-graph (node-count node)) + (node-accrued-count node) + (samples-percent call-graph (node-accrued-count node)) + (node-name node) + (node-index node)) + ;; Print callees. + (do-edges (call called node) + (print-info 4 (node-index called) (call-count call) (samples-percent call-graph (call-count call)) (node-name called)))) (print-separator) (format t "~2%") (print-flat call-graph :stream stream :max max - :min-percent min-percent :print-header nil)))) + :min-percent min-percent :print-header nil)))) (defun report (&key (type :graph) max min-percent call-graph - (stream *standard-output*) ((:show-progress *show-progress*))) + ((:sort-by *report-sort-by*) *report-sort-by*) + ((:sort-order *report-sort-order*) *report-sort-order*) + (stream *standard-output*) ((:show-progress *show-progress*))) "Report statistical profiling results. The following keyword args are recognized: - :Type + :TYPE Specifies the type of report to generate. If :FLAT, show flat report, if :GRAPH show a call graph and a flat report. If nil, don't print out a report. - :Stream + :STREAM Specify a stream to print the report on. Default is - *Standard-Output*. + *STANDARD-OUTPUT*. - :Max + :MAX Don't show more than entries in the flat report. - :Min-Percent + :MIN-PERCENT Don't show functions taking less than of the total time in the flat report. - :Show-Progress + :SORT-BY + If :SAMPLES, sort flat report by number of samples taken. + If :CUMULATIVE-SAMPLES, sort flat report by cumulative number of samples + taken (shows how much time each function spent on stack.) Default + is *REPORT-SORT-BY*. + + :SORT-ORDER + If :DESCENDING, sort flat report in descending order. If :ASCENDING, + sort flat report in ascending order. Default is *REPORT-SORT-ORDER*. + + :SHOW-PROGRESS If true, print progress messages while generating the call graph. - :Call-Graph + :CALL-GRAPH Print a report from instead of the latest profiling results. - Value of this function is a Call-Graph object representing the - resulting call-graph." - (declare (type report-type type)) - (let ((graph (or call-graph (make-call-graph)))) - (ecase type - (:flat - (print-flat graph :stream stream :max max :min-percent min-percent)) - (:graph - (print-graph graph :stream stream :max max :min-percent min-percent)) - ((nil))) - graph)) +Value of this function is a CALL-GRAPH object representing the +resulting call-graph, or NIL if there are no samples (eg. right after +calling RESET.) + +Profiling is stopped before the call graph is generated." + (cond (*samples* + (let ((graph (or call-graph (make-call-graph most-positive-fixnum)))) + (ecase type + (:flat + (print-flat graph :stream stream :max max :min-percent min-percent)) + (:graph + (print-graph graph :stream stream :max max :min-percent min-percent)) + ((nil))) + graph)) + (t + (format stream "~&; No samples to report.~%") + nil))) + +;;; Interface to DISASSEMBLE + +(defun sample-pc-from-pc-or-offset (sample pc-or-offset) + (etypecase sample + ;; Assembly routines or foreign functions don't move around, so we've + ;; stored a raw PC + ((or sb-kernel:code-component string) + pc-or-offset) + ;; Lisp functions might move, so we've stored a offset from the + ;; start of the code component. + (sb-di::compiled-debug-fun + (let* ((component (sb-di::compiled-debug-fun-component sample)) + (start-pc (code-start component))) + (+ start-pc pc-or-offset))))) + +(defun add-disassembly-profile-note (chunk stream dstate) + (declare (ignore chunk stream)) + (when *samples* + (let* ((location (+ (sb-disassem::seg-virtual-location + (sb-disassem:dstate-segment dstate)) + (sb-disassem::dstate-cur-offs dstate))) + (samples (loop with index = (samples-index *samples*) + for x from 0 below (- index 2) by 2 + for last-sample = nil then sample + for sample = (aref (samples-vector *samples*) x) + for pc-or-offset = (aref (samples-vector *samples*) + (1+ x)) + when (and sample (eq last-sample 'trace-start)) + count (= location + (sample-pc-from-pc-or-offset sample + pc-or-offset))))) + (unless (zerop samples) + (sb-disassem::note (format nil "~A/~A samples" + samples (samples-trace-count *samples*)) + dstate))))) + +(pushnew 'add-disassembly-profile-note sb-disassem::*default-dstate-hooks*) + +;;;; Call counting + +;;; The following functions tell sb-sprof to do call count profiling +;;; for the named functions in addition to normal statistical +;;; profiling. The benefit of this over using SB-PROFILE is that this +;;; encapsulation is a lot more lightweight, due to not needing to +;;; track cpu usage / consing. (For example, compiling asdf 20 times +;;; took 13s normally, 15s with call counting for all functions in +;;; SB-C, and 94s with SB-PROFILE profiling SB-C). + +(defun profile-call-counts (&rest names) + "Mark the functions named by NAMES as being subject to call counting +during statistical profiling. If a string is used as a name, it will +be interpreted as a package name. In this case call counting will be +done for all functions with names like X or (SETF X), where X is a symbol +with the package as its home package." + (dolist (name names) + (if (stringp name) + (let ((package (find-package name))) + (do-symbols (symbol package) + (when (eql (symbol-package symbol) package) + (dolist (function-name (list symbol (list 'setf symbol))) + (profile-call-counts-for-function function-name))))) + (profile-call-counts-for-function name)))) + +(defun profile-call-counts-for-function (function-name) + (unless (gethash function-name *encapsulations*) + (setf (gethash function-name *encapsulations*) nil))) + +(defun unprofile-call-counts () + "Clear all call counting information. Call counting will be done for no +functions during statistical profiling." + (clrhash *encapsulations*)) + +;;; Called when profiling is started to enable the call counting +;;; encapsulation. Wrap all the call counted functions +(defun enable-call-counting () + (maphash (lambda (k v) + (declare (ignore v)) + (enable-call-counting-for-function k)) + *encapsulations*)) + +;;; Called when profiling is stopped to disable the encapsulation. Restore +;;; the original functions. +(defun disable-call-counting () + (maphash (lambda (k v) + (when v + (assert (cdr v)) + (without-package-locks + (setf (fdefinition k) (cdr v))) + (setf (cdr v) nil))) + *encapsulations*)) + +(defun enable-call-counting-for-function (function-name) + (let ((info (gethash function-name *encapsulations*))) + ;; We should never try to encapsulate an fdefn multiple times. + (assert (or (null info) + (null (cdr info)))) + (when (and (fboundp function-name) + (or (not (symbolp function-name)) + (and (not (special-operator-p function-name)) + (not (macro-function function-name))))) + (let* ((original-fun (fdefinition function-name)) + (info (cons 0 original-fun))) + (setf (gethash function-name *encapsulations*) info) + (without-package-locks + (setf (fdefinition function-name) + (sb-int:named-lambda call-counter (sb-int:&more more-context more-count) + (declare (optimize speed (safety 0))) + ;; 2^59 calls should be enough for anybody, and it + ;; allows using fixnum arithmetic on x86-64. 2^32 + ;; isn't enough, so we can't do that on 32 bit platforms. + (incf (the (unsigned-byte 59) + (car info))) + (multiple-value-call original-fun + (sb-c:%more-arg-values more-context + 0 + more-count))))))))) + + ;;; silly examples (defun test-0 (n &optional (depth 0))