From 23e31980c78d174ef9cb775bf28f970890327fea Mon Sep 17 00:00:00 2001 From: Nikodemus Siivola Date: Wed, 11 May 2011 16:48:37 +0000 Subject: [PATCH] 1.0.48.10: add deadlock detection to spinlocks and mutexes Each thread gets a WAITING-FOR slot which holds the lock it is currently trying to acquire. After initial try at lock acquisition fails, we check for deadlocks: we look at the thread that owns the lock we want -> lock it is waiting for, etc, recursively, until we either bottom out or find someone waiting for us. Print thread as having state WAITING if it is waiting for a lock. Add a PRINT-OBJECT methods for MUTEXes and SPINLOCKs. SB-THREAD:THREAD-DEADLOCK-CYCLE contains a list of the threads and locks involved, so that the situation can be analyzed even afer a handler has broken the deadlock. --- NEWS | 1 + package-data-list.lisp-expr | 3 + src/code/cold-init.lisp | 2 +- src/code/deadline.lisp | 9 +- src/code/target-thread.lisp | 215 ++++++++++++++++++++++++++++++++++--------- src/code/thread.lisp | 3 +- tests/threads.impure.lisp | 35 ++++--- tests/threads.pure.lisp | 88 ++++++++++++++++++ tests/timer.impure.lisp | 25 ++--- version.lisp-expr | 2 +- 10 files changed, 310 insertions(+), 73 deletions(-) diff --git a/NEWS b/NEWS index 48085ff..96b4403 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,7 @@ changes relative to sbcl-1.0.48: streams. * enhancement: more informative compile-time warnings and runtime errors for type-errors detected at compile-time. + * enhancement: deadlock detection for mutexes and spinlocks. * bug fix: blocking reads from FIFOs created by RUN-PROGRAM were uninterruptible, as well as blocking reads from socket streams created with for which :SERVE-EVENTS NIL. (regression from 1.0.42.43) diff --git a/package-data-list.lisp-expr b/package-data-list.lisp-expr index f46895d..4cfabd1 100644 --- a/package-data-list.lisp-expr +++ b/package-data-list.lisp-expr @@ -1884,6 +1884,7 @@ is a good idea, but see SB-SYS re. blurring of boundaries." "!FOREIGN-COLD-INIT" "!FUNCTION-NAMES-COLD-INIT" "!TYPE-CLASS-COLD-INIT" "!TYPEDEFS-COLD-INIT" "!ALIEN-TYPE-COLD-INIT" "!CLASSES-COLD-INIT" + "!DEADLINE-COLD-INIT" "!EARLY-TYPE-COLD-INIT" "!LATE-TYPE-COLD-INIT" "!FIXUP-TYPE-COLD-INIT" "!TARGET-TYPE-COLD-INIT" "!RANDOM-COLD-INIT" "!READER-COLD-INIT" @@ -1948,6 +1949,8 @@ is a good idea, but see SB-SYS re. blurring of boundaries." "SYMBOL-VALUE-IN-THREAD-ERROR" "TERMINATE-THREAD" "THREAD" + "THREAD-DEADLOCK" + "THREAD-DEADLOCK-CYCLE" "THREAD-ERROR" "THREAD-ERROR-THREAD" "THREAD-ALIVE-P" diff --git a/src/code/cold-init.lisp b/src/code/cold-init.lisp index d8c1160..8444234 100644 --- a/src/code/cold-init.lisp +++ b/src/code/cold-init.lisp @@ -111,7 +111,7 @@ ;; I'm not sure where eval is first called, so I put this first. (show-and-call !eval-cold-init) - + (show-and-call !deadline-cold-init) (show-and-call thread-init-or-reinit) (show-and-call !typecheckfuns-cold-init) diff --git a/src/code/deadline.lisp b/src/code/deadline.lisp index 97a147b..43b4fc9 100644 --- a/src/code/deadline.lisp +++ b/src/code/deadline.lisp @@ -12,13 +12,17 @@ (in-package "SB!IMPL") +(!begin-collecting-cold-init-forms) + ;;; Current deadline as internal time units or NIL. -(defvar *deadline* nil) (declaim (type (or unsigned-byte null) *deadline*)) +(defvar *deadline*) +(!cold-init-forms (setq *deadline* nil)) ;;; The relative number of seconds the current deadline corresponds ;;; to. Used for continuing from TIMEOUT conditions. -(defvar *deadline-seconds* nil) +(defvar *deadline-seconds*) +(!cold-init-forms (setq *deadline-seconds* nil)) (declaim (inline seconds-to-internal-time)) (defun seconds-to-internal-time (seconds) @@ -171,3 +175,4 @@ it will signal a timeout condition." (values to-sec to-usec stop-sec stop-usec signalp))) (values nil nil nil nil nil))))))) +(!defun-from-collected-cold-init-forms !deadline-cold-init) diff --git a/src/code/target-thread.lisp b/src/code/target-thread.lisp index bc478fa..4b89ca8 100644 --- a/src/code/target-thread.lisp +++ b/src/code/target-thread.lisp @@ -21,6 +21,15 @@ The offending thread is initialized by the :THREAD initialization argument and read by the function THREAD-ERROR-THREAD.")) +(define-condition thread-deadlock (thread-error) + ((cycle :initarg :cycle :reader thread-deadlock-cycle)) + (:report + (lambda (condition stream) + (let ((*print-circle* t)) + (format stream "Deadlock cycle detected:~%~@< ~@;~ + ~{~:@_~S~:@_~}~:@>" + (mapcar #'car (thread-deadlock-cycle condition))))))) + #!+sb-doc (setf (fdocumentation 'thread-error-thread 'function) @@ -101,18 +110,41 @@ arbitrary printable objects, and need not be unique.") (multiple-value-list (join-thread thread :default cookie)))) (state (if (eq :running info) - info + (let* ((lock (thread-waiting-for thread))) + (typecase lock + (cons + (list "waiting for:" (cdr lock) + "timeout: " (car lock))) + (null + (list info)) + (t + (list "waiting for:" lock)))) (if (eq cookie (car info)) - :aborted + (list :aborted) :finished))) - (values (when (eq :finished state) info))) + (values (when (eq :finished state) + info)) + (*print-level* 4)) (format stream - "~@[~S ~]~:[~A~;~A~:[ no values~; values: ~:*~{~S~^, ~}~]~]" + "~@[~S ~]~:[~{~I~A~^~2I~_ ~}~_~;~A~:[ no values~; values: ~:*~{~S~^, ~}~]~]" (thread-name thread) (eq :finished state) state values)))) +(defun print-lock (lock name owner stream) + (let ((*print-circle* t)) + (print-unreadable-object (lock stream :type t :identity (not name)) + (if owner + (format stream "~@[~S ~]~2I~_owner: ~S" name owner) + (format stream "~@[~S ~](free)" name))))) + +(def!method print-object ((mutex mutex) stream) + (print-lock mutex (mutex-name mutex) (mutex-owner mutex) stream)) + +(def!method print-object ((spinlock spinlock) stream) + (print-lock spinlock (spinlock-name spinlock) (spinlock-value spinlock) stream)) + (defun thread-alive-p (thread) #!+sb-doc "Return T if THREAD is still alive. Note that the return value is @@ -267,6 +299,27 @@ created and old ones may exit at any time." ;;;; Spinlocks +(defmacro with-deadlocks ((thread lock timeout) &body forms) + (with-unique-names (prev n-thread n-lock n-timeout new) + `(let* ((,n-thread ,thread) + (,n-lock ,lock) + (,n-timeout (or ,timeout + (when sb!impl::*deadline* + sb!impl::*deadline-seconds*))) + ;; If we get interrupted while waiting for a lock, etc. + (,prev (thread-waiting-for ,n-thread)) + (,new (if ,n-timeout + (cons ,n-timeout ,n-lock) + ,n-lock))) + (declare (dynamic-extent ,new)) + ;; No WITHOUT-INTERRUPTS, since WITH-DEADLOCKS is used + ;; in places where interrupts should already be disabled. + (unwind-protect + (progn + (setf (thread-waiting-for ,n-thread) ,new) + ,@forms) + (setf (thread-waiting-for ,n-thread) ,prev))))) + (declaim (inline get-spinlock release-spinlock)) ;;; Should always be called with interrupts disabled. @@ -278,23 +331,28 @@ created and old ones may exit at any time." (when (eq old new) (error "Recursive lock attempt on ~S." spinlock)) #!+sb-thread - (flet ((cas () - (if (sb!ext:compare-and-swap (spinlock-value spinlock) nil new) - (thread-yield) - (return-from get-spinlock t)))) - (if (and (not *interrupts-enabled*) *allow-with-interrupts*) - ;; If interrupts are disabled, but we are allowed to - ;; enabled them, check for pending interrupts every once - ;; in a while. %CHECK-INTERRUPTS is taking shortcuts, make - ;; sure that deferrables are unblocked by doing an empty - ;; WITH-INTERRUPTS once. - (progn - (with-interrupts) - (loop - (loop repeat 128 do (cas)) ; 128 is arbitrary here - (sb!unix::%check-interrupts))) - (loop (cas))))) - t)) + (with-deadlocks (new spinlock nil) + (flet ((cas () + (if (sb!ext:compare-and-swap (spinlock-value spinlock) nil new) + (thread-yield) + (return-from get-spinlock t)))) + ;; Try once. + (cas) + ;; Check deadlocks + (with-interrupts (check-deadlock)) + (if (and (not *interrupts-enabled*) *allow-with-interrupts*) + ;; If interrupts are disabled, but we are allowed to + ;; enabled them, check for pending interrupts every once + ;; in a while. %CHECK-INTERRUPTS is taking shortcuts, make + ;; sure that deferrables are unblocked by doing an empty + ;; WITH-INTERRUPTS once. + (progn + (with-interrupts) + (loop + (loop repeat 128 do (cas)) ; 128 is arbitrary here + (sb!unix::%check-interrupts))) + (loop (cas))))))) + t) (defun release-spinlock (spinlock) (declare (optimize (speed 3) (safety 0))) @@ -345,7 +403,65 @@ HOLDING-MUTEX-P." ;; Make sure to get the current value. (sb!ext:compare-and-swap (mutex-%owner mutex) nil nil)) -(defun get-mutex (mutex &optional (new-owner *current-thread*) +;;; Signals an error if owner of LOCK is waiting on a lock whose release +;;; depends on the current thread. Does not detect deadlocks from sempahores. +(defun check-deadlock () + (let* ((self *current-thread*) + (origin (thread-waiting-for self))) + (labels ((lock-owner (lock) + (etypecase lock + (mutex (mutex-%owner lock)) + (spinlock (spinlock-value lock)))) + (detect-deadlock (lock) + (let ((other-thread (lock-owner lock))) + (cond ((not other-thread)) + ((eq self other-thread) + (let* ((chain (deadlock-chain self origin)) + (barf + (format nil + "~%WARNING: DEADLOCK CYCLE DETECTED:~%~@< ~@;~ + ~{~:@_~S~:@_~}~:@>~ + ~%END OF CYCLE~%" + (mapcar #'car chain)))) + ;; Barf to stderr in case the system is too tied up + ;; to report the error properly -- to avoid cross-talk + ;; build the whole string up first. + (write-string barf sb!sys:*stderr*) + (finish-output sb!sys:*stderr*) + (error 'thread-deadlock + :thread *current-thread* + :cycle chain))) + (t + (let ((other-lock (thread-waiting-for other-thread))) + ;; If the thread is waiting with a timeout OTHER-LOCK + ;; is a cons, and we don't consider it a deadlock -- since + ;; it will time out on its own sooner or later. + (when (and other-lock (not (consp other-lock))) + (detect-deadlock other-lock))))))) + (deadlock-chain (thread lock) + (let* ((other-thread (lock-owner lock)) + (other-lock (thread-waiting-for other-thread))) + (cond ((not other-thread) + ;; The deadlock is gone -- maybe someone timed out? + (return-from check-deadlock nil)) + ((consp other-lock) + ;; There's a timeout -- no deadlock. + (return-from check-deadlock nil)) + ((eq self other-thread) + ;; Done + (list (list thread lock))) + (t + (if other-lock + (cons (list thread lock) + (deadlock-chain other-thread other-lock)) + ;; Again, the deadlock is gone? + (return-from check-deadlock nil))))))) + ;; Timeout means there is no deadlock + (unless (consp origin) + (detect-deadlock origin) + t)))) + +(defun get-mutex (mutex &optional new-owner (waitp t) (timeout nil)) #!+sb-doc "Deprecated in favor of GRAB-MUTEX." @@ -363,7 +479,7 @@ HOLDING-MUTEX-P." #!-sb-thread (setf (mutex-%owner mutex) new-owner) #!+sb-thread - (progn + (with-deadlocks (new-owner mutex timeout) ;; FIXME: Lutexes do not currently support deadlines, as at least ;; on Darwin pthread_foo_timedbar functions are not supported: ;; this means that we probably need to use the Carbon multiprocessing @@ -378,12 +494,23 @@ HOLDING-MUTEX-P." (when timeout (error "Mutex timeouts not supported on this platform.")) (when (zerop (with-lutex-address (lutex (mutex-lutex mutex)) - (if waitp - (with-interrupts (%lutex-lock lutex)) - (%lutex-trylock lutex)))) - (setf (mutex-%owner mutex) new-owner) - (barrier (:write)) - t)) + (if waitp + (let ((once (%lutex-trylock lutex))) + (cond ((zerop once) + ;; No need to wait. + once) + (t + (with-interrupts + ;; Check for deadlocks before waiting + (check-deadlock) + (%lutex-lock lutex))))) + (%lutex-trylock lutex)))) + ;; FIXME: If %LUTEX-LOCK unwinds due to a signal, we may actually + ;; be holding the lock already -- and but neglect to mark ourselves + ;; as the owner here. This is bad. + (setf (mutex-%owner mutex) new-owner) + (barrier (:write)) + t)) #!-sb-lutex ;; This is a direct translation of the Mutex 2 algorithm from ;; "Futexes are Tricky" by Ulrich Drepper. @@ -399,20 +526,22 @@ HOLDING-MUTEX-P." +lock-taken+ +lock-contested+)))) ;; Wait on the contested lock. - (loop - (multiple-value-bind (to-sec to-usec stop-sec stop-usec deadlinep) - (decode-timeout timeout) - (declare (ignore stop-sec stop-usec)) - (case (with-pinned-objects (mutex) - (futex-wait (mutex-state-address mutex) - (get-lisp-obj-address +lock-contested+) - (or to-sec -1) - (or to-usec 0))) - ((1) (if deadlinep - (signal-deadline) - (return-from get-mutex nil))) - ((2)) - (otherwise (return)))))) + (with-interrupts + (check-deadlock) + (loop + (multiple-value-bind (to-sec to-usec stop-sec stop-usec deadlinep) + (decode-timeout timeout) + (declare (ignore stop-sec stop-usec)) + (case (with-pinned-objects (mutex) + (futex-wait (mutex-state-address mutex) + (get-lisp-obj-address +lock-contested+) + (or to-sec -1) + (or to-usec 0))) + ((1) (if deadlinep + (signal-deadline) + (return-from get-mutex nil))) + ((2)) + (otherwise (return))))))) (setf old (sb!ext:compare-and-swap (mutex-state mutex) +lock-free+ +lock-contested+)) diff --git a/src/code/thread.lisp b/src/code/thread.lisp index 7a2e567..85eb1c8 100644 --- a/src/code/thread.lisp +++ b/src/code/thread.lisp @@ -28,7 +28,8 @@ in future versions." :type mutex) (result-lock (make-mutex :name "thread result lock") - :type mutex)) + :type mutex) + waiting-for) (def!struct mutex #!+sb-doc diff --git a/tests/threads.impure.lisp b/tests/threads.impure.lisp index 424a6c2..311e1d5 100644 --- a/tests/threads.impure.lisp +++ b/tests/threads.impure.lisp @@ -344,11 +344,18 @@ (with-test (:name (:grab-mutex :timeout :acquisition-fail)) #+sb-lutex (error "Mutex timeout not supported here.") - (let ((m (make-mutex))) + (let ((m (make-mutex)) + (w (make-semaphore))) (with-mutex (m) - (assert (null (join-thread (make-thread - #'(lambda () - (grab-mutex m :timeout 0.1))))))))) + (let ((th (make-thread + #'(lambda () + (prog1 + (grab-mutex m :timeout 0.1) + (signal-semaphore w)))))) + ;; Wait for it to -- otherwise the detect the deadlock chain + ;; from JOIN-THREAD. + (wait-on-semaphore w) + (assert (null (join-thread th))))))) (with-test (:name (:grab-mutex :timeout :acquisition-success)) #+sb-lutex @@ -363,16 +370,18 @@ (with-test (:name (:grab-mutex :timeout+deadline)) #+sb-lutex (error "Mutex timeout not supported here.") - (let ((m (make-mutex))) + (let ((m (make-mutex)) + (w (make-semaphore))) (with-mutex (m) - (assert (eq (join-thread - (make-thread #'(lambda () - (sb-sys:with-deadline (:seconds 0.0) - (handler-case - (grab-mutex m :timeout 0.0) - (sb-sys:deadline-timeout () - :deadline)))))) - :deadline))))) + (let ((th (make-thread #'(lambda () + (sb-sys:with-deadline (:seconds 0.0) + (handler-case + (grab-mutex m :timeout 0.0) + (sb-sys:deadline-timeout () + (signal-semaphore w) + :deadline))))))) + (wait-on-semaphore w) + (assert (eq (join-thread th) :deadline)))))) (with-test (:name (:grab-mutex :waitp+deadline)) #+sb-lutex diff --git a/tests/threads.pure.lisp b/tests/threads.pure.lisp index 752d230..6fcd67a 100644 --- a/tests/threads.pure.lisp +++ b/tests/threads.pure.lisp @@ -304,3 +304,91 @@ (assert (equal (list :write :thread-dead) (sb-thread::symbol-value-in-thread-error-info e))))) (assert error-occurred))) + +#+sb-thread +(with-test (:name deadlock-detection.1) + (flet ((test (ma mb sa sb) + (lambda () + (handler-case + (sb-thread:with-mutex (ma) + (sb-thread:signal-semaphore sa) + (sb-thread:wait-on-semaphore sb) + (sb-thread:with-mutex (mb) + :ok)) + (sb-thread:thread-deadlock (e) + (princ e) + :deadlock))))) + (let* ((m1 (sb-thread:make-mutex :name "M1")) + (m2 (sb-thread:make-mutex :name "M2")) + (s1 (sb-thread:make-semaphore :name "S1")) + (s2 (sb-thread:make-semaphore :name "S2")) + (t1 (sb-thread:make-thread (test m1 m2 s1 s2) :name "T1")) + (t2 (sb-thread:make-thread (test m2 m1 s2 s1) :name "T2"))) + ;; One will deadlock, and the other will then complete normally + (let ((res (list (sb-thread:join-thread t1) + (sb-thread:join-thread t2)))) + (assert (or (equal '(:deadlock :ok) res) + (equal '(:ok :deadlock) res))))))) + +(with-test (:name deadlock-detection.2) + (let* ((m1 (sb-thread:make-mutex :name "M1")) + (m2 (sb-thread:make-mutex :name "M2")) + (s1 (sb-thread:make-semaphore :name "S1")) + (s2 (sb-thread:make-semaphore :name "S2")) + (t1 (sb-thread:make-thread + (lambda () + (sb-thread:with-mutex (m1) + (sb-thread:signal-semaphore s1) + (sb-thread:wait-on-semaphore s2) + (sb-thread:with-mutex (m2) + :ok))) + :name "T1"))) + (prog (err) + :retry + (handler-bind ((sb-thread:thread-deadlock + (lambda (e) + (unless err + ;; Make sure we can print the condition + ;; while it's active + (let ((*print-circle* nil)) + (setf err (princ-to-string e))) + (go :retry))))) + (when err + (sleep 1)) + (assert (eq :ok (sb-thread:with-mutex (m2) + (unless err + (sb-thread:signal-semaphore s2) + (sb-thread:wait-on-semaphore s1) + (sleep 1)) + (sb-thread:with-mutex (m1) + :ok))))) + (assert (stringp err))) + (assert (eq :ok (sb-thread:join-thread t1))))) + +(with-test (:name deadlock-detection.3) + (let* ((m1 (sb-thread:make-mutex :name "M1")) + (m2 (sb-thread:make-mutex :name "M2")) + (s1 (sb-thread:make-semaphore :name "S1")) + (s2 (sb-thread:make-semaphore :name "S2")) + (t1 (sb-thread:make-thread + (lambda () + (sb-thread:with-mutex (m1) + (sb-thread:signal-semaphore s1) + (sb-thread:wait-on-semaphore s2) + (sb-thread:with-mutex (m2) + :ok))) + :name "T1"))) + ;; Currently we don't consider it a deadlock + ;; if there is a timeout in the chain. + (assert (eq :deadline + (handler-case + (sb-thread:with-mutex (m2) + (sb-thread:signal-semaphore s2) + (sb-thread:wait-on-semaphore s1) + (sleep 1) + (sb-sys:with-deadline (:seconds 0.1) + (sb-thread:with-mutex (m1) + :ok))) + (sb-sys:deadline-timeout () + :deadline)))) + (assert (eq :ok (join-thread t1))))) diff --git a/tests/timer.impure.lisp b/tests/timer.impure.lisp index 150b696..457bfd2 100644 --- a/tests/timer.impure.lisp +++ b/tests/timer.impure.lisp @@ -272,18 +272,19 @@ (sleep (random 0.01)) (loop repeat 10000 do (sb-ext:unschedule-timer timer)))) - (loop repeat 5 - do (mapcar #'sb-thread:join-thread - (loop for i from 1 upto 10 - collect (let* ((thread (sb-thread:make-thread #'flop - :name (format nil "scheduler ~A" i))) - (ticker (make-limited-timer (lambda () 13) - 1000 - :thread (or other thread) - :name (format nil "ticker ~A" i)))) - (setf other thread) - (sb-ext:schedule-timer ticker 0 :repeat-interval 0.00001) - thread))))))) + (sb-sys:with-deadline (:seconds 30) + (loop repeat 5 + do (mapcar #'sb-thread:join-thread + (loop for i from 1 upto 10 + collect (let* ((thread (sb-thread:make-thread #'flop + :name (format nil "scheduler ~A" i))) + (ticker (make-limited-timer (lambda () 13) + 1000 + :thread (or other thread) + :name (format nil "ticker ~A" i)))) + (setf other thread) + (sb-ext:schedule-timer ticker 0 :repeat-interval 0.00001) + thread)))))))) ;;;; FIXME: OS X 10.4 doesn't like these being at all, and gives us a SIGSEGV ;;;; instead of using the Mach expection system! 10.5 on the other tends to diff --git a/version.lisp-expr b/version.lisp-expr index 2072a3d..8c8c9a8 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -20,4 +20,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.48.9" +"1.0.48.10" -- 1.7.10.4