1.0.48.10: add deadlock detection to spinlocks and mutexes
authorNikodemus Siivola <nikodemus@random-state.net>
Wed, 11 May 2011 16:48:37 +0000 (16:48 +0000)
committerNikodemus Siivola <nikodemus@random-state.net>
Wed, 11 May 2011 16:48:37 +0000 (16:48 +0000)
 Each thread gets a WAITING-FOR slot which holds the lock it is
 currently trying to acquire.

 After initial try at lock acquisition fails, we check for deadlocks:
 we look at the thread that owns the lock we want -> lock it is
 waiting for, etc, recursively, until we either bottom out or find
 someone waiting for us.

 Print thread as having state WAITING if it is waiting for a lock.

 Add a PRINT-OBJECT methods for MUTEXes and SPINLOCKs.

 SB-THREAD:THREAD-DEADLOCK-CYCLE contains a list of the
 threads and locks involved, so that the situation can be
 analyzed even afer a handler has broken the deadlock.

NEWS
package-data-list.lisp-expr
src/code/cold-init.lisp
src/code/deadline.lisp
src/code/target-thread.lisp
src/code/thread.lisp
tests/threads.impure.lisp
tests/threads.pure.lisp
tests/timer.impure.lisp
version.lisp-expr

diff --git a/NEWS b/NEWS
index 48085ff..96b4403 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,7 @@ changes relative to sbcl-1.0.48:
     streams.
   * enhancement: more informative compile-time warnings and runtime
     errors for type-errors detected at compile-time.
+  * enhancement: deadlock detection for mutexes and spinlocks.
   * bug fix: blocking reads from FIFOs created by RUN-PROGRAM were
     uninterruptible, as well as blocking reads from socket streams created
     with for which :SERVE-EVENTS NIL. (regression from 1.0.42.43)
index f46895d..4cfabd1 100644 (file)
@@ -1884,6 +1884,7 @@ is a good idea, but see SB-SYS re. blurring of boundaries."
                "!FOREIGN-COLD-INIT" "!FUNCTION-NAMES-COLD-INIT"
                "!TYPE-CLASS-COLD-INIT" "!TYPEDEFS-COLD-INIT"
                "!ALIEN-TYPE-COLD-INIT" "!CLASSES-COLD-INIT"
+               "!DEADLINE-COLD-INIT"
                "!EARLY-TYPE-COLD-INIT" "!LATE-TYPE-COLD-INIT"
                "!FIXUP-TYPE-COLD-INIT" "!TARGET-TYPE-COLD-INIT"
                "!RANDOM-COLD-INIT" "!READER-COLD-INIT"
@@ -1948,6 +1949,8 @@ is a good idea, but see SB-SYS re. blurring of boundaries."
                "SYMBOL-VALUE-IN-THREAD-ERROR"
                "TERMINATE-THREAD"
                "THREAD"
+               "THREAD-DEADLOCK"
+               "THREAD-DEADLOCK-CYCLE"
                "THREAD-ERROR"
                "THREAD-ERROR-THREAD"
                "THREAD-ALIVE-P"
index d8c1160..8444234 100644 (file)
 
   ;; I'm not sure where eval is first called, so I put this first.
   (show-and-call !eval-cold-init)
-
+  (show-and-call !deadline-cold-init)
   (show-and-call thread-init-or-reinit)
   (show-and-call !typecheckfuns-cold-init)
 
index 97a147b..43b4fc9 100644 (file)
 
 (in-package "SB!IMPL")
 
+(!begin-collecting-cold-init-forms)
+
 ;;; Current deadline as internal time units or NIL.
-(defvar *deadline* nil)
 (declaim (type (or unsigned-byte null) *deadline*))
+(defvar *deadline*)
+(!cold-init-forms (setq *deadline* nil))
 
 ;;; The relative number of seconds the current deadline corresponds
 ;;; to. Used for continuing from TIMEOUT conditions.
-(defvar *deadline-seconds* nil)
+(defvar *deadline-seconds*)
+(!cold-init-forms (setq *deadline-seconds* nil))
 
 (declaim (inline seconds-to-internal-time))
 (defun seconds-to-internal-time (seconds)
@@ -171,3 +175,4 @@ it will signal a timeout condition."
                    (values to-sec to-usec stop-sec stop-usec signalp)))
                (values nil nil nil nil nil)))))))
 
+(!defun-from-collected-cold-init-forms !deadline-cold-init)
index bc478fa..4b89ca8 100644 (file)
 The offending thread is initialized by the :THREAD initialization argument and
 read by the function THREAD-ERROR-THREAD."))
 
+(define-condition thread-deadlock (thread-error)
+  ((cycle :initarg :cycle :reader thread-deadlock-cycle))
+  (:report
+   (lambda (condition stream)
+     (let ((*print-circle* t))
+       (format stream "Deadlock cycle detected:~%~@<   ~@;~
+                     ~{~:@_~S~:@_~}~:@>"
+               (mapcar #'car (thread-deadlock-cycle condition)))))))
+
 #!+sb-doc
 (setf
  (fdocumentation 'thread-error-thread 'function)
@@ -101,18 +110,41 @@ arbitrary printable objects, and need not be unique.")
                      (multiple-value-list
                       (join-thread thread :default cookie))))
            (state (if (eq :running info)
-                      info
+                      (let* ((lock (thread-waiting-for thread)))
+                        (typecase lock
+                          (cons
+                           (list "waiting for:" (cdr lock)
+                                 "timeout: " (car lock)))
+                          (null
+                           (list info))
+                          (t
+                           (list "waiting for:" lock))))
                       (if (eq cookie (car info))
-                          :aborted
+                          (list :aborted)
                           :finished)))
-           (values (when (eq :finished state) info)))
+           (values (when (eq :finished state)
+                     info))
+           (*print-level* 4))
       (format stream
-              "~@[~S ~]~:[~A~;~A~:[ no values~; values: ~:*~{~S~^, ~}~]~]"
+              "~@[~S ~]~:[~{~I~A~^~2I~_ ~}~_~;~A~:[ no values~; values: ~:*~{~S~^, ~}~]~]"
               (thread-name thread)
               (eq :finished state)
               state
               values))))
 
+(defun print-lock (lock name owner stream)
+  (let ((*print-circle* t))
+    (print-unreadable-object (lock stream :type t :identity (not name))
+      (if owner
+          (format stream "~@[~S ~]~2I~_owner: ~S" name owner)
+          (format stream "~@[~S ~](free)" name)))))
+
+(def!method print-object ((mutex mutex) stream)
+  (print-lock mutex (mutex-name mutex) (mutex-owner mutex) stream))
+
+(def!method print-object ((spinlock spinlock) stream)
+  (print-lock spinlock (spinlock-name spinlock) (spinlock-value spinlock) stream))
+
 (defun thread-alive-p (thread)
   #!+sb-doc
   "Return T if THREAD is still alive. Note that the return value is
@@ -267,6 +299,27 @@ created and old ones may exit at any time."
 
 ;;;; Spinlocks
 
+(defmacro with-deadlocks ((thread lock timeout) &body forms)
+  (with-unique-names (prev n-thread n-lock n-timeout new)
+    `(let* ((,n-thread ,thread)
+            (,n-lock ,lock)
+            (,n-timeout (or ,timeout
+                            (when sb!impl::*deadline*
+                              sb!impl::*deadline-seconds*)))
+            ;; If we get interrupted while waiting for a lock, etc.
+            (,prev (thread-waiting-for ,n-thread))
+            (,new (if ,n-timeout
+                      (cons ,n-timeout ,n-lock)
+                      ,n-lock)))
+       (declare (dynamic-extent ,new))
+       ;; No WITHOUT-INTERRUPTS, since WITH-DEADLOCKS is used
+       ;; in places where interrupts should already be disabled.
+       (unwind-protect
+            (progn
+              (setf (thread-waiting-for ,n-thread) ,new)
+              ,@forms)
+         (setf (thread-waiting-for ,n-thread) ,prev)))))
+
 (declaim (inline get-spinlock release-spinlock))
 
 ;;; Should always be called with interrupts disabled.
@@ -278,23 +331,28 @@ created and old ones may exit at any time."
       (when (eq old new)
         (error "Recursive lock attempt on ~S." spinlock))
       #!+sb-thread
-      (flet ((cas ()
-               (if (sb!ext:compare-and-swap (spinlock-value spinlock) nil new)
-                   (thread-yield)
-                   (return-from get-spinlock t))))
-        (if (and (not *interrupts-enabled*) *allow-with-interrupts*)
-            ;; If interrupts are disabled, but we are allowed to
-            ;; enabled them, check for pending interrupts every once
-            ;; in a while. %CHECK-INTERRUPTS is taking shortcuts, make
-            ;; sure that deferrables are unblocked by doing an empty
-            ;; WITH-INTERRUPTS once.
-            (progn
-              (with-interrupts)
-              (loop
-               (loop repeat 128 do (cas)) ; 128 is arbitrary here
-               (sb!unix::%check-interrupts)))
-            (loop (cas)))))
-    t))
+      (with-deadlocks (new spinlock nil)
+        (flet ((cas ()
+                 (if (sb!ext:compare-and-swap (spinlock-value spinlock) nil new)
+                     (thread-yield)
+                     (return-from get-spinlock t))))
+          ;; Try once.
+          (cas)
+          ;; Check deadlocks
+          (with-interrupts (check-deadlock))
+          (if (and (not *interrupts-enabled*) *allow-with-interrupts*)
+              ;; If interrupts are disabled, but we are allowed to
+              ;; enabled them, check for pending interrupts every once
+              ;; in a while. %CHECK-INTERRUPTS is taking shortcuts, make
+              ;; sure that deferrables are unblocked by doing an empty
+              ;; WITH-INTERRUPTS once.
+              (progn
+                (with-interrupts)
+                (loop
+                  (loop repeat 128 do (cas)) ; 128 is arbitrary here
+                  (sb!unix::%check-interrupts)))
+              (loop (cas)))))))
+    t)
 
 (defun release-spinlock (spinlock)
   (declare (optimize (speed 3) (safety 0)))
@@ -345,7 +403,65 @@ HOLDING-MUTEX-P."
   ;; Make sure to get the current value.
   (sb!ext:compare-and-swap (mutex-%owner mutex) nil nil))
 
-(defun get-mutex (mutex &optional (new-owner *current-thread*)
+;;; Signals an error if owner of LOCK is waiting on a lock whose release
+;;; depends on the current thread. Does not detect deadlocks from sempahores.
+(defun check-deadlock ()
+  (let* ((self *current-thread*)
+         (origin (thread-waiting-for self)))
+    (labels ((lock-owner (lock)
+               (etypecase lock
+                 (mutex (mutex-%owner lock))
+                 (spinlock (spinlock-value lock))))
+             (detect-deadlock (lock)
+               (let ((other-thread (lock-owner lock)))
+                 (cond ((not other-thread))
+                       ((eq self other-thread)
+                        (let* ((chain (deadlock-chain self origin))
+                               (barf
+                                (format nil
+                                        "~%WARNING: DEADLOCK CYCLE DETECTED:~%~@<   ~@;~
+                                         ~{~:@_~S~:@_~}~:@>~
+                                         ~%END OF CYCLE~%"
+                                        (mapcar #'car chain))))
+                          ;; Barf to stderr in case the system is too tied up
+                          ;; to report the error properly -- to avoid cross-talk
+                          ;; build the whole string up first.
+                          (write-string barf sb!sys:*stderr*)
+                          (finish-output sb!sys:*stderr*)
+                          (error 'thread-deadlock
+                                 :thread *current-thread*
+                                 :cycle chain)))
+                       (t
+                        (let ((other-lock (thread-waiting-for other-thread)))
+                          ;; If the thread is waiting with a timeout OTHER-LOCK
+                          ;; is a cons, and we don't consider it a deadlock -- since
+                          ;; it will time out on its own sooner or later.
+                          (when (and other-lock (not (consp other-lock)))
+                            (detect-deadlock other-lock)))))))
+             (deadlock-chain (thread lock)
+               (let* ((other-thread (lock-owner lock))
+                      (other-lock (thread-waiting-for other-thread)))
+                 (cond ((not other-thread)
+                        ;; The deadlock is gone -- maybe someone timed out?
+                        (return-from check-deadlock nil))
+                       ((consp other-lock)
+                        ;; There's a timeout -- no deadlock.
+                        (return-from check-deadlock nil))
+                       ((eq self other-thread)
+                        ;; Done
+                        (list (list thread lock)))
+                       (t
+                        (if other-lock
+                            (cons (list thread lock)
+                                  (deadlock-chain other-thread other-lock))
+                            ;; Again, the deadlock is gone?
+                            (return-from check-deadlock nil)))))))
+      ;; Timeout means there is no deadlock
+      (unless (consp origin)
+        (detect-deadlock origin)
+        t))))
+
+(defun get-mutex (mutex &optional new-owner
                                   (waitp t) (timeout nil))
   #!+sb-doc
   "Deprecated in favor of GRAB-MUTEX."
@@ -363,7 +479,7 @@ HOLDING-MUTEX-P."
   #!-sb-thread
   (setf (mutex-%owner mutex) new-owner)
   #!+sb-thread
-  (progn
+  (with-deadlocks (new-owner mutex timeout)
     ;; FIXME: Lutexes do not currently support deadlines, as at least
     ;; on Darwin pthread_foo_timedbar functions are not supported:
     ;; this means that we probably need to use the Carbon multiprocessing
@@ -378,12 +494,23 @@ HOLDING-MUTEX-P."
       (when timeout
         (error "Mutex timeouts not supported on this platform."))
       (when (zerop (with-lutex-address (lutex (mutex-lutex mutex))
-                    (if waitp
-                        (with-interrupts (%lutex-lock lutex))
-                        (%lutex-trylock lutex))))
-       (setf (mutex-%owner mutex) new-owner)
-       (barrier (:write))
-       t))
+                     (if waitp
+                         (let ((once (%lutex-trylock lutex)))
+                           (cond ((zerop once)
+                                  ;; No need to wait.
+                                  once)
+                                 (t
+                                  (with-interrupts
+                                    ;; Check for deadlocks before waiting
+                                    (check-deadlock)
+                                    (%lutex-lock lutex)))))
+                         (%lutex-trylock lutex))))
+        ;; FIXME: If %LUTEX-LOCK unwinds due to a signal, we may actually
+        ;; be holding the lock already -- and but neglect to mark ourselves
+        ;; as the owner here. This is bad.
+        (setf (mutex-%owner mutex) new-owner)
+        (barrier (:write))
+        t))
     #!-sb-lutex
     ;; This is a direct translation of the Mutex 2 algorithm from
     ;; "Futexes are Tricky" by Ulrich Drepper.
@@ -399,20 +526,22 @@ HOLDING-MUTEX-P."
                                                         +lock-taken+
                                                         +lock-contested+))))
              ;; Wait on the contested lock.
-             (loop
-              (multiple-value-bind (to-sec to-usec stop-sec stop-usec deadlinep)
-                  (decode-timeout timeout)
-                (declare (ignore stop-sec stop-usec))
-                (case (with-pinned-objects (mutex)
-                        (futex-wait (mutex-state-address mutex)
-                                    (get-lisp-obj-address +lock-contested+)
-                                    (or to-sec -1)
-                                    (or to-usec 0)))
-                  ((1) (if deadlinep
-                           (signal-deadline)
-                           (return-from get-mutex nil)))
-                  ((2))
-                  (otherwise (return))))))
+             (with-interrupts
+               (check-deadlock)
+               (loop
+                 (multiple-value-bind (to-sec to-usec stop-sec stop-usec deadlinep)
+                     (decode-timeout timeout)
+                   (declare (ignore stop-sec stop-usec))
+                   (case (with-pinned-objects (mutex)
+                           (futex-wait (mutex-state-address mutex)
+                                       (get-lisp-obj-address +lock-contested+)
+                                       (or to-sec -1)
+                                       (or to-usec 0)))
+                     ((1) (if deadlinep
+                              (signal-deadline)
+                              (return-from get-mutex nil)))
+                     ((2))
+                     (otherwise (return)))))))
            (setf old (sb!ext:compare-and-swap (mutex-state mutex)
                                               +lock-free+
                                               +lock-contested+))
index 7a2e567..85eb1c8 100644 (file)
@@ -28,7 +28,8 @@ in future versions."
    :type mutex)
   (result-lock
    (make-mutex :name "thread result lock")
-   :type mutex))
+   :type mutex)
+  waiting-for)
 
 (def!struct mutex
   #!+sb-doc
index 424a6c2..311e1d5 100644 (file)
 (with-test (:name (:grab-mutex :timeout :acquisition-fail))
   #+sb-lutex
   (error "Mutex timeout not supported here.")
-  (let ((m (make-mutex)))
+  (let ((m (make-mutex))
+        (w (make-semaphore)))
     (with-mutex (m)
-      (assert (null (join-thread (make-thread
-                                  #'(lambda ()
-                                      (grab-mutex m :timeout 0.1)))))))))
+      (let ((th (make-thread
+                 #'(lambda ()
+                     (prog1
+                         (grab-mutex m :timeout 0.1)
+                       (signal-semaphore w))))))
+        ;; Wait for it to -- otherwise the detect the deadlock chain
+        ;; from JOIN-THREAD.
+        (wait-on-semaphore w)
+        (assert (null (join-thread th)))))))
 
 (with-test (:name (:grab-mutex :timeout :acquisition-success))
   #+sb-lutex
 (with-test (:name (:grab-mutex :timeout+deadline))
   #+sb-lutex
   (error "Mutex timeout not supported here.")
-  (let ((m (make-mutex)))
+  (let ((m (make-mutex))
+        (w (make-semaphore)))
     (with-mutex (m)
-      (assert (eq (join-thread
-                   (make-thread #'(lambda ()
-                                    (sb-sys:with-deadline (:seconds 0.0)
-                                      (handler-case
-                                          (grab-mutex m :timeout 0.0)
-                                        (sb-sys:deadline-timeout ()
-                                          :deadline))))))
-                  :deadline)))))
+      (let ((th (make-thread #'(lambda ()
+                                 (sb-sys:with-deadline (:seconds 0.0)
+                                   (handler-case
+                                       (grab-mutex m :timeout 0.0)
+                                     (sb-sys:deadline-timeout ()
+                                       (signal-semaphore w)
+                                       :deadline)))))))
+        (wait-on-semaphore w)
+        (assert (eq (join-thread th) :deadline))))))
 
 (with-test (:name (:grab-mutex :waitp+deadline))
   #+sb-lutex
index 752d230..6fcd67a 100644 (file)
         (assert (equal (list :write :thread-dead)
                        (sb-thread::symbol-value-in-thread-error-info e)))))
     (assert error-occurred)))
+
+#+sb-thread
+(with-test (:name deadlock-detection.1)
+  (flet ((test (ma mb sa sb)
+           (lambda ()
+             (handler-case
+                 (sb-thread:with-mutex (ma)
+                   (sb-thread:signal-semaphore sa)
+                   (sb-thread:wait-on-semaphore sb)
+                   (sb-thread:with-mutex (mb)
+                     :ok))
+               (sb-thread:thread-deadlock (e)
+                 (princ e)
+                 :deadlock)))))
+    (let* ((m1 (sb-thread:make-mutex :name "M1"))
+           (m2 (sb-thread:make-mutex :name "M2"))
+           (s1 (sb-thread:make-semaphore :name "S1"))
+           (s2 (sb-thread:make-semaphore :name "S2"))
+           (t1 (sb-thread:make-thread (test m1 m2 s1 s2) :name "T1"))
+           (t2 (sb-thread:make-thread (test m2 m1 s2 s1) :name "T2")))
+      ;; One will deadlock, and the other will then complete normally
+      (let ((res (list (sb-thread:join-thread t1)
+                       (sb-thread:join-thread t2))))
+        (assert (or (equal '(:deadlock :ok) res)
+                    (equal '(:ok :deadlock) res)))))))
+
+(with-test (:name deadlock-detection.2)
+  (let* ((m1 (sb-thread:make-mutex :name "M1"))
+         (m2 (sb-thread:make-mutex :name "M2"))
+         (s1 (sb-thread:make-semaphore :name "S1"))
+         (s2 (sb-thread:make-semaphore :name "S2"))
+         (t1 (sb-thread:make-thread
+              (lambda ()
+                (sb-thread:with-mutex (m1)
+                  (sb-thread:signal-semaphore s1)
+                  (sb-thread:wait-on-semaphore s2)
+                  (sb-thread:with-mutex (m2)
+                    :ok)))
+              :name "T1")))
+    (prog (err)
+     :retry
+       (handler-bind ((sb-thread:thread-deadlock
+                       (lambda (e)
+                         (unless err
+                           ;; Make sure we can print the condition
+                           ;; while it's active
+                           (let ((*print-circle* nil))
+                             (setf err (princ-to-string e)))
+                           (go :retry)))))
+         (when err
+           (sleep 1))
+         (assert (eq :ok (sb-thread:with-mutex (m2)
+                           (unless err
+                             (sb-thread:signal-semaphore s2)
+                             (sb-thread:wait-on-semaphore s1)
+                             (sleep 1))
+                           (sb-thread:with-mutex (m1)
+                             :ok)))))
+       (assert (stringp err)))
+    (assert (eq :ok (sb-thread:join-thread t1)))))
+
+(with-test (:name deadlock-detection.3)
+  (let* ((m1 (sb-thread:make-mutex :name "M1"))
+         (m2 (sb-thread:make-mutex :name "M2"))
+         (s1 (sb-thread:make-semaphore :name "S1"))
+         (s2 (sb-thread:make-semaphore :name "S2"))
+         (t1 (sb-thread:make-thread
+              (lambda ()
+                (sb-thread:with-mutex (m1)
+                  (sb-thread:signal-semaphore s1)
+                  (sb-thread:wait-on-semaphore s2)
+                  (sb-thread:with-mutex (m2)
+                    :ok)))
+              :name "T1")))
+    ;; Currently we don't consider it a deadlock
+    ;; if there is a timeout in the chain.
+    (assert (eq :deadline
+                (handler-case
+                    (sb-thread:with-mutex (m2)
+                      (sb-thread:signal-semaphore s2)
+                      (sb-thread:wait-on-semaphore s1)
+                      (sleep 1)
+                      (sb-sys:with-deadline (:seconds 0.1)
+                        (sb-thread:with-mutex (m1)
+                          :ok)))
+                  (sb-sys:deadline-timeout ()
+                    :deadline))))
+    (assert (eq :ok (join-thread t1)))))
index 150b696..457bfd2 100644 (file)
              (sleep (random 0.01))
              (loop repeat 10000
                    do (sb-ext:unschedule-timer timer))))
-      (loop repeat 5
-            do (mapcar #'sb-thread:join-thread
-                       (loop for i from 1 upto 10
-                             collect (let* ((thread (sb-thread:make-thread #'flop
-                                                                           :name (format nil "scheduler ~A" i)))
-                                            (ticker (make-limited-timer (lambda () 13)
-                                                                               1000
-                                                                               :thread (or other thread)
-                                                                               :name (format nil "ticker ~A" i))))
-                                       (setf other thread)
-                                       (sb-ext:schedule-timer ticker 0 :repeat-interval 0.00001)
-                                       thread)))))))
+      (sb-sys:with-deadline (:seconds 30)
+        (loop repeat 5
+              do (mapcar #'sb-thread:join-thread
+                           (loop for i from 1 upto 10
+                                 collect (let* ((thread (sb-thread:make-thread #'flop
+                                                                               :name (format nil "scheduler ~A" i)))
+                                                (ticker (make-limited-timer (lambda () 13)
+                                                                            1000
+                                                                            :thread (or other thread)
+                                                                            :name (format nil "ticker ~A" i))))
+                                           (setf other thread)
+                                           (sb-ext:schedule-timer ticker 0 :repeat-interval 0.00001)
+                                           thread))))))))
 
 ;;;; FIXME: OS X 10.4 doesn't like these being at all, and gives us a SIGSEGV
 ;;;; instead of using the Mach expection system! 10.5 on the other tends to
index 2072a3d..8c8c9a8 100644 (file)
@@ -20,4 +20,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.48.9"
+"1.0.48.10"