more robust deadlock detection
authorNikodemus Siivola <nikodemus@random-state.net>
Mon, 28 Nov 2011 17:45:16 +0000 (19:45 +0200)
committerNikodemus Siivola <nikodemus@random-state.net>
Mon, 5 Dec 2011 08:44:44 +0000 (10:44 +0200)
 Lock around building the deadlock chain using WITH-CAS-LOCK after the
 tentative deadlock has been detected, and break the deadlock chain before
 signaling the error.

 This means that a single deadlock is reported only in a single thread.

 Fixes occasional failures of deadlock-detection.1 due to a bogus vicious
 metacircle. (Two threads detecting the same deadlock, then racing to report
 the error detected as another deadlock, the reporting of which in turn looked
 like a metacircle to CLOS if PRINT-OBJECT didn't yet have the right method in
 cache.)

NEWS
src/code/target-thread.lisp
tests/threads.pure.lisp

diff --git a/NEWS b/NEWS
index 60733ce..122a0ea 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,8 @@ changes relative to sbcl-1.0.54:
     ** --arch option can be used to specify the architecture to build for.
        (Mainly useful for building 32-bit SBCL's on x86-64 hosts, not
        full-blows cross-compilation.)
+  * bug fix: deadlock detection could report the same deadlock twice, for
+    two different threads. Now a single deadlock is reported exactly once.
 
 changes in sbcl-1.0.54 relative to sbcl-1.0.53:
   * minor incompatible changes:
index 7ec3309..4059d14 100644 (file)
@@ -362,6 +362,8 @@ HOLDING-MUTEX-P."
   ;; Make sure to get the current value.
   (sb!ext:compare-and-swap (mutex-%owner mutex) nil nil))
 
+(sb!ext:defglobal **deadlock-lock** nil)
+
 ;;; Signals an error if owner of LOCK is waiting on a lock whose release
 ;;; depends on the current thread. Does not detect deadlocks from sempahores.
 (defun check-deadlock ()
@@ -373,16 +375,36 @@ HOLDING-MUTEX-P."
                (let ((other-thread (mutex-%owner lock)))
                  (cond ((not other-thread))
                        ((eq self other-thread)
-                        (let* ((chain (deadlock-chain self origin))
+                        (let* ((chain
+                                 (with-cas-lock ((symbol-value '**deadlock-lock**))
+                                   (prog1 (deadlock-chain self origin)
+                                     ;; We're now committed to signaling the
+                                     ;; error and breaking the deadlock, so
+                                     ;; mark us as no longer waiting on the
+                                     ;; lock. This ensures that a single
+                                     ;; deadlock is reported in only one
+                                     ;; thread, and that we don't look like
+                                     ;; we're waiting on the lock when print
+                                     ;; stuff -- because that may lead to
+                                     ;; further deadlock checking, in turn
+                                     ;; possibly leading to a bogus vicious
+                                     ;; metacycle on PRINT-OBJECT.
+                                     (setf (thread-waiting-for self) nil))))
                                (barf
-                                (format nil
-                                        "~%WARNING: DEADLOCK CYCLE DETECTED:~%~@<   ~@;~
-                                         ~{~:@_~S~:@_~}~:@>~
-                                         ~%END OF CYCLE~%"
-                                        (mapcar #'car chain))))
+                                 (with-output-to-string (s)
+                                   (funcall (formatter
+                                             "~%WARNING: DEADLOCK CYCLE DETECTED:~%~@<   ~@;~
+                                              ~{~:@_~S~:@_~}~:@>~
+                                              ~%END OF CYCLE~%")
+                                            s
+                                            (mapcar #'car chain)))))
                           ;; Barf to stderr in case the system is too tied up
-                          ;; to report the error properly -- to avoid cross-talk
+                          ;; to report the error properly -- and to avoid cross-talk
                           ;; build the whole string up first.
+                          ;;
+                          ;; ...would be even better if we had
+                          ;; sensible locks on streams, but what can
+                          ;; you do...
                           (write-string barf sb!sys:*stderr*)
                           (finish-output sb!sys:*stderr*)
                           (error 'thread-deadlock
index adcf13a..e58cda7 100644 (file)
                 (t1 (sb-thread:make-thread (test m1 m2 s1 s2) :name "T1"))
                 (t2 (sb-thread:make-thread (test m2 m1 s2 s1) :name "T2")))
            ;; One will deadlock, and the other will then complete normally.
-           ;; ...except sometimes, when we get unlucky, and both will do
-           ;; the deadlock detection in parallel and both signal.
            (let ((res (list (sb-thread:join-thread t1)
                             (sb-thread:join-thread t2))))
              (assert (or (equal '(:deadlock :ok) res)
-                         (equal '(:ok :deadlock) res)
-                         (equal '(:deadlock :deadlock) res))))))))
+                         (equal '(:ok :deadlock) res))))))))
 
 (with-test (:name deadlock-detection.2 :skipped-on '(not :sb-thread))
   (let* ((m1 (sb-thread:make-mutex :name "M1"))