fix bogus deadlocks from interrupts and GCs
authorNikodemus Siivola <nikodemus@sb-studio.net>
Mon, 15 Aug 2011 11:33:49 +0000 (14:33 +0300)
committerNikodemus Siivola <nikodemus@sb-studio.net>
Mon, 15 Aug 2011 13:32:14 +0000 (16:32 +0300)
 lp#807475

 Going in despite the freeze since this is a regression that can
 semi-randomly break correct code. *ouch*

 Thanks to Bart Bortta and #sbcl for the analysis.

 Problem 1:

   T1 holds L1

   T2 is waiting for L1

   T2 is interrupted, interrupt handler grabs L2

   T1 starts waiting on L2

   Prior to this patch, when GET-MUTEX in T2's interrupt handler grabbed
   L2 is marked T2 as still waiting for L1 -- which is not true until
   the interrupt handler returns.

 Problem 2:

   T1 holds L1

   T2 is waiting for L1

   GC is triggered in T2 inside GET-MUTEX

   T2 grabs *ALREADY-IN-GC* lock

   GC is triggered in T1, T1 tries to get *ALREADY-IN-GC* lock.

   Prior to this patch, when T1 detects a bogus deadlock as T2 has
   been marked as waiting for L1 -- which is not true until the GC is
   finished and normal execution resumes.

 Problem 3:

   T1 holds L1

   T2 is waiting for L1

   GC is triggered in T2 inside GET-MUTEX

   T2 grabs lock L2 due to a finalizer or an after-gc-hook

   GC is triggered in T1

   T1 tries to grab L2 due to a finalizer, etc.

   Same as problem 2, but with a user-lock and POST-GC instead of
   *ALREADY-IN-GC* and SUB-GC.

 This patch fixes the issue by saving, clearing, and restoring
 the waiting-for mark in

  1) interrupt handlers

  2) SUB-GC

  3) POST-GC

NEWS
src/code/gc.lisp
src/code/target-signal.lisp
src/code/target-thread.lisp
src/code/thread.lisp
tests/threads.impure.lisp

diff --git a/NEWS b/NEWS
index a0816bb..2bc82ba 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -48,6 +48,8 @@ changes relative to sbcl-1.0.50:
     (lp#819269)
   * bug fix: &REST to &MORE conversion still works in unsafe call to known
     functions; reported by Lutz Euler (lp#826459).
+  * bug fix: bogus deadlocks from interrupts and GCs. (lp#807475, regression
+    since 1.0.48)
 
 changes in sbcl-1.0.50 relative to sbcl-1.0.49:
   * enhancement: errors from FD handlers now provide a restart to remove
index 5ea3669..148547c 100644 (file)
@@ -236,39 +236,40 @@ NIL as the pathname."
            ;; Now, if GET-MUTEX did not cons, that would be enough.
            ;; Because it does, we need the :IN-PROGRESS bit above to
            ;; tell the runtime not to trigger gcs.
-           (let ((sb!impl::*in-without-gcing* t)
-                 (sb!impl::*deadline* nil)
-                 (sb!impl::*deadline-seconds* nil))
-             (sb!thread:with-mutex (*already-in-gc*)
-               (let ((*gc-inhibit* t))
-                 (let ((old-usage (dynamic-usage))
-                       (new-usage 0))
-                   (unsafe-clear-roots gen)
-                   (gc-stop-the-world)
-                   (let ((start-time (get-internal-run-time)))
-                     (collect-garbage gen)
-                     (setf *gc-epoch* (cons nil nil))
-                     (let ((run-time (- (get-internal-run-time) start-time)))
-                       ;; KLUDGE: Sometimes we see the second getrusage() call
-                       ;; return a smaller value than the first, which can
-                       ;; lead to *GC-RUN-TIME* to going negative, which in
-                       ;; turn is a type-error.
-                       (when (plusp run-time)
-                         (incf *gc-run-time* run-time))))
-                   (setf *gc-pending* nil
-                         new-usage (dynamic-usage))
-                   #!+sb-thread
-                   (assert (not *stop-for-gc-pending*))
-                   (gc-start-the-world)
-                   ;; In a multithreaded environment the other threads
-                   ;; will see *n-b-f-o-p* change a little late, but
-                   ;; that's OK.
-                   (let ((freed (- old-usage new-usage)))
-                     ;; GENCGC occasionally reports negative here, but
-                     ;; the current belief is that it is part of the
-                     ;; normal order of things and not a bug.
-                     (when (plusp freed)
-                       (incf *n-bytes-freed-or-purified* freed)))))))
+           (sb!thread::without-thread-waiting-for (:already-without-interrupts t)
+             (let* ((sb!impl::*in-without-gcing* t)
+                    (sb!impl::*deadline* nil)
+                    (sb!impl::*deadline-seconds* nil))
+               (sb!thread:with-mutex (*already-in-gc*)
+                 (let ((*gc-inhibit* t))
+                   (let ((old-usage (dynamic-usage))
+                         (new-usage 0))
+                     (unsafe-clear-roots gen)
+                     (gc-stop-the-world)
+                     (let ((start-time (get-internal-run-time)))
+                       (collect-garbage gen)
+                       (setf *gc-epoch* (cons nil nil))
+                       (let ((run-time (- (get-internal-run-time) start-time)))
+                         ;; KLUDGE: Sometimes we see the second getrusage() call
+                         ;; return a smaller value than the first, which can
+                         ;; lead to *GC-RUN-TIME* to going negative, which in
+                         ;; turn is a type-error.
+                         (when (plusp run-time)
+                           (incf *gc-run-time* run-time))))
+                     (setf *gc-pending* nil
+                           new-usage (dynamic-usage))
+                     #!+sb-thread
+                     (assert (not *stop-for-gc-pending*))
+                     (gc-start-the-world)
+                     ;; In a multithreaded environment the other threads
+                     ;; will see *n-b-f-o-p* change a little late, but
+                     ;; that's OK.
+                     (let ((freed (- old-usage new-usage)))
+                       ;; GENCGC occasionally reports negative here, but
+                       ;; the current belief is that it is part of the
+                       ;; normal order of things and not a bug.
+                       (when (plusp freed)
+                         (incf *n-bytes-freed-or-purified* freed))))))))
            ;; While holding the mutex we were protected from
            ;; SIG_STOP_FOR_GC and recursive GCs. Now, in order to
            ;; preserve the invariant (*GC-PENDING* ->
@@ -299,9 +300,10 @@ NIL as the pathname."
   ;; finalizers and after-gc hooks.
   (when (sb!thread:thread-alive-p sb!thread:*current-thread*)
     (when *allow-with-interrupts*
-      (with-interrupts
-        (run-pending-finalizers)
-        (call-hooks "after-GC" *after-gc-hooks* :on-error :warn)))))
+      (sb!thread::without-thread-waiting-for ()
+        (with-interrupts
+          (run-pending-finalizers)
+          (call-hooks "after-GC" *after-gc-hooks* :on-error :warn))))))
 
 ;;; This is the user-advertised garbage collection function.
 (defun gc (&key (gen 0) (full nil) &allow-other-keys)
index bea8892..2ae4a88 100644 (file)
     (let ((*unblock-deferrables-on-enabling-interrupts-p* t))
       (with-interrupt-bindings
         (let ((sb!debug:*stack-top-hint*
-               (nth-value 1 (sb!kernel:find-interrupted-name-and-frame))))
-          (allow-with-interrupts
-            (nlx-protect (funcall function)
-                         ;; We've been running with deferrables
-                         ;; blocked in Lisp called by a C signal
-                         ;; handler. If we return normally the sigmask
-                         ;; in the interrupted context is restored.
-                         ;; However, if we do an nlx the operating
-                         ;; system will not restore it for us.
-                         (when *unblock-deferrables-on-enabling-interrupts-p*
-                           ;; This means that storms of interrupts
-                           ;; doing an nlx can still run out of stack.
-                           (unblock-deferrable-signals)))))))))
+                (nth-value 1 (sb!kernel:find-interrupted-name-and-frame))))
+          (sb!thread::without-thread-waiting-for (:already-without-interrupts t)
+              (allow-with-interrupts
+                (nlx-protect (funcall function)
+                             ;; We've been running with deferrables
+                             ;; blocked in Lisp called by a C signal
+                             ;; handler. If we return normally the sigmask
+                             ;; in the interrupted context is restored.
+                             ;; However, if we do an nlx the operating
+                             ;; system will not restore it for us.
+                             (when *unblock-deferrables-on-enabling-interrupts-p*
+                               ;; This means that storms of interrupts
+                               ;; doing an nlx can still run out of stack.
+                               (unblock-deferrable-signals))))))))))
 
 (defmacro in-interruption ((&key) &body body)
   #!+sb-doc
index 18bec35..038dad0 100644 (file)
@@ -301,7 +301,7 @@ created and old ones may exit at any time."
 
 (defmacro with-deadlocks ((thread lock &optional timeout) &body forms)
   (declare (ignorable timeout))
-  (with-unique-names (prev n-thread n-lock n-timeout new)
+  (with-unique-names (n-thread n-lock n-timeout new)
     `(let* ((,n-thread ,thread)
             (,n-lock ,lock)
             (,n-timeout #!-sb-lutex
@@ -309,8 +309,6 @@ created and old ones may exit at any time."
                            `(or ,timeout
                                 (when sb!impl::*deadline*
                                   sb!impl::*deadline-seconds*))))
-            ;; If we get interrupted while waiting for a lock, etc.
-            (,prev (thread-waiting-for ,n-thread))
             (,new (if ,n-timeout
                       (cons ,n-timeout ,n-lock)
                       ,n-lock)))
@@ -321,7 +319,9 @@ created and old ones may exit at any time."
             (progn
               (setf (thread-waiting-for ,n-thread) ,new)
               ,@forms)
-         (setf (thread-waiting-for ,n-thread) ,prev)))))
+         ;; Interrupt handlers and GC save and restore any
+         ;; previous wait marks using WITHOUT-DEADLOCKS below.
+         (setf (thread-waiting-for ,n-thread) nil)))))
 
 (declaim (inline get-spinlock release-spinlock))
 
index 85eb1c8..5dfb84e 100644 (file)
@@ -71,6 +71,26 @@ stale value, use MUTEX-OWNER instead."
   (name  nil :type (or null thread-name))
   (value nil))
 
+(sb!xc:defmacro without-thread-waiting-for ((&key already-without-interrupts) &body body)
+  (with-unique-names (thread prev)
+    (let ((without (if already-without-interrupts
+                       'progn
+                       'without-interrupts))
+          (with (if already-without-interrupts
+                    'progn
+                    'with-local-interrupts)))
+      `(let* ((,thread *current-thread*)
+              (,prev (thread-waiting-for ,thread)))
+         (flet ((exec () ,@body))
+           (if ,prev
+               (,without
+                (unwind-protect
+                     (progn
+                       (setf (thread-waiting-for ,thread) nil)
+                       (,with (exec)))
+                  (setf (thread-waiting-for ,thread) ,prev)))
+               (exec)))))))
+
 (sb!xc:defmacro with-mutex ((mutex &key (value '*current-thread*) (wait-p t))
                             &body body)
   #!+sb-doc
index 311e1d5..e3db63e 100644 (file)
             (format t "~%joined ~S~%" (sb-thread:thread-name th)))
           (list d1 d2 d3 i))))
 (format t "parallel defclass test done~%")
+
+(with-test (:name (:deadlock-detection :interrupts))
+  (let* ((m1 (sb-thread:make-mutex :name "M1"))
+         (m2 (sb-thread:make-mutex :name "M2"))
+         (t1 (sb-thread:make-thread
+              (lambda ()
+                (sb-thread:with-mutex (m1)
+                  (sleep 0.3)
+                  :ok))
+              :name "T1"))
+         (t2 (sb-thread:make-thread
+              (lambda ()
+                (sleep 0.1)
+                (sb-thread:with-mutex (m1 :wait-p t)
+                  (sleep 0.2)
+                  :ok))
+              :name "T2")))
+    (sleep 0.2)
+    (sb-thread:interrupt-thread t2 (lambda ()
+                                     (sb-thread:with-mutex (m2 :wait-p t)
+                                       (sleep 0.3))))
+    (sleep 0.05)
+    (sb-thread:interrupt-thread t1 (lambda ()
+                                     (sb-thread:with-mutex (m2 :wait-p t)
+                                       (sleep 0.3))))
+    ;; both threads should finish without a deadlock or deadlock
+    ;; detection error
+    (let ((res (list (sb-thread:join-thread t1)
+                     (sb-thread:join-thread t2))))
+      (assert (equal '(:ok :ok) res)))))
+
+(with-test (:name (:deadlock-detection :gc))
+  ;; To semi-reliably trigger the error (in SBCL's where)
+  ;; it was present you had to run this for > 30 seconds,
+  ;; but that's a bit long for a single test.
+  (let* ((stop (+ 5 (get-universal-time)))
+         (m1 (sb-thread:make-mutex :name "m1"))
+         (t1 (sb-thread:make-thread
+              (lambda ()
+                (loop until (> (get-universal-time) stop)
+                      do (sb-thread:with-mutex (m1)
+                           (eval `(make-array 24))))
+                :ok)))
+         (t2 (sb-thread:make-thread
+              (lambda ()
+                (loop until (> (get-universal-time) stop)
+                      do (sb-thread:with-mutex (m1)
+                           (eval `(make-array 24))))
+                :ok))))
+    (let ((res (list (sb-thread:join-thread t1)
+                     (sb-thread:join-thread t2))))
+      (assert (equal '(:ok :ok) res)))))