From 5abf3b4b94c8c2315777e63729293395dc54992c Mon Sep 17 00:00:00 2001 From: Nikodemus Siivola Date: Mon, 15 Aug 2011 14:33:49 +0300 Subject: [PATCH] fix bogus deadlocks from interrupts and GCs lp#807475 Going in despite the freeze since this is a regression that can semi-randomly break correct code. *ouch* Thanks to Bart Bortta and #sbcl for the analysis. Problem 1: T1 holds L1 T2 is waiting for L1 T2 is interrupted, interrupt handler grabs L2 T1 starts waiting on L2 Prior to this patch, when GET-MUTEX in T2's interrupt handler grabbed L2 is marked T2 as still waiting for L1 -- which is not true until the interrupt handler returns. Problem 2: T1 holds L1 T2 is waiting for L1 GC is triggered in T2 inside GET-MUTEX T2 grabs *ALREADY-IN-GC* lock GC is triggered in T1, T1 tries to get *ALREADY-IN-GC* lock. Prior to this patch, when T1 detects a bogus deadlock as T2 has been marked as waiting for L1 -- which is not true until the GC is finished and normal execution resumes. Problem 3: T1 holds L1 T2 is waiting for L1 GC is triggered in T2 inside GET-MUTEX T2 grabs lock L2 due to a finalizer or an after-gc-hook GC is triggered in T1 T1 tries to grab L2 due to a finalizer, etc. Same as problem 2, but with a user-lock and POST-GC instead of *ALREADY-IN-GC* and SUB-GC. This patch fixes the issue by saving, clearing, and restoring the waiting-for mark in 1) interrupt handlers 2) SUB-GC 3) POST-GC --- NEWS | 2 ++ src/code/gc.lisp | 74 ++++++++++++++++++++++--------------------- src/code/target-signal.lisp | 27 ++++++++-------- src/code/target-thread.lisp | 8 ++--- src/code/thread.lisp | 20 ++++++++++++ tests/threads.impure.lisp | 52 ++++++++++++++++++++++++++++++ 6 files changed, 130 insertions(+), 53 deletions(-) diff --git a/NEWS b/NEWS index a0816bb..2bc82ba 100644 --- a/NEWS +++ b/NEWS @@ -48,6 +48,8 @@ changes relative to sbcl-1.0.50: (lp#819269) * bug fix: &REST to &MORE conversion still works in unsafe call to known functions; reported by Lutz Euler (lp#826459). + * bug fix: bogus deadlocks from interrupts and GCs. (lp#807475, regression + since 1.0.48) changes in sbcl-1.0.50 relative to sbcl-1.0.49: * enhancement: errors from FD handlers now provide a restart to remove diff --git a/src/code/gc.lisp b/src/code/gc.lisp index 5ea3669..148547c 100644 --- a/src/code/gc.lisp +++ b/src/code/gc.lisp @@ -236,39 +236,40 @@ NIL as the pathname." ;; Now, if GET-MUTEX did not cons, that would be enough. ;; Because it does, we need the :IN-PROGRESS bit above to ;; tell the runtime not to trigger gcs. - (let ((sb!impl::*in-without-gcing* t) - (sb!impl::*deadline* nil) - (sb!impl::*deadline-seconds* nil)) - (sb!thread:with-mutex (*already-in-gc*) - (let ((*gc-inhibit* t)) - (let ((old-usage (dynamic-usage)) - (new-usage 0)) - (unsafe-clear-roots gen) - (gc-stop-the-world) - (let ((start-time (get-internal-run-time))) - (collect-garbage gen) - (setf *gc-epoch* (cons nil nil)) - (let ((run-time (- (get-internal-run-time) start-time))) - ;; KLUDGE: Sometimes we see the second getrusage() call - ;; return a smaller value than the first, which can - ;; lead to *GC-RUN-TIME* to going negative, which in - ;; turn is a type-error. - (when (plusp run-time) - (incf *gc-run-time* run-time)))) - (setf *gc-pending* nil - new-usage (dynamic-usage)) - #!+sb-thread - (assert (not *stop-for-gc-pending*)) - (gc-start-the-world) - ;; In a multithreaded environment the other threads - ;; will see *n-b-f-o-p* change a little late, but - ;; that's OK. - (let ((freed (- old-usage new-usage))) - ;; GENCGC occasionally reports negative here, but - ;; the current belief is that it is part of the - ;; normal order of things and not a bug. - (when (plusp freed) - (incf *n-bytes-freed-or-purified* freed))))))) + (sb!thread::without-thread-waiting-for (:already-without-interrupts t) + (let* ((sb!impl::*in-without-gcing* t) + (sb!impl::*deadline* nil) + (sb!impl::*deadline-seconds* nil)) + (sb!thread:with-mutex (*already-in-gc*) + (let ((*gc-inhibit* t)) + (let ((old-usage (dynamic-usage)) + (new-usage 0)) + (unsafe-clear-roots gen) + (gc-stop-the-world) + (let ((start-time (get-internal-run-time))) + (collect-garbage gen) + (setf *gc-epoch* (cons nil nil)) + (let ((run-time (- (get-internal-run-time) start-time))) + ;; KLUDGE: Sometimes we see the second getrusage() call + ;; return a smaller value than the first, which can + ;; lead to *GC-RUN-TIME* to going negative, which in + ;; turn is a type-error. + (when (plusp run-time) + (incf *gc-run-time* run-time)))) + (setf *gc-pending* nil + new-usage (dynamic-usage)) + #!+sb-thread + (assert (not *stop-for-gc-pending*)) + (gc-start-the-world) + ;; In a multithreaded environment the other threads + ;; will see *n-b-f-o-p* change a little late, but + ;; that's OK. + (let ((freed (- old-usage new-usage))) + ;; GENCGC occasionally reports negative here, but + ;; the current belief is that it is part of the + ;; normal order of things and not a bug. + (when (plusp freed) + (incf *n-bytes-freed-or-purified* freed)))))))) ;; While holding the mutex we were protected from ;; SIG_STOP_FOR_GC and recursive GCs. Now, in order to ;; preserve the invariant (*GC-PENDING* -> @@ -299,9 +300,10 @@ NIL as the pathname." ;; finalizers and after-gc hooks. (when (sb!thread:thread-alive-p sb!thread:*current-thread*) (when *allow-with-interrupts* - (with-interrupts - (run-pending-finalizers) - (call-hooks "after-GC" *after-gc-hooks* :on-error :warn))))) + (sb!thread::without-thread-waiting-for () + (with-interrupts + (run-pending-finalizers) + (call-hooks "after-GC" *after-gc-hooks* :on-error :warn)))))) ;;; This is the user-advertised garbage collection function. (defun gc (&key (gen 0) (full nil) &allow-other-keys) diff --git a/src/code/target-signal.lisp b/src/code/target-signal.lisp index bea8892..2ae4a88 100644 --- a/src/code/target-signal.lisp +++ b/src/code/target-signal.lisp @@ -47,19 +47,20 @@ (let ((*unblock-deferrables-on-enabling-interrupts-p* t)) (with-interrupt-bindings (let ((sb!debug:*stack-top-hint* - (nth-value 1 (sb!kernel:find-interrupted-name-and-frame)))) - (allow-with-interrupts - (nlx-protect (funcall function) - ;; We've been running with deferrables - ;; blocked in Lisp called by a C signal - ;; handler. If we return normally the sigmask - ;; in the interrupted context is restored. - ;; However, if we do an nlx the operating - ;; system will not restore it for us. - (when *unblock-deferrables-on-enabling-interrupts-p* - ;; This means that storms of interrupts - ;; doing an nlx can still run out of stack. - (unblock-deferrable-signals))))))))) + (nth-value 1 (sb!kernel:find-interrupted-name-and-frame)))) + (sb!thread::without-thread-waiting-for (:already-without-interrupts t) + (allow-with-interrupts + (nlx-protect (funcall function) + ;; We've been running with deferrables + ;; blocked in Lisp called by a C signal + ;; handler. If we return normally the sigmask + ;; in the interrupted context is restored. + ;; However, if we do an nlx the operating + ;; system will not restore it for us. + (when *unblock-deferrables-on-enabling-interrupts-p* + ;; This means that storms of interrupts + ;; doing an nlx can still run out of stack. + (unblock-deferrable-signals)))))))))) (defmacro in-interruption ((&key) &body body) #!+sb-doc diff --git a/src/code/target-thread.lisp b/src/code/target-thread.lisp index 18bec35..038dad0 100644 --- a/src/code/target-thread.lisp +++ b/src/code/target-thread.lisp @@ -301,7 +301,7 @@ created and old ones may exit at any time." (defmacro with-deadlocks ((thread lock &optional timeout) &body forms) (declare (ignorable timeout)) - (with-unique-names (prev n-thread n-lock n-timeout new) + (with-unique-names (n-thread n-lock n-timeout new) `(let* ((,n-thread ,thread) (,n-lock ,lock) (,n-timeout #!-sb-lutex @@ -309,8 +309,6 @@ created and old ones may exit at any time." `(or ,timeout (when sb!impl::*deadline* sb!impl::*deadline-seconds*)))) - ;; If we get interrupted while waiting for a lock, etc. - (,prev (thread-waiting-for ,n-thread)) (,new (if ,n-timeout (cons ,n-timeout ,n-lock) ,n-lock))) @@ -321,7 +319,9 @@ created and old ones may exit at any time." (progn (setf (thread-waiting-for ,n-thread) ,new) ,@forms) - (setf (thread-waiting-for ,n-thread) ,prev))))) + ;; Interrupt handlers and GC save and restore any + ;; previous wait marks using WITHOUT-DEADLOCKS below. + (setf (thread-waiting-for ,n-thread) nil))))) (declaim (inline get-spinlock release-spinlock)) diff --git a/src/code/thread.lisp b/src/code/thread.lisp index 85eb1c8..5dfb84e 100644 --- a/src/code/thread.lisp +++ b/src/code/thread.lisp @@ -71,6 +71,26 @@ stale value, use MUTEX-OWNER instead." (name nil :type (or null thread-name)) (value nil)) +(sb!xc:defmacro without-thread-waiting-for ((&key already-without-interrupts) &body body) + (with-unique-names (thread prev) + (let ((without (if already-without-interrupts + 'progn + 'without-interrupts)) + (with (if already-without-interrupts + 'progn + 'with-local-interrupts))) + `(let* ((,thread *current-thread*) + (,prev (thread-waiting-for ,thread))) + (flet ((exec () ,@body)) + (if ,prev + (,without + (unwind-protect + (progn + (setf (thread-waiting-for ,thread) nil) + (,with (exec))) + (setf (thread-waiting-for ,thread) ,prev))) + (exec))))))) + (sb!xc:defmacro with-mutex ((mutex &key (value '*current-thread*) (wait-p t)) &body body) #!+sb-doc diff --git a/tests/threads.impure.lisp b/tests/threads.impure.lisp index 311e1d5..e3db63e 100644 --- a/tests/threads.impure.lisp +++ b/tests/threads.impure.lisp @@ -1385,3 +1385,55 @@ (format t "~%joined ~S~%" (sb-thread:thread-name th))) (list d1 d2 d3 i)))) (format t "parallel defclass test done~%") + +(with-test (:name (:deadlock-detection :interrupts)) + (let* ((m1 (sb-thread:make-mutex :name "M1")) + (m2 (sb-thread:make-mutex :name "M2")) + (t1 (sb-thread:make-thread + (lambda () + (sb-thread:with-mutex (m1) + (sleep 0.3) + :ok)) + :name "T1")) + (t2 (sb-thread:make-thread + (lambda () + (sleep 0.1) + (sb-thread:with-mutex (m1 :wait-p t) + (sleep 0.2) + :ok)) + :name "T2"))) + (sleep 0.2) + (sb-thread:interrupt-thread t2 (lambda () + (sb-thread:with-mutex (m2 :wait-p t) + (sleep 0.3)))) + (sleep 0.05) + (sb-thread:interrupt-thread t1 (lambda () + (sb-thread:with-mutex (m2 :wait-p t) + (sleep 0.3)))) + ;; both threads should finish without a deadlock or deadlock + ;; detection error + (let ((res (list (sb-thread:join-thread t1) + (sb-thread:join-thread t2)))) + (assert (equal '(:ok :ok) res))))) + +(with-test (:name (:deadlock-detection :gc)) + ;; To semi-reliably trigger the error (in SBCL's where) + ;; it was present you had to run this for > 30 seconds, + ;; but that's a bit long for a single test. + (let* ((stop (+ 5 (get-universal-time))) + (m1 (sb-thread:make-mutex :name "m1")) + (t1 (sb-thread:make-thread + (lambda () + (loop until (> (get-universal-time) stop) + do (sb-thread:with-mutex (m1) + (eval `(make-array 24)))) + :ok))) + (t2 (sb-thread:make-thread + (lambda () + (loop until (> (get-universal-time) stop) + do (sb-thread:with-mutex (m1) + (eval `(make-array 24)))) + :ok)))) + (let ((res (list (sb-thread:join-thread t1) + (sb-thread:join-thread t2)))) + (assert (equal '(:ok :ok) res))))) -- 1.7.10.4