0.9.17.2: fix two potential GC deadlocks
authorNikodemus Siivola <nikodemus@random-state.net>
Wed, 27 Sep 2006 11:03:17 +0000 (11:03 +0000)
committerNikodemus Siivola <nikodemus@random-state.net>
Wed, 27 Sep 2006 11:03:17 +0000 (11:03 +0000)
 * Dying threads used to grab session and all-threads locks
   with GC inhibited, which was bad:
    1. T1 has the lock, GC not inhibited
       T2 in HANDLE-THREAD-EXIT waiting for the lock, GC inhibited
    2. GC is triggered
    3. T1 stopped while holding the lock
       T2 deadlocks waiting for T1 to release the lock.
 * Mark threads dead while holding the *ALL-THREADS-LOCK*, so that
     (unless (thread-alive-p th)
       (assert (not (member th (list-all-threads)))))
   cannot fail.
 * Since dying threads can now trigger GCs, don't run after-gc hooks
   and finalizers if the thread has been marked as dead.
 * Move all thread cleanup logic to HANDLE-THREAD exit.

NEWS
src/code/gc.lisp
src/code/target-thread.lisp
src/runtime/interrupt.c
src/runtime/thread.c
tests/threads.impure.lisp
version.lisp-expr

diff --git a/NEWS b/NEWS
index 67243b4..499e5be 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,7 @@
 ;;;; -*- coding: utf-8; -*-
+changes in sbcl-0.9.18 (1.0.beta?) relative to sbcl-0.9.16:
+  * bug fix: two potential GC deadlocks affecting threaded builds.
+
 changes in sbcl-0.9.17 (0.9.99?) relative to sbcl-0.9.16:
   * feature: weak hash tables, see MAKE-HASH-TABLE documentation
   * incompatible change: External-format support for FFI calls.  The
index 04562f0..fbd4e4b 100644 (file)
@@ -139,8 +139,9 @@ and submit it as a patch."
 ;;;; GC hooks
 
 (defvar *after-gc-hooks* nil
-  "Called after each garbage collection. In a multithreaded
-environment these hooks may run in any thread.")
+  "Called after each garbage collection, except for garbage collections
+triggered during thread exits. In a multithreaded environment these hooks may
+run in any thread.")
 
 \f
 ;;;; internal GC
@@ -179,7 +180,7 @@ environment these hooks may run in any thread.")
   (sb!thread:make-mutex :name "GC lock") "ID of thread running SUB-GC")
 
 (defun sub-gc (&key (gen 0))
-  (unless (eq sb!thread:*current-thread*
+  (unless (eq sb!thread:*current-thread* 
               (sb!thread::mutex-value *already-in-gc*))
     ;; With gencgc, unless *GC-PENDING* every allocation in this
     ;; function triggers another gc, potentially exceeding maximum
@@ -216,12 +217,19 @@ environment these hooks may run in any thread.")
       ;;
       ;; Can that be avoided by having the finalizers and hooks run only
       ;; from the outermost SUB-GC?
-      (run-pending-finalizers)
-      (dolist (hook *after-gc-hooks*)
-        (handler-case
-            (funcall hook)
-          (error (c)
-            (warn "Error calling after GC hook ~S:~%  ~S" hook c)))))))
+      ;;
+      ;; KLUDGE: Don't run the hooks in GC's triggered by dying threads,
+      ;; so that user-code never runs with 
+      ;;   (thread-alive-p *current-thread*) => nil
+      ;; The long-term solution will be to keep a separate thread for
+      ;; finalizers and after-gc hooks.
+      (when (sb!thread:thread-alive-p sb!thread:*current-thread*)
+        (run-pending-finalizers)
+        (dolist (hook *after-gc-hooks*)
+          (handler-case
+              (funcall hook)
+            (error (c)
+              (warn "Error calling after-GC hook ~S:~% ~A" hook c))))))))
 
 ;;; This is the user-advertised garbage collection function.
 (defun gc (&key (gen 0) (full nil) &allow-other-keys)
index 302a9fa..ed47314 100644 (file)
@@ -57,10 +57,18 @@ in future versions."
 (defvar *all-threads* ())
 (defvar *all-threads-lock* (make-mutex :name "all threads lock"))
 
+(defmacro with-all-threads-lock (&body body)
+  #!-sb-thread
+  `(locally ,@body)
+  #!+sb-thread
+  `(without-interrupts
+     (with-mutex (*all-threads-lock*)
+       ,@body)))
+
 (defun list-all-threads ()
   #!+sb-doc
   "Return a list of the live threads."
-  (with-mutex (*all-threads-lock*)
+  (with-all-threads-lock
     (copy-list *all-threads*)))
 
 (declaim (inline current-thread-sap))
@@ -97,7 +105,7 @@ in future versions."
   (define-alien-routine "signal_interrupt_thread"
       integer (os-thread unsigned-long))
 
-  (define-alien-routine "block_blockable_signals"
+  (define-alien-routine "block_deferrable_signals"
       void)
 
   #!+sb-lutex
@@ -472,16 +480,22 @@ this semaphore, then N of them is woken up."
 ;;; Remove thread from its session, if it has one.
 #!+sb-thread
 (defun handle-thread-exit (thread)
-  (with-mutex (*all-threads-lock*)
-    (/show0 "HANDLING THREAD EXIT")
-    #!+sb-lutex
-    (when (thread-interruptions-lock thread)
-      (/show0 "FREEING MUTEX LUTEX")
-      (with-lutex-address (lutex (mutex-lutex (thread-interruptions-lock thread)))
-        (%lutex-destroy lutex)))
-    (setq *all-threads* (delete thread *all-threads*)))
-  (when *session*
-    (%delete-thread-from-session thread *session*)))
+  (/show0 "HANDLING THREAD EXIT")
+  ;; We're going down, can't handle interrupts sanely anymore.
+  ;; GC remains enabled.
+  (block-deferrable-signals)
+  ;; Lisp-side cleanup
+  (with-all-threads-lock
+    (setf (thread-%alive-p thread) nil)
+    (setf (thread-os-thread thread) nil)
+    (setq *all-threads* (delete thread *all-threads*))
+    (when *session*
+      (%delete-thread-from-session thread *session*)))
+  #!+sb-lutex
+  (when (thread-interruptions-lock thread)
+    (/show0 "FREEING MUTEX LUTEX")
+    (with-lutex-address (lutex (mutex-lutex (thread-interruptions-lock thread)))
+      (%lutex-destroy lutex))))
 
 (defun terminate-session ()
   #!+sb-doc
@@ -621,7 +635,7 @@ returns the thread exits."
                   (sb!impl::*internal-symbol-output-fun* nil)
                   (sb!impl::*descriptor-handlers* nil)) ; serve-event
               (setf (thread-os-thread thread) (current-thread-sap-id))
-              (with-mutex (*all-threads-lock*)
+              (with-all-threads-lock
                 (push thread *all-threads*))
               (with-session-lock (*session*)
                 (push thread (session-threads *session*)))
@@ -644,15 +658,7 @@ returns the thread exits."
                            ;; threads, it's time to enable signals
                            (sb!unix::reset-signal-mask)
                            (funcall real-function))
-                      ;; we're going down, can't handle
-                      ;; interrupts sanely anymore
-                      (let ((sb!impl::*gc-inhibit* t))
-                        (block-blockable-signals)
-                        (setf (thread-%alive-p thread) nil)
-                        (setf (thread-os-thread thread) nil)
-                        ;; and remove what can be the last
-                        ;; reference to this thread
-                        (handle-thread-exit thread)))))))
+                      (handle-thread-exit thread))))))
             (values))))
     ;; Keep INITIAL-FUNCTION pinned until the child thread is
     ;; initialized properly.
index e010971..0d219b0 100644 (file)
@@ -186,6 +186,14 @@ block_blockable_signals(void)
 #endif
 }
 
+void
+block_deferrable_signals(void)
+{
+#ifndef LISP_FEATURE_WIN32
+    thread_sigmask(SIG_BLOCK, &deferrable_sigset, 0);
+#endif
+}
+
 \f
 /*
  * utility routines used by various signal handlers
index a08c2ad..262e642 100644 (file)
@@ -224,6 +224,7 @@ new_thread_trampoline(struct thread *th)
 {
     lispobj function;
     int result, lock_ret;
+
     FSHOW((stderr,"/creating thread %lu\n", thread_self()));
     function = th->no_tls_value_marker;
     th->no_tls_value_marker = NO_TLS_VALUE_MARKER_WIDETAG;
@@ -245,6 +246,9 @@ new_thread_trampoline(struct thread *th)
     gc_assert(lock_ret == 0);
 
     result = funcall0(function);
+
+    /* Block GC */
+    block_blockable_signals();
     th->state=STATE_DEAD;
 
     /* SIG_STOP_FOR_GC is blocked and GC might be waiting for this
index 43a60f3..8eb97ae 100644 (file)
     (wait-for-threads threads)))
 
 (format t "backtrace test done~%")
+
+(format t "~&starting gc deadlock test: WARNING: THIS TEST WILL HANG ON FAILURE!~%")
+
+(with-test (:name (:gc-deadlock))
+  ;; Prior to 0.9.16.46 thread exit potentially deadlocked the
+  ;; GC due to *all-threads-lock* and session lock. On earlier
+  ;; versions and at least on one specific box this test is good enough
+  ;; to catch that typically well before the 1500th iteration.
+  (loop
+     with i = 0
+     with n = 3000
+     while (< i n)
+     do
+       (incf i)
+       (when (zerop (mod i 100))
+         (write-char #\.)
+         (force-output))
+       (handler-case
+           (if (oddp i)
+               (sb-thread:make-thread
+                (lambda ()
+                  (sleep (random 0.001)))
+                :name (list :sleep i))
+               (sb-thread:make-thread
+                (lambda ()
+                  ;; KLUDGE: what we are doing here is explicit,
+                  ;; but the same can happen because of a regular
+                  ;; MAKE-THREAD or LIST-ALL-THREADS, and various
+                  ;; session functions.
+                  (sb-thread:with-mutex (sb-thread::*all-threads-lock*)
+                    (sb-thread::with-session-lock (sb-thread::*session*)
+                      (sb-ext:gc))))
+                :name (list :gc i)))
+         (error (e)
+           (format t "~%error creating thread ~D: ~A -- backing off for retry~%" i e)
+           (sleep 0.1)
+           (incf i)))))
+
+(format t "~&gc deadlock test done~%")
index ca4393a..94ed619 100644 (file)
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"0.9.17.1"
+"0.9.17.2"