From: Nikodemus Siivola Date: Wed, 27 Sep 2006 11:03:17 +0000 (+0000) Subject: 0.9.17.2: fix two potential GC deadlocks X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=bfb7c2d573bacfd9c5f3f243b7c1589f81f11406;p=sbcl.git 0.9.17.2: fix two potential GC deadlocks * Dying threads used to grab session and all-threads locks with GC inhibited, which was bad: 1. T1 has the lock, GC not inhibited T2 in HANDLE-THREAD-EXIT waiting for the lock, GC inhibited 2. GC is triggered 3. T1 stopped while holding the lock T2 deadlocks waiting for T1 to release the lock. * Mark threads dead while holding the *ALL-THREADS-LOCK*, so that (unless (thread-alive-p th) (assert (not (member th (list-all-threads))))) cannot fail. * Since dying threads can now trigger GCs, don't run after-gc hooks and finalizers if the thread has been marked as dead. * Move all thread cleanup logic to HANDLE-THREAD exit. --- diff --git a/NEWS b/NEWS index 67243b4..499e5be 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,7 @@ ;;;; -*- coding: utf-8; -*- +changes in sbcl-0.9.18 (1.0.beta?) relative to sbcl-0.9.16: + * bug fix: two potential GC deadlocks affecting threaded builds. + changes in sbcl-0.9.17 (0.9.99?) relative to sbcl-0.9.16: * feature: weak hash tables, see MAKE-HASH-TABLE documentation * incompatible change: External-format support for FFI calls. The diff --git a/src/code/gc.lisp b/src/code/gc.lisp index 04562f0..fbd4e4b 100644 --- a/src/code/gc.lisp +++ b/src/code/gc.lisp @@ -139,8 +139,9 @@ and submit it as a patch." ;;;; GC hooks (defvar *after-gc-hooks* nil - "Called after each garbage collection. In a multithreaded -environment these hooks may run in any thread.") + "Called after each garbage collection, except for garbage collections +triggered during thread exits. In a multithreaded environment these hooks may +run in any thread.") ;;;; internal GC @@ -179,7 +180,7 @@ environment these hooks may run in any thread.") (sb!thread:make-mutex :name "GC lock") "ID of thread running SUB-GC") (defun sub-gc (&key (gen 0)) - (unless (eq sb!thread:*current-thread* + (unless (eq sb!thread:*current-thread* (sb!thread::mutex-value *already-in-gc*)) ;; With gencgc, unless *GC-PENDING* every allocation in this ;; function triggers another gc, potentially exceeding maximum @@ -216,12 +217,19 @@ environment these hooks may run in any thread.") ;; ;; Can that be avoided by having the finalizers and hooks run only ;; from the outermost SUB-GC? - (run-pending-finalizers) - (dolist (hook *after-gc-hooks*) - (handler-case - (funcall hook) - (error (c) - (warn "Error calling after GC hook ~S:~% ~S" hook c))))))) + ;; + ;; KLUDGE: Don't run the hooks in GC's triggered by dying threads, + ;; so that user-code never runs with + ;; (thread-alive-p *current-thread*) => nil + ;; The long-term solution will be to keep a separate thread for + ;; finalizers and after-gc hooks. + (when (sb!thread:thread-alive-p sb!thread:*current-thread*) + (run-pending-finalizers) + (dolist (hook *after-gc-hooks*) + (handler-case + (funcall hook) + (error (c) + (warn "Error calling after-GC hook ~S:~% ~A" hook c)))))))) ;;; This is the user-advertised garbage collection function. (defun gc (&key (gen 0) (full nil) &allow-other-keys) diff --git a/src/code/target-thread.lisp b/src/code/target-thread.lisp index 302a9fa..ed47314 100644 --- a/src/code/target-thread.lisp +++ b/src/code/target-thread.lisp @@ -57,10 +57,18 @@ in future versions." (defvar *all-threads* ()) (defvar *all-threads-lock* (make-mutex :name "all threads lock")) +(defmacro with-all-threads-lock (&body body) + #!-sb-thread + `(locally ,@body) + #!+sb-thread + `(without-interrupts + (with-mutex (*all-threads-lock*) + ,@body))) + (defun list-all-threads () #!+sb-doc "Return a list of the live threads." - (with-mutex (*all-threads-lock*) + (with-all-threads-lock (copy-list *all-threads*))) (declaim (inline current-thread-sap)) @@ -97,7 +105,7 @@ in future versions." (define-alien-routine "signal_interrupt_thread" integer (os-thread unsigned-long)) - (define-alien-routine "block_blockable_signals" + (define-alien-routine "block_deferrable_signals" void) #!+sb-lutex @@ -472,16 +480,22 @@ this semaphore, then N of them is woken up." ;;; Remove thread from its session, if it has one. #!+sb-thread (defun handle-thread-exit (thread) - (with-mutex (*all-threads-lock*) - (/show0 "HANDLING THREAD EXIT") - #!+sb-lutex - (when (thread-interruptions-lock thread) - (/show0 "FREEING MUTEX LUTEX") - (with-lutex-address (lutex (mutex-lutex (thread-interruptions-lock thread))) - (%lutex-destroy lutex))) - (setq *all-threads* (delete thread *all-threads*))) - (when *session* - (%delete-thread-from-session thread *session*))) + (/show0 "HANDLING THREAD EXIT") + ;; We're going down, can't handle interrupts sanely anymore. + ;; GC remains enabled. + (block-deferrable-signals) + ;; Lisp-side cleanup + (with-all-threads-lock + (setf (thread-%alive-p thread) nil) + (setf (thread-os-thread thread) nil) + (setq *all-threads* (delete thread *all-threads*)) + (when *session* + (%delete-thread-from-session thread *session*))) + #!+sb-lutex + (when (thread-interruptions-lock thread) + (/show0 "FREEING MUTEX LUTEX") + (with-lutex-address (lutex (mutex-lutex (thread-interruptions-lock thread))) + (%lutex-destroy lutex)))) (defun terminate-session () #!+sb-doc @@ -621,7 +635,7 @@ returns the thread exits." (sb!impl::*internal-symbol-output-fun* nil) (sb!impl::*descriptor-handlers* nil)) ; serve-event (setf (thread-os-thread thread) (current-thread-sap-id)) - (with-mutex (*all-threads-lock*) + (with-all-threads-lock (push thread *all-threads*)) (with-session-lock (*session*) (push thread (session-threads *session*))) @@ -644,15 +658,7 @@ returns the thread exits." ;; threads, it's time to enable signals (sb!unix::reset-signal-mask) (funcall real-function)) - ;; we're going down, can't handle - ;; interrupts sanely anymore - (let ((sb!impl::*gc-inhibit* t)) - (block-blockable-signals) - (setf (thread-%alive-p thread) nil) - (setf (thread-os-thread thread) nil) - ;; and remove what can be the last - ;; reference to this thread - (handle-thread-exit thread))))))) + (handle-thread-exit thread)))))) (values)))) ;; Keep INITIAL-FUNCTION pinned until the child thread is ;; initialized properly. diff --git a/src/runtime/interrupt.c b/src/runtime/interrupt.c index e010971..0d219b0 100644 --- a/src/runtime/interrupt.c +++ b/src/runtime/interrupt.c @@ -186,6 +186,14 @@ block_blockable_signals(void) #endif } +void +block_deferrable_signals(void) +{ +#ifndef LISP_FEATURE_WIN32 + thread_sigmask(SIG_BLOCK, &deferrable_sigset, 0); +#endif +} + /* * utility routines used by various signal handlers diff --git a/src/runtime/thread.c b/src/runtime/thread.c index a08c2ad..262e642 100644 --- a/src/runtime/thread.c +++ b/src/runtime/thread.c @@ -224,6 +224,7 @@ new_thread_trampoline(struct thread *th) { lispobj function; int result, lock_ret; + FSHOW((stderr,"/creating thread %lu\n", thread_self())); function = th->no_tls_value_marker; th->no_tls_value_marker = NO_TLS_VALUE_MARKER_WIDETAG; @@ -245,6 +246,9 @@ new_thread_trampoline(struct thread *th) gc_assert(lock_ret == 0); result = funcall0(function); + + /* Block GC */ + block_blockable_signals(); th->state=STATE_DEAD; /* SIG_STOP_FOR_GC is blocked and GC might be waiting for this diff --git a/tests/threads.impure.lisp b/tests/threads.impure.lisp index 43a60f3..8eb97ae 100644 --- a/tests/threads.impure.lisp +++ b/tests/threads.impure.lisp @@ -670,3 +670,42 @@ (wait-for-threads threads))) (format t "backtrace test done~%") + +(format t "~&starting gc deadlock test: WARNING: THIS TEST WILL HANG ON FAILURE!~%") + +(with-test (:name (:gc-deadlock)) + ;; Prior to 0.9.16.46 thread exit potentially deadlocked the + ;; GC due to *all-threads-lock* and session lock. On earlier + ;; versions and at least on one specific box this test is good enough + ;; to catch that typically well before the 1500th iteration. + (loop + with i = 0 + with n = 3000 + while (< i n) + do + (incf i) + (when (zerop (mod i 100)) + (write-char #\.) + (force-output)) + (handler-case + (if (oddp i) + (sb-thread:make-thread + (lambda () + (sleep (random 0.001))) + :name (list :sleep i)) + (sb-thread:make-thread + (lambda () + ;; KLUDGE: what we are doing here is explicit, + ;; but the same can happen because of a regular + ;; MAKE-THREAD or LIST-ALL-THREADS, and various + ;; session functions. + (sb-thread:with-mutex (sb-thread::*all-threads-lock*) + (sb-thread::with-session-lock (sb-thread::*session*) + (sb-ext:gc)))) + :name (list :gc i))) + (error (e) + (format t "~%error creating thread ~D: ~A -- backing off for retry~%" i e) + (sleep 0.1) + (incf i))))) + +(format t "~&gc deadlock test done~%") diff --git a/version.lisp-expr b/version.lisp-expr index ca4393a..94ed619 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.9.17.1" +"0.9.17.2"