Fix deadlocks in GC on Windows.
authorStas Boukarev <stassats@gmail.com>
Fri, 22 Nov 2013 18:38:31 +0000 (22:38 +0400)
committerStas Boukarev <stassats@gmail.com>
Fri, 22 Nov 2013 19:09:27 +0000 (23:09 +0400)
When multiple threads cons and end up in the GC, they could deadlock
trying to send messages using safepoints and trying to acquire the
*already-in-gc* lock.

Also happens to stop gc.impure.lisp / BUG-936304 test from failing on linux-x86.

Thanks to Paul Khuong for the patch.

NEWS
src/code/gc.lisp
src/runtime/gc-common.c
tests/dynamic-extent.impure.lisp

diff --git a/NEWS b/NEWS
index 0458bbf..3c11c72 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,7 @@ changes relative to sbcl-1.1.13:
     Munksgaard. (lp#455381)
   * bug fix: DEFSTRUCTs with NIL as a slot name no longer cause strange
     CLOS-related errors.  (lp#633911)
+  * bug fix: GC deadlocks caused by concurrent consing on Windows.
 
 changes in sbcl-1.1.13 relative to sbcl-1.1.12:
   * optimization: better distribution of SXHASH over small conses of related
index 61c351c..f869788 100644 (file)
@@ -220,65 +220,92 @@ statistics are appended to it."
          (setf *gc-pending* t)
          nil)
         (t
-         (without-interrupts
-           (setf *gc-pending* :in-progress)
-           ;; Tricks to to prevent triggerring a recursive gc. This is
-           ;; like a WITHOUT-GCING inside the lock except that we
-           ;; cannot call MAYBE-HANDLE-PENDING-GC at the end, because
-           ;; that would lead to a recursive attempt on the lock. In
-           ;; case you are wondering, wrapping the lock in a
-           ;; WITHOUT-GCING would also deadlock. The
-           ;; *IN-WITHOUT-GCING* part is used to tell the runtime that
-           ;; it's ok to have a pending gc even though *GC-INHIBIT* is
-           ;; NIL.
-           ;;
-           ;; Now, if GET-MUTEX did not cons, that would be enough.
-           ;; Because it does, we need the :IN-PROGRESS bit above to
-           ;; tell the runtime not to trigger gcs.
-           (sb!thread::without-thread-waiting-for (:already-without-interrupts t)
-             (let* ((sb!impl::*in-without-gcing* t)
-                    (sb!impl::*deadline* nil)
-                    (sb!impl::*deadline-seconds* nil))
-               (sb!thread:with-mutex (*already-in-gc*)
-                 (let ((*gc-inhibit* t))
-                   (let ((old-usage (dynamic-usage))
-                         (new-usage 0))
-                     (unsafe-clear-roots gen)
-                     (gc-stop-the-world)
-                     (let ((start-time (get-internal-run-time)))
-                       (collect-garbage gen)
-                       (setf *gc-epoch* (cons nil nil))
-                       (let ((run-time (- (get-internal-run-time) start-time)))
-                         ;; KLUDGE: Sometimes we see the second getrusage() call
-                         ;; return a smaller value than the first, which can
-                         ;; lead to *GC-RUN-TIME* to going negative, which in
-                         ;; turn is a type-error.
-                         (when (plusp run-time)
-                           (incf *gc-run-time* run-time))))
-                     #!+sb-safepoint
-                     (setf *stop-for-gc-pending* nil)
-                     (setf *gc-pending* nil
-                           new-usage (dynamic-usage))
-                     #!+sb-thread
-                     (assert (not *stop-for-gc-pending*))
-                     (gc-start-the-world)
-                     ;; In a multithreaded environment the other threads
-                     ;; will see *n-b-f-o-p* change a little late, but
-                     ;; that's OK.
-                     (let ((freed (- old-usage new-usage)))
-                       ;; GENCGC occasionally reports negative here, but
-                       ;; the current belief is that it is part of the
-                       ;; normal order of things and not a bug.
-                       (when (plusp freed)
-                         (incf *n-bytes-freed-or-purified* freed))))))))
-           ;; While holding the mutex we were protected from
-           ;; SIG_STOP_FOR_GC and recursive GCs. Now, in order to
-           ;; preserve the invariant (*GC-PENDING* ->
-           ;; pseudo-atomic-interrupted or *GC-INHIBIT*), let's check
-           ;; explicitly for a pending gc before interrupts are
-           ;; enabled again.
-           (maybe-handle-pending-gc))
-         t)))
+         (flet ((perform-gc ()
+                  ;; Called from WITHOUT-GCING and WITHOUT-INTERRUPTS
+                  ;; after the world has been stopped, but it's an
+                  ;; awkwardly long piece of code to nest so deeply.
+                  (let ((old-usage (dynamic-usage))
+                        (new-usage 0)
+                        (start-time (get-internal-run-time)))
+                    (collect-garbage gen)
+                    (setf *gc-epoch* (cons nil nil))
+                    (let ((run-time (- (get-internal-run-time) start-time)))
+                      ;; KLUDGE: Sometimes we see the second getrusage() call
+                      ;; return a smaller value than the first, which can
+                      ;; lead to *GC-RUN-TIME* to going negative, which in
+                      ;; turn is a type-error.
+                      (when (plusp run-time)
+                        (incf *gc-run-time* run-time)))
+                    #!+sb-safepoint
+                    (setf *stop-for-gc-pending* nil)
+                    (setf *gc-pending* nil
+                          new-usage (dynamic-usage))
+                    #!+sb-thread
+                    (assert (not *stop-for-gc-pending*))
+                    (gc-start-the-world)
+                    ;; In a multithreaded environment the other threads
+                    ;; will see *n-b-f-o-p* change a little late, but
+                    ;; that's OK.
+                    ;; N.B. the outer without-gcing prevents this
+                    ;; function from being entered, so no need for
+                    ;; locking.
+                    (let ((freed (- old-usage new-usage)))
+                      ;; GENCGC occasionally reports negative here, but
+                      ;; the current belief is that it is part of the
+                      ;; normal order of things and not a bug.
+                      (when (plusp freed)
+                        (incf *n-bytes-freed-or-purified* freed))))))
+           (declare (inline perform-gc))
+           ;; Let's make sure we're not interrupted and that none of
+           ;; the deadline or deadlock detection stuff triggers.
+           (without-interrupts
+             (sb!thread::without-thread-waiting-for
+                 (:already-without-interrupts t)
+               (let ((sb!impl::*deadline* nil)
+                     (sb!impl::*deadline-seconds* nil)
+                     (epoch *gc-epoch*))
+                 (loop
+                  ;; GCing must be done without-gcing to avoid
+                  ;; recursive GC... but we can't block on
+                  ;; *already-in-gc* inside without-gcing: that would
+                  ;; cause a deadlock.
+                  (without-gcing
+                    ;; Try to grab that mutex.  On acquisition, stop
+                    ;; the world from with the mutex held, and then
+                    ;; execute the remainder of the GC: stopping the
+                    ;; world with interrupts disabled is the mother of
+                    ;; all critical sections.
+                    (cond ((sb!thread:with-mutex (*already-in-gc* :wait-p nil)
+                             (unsafe-clear-roots gen)
+                             (gc-stop-the-world)
+                             t)
+                           ;; Success! GC.
+                           (perform-gc)
+                           ;; Return, but leave *gc-pending* as is: we
+                           ;; did allocate a tiny bit after GCing.  In
+                           ;; theory, this could lead to a long chain
+                           ;; of tail-recursive (but not in explicit
+                           ;; tail position) GCs, but that doesn't
+                           ;; seem likely to happen too often... And
+                           ;; the old code already suffered from this
+                           ;; problem.
+                           (return t))
+                          (t
+                           ;; Some other thread is trying to GC. Clear
+                           ;; *gc-pending* (we already know we want a
+                           ;; GC to happen) and either let
+                           ;; without-gcing figure out that the world
+                           ;; is stopping, or try again.
+                           (setf *gc-pending* nil))))
+                  ;; we just wanted a minor GC, and a GC has
+                  ;; occurred. Leave, but don't execute after-gc
+                  ;; hooks.
+                  ;;
+                  ;; Return a 0 for easy ternary logic in the C
+                  ;; runtime.
+                  (when (and (eql gen 0)
+                             (neq epoch *gc-pending*))
+                    (return 0))))))))))
 
 (defun post-gc ()
   ;; Outside the mutex, interrupts may be enabled: these may cause
@@ -329,7 +356,7 @@ which may in turn trigger a collection of one or more older
 generations as well. If FULL is true, all generations are collected.
 If GEN is provided, it can be used to specify the oldest generation
 guaranteed to be collected."
-  (when (sub-gc :gen (if full sb!vm:+pseudo-static-generation+ gen))
+  (when (eq t (sub-gc :gen (if full sb!vm:+pseudo-static-generation+ gen)))
     (post-gc)))
 
 (define-alien-routine scrub-control-stack sb!alien:void)
index 490e4d1..7add951 100644 (file)
@@ -2661,8 +2661,24 @@ maybe_gc(os_context_t *context)
      * we may even be in a WITHOUT-INTERRUPTS. */
     gc_happened = funcall0(StaticSymbolFunction(SUB_GC));
     FSHOW((stderr, "/maybe_gc: gc_happened=%s\n",
-           (gc_happened == NIL) ? "NIL" : "T"));
-    if ((gc_happened != NIL) &&
+           (gc_happened == NIL)
+           ? "NIL"
+           : ((gc_happened == T)
+              ? "T"
+              : "0")));
+    /* gc_happened can take three values: T, NIL, 0.
+     *
+     * T means that the thread managed to trigger a GC, and post-gc
+     * must be called.
+     *
+     * NIL means that the thread is within without-gcing, and no GC
+     * has occurred.
+     *
+     * Finally, 0 means that *a* GC has occurred, but it wasn't
+     * triggered by this thread; success, but post-gc doesn't have
+     * to be called.
+     */
+    if ((gc_happened == T) &&
         /* See if interrupts are enabled or it's possible to enable
          * them. POST-GC has a similar check, but we don't want to
          * unlock deferrables in that case and get a pending interrupt
index 5111404..9b06fd7 100644 (file)
     (declare (dynamic-extent x))
     (unless (equalp (caar x) (make-nested-good :bar *bar*))
       (error "got ~S, wanted ~S" (caar x) (make-nested-good :bar *bar*)))
-    (caar x)))
+    ;; the NESTED instance itself *should* be DX!
+    (copy-nested (caar x))))
 
 (with-test (:name :conservative-nested-dx)
   ;; NESTED-BAD should not stack-allocate :BAR due to the SETF.