From ce73c961ea089afea8ec523e1a5556c27d1aa6d2 Mon Sep 17 00:00:00 2001 From: Gabor Melis Date: Sun, 22 Mar 2009 22:06:03 +0000 Subject: [PATCH] 1.0.26.17: fix GC/SIG_STOP_FOR_GC race Consider this: in a PA section GC is requested: GC_PENDING, pseudo_atomic_interrupted and gc_blocked_deferrables are set, deferrables are blocked then pseudo_atomic_atomic is cleared, but a SIG_STOP_FOR_GC arrives before trapping to interrupt_handle_pending. In sig_stop_for_gc_handler, GC_PENDING is cleared but pseudo_atomic_interrupted is not and we go on running with pseudo_atomic_interrupted but without a pending interrupt or GC. GC_BLOCKED_DEFERRABLES is also left at 1. Add more checks, fix comments. --- NEWS | 4 ++++ src/runtime/alloc.c | 7 +++---- src/runtime/backtrace.c | 2 ++ src/runtime/dynbind.c | 5 +++++ src/runtime/interrupt.c | 46 +++++++++++++++++++++++++++++++++++++--------- src/runtime/interrupt.h | 6 +++--- version.lisp-expr | 2 +- 7 files changed, 55 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 6728d45..3ef75e6 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,10 @@ changes in sbcl-1.0.27 relative to 1.0.26: * new port: support added for x86-64 OpenBSD. (thanks to Josh Elsasser) * bug fix: a type error is signaled for attempts to use the LOOP keyword ACROSS for a NIL value. (thanks to Daniel Lowe) + * bug fix: fix gc related interrupt handling bug on ppc (regression from + 1.0.25.37, reported by Harald Hanche-Olsen) + * bug fix: work around signal delivery bug in darwin (regression from + 1.0.25.44, reported by Sidney Markowitz) changes in sbcl-1.0.26 relative to 1.0.25: * incompatible change: an interruption (be it a function passed to diff --git a/src/runtime/alloc.c b/src/runtime/alloc.c index b84df30..eeb827f 100644 --- a/src/runtime/alloc.c +++ b/src/runtime/alloc.c @@ -40,10 +40,9 @@ pa_alloc(int bytes, int page_type_flag) lispobj *result; struct thread *th = arch_os_get_current_thread(); - /* SIG_STOP_FOR_GC needs to be enabled before we can call lisp: - * otherwise two threads racing here may deadlock: the other will - * wait on the GC lock, and the other cannot stop the first - * one... */ + /* SIG_STOP_FOR_GC must be unblocked: else two threads racing here + * may deadlock: one will wait on the GC lock, and the other + * cannot stop the first one... */ check_gc_signals_unblocked_or_lose(0); /* FIXME: OOAO violation: see arch_pseudo_* */ diff --git a/src/runtime/backtrace.c b/src/runtime/backtrace.c index 2d7dfca..b6dfc44 100644 --- a/src/runtime/backtrace.c +++ b/src/runtime/backtrace.c @@ -533,6 +533,7 @@ describe_thread_state(void) { sigset_t mask; struct thread *thread = arch_os_get_current_thread(); + struct interrupt_data *data = thread->interrupt_data; #ifndef LISP_FEATURE_WIN32 get_current_sigmask(&mask); printf("Signal mask:\n"); @@ -553,6 +554,7 @@ describe_thread_state(void) #ifdef STOP_FOR_GC_PENDING printf(" *STOP-FOR-GC-PENDING* = %s\n", (SymbolValue(STOP_FOR_GC_PENDING, thread) == T) ? "T" : "NIL"); #endif + printf("Pending handler = %p\n", data->pending_handler); } /* This function has been split from backtrace() to enable Lisp diff --git a/src/runtime/dynbind.c b/src/runtime/dynbind.c index bb964bf..9a64a0a 100644 --- a/src/runtime/dynbind.c +++ b/src/runtime/dynbind.c @@ -13,6 +13,9 @@ * files for more information. */ +#include +#include + #include "sbcl.h" #include "runtime.h" #include "globals.h" @@ -44,6 +47,7 @@ void bind_variable(lispobj symbol, lispobj value, void *th) if(!sym->tls_index) { lispobj *tls_index_lock= &((struct symbol *)native_pointer(TLS_INDEX_LOCK))->value; + FSHOW_SIGNAL((stderr, "entering dynbind tls alloc\n")); set_pseudo_atomic_atomic(th); get_spinlock(tls_index_lock,(long)th); if(!sym->tls_index) { @@ -55,6 +59,7 @@ void bind_variable(lispobj symbol, lispobj value, void *th) } } release_spinlock(tls_index_lock); + FSHOW_SIGNAL((stderr, "exiting dynbind tls alloc\n")); clear_pseudo_atomic_atomic(th); if (get_pseudo_atomic_interrupted(th)) do_pending_interrupt(); diff --git a/src/runtime/interrupt.c b/src/runtime/interrupt.c index 8c87054..acd1d04 100644 --- a/src/runtime/interrupt.c +++ b/src/runtime/interrupt.c @@ -498,7 +498,14 @@ check_interrupt_context_or_lose(os_context_t *context) if (stop_for_gc_pending) if (!(pseudo_atomic_interrupted || gc_inhibit || in_race_p)) lose("STOP_FOR_GC_PENDING, but why?\n"); + if (pseudo_atomic_interrupted) + if (!(gc_pending || stop_for_gc_pending || interrupt_deferred_p)) + lose("pseudo_atomic_interrupted, but why?\n"); } +#else + if (pseudo_atomic_interrupted) + if (!(gc_pending || interrupt_deferred_p)) + lose("pseudo_atomic_interrupted, but why?\n"); #endif #endif if (interrupt_pending && !interrupt_deferred_p) @@ -756,7 +763,7 @@ interrupt_handle_pending(os_context_t *context) struct interrupt_data *data = thread->interrupt_data; if (arch_pseudo_atomic_atomic(context)) { - lose("Handling pending interrupt in pseduo atomic."); + lose("Handling pending interrupt in pseudo atomic."); } FSHOW_SIGNAL((stderr, "/entering interrupt_handle_pending\n")); @@ -876,11 +883,15 @@ interrupt_handle_pending(os_context_t *context) sigcopyset(os_context_sigmask_addr(context), &data->pending_mask); run_deferred_handler(data, context); } +#endif +#ifdef LISP_FEATURE_GENCGC + if (get_pseudo_atomic_interrupted(thread)) + lose("pseudo_atomic_interrupted after interrupt_handle_pending\n"); +#endif /* It is possible that the end of this function was reached * without never actually doing anything, the tests in Lisp for * when to call receive-pending-interrupt are not exact. */ FSHOW_SIGNAL((stderr, "/exiting interrupt_handle_pending\n")); -#endif } @@ -1023,12 +1034,12 @@ maybe_defer_handler(void *handler, struct interrupt_data *data, */ if ((SymbolValue(INTERRUPTS_ENABLED,thread) == NIL) || in_leaving_without_gcing_race_p(thread)) { - store_signal_data_for_later(data,handler,signal,info,context); - SetSymbolValue(INTERRUPT_PENDING, T,thread); FSHOW_SIGNAL((stderr, "/maybe_defer_handler(%x,%d): deferred (RACE=%d)\n", (unsigned int)handler,signal, in_leaving_without_gcing_race_p(thread))); + store_signal_data_for_later(data,handler,signal,info,context); + SetSymbolValue(INTERRUPT_PENDING, T,thread); check_interrupt_context_or_lose(context); return 1; } @@ -1036,11 +1047,11 @@ maybe_defer_handler(void *handler, struct interrupt_data *data, * actually use its argument for anything on x86, so this branch * may succeed even when context is null (gencgc alloc()) */ if (arch_pseudo_atomic_atomic(context)) { - store_signal_data_for_later(data,handler,signal,info,context); - arch_set_pseudo_atomic_interrupted(context); FSHOW_SIGNAL((stderr, "/maybe_defer_handler(%x,%d): deferred(PA)\n", (unsigned int)handler,signal)); + store_signal_data_for_later(data,handler,signal,info,context); + arch_set_pseudo_atomic_interrupted(context); check_interrupt_context_or_lose(context); return 1; } @@ -1128,15 +1139,15 @@ sig_stop_for_gc_handler(int signal, siginfo_t *info, os_context_t *context) /* Test for GC_INHIBIT _first_, else we'd trap on every single * pseudo atomic until gc is finally allowed. */ if (SymbolValue(GC_INHIBIT,thread) != NIL) { - SetSymbolValue(STOP_FOR_GC_PENDING,T,thread); FSHOW_SIGNAL((stderr, "sig_stop_for_gc deferred (*GC-INHIBIT*)\n")); + SetSymbolValue(STOP_FOR_GC_PENDING,T,thread); return; } else if (arch_pseudo_atomic_atomic(context)) { + FSHOW_SIGNAL((stderr,"sig_stop_for_gc deferred (PA)\n")); SetSymbolValue(STOP_FOR_GC_PENDING,T,thread); arch_set_pseudo_atomic_interrupted(context); maybe_save_gc_mask_and_block_deferrables (os_context_sigmask_addr(context)); - FSHOW_SIGNAL((stderr,"sig_stop_for_gc deferred (PA)\n")); return; } @@ -1151,6 +1162,23 @@ sig_stop_for_gc_handler(int signal, siginfo_t *info, os_context_t *context) SetSymbolValue(GC_PENDING,NIL,thread); SetSymbolValue(STOP_FOR_GC_PENDING,NIL,thread); + /* Consider this: in a PA section GC is requested: GC_PENDING, + * pseudo_atomic_interrupted and gc_blocked_deferrables are set, + * deferrables are blocked then pseudo_atomic_atomic is cleared, + * but a SIG_STOP_FOR_GC arrives before trapping to + * interrupt_handle_pending. Here, GC_PENDING is cleared but + * pseudo_atomic_interrupted is not and we go on running with + * pseudo_atomic_interrupted but without a pending interrupt or + * GC. GC_BLOCKED_DEFERRABLES is also left at 1. So let's tidy it + * up. */ + if (thread->interrupt_data->gc_blocked_deferrables) { + FSHOW_SIGNAL((stderr,"cleaning up after gc_blocked_deferrables\n")); + clear_pseudo_atomic_interrupted(thread); + sigcopyset(os_context_sigmask_addr(context), + &thread->interrupt_data->pending_mask); + thread->interrupt_data->gc_blocked_deferrables = 0; + } + if(thread_state(thread)!=STATE_RUNNING) { lose("sig_stop_for_gc_handler: wrong thread state: %ld\n", fixnum_value(thread->state)); @@ -1476,7 +1504,7 @@ handle_guard_page_triggered(os_context_t *context,os_vm_address_t addr) else if (addr >= undefined_alien_address && addr < undefined_alien_address + os_vm_page_size) { arrange_return_to_lisp_function - (context, StaticSymbolFunction(UNDEFINED_ALIEN_VARIABLE_ERROR)); + (context, StaticSymbolFunction(UNDEFINED_ALIEN_VARIABLE_ERROR)); return 1; } else return 0; diff --git a/src/runtime/interrupt.h b/src/runtime/interrupt.h index 0c3984e..93a5387 100644 --- a/src/runtime/interrupt.h +++ b/src/runtime/interrupt.h @@ -109,9 +109,9 @@ struct interrupt_data { siginfo_t pending_info; sigset_t pending_mask; /* Was pending mask saved for gc request? True if GC_PENDING or - * SIG_STOP_FOR_GC happened in a pseudo atomic with no GC_INHIBIT - * NIL. Both deferrable interrupt handlers and gc are careful not - * to clobber each other's pending_mask. */ + * SIG_STOP_FOR_GC happened in a pseudo atomic with GC_INHIBIT NIL + * and with no pending handler. Both deferrable interrupt handlers + * and gc are careful not to clobber each other's pending_mask. */ boolean gc_blocked_deferrables; #ifdef LISP_FEATURE_PPC /* On PPC when consing wants to turn to alloc(), it does so via a diff --git a/version.lisp-expr b/version.lisp-expr index f46932c..01dab68 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.26.16" +"1.0.26.17" -- 1.7.10.4