From d1873cc3f7a09f9891bb9c05f206af1774876c0c Mon Sep 17 00:00:00 2001 From: Gabor Melis Date: Mon, 16 Feb 2009 21:36:13 +0000 Subject: [PATCH] 1.0.25.21: handling of potential corruptions - add corruption_warning_and_maybe_lose that prints a warning and loses depending on lose_on_corruption_p (false by default) - use corruption_warning_and_maybe_lose when the control stack is exhausted and on memory faults - use corruption_warning_and_maybe_lose on the lisp handlers of SIGILL, SIGBUS and SIGEMT, as invoking them is surely not a good sign. - add --lose-on-corruption as a runtime option - add --disable-ldb as a runtime option - update the man page and the user manual - HEAP-EXHAUSTED fixes: - exit pseduo atomic properly and do pending interrupt if needed - signalling HEAP-EXHAUSTED in a WITHOUT-INTERRUPTS is dangerous - use --lose-on-corruption in make-target*.sh Also, block blockable signals on lose() to prevent other threads, timers and such from interfering. If only all threads could be stopped somehow. --- NEWS | 8 ++++++ doc/manual/start-stop.texinfo | 13 +++++++++ doc/sbcl.1 | 19 +++++++++++--- make-target-2.sh | 2 ++ make-target-contrib.sh | 3 ++- src/runtime/gencgc.c | 14 +++++++++- src/runtime/interr.c | 58 ++++++++++++++++++++++++++++++++++++----- src/runtime/interr.h | 2 ++ src/runtime/interrupt.c | 11 ++++++++ src/runtime/runtime.c | 12 ++++++++- version.lisp-expr | 2 +- 11 files changed, 130 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 85422c1..a35e241 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,12 @@ ;;;; -*- coding: utf-8; fill-column: 78 -*- +changes in sbcl-1.0.26 relative to 1.0.25: + * new feature: runtime option --disable-ldb + * new feature: runtime option --lose-on-corruption to die at the + slightest hint of possibly non-recoverable errors: running out of + memory, stack, alien stack, binding stack, encountering a memory + fault, etc. In the absence of --lose-on-corruption a warning is + printed to stderr. + changes in sbcl-1.0.25 relative to 1.0.24: * incompatible change: SB-INTROSPECT:FUNCTION-ARGLIST is deprecated, to be removed later. Please use SB-INTROSPECT:FUNCTION-LAMBDA-LIST instead. diff --git a/doc/manual/start-stop.texinfo b/doc/manual/start-stop.texinfo index be2a88a..1ec355a 100644 --- a/doc/manual/start-stop.texinfo +++ b/doc/manual/start-stop.texinfo @@ -214,8 +214,21 @@ startup. This makes it easier to write Lisp programs which work cleanly in Unix pipelines. See also the @code{--noprint} and @code{--disable-debugger} options. +@item --disable-ldb +Disable the low-level debugger. Only effective if SBCL is compiled +with LDB. + +@item --lose-on-corruption +There are some dangerous low level errors (for instance, control stack +exhausted, memory fault) that (or whose handlers) can corrupt the +image. By default SBCL prints a warning, then tries to continue and +handle the error in Lisp, but this will not always work and SBCL may +malfunction or even hang. With this option, upon encountering such an +error SBCL will invoke ldb (if present and enabled) or else exit. + @item --script @var{filename} As a runtime option this is equivalent to @code{--noinform} +@code{--disable-ldb} @code{--lose-on-corruption} @code{--end-runtime-options} @code{--script} @var{filename}. See the description of @code{--script} as a toplevel option below. diff --git a/doc/sbcl.1 b/doc/sbcl.1 index 2f86938..62010f9 100644 --- a/doc/sbcl.1 +++ b/doc/sbcl.1 @@ -105,10 +105,22 @@ startup. (This makes it easier to write Lisp programs which work cleanly in Unix pipelines. See also the "\-\-noprint" and "\-\-disable\-debugger" options.) .TP 3 +.B \-\-disable\-ldb +Disable the low-level debugger. Only effective if SBCL is compiled with LDB. +.TP 3 +.B \-\-lose\-on\-corruption +There are some dangerous low level errors (for instance, control stack +exhausted, memory fault) that (or whose handlers) can corrupt the +image. By default SBCL prints a warning, then tries to continue and +handle the error in Lisp, but this will not always work and SBCL may +malfunction or even hang. With this option, upon encountering such an +error SBCL will invoke ldb (if present and enabled) or else exit. +.TP 3 .B \-\-script -As a runtime option equivalent to \-\-noinform -\-\-end\-toplevel\-options \-\-script . See the description -of \-\-script as a toplevel option below. +As a runtime option equivalent to \-\-noinform \-\-disable\-ldb +\-\-lose\-on\-corruption \-\-end\-toplevel\-options \-\-script +. See the description of \-\-script as a toplevel option +below. .TP 3 .B \-\-help Print some basic information about SBCL, then exit. @@ -166,6 +178,7 @@ debugger, allowing interactive diagnosis and possible intercession. This option disables the debugger, causing errors to print a backtrace and exit with status 1 instead -- which is a mode of operation better suited for batch processing. See the User Manual on \f(CRSB\-EXT:DISABLE\-DEBUGGER\fR for details. +.TP 3 .B \-\-script Implies \-\-no-sysinit \-\-no-userinit \-\-disable-debugger \-\-end\-toplevel\-options. diff --git a/make-target-2.sh b/make-target-2.sh index 48b7a1a..7c6cc83 100644 --- a/make-target-2.sh +++ b/make-target-2.sh @@ -32,8 +32,10 @@ export LANG LC_ALL echo //doing warm init - compilation phase ./src/runtime/sbcl \ --core output/cold-sbcl.core \ +--lose-on-corruption \ --no-sysinit --no-userinit < make-target-2.lisp echo //doing warm init - load and dump phase ./src/runtime/sbcl \ --core output/cold-sbcl.core \ +--lose-on-corruption \ --no-sysinit --no-userinit < make-target-2-load.lisp diff --git a/make-target-contrib.sh b/make-target-contrib.sh index 934fb9a..bd9f640 100644 --- a/make-target-contrib.sh +++ b/make-target-contrib.sh @@ -32,7 +32,8 @@ if [ "$OSTYPE" = "cygwin" ] ; then SBCL_PWD=`echo $SBCL_PWD | sed s/\ /\\\\\\\\\ /g` fi -SBCL="$SBCL_PWD/src/runtime/sbcl --noinform --core $SBCL_PWD/output/sbcl.core --disable-debugger --no-sysinit --no-userinit" +SBCL="$SBCL_PWD/src/runtime/sbcl --noinform --core $SBCL_PWD/output/sbcl.core \ +--lose-on-corruption --disable-debugger --no-sysinit --no-userinit" SBCL_BUILDING_CONTRIB=1 export SBCL SBCL_BUILDING_CONTRIB diff --git a/src/runtime/gencgc.c b/src/runtime/gencgc.c index abd4918..9680743 100644 --- a/src/runtime/gencgc.c +++ b/src/runtime/gencgc.c @@ -1154,6 +1154,7 @@ static page_index_t gencgc_alloc_start_page = -1; void gc_heap_exhausted_error_or_lose (long available, long requested) { + struct thread *thread = arch_os_get_current_thread(); /* Write basic information before doing anything else: if we don't * call to lisp this is a must, and even if we do there is always * the danger that we bounce back here before the error has been @@ -1166,7 +1167,6 @@ gc_heap_exhausted_error_or_lose (long available, long requested) /* If we are in GC, or totally out of memory there is no way * to sanely transfer control to the lisp-side of things. */ - struct thread *thread = arch_os_get_current_thread(); print_generation_stats(1); fprintf(stderr, "GC control variables:\n"); fprintf(stderr, " *GC-INHIBIT* = %s\n *GC-PENDING* = %s\n", @@ -1181,6 +1181,18 @@ gc_heap_exhausted_error_or_lose (long available, long requested) else { /* FIXME: assert free_pages_lock held */ (void)thread_mutex_unlock(&free_pages_lock); + gc_assert(get_pseudo_atomic_atomic(thread)); + clear_pseudo_atomic_atomic(thread); + if (get_pseudo_atomic_interrupted(thread)) + do_pending_interrupt(); + /* Another issue is that signalling HEAP-EXHAUSTED error leads + * to running user code at arbitrary places, even in a + * WITHOUT-INTERRUPTS which may lead to a deadlock without + * running out of the heap. So at this point all bets are + * off. */ + if (SymbolValue(INTERRUPTS_ENABLED,thread) == NIL) + corruption_warning_and_maybe_lose + ("Signalling HEAP-EXHAUSTED in a WITHOUT-INTERRUPTS."); funcall2(StaticSymbolFunction(HEAP_EXHAUSTED_ERROR), alloc_number(available), alloc_number(requested)); lose("HEAP-EXHAUSTED-ERROR fell through"); diff --git a/src/runtime/interr.c b/src/runtime/interr.c index 5f7869c..c13be45 100644 --- a/src/runtime/interr.c +++ b/src/runtime/interr.c @@ -48,26 +48,70 @@ void disable_lossage_handler(void) lossage_handler = default_lossage_handler; } -void -lose(char *fmt, ...) +static +void print_message(char *fmt, va_list ap) { - va_list ap; - fprintf(stderr, "fatal error encountered in SBCL pid %d",getpid()); + fprintf(stderr, " in SBCL pid %d",getpid()); #if defined(LISP_FEATURE_SB_THREAD) fprintf(stderr, "(tid %lu)", (unsigned long) thread_self()); #endif if (fmt) { fprintf(stderr, ":\n"); - va_start(ap, fmt); vfprintf(stderr, fmt, ap); - va_end(ap); } fprintf(stderr, "\n"); - fflush(stderr); +} + +static inline void +call_lossage_handler() never_returns; + +static inline void +call_lossage_handler() +{ lossage_handler(); fprintf(stderr, "Argh! lossage_handler() returned, total confusion..\n"); exit(1); } + +void +lose(char *fmt, ...) +{ + va_list ap; + /* Block signals to prevent other threads, timers and such from + * interfering. If only all threads could be stopped somehow. */ + block_blockable_signals(); + fprintf(stderr, "fatal error encountered"); + va_start(ap, fmt); + print_message(fmt, ap); + va_end(ap); + fprintf(stderr, "\n"); + fflush(stderr); + call_lossage_handler(); +} + +boolean lose_on_corruption_p = 0; + +void +corruption_warning_and_maybe_lose(char *fmt, ...) +{ + va_list ap; + sigset_t oldset; + thread_sigmask(SIG_BLOCK, &blockable_sigset, &oldset); + fprintf(stderr, "CORRUPTION WARNING"); + va_start(ap, fmt); + print_message(fmt, ap); + va_end(ap); + fprintf(stderr, "The integrity of this image is possibly compromised.\n"); + if (lose_on_corruption_p) + fprintf(stderr, "Exiting.\n"); + else + fprintf(stderr, "Continuing with fingers crossed.\n"); + fflush(stderr); + if (lose_on_corruption_p) + call_lossage_handler(); + else + thread_sigmask(SIG_SETMASK,&oldset,0); +} /* internal error handler for when the Lisp error system doesn't exist * diff --git a/src/runtime/interr.h b/src/runtime/interr.h index e527fcb..02c8b3c 100644 --- a/src/runtime/interr.h +++ b/src/runtime/interr.h @@ -13,6 +13,8 @@ #define _INTERR_H_ extern void lose(char *fmt, ...) never_returns; +extern boolean lose_on_corruption_p; +extern void corruption_warning_and_maybe_lose(char *fmt, ...); extern void enable_lossage_handler(void); extern void disable_lossage_handler(void); extern void describe_internal_error(os_context_t *context); diff --git a/src/runtime/interrupt.c b/src/runtime/interrupt.c index 95f50be..88126e7 100644 --- a/src/runtime/interrupt.c +++ b/src/runtime/interrupt.c @@ -810,6 +810,14 @@ interrupt_handle_now_handler(int signal, siginfo_t *info, void *void_context) os_context_t *context = arch_os_get_context(&void_context); #if defined(LISP_FEATURE_LINUX) || defined(RESTORE_FP_CONTROL_FROM_CONTEXT) os_restore_fp_control(context); +#ifndef LISP_FEATURE_WIN32 + if ((signal == SIGILL) || (signal == SIGBUS) +#ifndef LISP_FEATURE_LINUX + || (signal == SIGEMT) +#endif + ) + corruption_warning_and_maybe_lose("Signal %d recieved", signal); +#endif #endif interrupt_handle_now(signal, info, context); } @@ -1045,6 +1053,7 @@ handle_guard_page_triggered(os_context_t *context,os_vm_address_t addr) * protection so the error handler has some headroom, protect the * previous page so that we can catch returns from the guard page * and restore it. */ + corruption_warning_and_maybe_lose("Control stack exhausted"); protect_control_stack_guard_page(0); protect_control_stack_return_guard_page(1); @@ -1300,6 +1309,8 @@ lisp_memory_fault_error(os_context_t *context, os_vm_address_t addr) * now -- some address is better then no address in this case. */ current_memory_fault_address = addr; + /* To allow debugging memory faults in signal handlers and such. */ + corruption_warning_and_maybe_lose("Memory fault"); arrange_return_to_lisp_function(context, StaticSymbolFunction(MEMORY_FAULT_ERROR)); } diff --git a/src/runtime/runtime.c b/src/runtime/runtime.c index fba25c0..23c323f 100644 --- a/src/runtime/runtime.c +++ b/src/runtime/runtime.c @@ -229,6 +229,7 @@ main(int argc, char *argv[], char *envp[]) /* other command line options */ boolean noinform = 0; boolean end_runtime_options = 0; + boolean disable_lossage_handler_p = 0; lispobj initial_function; const char *sbcl_home = getenv("SBCL_HOME"); @@ -275,6 +276,8 @@ main(int argc, char *argv[], char *envp[]) * TOPLEVEL-INIT sees the option. */ noinform = 1; end_runtime_options = 1; + disable_lossage_handler_p = 1; + lose_on_corruption_p = 1; break; } else if (0 == strcmp(arg, "--noinform")) { noinform = 1; @@ -337,6 +340,12 @@ main(int argc, char *argv[], char *envp[]) ++n; } ++argi; + } else if (0 == strcmp(arg, "--disable-ldb")) { + disable_lossage_handler_p = 1; + ++argi; + } else if (0 == strcmp(arg, "--lose-on-corruption")) { + lose_on_corruption_p = 1; + ++argi; } else if (0 == strcmp(arg, "--end-runtime-options")) { end_runtime_options = 1; ++argi; @@ -426,7 +435,8 @@ main(int argc, char *argv[], char *envp[]) define_var("nil", NIL, 1); define_var("t", T, 1); - enable_lossage_handler(); + if (!disable_lossage_handler_p) + enable_lossage_handler(); globals_init(); diff --git a/version.lisp-expr b/version.lisp-expr index 40dbfee..a1997bc 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.25.20" +"1.0.25.21" -- 1.7.10.4