From d0511d2a94e7d2d346e2f4acc38ff84cd99a74b1 Mon Sep 17 00:00:00 2001 From: Daniel Barlow Date: Mon, 16 Aug 2004 18:19:14 +0000 Subject: [PATCH] 0.8.13.65 Fix/bandaid for some more threading problems (though regrettably only the easy ones) ... when stopping threads for GC, if kill() fails assume this is because the thread died after outliving its parent so nothing was available to set th->state=STATE_DEAD when it exited. Mark said thread dead, Ted. (Note: still doesn't cope with pid recycling. In the unlikely-unless-malefactor-involved case that a pid previously used for a thread died after its parent and now belongs to an unrelated process which the current user has permission to kill it, it will probably die in the next GC. Exploits on a postcasrd to the usual address, please) ... interrupt_thread() now checks that the thread is one we know about (instead of just some random other process) and refuses to kill if it's not. INTERRUPT-THREAD now catches this and other errors from kill() and signals a more useful condition than the provious "unexpected system call failure" ... only the parent of a thread can identify whether it is a zombie, which it should usually do when it gets a SIG_THREAD_EXIT. It's possible that this signal may go missing if delivered during pseudoatomic or similar, so for the sake of being able to GC we ask each thread to check its children again in the stop_for_gc_handler before going to sleep Thread documentation update based on the LSM lightning talk --- doc/manual/beyond-ansi.texinfo | 143 +++++++++++++++++++++++++++++++--------- src/code/debug.lisp | 3 +- src/code/target-thread.lisp | 26 ++++++-- src/runtime/interrupt.c | 18 ++--- src/runtime/thread.c | 29 +++++++- version.lisp-expr | 2 +- 6 files changed, 169 insertions(+), 52 deletions(-) diff --git a/doc/manual/beyond-ansi.texinfo b/doc/manual/beyond-ansi.texinfo index 03a5082..2fb97ab 100644 --- a/doc/manual/beyond-ansi.texinfo +++ b/doc/manual/beyond-ansi.texinfo @@ -175,39 +175,125 @@ requested order from a user-supplied primary method. @comment node-name, next, previous, up @subsection Threading (a.k.a Multiprocessing) -SBCL (as of version 0.8.3, on Linux x86 only) supports a fairly -low-level threading interface that maps onto the host operating -system's concept of threads or lightweight processes. +SBCL supports a fairly low-level threading interface that maps onto +the host operating system's concept of threads or lightweight +processes. This means that threads may take advantage of hardware +multiprocessing on machines that have more than one CPU, but it does +not allow Lisp control of the scheduler. This is found in the +SB-THREAD package. -@subsubsection Lisp-level view +This requires x86 and Linux kernel 2.6 or systems with NPTL backports. -A rudimentary interface to creating and managing multiple threads can -be found in the @dfn{sb-thread} package. This is intended for public -consumption, so look at the exported symbols and their documentation -strings. +@subsubsection Special variables -Dynamic bindings to symbols are per-thread. Signal handlers are -per-thread. - -Mutexes and condition variables are available for managing access to -shared data: see +The interaction of special variables with multiple threads is mostly +as one would expect, but users of other Lisps are warned that the +behaviour of locally bound specials differs in places from what they +may expect. @itemize - -@item -@code{(apropos "mutex" :sb-thread)} - +@item +global special values are visible across all threads; @item -@code{(apropos "condition" :sb-thread)} - +bindings (e.g. using LET) are local to the thread; @item -and the @code{waitqueue} structure +initial values in a new thread are taken from the thread that created it. +@end itemize + +@subsubsection Mutex support + +Mutexes are used for controlling access to a shared resource. One +thread is allowed to hold the mutex, others which attempt to take it +will be made to wait until it's free. Threads are woken in the order +that they go to sleep. +There isn't a timeout on mutex acquisition, but the usual WITH-TIMEOUT +macro (which throws a TIMEOUT condition after n seconds) can be used +if you want a bounded wait. + +@lisp +(defpackage :demo (:use "CL" "SB-THREAD" "SB-EXT")) + +(in-package :demo) + +(defvar *a-mutex* (make-mutex :name "my lock")) + +(defun thread-fn () + (let ((id (current-thread-id))) + (format t "Thread ~A running ~%" id) + (with-mutex (*a-mutex*) + (format t "Thread ~A got the lock~%" id) + (sleep (random 5))) + (format t "Thread ~A dropped lock, dying now~%" id))) + +(make-thread #'thread-fn) +(make-thread #'thread-fn) + +@end lisp + +@subsubsection Waitqueue/condition variables + +These are based on the POSIX condition variable design, hence the +annoyingly CL-conflicting name. For use when you want to check a +condition and sleep until it's true. For example: you have a shared +queue, a writer process checking ``queue is empty'' and one or more +readers that need to know when ``queue is not empty''. It sounds +simple, but is astonishingly easy to deadlock if another process runs +when you weren't expecting it to. + +There are three components: + +@itemize +@item the condition itself (not represented in code) +@item the condition variable (a.k.a waitqueue) which proxies for it +@item a lock to hold while testing the condition @end itemize -and poke around in their documentation strings. +Important stuff to be aware of: + +@itemize +@item when calling condition-wait, you must hold the mutex. condition-wait will drop the mutex while it waits, and obtain it again before returning for whatever reason; + +@item likewise, you must be holding the mutex around calls to condition-notify; + +@item a process may return from condition-wait in several circumstances: it is not guaranteed that the underlying condition has become true. You must check that the resource is ready for whatever you want to do to it. + +@end itemize + +@lisp +(defvar *buffer-queue* (make-waitqueue)) +(defvar *buffer-lock* (make-mutex :name "buffer lock")) + +(defvar *buffer* (list nil)) + +(defun reader () + (with-mutex (*buffer-lock*) + (loop + (condition-wait *buffer-queue* *buffer-lock*) + (loop + (unless *buffer* (return)) + (let ((head (car *buffer*))) + (setf *buffer* (cdr *buffer*)) + (format t "reader ~A woke, read ~A~%" + (current-thread-id) head)))))) + +(defun writer () + (loop + (sleep (random 5)) + (with-mutex (*buffer-lock*) + (let ((el (intern + (string (code-char + (+ (char-code #\A) (random 26))))))) + (setf *buffer* (cons el *buffer*))) + (condition-notify *buffer-queue*)))) + +(make-thread #'writer) +(make-thread #'reader) +(make-thread #'reader) + +@end lisp -@subsubsection Sessions +@subsubsection Sessions/Debugging If the user has multiple views onto the same Lisp image (for example, using multiple terminals, or a windowing system, or network access) @@ -216,7 +302,8 @@ view has its own collection of foreground/background/stopped threads. A thread which wishes to create a new session can use @code{sb-thread:with-new-session} to remove itself from the current session (which it shares with its parent and siblings) and create a -fresh one. See also @code{sb-thread:make-listener-thread}. +fresh one. +# See also @code{sb-thread:make-listener-thread}. Within a single session, threads arbitrate between themselves for the user's attention. A thread may be in one of three notional states: @@ -248,13 +335,9 @@ cause interesting results if you link to foreign code that expects threading or creates new threads, and the thread library in question uses %fs in an incompatible way. -There are two implementation mechanisms for queueing. If SBCL was -built on an NPTL-capable Linux system (2.6 or some vendor 2.4 ports) -with the @code{:SB-FUTEX} feature, queuing will be done using the -@code{sys_futex()} system call if it's available at runtime. -Otherwise it will fall back to using @code{sigtimedwait()} to sleep -and a signal (@code{SIG_DEQUEUE}, one of the POSIX RT signals) to -wake. +Queues require the @code{sys_futex()} system call to be available: +this is the reason for the NPTL requirement. We test at runtime that +this system call exists. Garbage collection is done with the existing Conservative Generational GC. Allocation is done in small (typically 8k) regions: each thread diff --git a/src/code/debug.lisp b/src/code/debug.lisp index b7c1306..1ec043b 100644 --- a/src/code/debug.lisp +++ b/src/code/debug.lisp @@ -874,8 +874,9 @@ reset to ~S." (handler-case (progn (format *error-output* - "~&~@~2%" + "~&~@~2%" (type-of condition) + (sb!thread:current-thread-id) condition) ;; Flush *ERROR-OUTPUT* even before the BACKTRACE, so that ;; even if we hit an error within BACKTRACE (e.g. a bug in diff --git a/src/code/target-thread.lisp b/src/code/target-thread.lisp index 07a0031..b2fe232 100644 --- a/src/code/target-thread.lisp +++ b/src/code/target-thread.lisp @@ -306,15 +306,29 @@ time we reacquire LOCK and return to the caller." ;;; locks, you probably won't like the effect. Used with thought ;;; though, it's a good deal gentler than the last-resort functions above +(define-condition interrupt-thread-error (error) + ((thread :reader interrupt-thread-error-thread :initarg :thread) + (errno :reader interrupt-thread-error-errno :initarg :errno)) + (:report (lambda (c s) + (format s "interrupt thread ~A failed (~A: ~A)" + (interrupt-thread-error-thread c) + (interrupt-thread-error-errno c) + (strerror (interrupt-thread-error-errno c)))))) + (defun interrupt-thread (thread function) "Interrupt THREAD and make it run FUNCTION." (let ((function (coerce function 'function))) - (sb!sys:with-pinned-objects (function) - (sb!unix::syscall* ("interrupt_thread" - sb!alien:unsigned-long sb!alien:unsigned-long) - thread - thread - (sb!kernel:get-lisp-obj-address function))))) + (sb!sys:with-pinned-objects + (function) + (multiple-value-bind (res err) + (sb!unix::syscall ("interrupt_thread" + sb!alien:unsigned-long sb!alien:unsigned-long) + thread + thread + (sb!kernel:get-lisp-obj-address function)) + (unless res + (error 'interrupt-thread-error :thread thread :errno err)))))) + (defun terminate-thread (thread-id) "Terminate the thread identified by THREAD-ID, by causing it to run diff --git a/src/runtime/interrupt.c b/src/runtime/interrupt.c index 2be9e0f..25b8943 100644 --- a/src/runtime/interrupt.c +++ b/src/runtime/interrupt.c @@ -562,6 +562,12 @@ sig_stop_for_gc_handler(int signal, siginfo_t *info, void *void_context) for(i=1;istate=STATE_STOPPED; sigemptyset(&ss); sigaddset(&ss,SIG_STOP_FOR_GC); @@ -687,17 +693,7 @@ void interrupt_thread_handler(int num, siginfo_t *info, void *v_context) void thread_exit_handler(int num, siginfo_t *info, void *v_context) { /* called when a child thread exits */ - pid_t kid; - int status; - - while(1) { - kid=waitpid(-1,&status,__WALL|WNOHANG); - if(kid<=0) break; - if(WIFEXITED(status) || WIFSIGNALED(status)) { - struct thread *th=find_thread_by_pid(kid); - if(th) th->state=STATE_DEAD; - } - } + mark_dead_threads(); } diff --git a/src/runtime/thread.c b/src/runtime/thread.c index e5003f9..8662682 100644 --- a/src/runtime/thread.c +++ b/src/runtime/thread.c @@ -289,6 +289,21 @@ struct thread *find_thread_by_pid(pid_t pid) #if defined LISP_FEATURE_SB_THREAD /* This is not needed unless #+SB-THREAD, as there's a trivial null * unithread definition. */ + +void mark_dead_threads() +{ + pid_t kid; + int status; + while(1) { + kid=waitpid(-1,&status,__WALL|WNOHANG); + if(kid<=0) break; + if(WIFEXITED(status) || WIFSIGNALED(status)) { + struct thread *th=find_thread_by_pid(kid); + if(th) th->state=STATE_DEAD; + } + } +} + void reap_dead_threads() { struct thread *th,*next,*prev=0; @@ -345,9 +360,12 @@ void unblock_sigcont_and_sleep(void) int interrupt_thread(pid_t pid, lispobj function) { union sigval sigval; + struct thread *th; sigval.sival_int=function; - - return sigqueue(pid, SIG_INTERRUPT_THREAD, sigval); + for_each_thread(th) + if((th->pid==pid) && (th->state != STATE_DEAD)) + return sigqueue(pid, SIG_INTERRUPT_THREAD, sigval); + errno=EPERM; return -1; } int signal_thread_to_dequeue (pid_t pid) @@ -374,7 +392,12 @@ void gc_stop_the_world() if(p==th) continue; if(p->state==STATE_RUNNING) { p->state=STATE_STOPPING; - kill(p->pid,SIG_STOP_FOR_GC); + if(kill(p->pid,SIG_STOP_FOR_GC)==-1) { + /* we can't kill the process; assume because it + * died already (and its parent is dead so never + * saw the SIGCHLD) */ + p->state=STATE_DEAD; + } } if((p->state!=STATE_STOPPED) && (p->state!=STATE_DEAD)) { diff --git a/version.lisp-expr b/version.lisp-expr index 733488e..08a2b06 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.8.13.64" +"0.8.13.65" -- 1.7.10.4