From a7c2f2622f1ceeeb3459cb6bbcf261bda1ff2327 Mon Sep 17 00:00:00 2001 From: Daniel Barlow Date: Mon, 19 Jul 2004 23:44:44 +0000 Subject: [PATCH] 0.8.12.42 Fix the threading problems revealed by Edi's CL-PPCRE tests ... If target-arch.h is included before genesis/config.h, some runtime files with inline expansion of get_spinlock get the wrong version Significantly change handling of thread exit: the SIG_THREAD_EXIT handler just makes th->state=STATE_DEAD, does not do all_threads surgery, does not call Lisp code and is no longer deferrable, eliminating a problem where threads die at the start of GC and become zombified so cannot be stopped for GC Instead we call thread exit handlers from reap_dead_threads(), necessitating further changes in HANDLE-THREAD-EXIT, as it can now be called from threads other than the parent of the dead one stop_the_world doesn't actually need to hold all_threads_lock(), as it doesn't modify the all_threads list. Likewise sig_stop_for_gc_handler(), which means the sched_yield() kludge can go away --- src/code/gc.lisp | 3 +- src/code/target-thread.lisp | 55 +++++++++++++++----- src/code/target-unithread.lisp | 2 + src/compiler/generic/objdef.lisp | 6 +-- src/runtime/alloc.c | 2 +- src/runtime/interrupt.c | 31 +++--------- src/runtime/thread.c | 104 +++++++++++++++++++++++++++----------- src/runtime/thread.h | 1 + src/runtime/x86-arch.h | 12 ++++- src/runtime/x86-linux-os.c | 4 +- version.lisp-expr | 2 +- 11 files changed, 145 insertions(+), 77 deletions(-) diff --git a/src/code/gc.lisp b/src/code/gc.lisp index 1a221d6..7c548fd 100644 --- a/src/code/gc.lisp +++ b/src/code/gc.lisp @@ -244,7 +244,8 @@ and submit it as a patch." (scrub-control-stack) (setf *need-to-collect-garbage* nil) (dolist (h *after-gc-hooks*) (carefully-funcall h)) - (gc-start-the-world))))))) + (gc-start-the-world)) + (sb!thread::reap-dead-threads)))))) ;;; This is the user-advertised garbage collection function. (defun gc (&key (gen 0) (full nil) &allow-other-keys) diff --git a/src/code/target-thread.lisp b/src/code/target-thread.lisp index 8678534..07a0031 100644 --- a/src/code/target-thread.lisp +++ b/src/code/target-thread.lisp @@ -23,6 +23,8 @@ unsigned-int (thread-id unsigned-long)) +(define-alien-routine reap-dead-threads void) + (defvar *session* nil) ;;;; queues, locks @@ -121,6 +123,9 @@ ;;; i suspect there may be a race still in this: the futex version requires ;;; the old mutex value before sleeping, so how do we get away without it (defun get-mutex (lock &optional new-value (wait-p t)) + "Acquire LOCK, setting it to NEW-VALUE or some suitable default value +if NIL. If WAIT-P is non-NIL and the lock is in use, sleep until it +is available" (declare (type mutex lock) (optimize (speed 3))) (let ((pid (current-thread-id))) (unless new-value (setf new-value pid)) @@ -144,7 +149,8 @@ (let ((pid (current-thread-id)) old) (unless new-value (setf new-value pid)) - (assert (not (eql new-value (mutex-value lock)))) + (when (eql new-value (mutex-value lock)) + (warn "recursive lock attempt ~S~%" lock)) (loop (unless (setf old (sb!vm::%instance-set-conditional lock 4 nil new-value)) @@ -333,6 +339,27 @@ SB-EXT:QUIT - the usual cleanup forms will be evaluated" until (sb!sys:sap= thread (sb!sys:int-sap 0)) collect (funcall function thread)))) +(defun thread-sap-from-id (id) + (let ((thread (alien-sap (extern-alien "all_threads" (* t))))) + (loop + (when (sb!sys:sap= thread (sb!sys:int-sap 0)) (return nil)) + (let ((pid (sb!sys:sap-ref-32 thread (* 4 sb!vm::thread-pid-slot)))) + (when (= pid id) (return thread)) + (setf thread (sb!sys:sap-ref-sap thread (* 4 sb!vm::thread-next-slot))))))) + +;;; internal use only. If you think you need to use this, either you +;;; are an SBCL developer, are doing something that you should discuss +;;; with an SBCL developer first, or are doing something that you +;;; should probably discuss with a professional psychiatrist first +(defun symbol-value-in-thread (symbol thread-id) + (let ((thread (thread-sap-from-id thread-id))) + (when thread + (let* ((index (sb!vm::symbol-tls-index symbol)) + (tl-val (sb!sys:sap-ref-32 thread (* 4 index)))) + (if (eql tl-val sb!vm::unbound-marker-widetag) + (sb!vm::symbol-global-value symbol) + (sb!kernel:make-lisp-obj tl-val)))))) + ;;;; job control, independent listeners (defstruct session @@ -349,15 +376,15 @@ SB-EXT:QUIT - the usual cleanup forms will be evaluated" (defun init-job-control () (setf *session* (new-session))) -(defun %delete-thread-from-session (tid) - (with-mutex ((session-lock *session*)) - (setf (session-threads *session*) - (delete tid (session-threads *session*)) - (session-interactive-threads *session*) - (delete tid (session-interactive-threads *session*))))) +(defun %delete-thread-from-session (tid session) + (with-mutex ((session-lock session)) + (setf (session-threads session) + (delete tid (session-threads session)) + (session-interactive-threads session) + (delete tid (session-interactive-threads session))))) (defun call-with-new-session (fn) - (%delete-thread-from-session (current-thread-id)) + (%delete-thread-from-session (current-thread-id) *session*) (let ((*session* (new-session))) (funcall fn))) (defmacro with-new-session (args &body forms) @@ -366,14 +393,18 @@ SB-EXT:QUIT - the usual cleanup forms will be evaluated" `(labels ((,fb-name () ,@forms)) (call-with-new-session (function ,fb-name))))) -;;; this is called from a C signal handler: some signals may be masked +;;; Remove thread id TID from its session, if it has one. This is +;;; called from C reap_dead_threads() so is run in the context of +;;; whichever thread called that (usually after a GC), which may not have +;;; any meaningful parent/child/sibling relationship with the dead thread (defun handle-thread-exit (tid) - "Remove thread id TID from the session, if it's there" - (%delete-thread-from-session tid)) + (let ((session (symbol-value-in-thread '*session* tid))) + (and session (%delete-thread-from-session tid session)))) (defun terminate-session () - "Kill all threads in session exept for this one. Does nothing if current + "Kill all threads in session except for this one. Does nothing if current thread is not the foreground thread" + (reap-dead-threads) (let* ((tid (current-thread-id)) (to-kill (with-mutex ((session-lock *session*)) diff --git a/src/code/target-unithread.lisp b/src/code/target-unithread.lisp index 0eddbbb..4fc86d5 100644 --- a/src/code/target-unithread.lisp +++ b/src/code/target-unithread.lisp @@ -22,6 +22,8 @@ (sb!sys:sap-ref-32 (alien-sap (extern-alien "all_threads" (* t))) (* sb!vm::thread-pid-slot 4))) +(defun reap-dead-threads ()) + ;;;; queues, locks ;; spinlocks use 0 as "free" value: higher-level locks use NIL diff --git a/src/compiler/generic/objdef.lisp b/src/compiler/generic/objdef.lisp index 5de4603..0659a28 100644 --- a/src/compiler/generic/objdef.lisp +++ b/src/compiler/generic/objdef.lisp @@ -327,7 +327,7 @@ ;; first data slot, and if you subtract 7 you get a symbol header. ;; also the CAR of NIL-as-end-of-list - (value :init :unbound) + (value :init :unbound :ref-known (flushable) :ref-trans symbol-global-value) ;; also the CDR of NIL-as-end-of-list. Its reffer needs special ;; care for this reason, as hash values must be fixnums. (hash :set-trans %set-symbol-hash) @@ -339,7 +339,7 @@ (package :ref-trans symbol-package :set-trans %set-symbol-package :init :null) - #!+sb-thread (tls-index)) + #!+sb-thread (tls-index :ref-known (flushable) :ref-trans symbol-tls-index)) (define-primitive-object (complex-single-float :lowtag other-pointer-lowtag @@ -375,7 +375,7 @@ (tls-cookie) ; on x86, the LDT index (this :c-type "struct thread *" :length #!+alpha 2 #!-alpha 1) (next :c-type "struct thread *" :length #!+alpha 2 #!-alpha 1) - (state) ; running, stopping, stopped + (state) ; running, stopping, stopped, dead #!+x86 (pseudo-atomic-atomic) #!+x86 (pseudo-atomic-interrupted) (interrupt-data :c-type "struct interrupt_data *" diff --git a/src/runtime/alloc.c b/src/runtime/alloc.c index ed6ec54..9fcafd3 100644 --- a/src/runtime/alloc.c +++ b/src/runtime/alloc.c @@ -17,10 +17,10 @@ #include #include +#include "sbcl.h" #include "genesis/config.h" #include "runtime.h" #include "os.h" -#include "sbcl.h" #include "alloc.h" #include "globals.h" #include "gc.h" diff --git a/src/runtime/interrupt.c b/src/runtime/interrupt.c index e277dde..2be9e0f 100644 --- a/src/runtime/interrupt.c +++ b/src/runtime/interrupt.c @@ -108,7 +108,6 @@ void sigaddset_blockable(sigset_t *s) #ifdef LISP_FEATURE_SB_THREAD sigaddset(s, SIG_STOP_FOR_GC); sigaddset(s, SIG_INTERRUPT_THREAD); - sigaddset(s, SIG_THREAD_EXIT); #endif } @@ -552,16 +551,6 @@ sig_stop_for_gc_handler(int signal, siginfo_t *info, void *void_context) sigset_t ss; int i; - /* KLUDGE: at least on Linux, the kernel apparently schedules a - thread immediately it is signalled. However, we signal - SIG_STOP_FOR_GC while holding the spinlock, and consequently we - can easily end up with a kind of thundering herd of threads all - wanting to acquire the lock at the same time so that they can - tell the system that they've gone to sleep. So we yield here. - Whether this is the right fix or not is unknown. -- CSR, - 2004-07-16 */ - sched_yield(); - if(maybe_defer_handler(sig_stop_for_gc_handler,data, signal,info,context)) { return; @@ -573,9 +562,7 @@ sig_stop_for_gc_handler(int signal, siginfo_t *info, void *void_context) for(i=1;ipid); thread->state=STATE_STOPPED; - release_spinlock(&all_threads_lock); sigemptyset(&ss); sigaddset(&ss,SIG_STOP_FOR_GC); sigwaitinfo(&ss,0); @@ -700,26 +687,20 @@ void interrupt_thread_handler(int num, siginfo_t *info, void *v_context) void thread_exit_handler(int num, siginfo_t *info, void *v_context) { /* called when a child thread exits */ - os_context_t *context = (os_context_t*)arch_os_get_context(&v_context); - struct thread *th=arch_os_get_current_thread(); pid_t kid; - int *status; - struct interrupt_data *data= - th ? th->interrupt_data : global_interrupt_data; - if(maybe_defer_handler(thread_exit_handler,data,num,info,context)){ - return ; - } + int status; + while(1) { kid=waitpid(-1,&status,__WALL|WNOHANG); - if(kid<1) break; + if(kid<=0) break; if(WIFEXITED(status) || WIFSIGNALED(status)) { struct thread *th=find_thread_by_pid(kid); - if(!th) continue; - funcall1(SymbolFunction(HANDLE_THREAD_EXIT),make_fixnum(kid)); - destroy_thread(th); + if(th) th->state=STATE_DEAD; } } } + + #endif boolean handle_control_stack_guard_triggered(os_context_t *context,void *addr){ diff --git a/src/runtime/thread.c b/src/runtime/thread.c index 42a8b07..0d03ae2 100644 --- a/src/runtime/thread.c +++ b/src/runtime/thread.c @@ -5,6 +5,9 @@ #include #include #include +#include +#include + #include "runtime.h" #include "sbcl.h" #include "validate.h" /* for CONTROL_STACK_SIZE etc */ @@ -15,6 +18,7 @@ #include "globals.h" #include "dynbind.h" #include "genesis/cons.h" +#include "genesis/fdefn.h" #define ALIEN_STACK_SIZE (1*1024*1024) /* 1Mb size chosen at random */ int dynamic_values_bytes=4096*sizeof(lispobj); /* same for all threads */ @@ -22,7 +26,7 @@ struct thread *all_threads; volatile lispobj all_threads_lock; extern struct interrupt_data * global_interrupt_data; -void get_spinlock(lispobj *word,int value); +void get_spinlock(volatile lispobj *word,int value); int initial_thread_trampoline(struct thread *th) @@ -195,6 +199,11 @@ struct thread * create_thread_struct(lispobj initial_function) { void link_thread(struct thread *th,pid_t kid_pid) { + sigset_t newset,oldset; + sigemptyset(&newset); + sigaddset_blockable(&newset); + sigprocmask(SIG_BLOCK, &newset, &oldset); + get_spinlock(&all_threads_lock,kid_pid); th->next=all_threads; all_threads=th; @@ -203,6 +212,8 @@ void link_thread(struct thread *th,pid_t kid_pid) */ protect_control_stack_guard_page(th->pid,1); release_spinlock(&all_threads_lock); + + sigprocmask(SIG_SETMASK,&oldset,0); th->pid=kid_pid; /* child will not start until this is set */ } @@ -218,30 +229,38 @@ void create_initial_thread(lispobj initial_function) { #ifdef LISP_FEATURE_SB_THREAD pid_t create_thread(lispobj initial_function) { struct thread *th=create_thread_struct(initial_function); - pid_t kid_pid=clone(new_thread_trampoline, - (((void*)th->control_stack_start)+ - THREAD_CONTROL_STACK_SIZE-4), - CLONE_FILES|SIG_THREAD_EXIT|CLONE_VM,th); - - if(th && kid_pid>0) { + pid_t kid_pid=0; + + if(th==0) return 0; + kid_pid=clone(new_thread_trampoline, + (((void*)th->control_stack_start)+ + THREAD_CONTROL_STACK_SIZE-4), + CLONE_FILES|SIG_THREAD_EXIT|CLONE_VM,th); + + if(kid_pid>0) { link_thread(th,kid_pid); return th->pid; } else { - destroy_thread(th); + os_invalidate((os_vm_address_t) th->control_stack_start, + ((sizeof (lispobj)) + * (th->control_stack_end-th->control_stack_start)) + + BINDING_STACK_SIZE+ALIEN_STACK_SIZE+dynamic_values_bytes+ + 32*SIGSTKSZ); return 0; } } #endif +/* unused */ void destroy_thread (struct thread *th) { /* precondition: the unix task has already been killed and exited. - * This is called by the parent */ + * This is called by the parent or some other thread */ #ifdef LISP_FEATURE_GENCGC gc_alloc_update_page_tables(0, &th->alloc_region); #endif get_spinlock(&all_threads_lock,th->pid); - th->state=STATE_STOPPED; + th->unbound_marker=0; /* for debugging */ if(th==all_threads) all_threads=th->next; else { @@ -258,6 +277,33 @@ void destroy_thread (struct thread *th) 32*SIGSTKSZ); } +void reap_dead_threads() +{ + struct thread *th,*next,*prev=0; + th=all_threads; + while(th) { + next=th->next; + if(th->state==STATE_DEAD) { + funcall1(SymbolFunction(HANDLE_THREAD_EXIT),make_fixnum(th->pid)); +#ifdef LISP_FEATURE_GENCGC + gc_alloc_update_page_tables(0, &th->alloc_region); +#endif + get_spinlock(&all_threads_lock,th->pid); + if(prev) prev->next=next; + else all_threads=next; + release_spinlock(&all_threads_lock); + if(th->tls_cookie>=0) arch_os_thread_cleanup(th); + os_invalidate((os_vm_address_t) th->control_stack_start, + ((sizeof (lispobj)) + * (th->control_stack_end-th->control_stack_start)) + + BINDING_STACK_SIZE+ALIEN_STACK_SIZE+dynamic_values_bytes+ + 32*SIGSTKSZ); + } else + prev=th; + th=next; + } +} + struct thread *find_thread_by_pid(pid_t pid) { @@ -309,7 +355,7 @@ int signal_thread_to_dequeue (pid_t pid) /* stopping the world is a two-stage process. From this thread we signal - * all the others with SIG_STOP_FOR_GC. The handler for this thread does + * all the others with SIG_STOP_FOR_GC. The handler for this signal does * the usual pseudo-atomic checks (we don't want to stop a thread while * it's in the middle of allocation) then kills _itself_ with SIGSTOP. */ @@ -319,26 +365,22 @@ void gc_stop_the_world() /* stop all other threads by sending them SIG_STOP_FOR_GC */ struct thread *p,*th=arch_os_get_current_thread(); pid_t old_pid; - int finished=0; + int finished; do { - get_spinlock(&all_threads_lock,th->pid); + finished=1; for(p=all_threads,old_pid=p->pid; p; p=p->next) { if(p==th) continue; - if(p->state!=STATE_RUNNING) continue; - p->state=STATE_STOPPING; - kill(p->pid,SIG_STOP_FOR_GC); + if(p->state==STATE_RUNNING) { + p->state=STATE_STOPPING; + kill(p->pid,SIG_STOP_FOR_GC); + } + if((p->state!=STATE_STOPPED) && + (p->state!=STATE_DEAD)) { + finished=0; + } } - release_spinlock(&all_threads_lock); - sched_yield(); - /* if everything has stopped, and there is no possibility that - * a new thread has been created, we're done. Otherwise go - * round again and signal anything that sprang up since last - * time */ - if(old_pid==all_threads->pid) { - finished=1; - for_each_thread(p) - finished = finished && - ((p==th) || (p->state==STATE_STOPPED)); + if(old_pid!=all_threads->pid) { + finished=0; } } while(!finished); } @@ -346,12 +388,14 @@ void gc_stop_the_world() void gc_start_the_world() { struct thread *p,*th=arch_os_get_current_thread(); - get_spinlock(&all_threads_lock,th->pid); + /* if a resumed thread creates a new thread before we're done with + * this loop, the new thread will get consed on the front of * + * all_threads_lock, but it won't have been stopped so won't need + * restarting */ for(p=all_threads;p;p=p->next) { - if(p==th) continue; + if((p==th) || (p->state==STATE_DEAD)) continue; p->state=STATE_RUNNING; kill(p->pid,SIG_STOP_FOR_GC); } - release_spinlock(&all_threads_lock); } #endif diff --git a/src/runtime/thread.h b/src/runtime/thread.h index d503c27..825c412 100644 --- a/src/runtime/thread.h +++ b/src/runtime/thread.h @@ -21,6 +21,7 @@ struct alloc_region { }; #define STATE_RUNNING (make_fixnum(0)) #define STATE_STOPPING (make_fixnum(1)) #define STATE_STOPPED (make_fixnum(2)) +#define STATE_DEAD (make_fixnum(3)) #define THREAD_SLOT_OFFSET_WORDS(c) \ (offsetof(struct thread,c)/(sizeof (struct thread *))) diff --git a/src/runtime/x86-arch.h b/src/runtime/x86-arch.h index e2b3e25..eea03a4 100644 --- a/src/runtime/x86-arch.h +++ b/src/runtime/x86-arch.h @@ -4,6 +4,9 @@ * namespace that we control. */ #ifndef _X86_ARCH_H #define _X86_ARCH_H +#ifndef SBCL_GENESIS_CONFIG +#error genesis/config.h (or sbcl.h) must be incuded before this file +#endif #define ARCH_HAS_STACK_POINTER @@ -14,10 +17,15 @@ * 2002-02-15) */ #ifdef LISP_FEATURE_SB_THREAD + +extern never_returns lose(char *fmt, ...); + static inline void -get_spinlock(lispobj *word,int value) +get_spinlock(volatile lispobj *word,int value) { u32 eax=0; + if(*word==value) + lose("recursive get_spinlock: 0x%x,%d\n",word,value); do { asm ("xor %0,%0\n\ lock cmpxchg %1,%2" @@ -28,7 +36,7 @@ get_spinlock(lispobj *word,int value) } static inline void -release_spinlock(lispobj *word) +release_spinlock(volatile lispobj *word) { *word=0; } diff --git a/src/runtime/x86-linux-os.c b/src/runtime/x86-linux-os.c index 927d3c4..25d9cd7 100644 --- a/src/runtime/x86-linux-os.c +++ b/src/runtime/x86-linux-os.c @@ -22,6 +22,7 @@ #include #include +#include "sbcl.h" #include "./signal.h" #include "os.h" #include "arch.h" @@ -29,7 +30,6 @@ #include "interrupt.h" #include "interr.h" #include "lispregs.h" -#include "sbcl.h" #include #include @@ -65,7 +65,7 @@ void debug_get_ldt() printf("%d bytes in ldt: print/x local_ldt_copy\n", n); } -lispobj modify_ldt_lock; /* protect all calls to modify_ldt */ +volatile lispobj modify_ldt_lock; /* protect all calls to modify_ldt */ int arch_os_thread_init(struct thread *thread) { stack_t sigstack; diff --git a/version.lisp-expr b/version.lisp-expr index 1623278..016bbd1 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.8.12.41" +"0.8.12.42" -- 1.7.10.4