From 93ed7777d86296768a1e98acd5b08873c5aec8e4 Mon Sep 17 00:00:00 2001 From: "Tobias C. Rittweiler" Date: Sat, 3 Apr 2010 18:06:37 +0000 Subject: [PATCH] 1.0.37.34: Fix lost wakeup bug between CONDITION-WAIT and CONDITION-NOTIFY. (Patch and write up below are actually due to Nikodemus.) Use the waitqueue object as the data item in CONDITION-NOTIFY, and the thread in CONDITION-WAIT. It doesn't matter if multiple notifies use the same object -- single thread notifying multiple times has the same meaning as multiple threads sending one notification each, but multiple waiters must use different objects as per following scenario: 1) Thread A calls CONDITION-WAIT on a condition variable CVAR. Thread A sets CVAR's waitqueue-data to token A. Thread A is preempted after the (RELEASE-MUTEX), before the FUTEX-WAIT. 2) Thread B is run. Thread B invokes CONDITION-NOTIFY. Thread B sets CVAR's waitqueue-data to a token B. Thread B invokes FUTEX-WAKE which is NO-OP because no thread is yet waiting on the futex. 3) Thread C is run. Thread C invokes CONDITION-WAIT. Thread C sets CVAR's waitqueue-data to token C. 4) Thread A is finally run again. The call to FUTEX-WAIT compares CVAR's waitqueue-data with token A. If token A == token C, the wakeup is lost. Similarly, if step 3 does not happen, and token A == token B, the wakeup is lost. Furthermore, consider if thread A == thread B, and step 3 does not happen. (This can occur due to an interrupt.) Hence CONDITION-NOTIFY cannot use the thread object as the token. --- NEWS | 4 ++ src/code/target-thread.lisp | 109 ++++++++++++++++++++++--------------------- version.lisp-expr | 2 +- 3 files changed, 60 insertions(+), 55 deletions(-) diff --git a/NEWS b/NEWS index 19e4cde..9e9f18c 100644 --- a/NEWS +++ b/NEWS @@ -42,6 +42,10 @@ changes relative to sbcl-1.0.36: * bug fix: misoptimization of multiplication by one in (SB-C::FLOAT-ACCURACY 0) policies. * bug fix: miscounts in SB-PROFILE. + * bug fix: Fix lost wakeup bug between SB-THREAD:CONDITION-WAIT and + CONDITION-NOTIFY on Linux. See threads "lost wakeup in condition-wait / + condition-notify" (Feb 2010) and "Condition-Wait, Deadline handler, waking + up itself" (March 2010) for further details. changes in sbcl-1.0.37 relative to sbcl-1.0.36: * enhancement: Backtrace from THROW to uncaught tag on x86oids now shows diff --git a/src/code/target-thread.lisp b/src/code/target-thread.lisp index 47d07d2..914882c 100644 --- a/src/code/target-thread.lisp +++ b/src/code/target-thread.lisp @@ -588,51 +588,50 @@ returning normally, it may do so without holding the mutex." ;; Need to disable interrupts so that we don't miss grabbing the ;; mutex on our way out. (without-interrupts - (let ((me nil)) - ;; This setf becomes visible to other CPUS due to the usual - ;; memory barrier semantics of lock acquire/release. This must - ;; not be moved into the loop else wakeups may be lost upon - ;; continuing after a deadline or EINTR. - (setf (waitqueue-data queue) me) - (loop - (multiple-value-bind (to-sec to-usec) - (allow-with-interrupts (decode-timeout nil)) - (case (unwind-protect - (with-pinned-objects (queue me) - ;; RELEASE-MUTEX is purposefully as close to - ;; FUTEX-WAIT as possible to reduce the size - ;; of the window where WAITQUEUE-DATA may be - ;; set by a notifier. - (release-mutex mutex) - ;; Now we go to sleep using futex-wait. If - ;; anyone else manages to grab MUTEX and call - ;; CONDITION-NOTIFY during this comment, it - ;; will change queue->data, and so futex-wait - ;; returns immediately instead of sleeping. - ;; Ergo, no lost wakeup. We may get spurious - ;; wakeups, but that's ok. - (allow-with-interrupts - (futex-wait (waitqueue-data-address queue) - (get-lisp-obj-address me) - ;; our way of saying "no - ;; timeout": - (or to-sec -1) - (or to-usec 0)))) - ;; If we are interrupted while waiting, we should - ;; do these things before returning. Ideally, in - ;; the case of an unhandled signal, we should do - ;; them before entering the debugger, but this is - ;; better than nothing. - (allow-with-interrupts (get-mutex mutex))) - ;; ETIMEDOUT; we know it was a timeout, yet we cannot - ;; signal a deadline unconditionally here because the - ;; call to GET-MUTEX may already have signaled it. - ((1)) - ;; EINTR - ((2)) - ;; EWOULDBLOCK, -1 here, is the possible spurious wakeup - ;; case. 0 is the normal wakeup. - (otherwise (return))))))))) + ;; This setf becomes visible to other CPUS due to the usual + ;; memory barrier semantics of lock acquire/release. This must + ;; not be moved into the loop else wakeups may be lost upon + ;; continuing after a deadline or EINTR. + (setf (waitqueue-data queue) me) + (loop + (multiple-value-bind (to-sec to-usec) + (allow-with-interrupts (decode-timeout nil)) + (case (unwind-protect + (with-pinned-objects (queue me) + ;; RELEASE-MUTEX is purposefully as close to + ;; FUTEX-WAIT as possible to reduce the size + ;; of the window where WAITQUEUE-DATA may be + ;; set by a notifier. + (release-mutex mutex) + ;; Now we go to sleep using futex-wait. If + ;; anyone else manages to grab MUTEX and call + ;; CONDITION-NOTIFY during this comment, it + ;; will change queue->data, and so futex-wait + ;; returns immediately instead of sleeping. + ;; Ergo, no lost wakeup. We may get spurious + ;; wakeups, but that's ok. + (allow-with-interrupts + (futex-wait (waitqueue-data-address queue) + (get-lisp-obj-address me) + ;; our way of saying "no + ;; timeout": + (or to-sec -1) + (or to-usec 0)))) + ;; If we are interrupted while waiting, we should + ;; do these things before returning. Ideally, in + ;; the case of an unhandled signal, we should do + ;; them before entering the debugger, but this is + ;; better than nothing. + (allow-with-interrupts (get-mutex mutex))) + ;; ETIMEDOUT; we know it was a timeout, yet we cannot + ;; signal a deadline unconditionally here because the + ;; call to GET-MUTEX may already have signaled it. + ((1)) + ;; EINTR + ((2)) + ;; EWOULDBLOCK, -1 here, is the possible spurious wakeup + ;; case. 0 is the normal wakeup. + (otherwise (return)))))))) (defun condition-notify (queue &optional (n 1)) #!+sb-doc @@ -649,17 +648,19 @@ this call." #!+sb-lutex (with-lutex-address (lutex (waitqueue-lutex queue)) (%lutex-wake lutex n)) - ;; no problem if >1 thread notifies during the comment in - ;; condition-wait: as long as the value in queue-data isn't the - ;; waiting thread's id, it matters not what it is + ;; No problem if >1 thread notifies during the comment in condition-wait: + ;; as long as the value in queue-data isn't the waiting thread's id, it + ;; matters not what it is -- using the queue object itself is handy. + ;; ;; XXX we should do something to ensure that the result of this setf - ;; is visible to all CPUs + ;; is visible to all CPUs. + ;; + ;; ^-- surely futex_wake() involves a memory barrier? #!-sb-lutex - (let ((me *current-thread*)) - (progn - (setf (waitqueue-data queue) me) - (with-pinned-objects (queue) - (futex-wake (waitqueue-data-address queue) n)))))) + (progn + (setf (waitqueue-data queue) queue) + (with-pinned-objects (queue) + (futex-wake (waitqueue-data-address queue) n))))) (defun condition-broadcast (queue) #!+sb-doc diff --git a/version.lisp-expr b/version.lisp-expr index a21d614..0d57c83 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.37.33" +"1.0.37.34" -- 1.7.10.4