1.0.42.50: workaround a Darwin nanosleep() bug
authorNikodemus Siivola <nikodemus@random-state.net>
Tue, 21 Sep 2010 13:10:37 +0000 (13:10 +0000)
committerNikodemus Siivola <nikodemus@random-state.net>
Tue, 21 Sep 2010 13:10:37 +0000 (13:10 +0000)
 Fixes lp#640516.

 It turns out that on Darwin, if a nanosleep() call is interrupted,
 and the signal handler takes longer than the requested sleep time
 was, then the call will return with EINTR and (unsigned)-1 in the
 remaining seconds.

 Since we call nanosleep() again when it returns with EINTR with the
 remaining time, this would cause us to sleep ~136 years...

 So, check that the remainder is not increasing before calling
 nanosleep() again.

 Many, many thanks to Joe Lobraco who reported and diagnosed the
 issue.

NEWS
src/code/unix.lisp
tests/signals.impure.lisp
version.lisp-expr

diff --git a/NEWS b/NEWS
index feef931..bc45c8a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -59,6 +59,8 @@ changes relative to sbcl-1.0.42
     (lp#316068)
   * bug fix: (SETF DOCUMENTATION) of a macro works properly. (lp#643958, thanks
     to Stas Boukarev)
+  * bug fix: interrupt taking longer than the requested period caused SLEEP
+    to hang on Darwin. (lp#640516, thanks to Joe Lebroco for the analysis)
 
 changes in sbcl-1.0.42 relative to sbcl-1.0.41
   * build changes
index 37d715f..47eba3d 100644 (file)
@@ -916,12 +916,27 @@ corresponds to NAME, or NIL if there is none."
                (rem (struct timespec)))
     (setf (slot req 'tv-sec) secs)
     (setf (slot req 'tv-nsec) nsecs)
-    (loop while (eql sb!unix:eintr
-                     (nth-value 1
-                                (int-syscall ("nanosleep" (* (struct timespec))
-                                                          (* (struct timespec)))
-                                             (addr req) (addr rem))))
-       do (rotatef req rem))))
+    (loop while (and (eql sb!unix:eintr
+                          (nth-value 1
+                                     (int-syscall ("nanosleep" (* (struct timespec))
+                                                               (* (struct timespec)))
+                                                  (addr req) (addr rem))))
+                     ;; KLUDGE: On Darwin, if an interrupt cases nanosleep to
+                     ;; take longer than the requested time, the call will
+                     ;; return with EINT and (unsigned)-1 seconds in the
+                     ;; remainder timespec, which would cause us to enter
+                     ;; nanosleep again for ~136 years. So, we check that the
+                     ;; remainder time is actually decreasing. Since the cost
+                     ;; of this check is neglible, do it on all platforms.
+                     ;; http://osdir.com/ml/darwin-kernel/2010-03/msg00007.html
+                     (let ((rem-sec (slot rem 'tv-sec))
+                           (rem-nsec (slot rem 'tv-nsec)))
+                       (when (or (> secs rem-sec)
+                                 (and (= secs rem-sec) (>= nsecs rem-nsec)))
+                         (setf secs rem-sec
+                               nsecs rem-nsec)
+                         t)))
+          do (rotatef req rem))))
 
 (defun unix-get-seconds-west (secs)
   (multiple-value-bind (ignore seconds dst) (get-timezone secs)
index 7be8bac..093bf23 100644 (file)
                    sb-unix:sigint)
                 (sb-sys:interactive-interrupt ()
                   :condition)))))
+
+(with-test (:name :bug-640516)
+  ;; On Darwin interrupting a SLEEP so that it took longer than
+  ;; the requested amount caused it to hang.
+  (assert
+   (handler-case
+       (sb-ext:with-timeout 10
+         (let (to)
+           (handler-bind ((sb-ext:timeout (lambda (c)
+                                            (unless to
+                                              (setf to t)
+                                              (sleep 2)
+                                              (continue c)))))
+             (sb-ext:with-timeout 0.1 (sleep 1) t))))
+     (sb-ext:timeout ()
+       nil))))
index 5ec2273..7c64c3e 100644 (file)
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.42.49"
+"1.0.42.50"