gencgc: bogus memory fault handling
authorNikodemus Siivola <nikodemus@random-state.net>
Wed, 7 Dec 2011 10:24:54 +0000 (12:24 +0200)
committerNikodemus Siivola <nikodemus@random-state.net>
Wed, 7 Dec 2011 17:31:28 +0000 (19:31 +0200)
 Add more debugging information when a bogus memory fault occurs,
 and two control variables:

  continue_after_memoryfault_on_unprotected_page
  ignore_memoryfaults_on_unprotected_pages

 The first just prevents us from lose()ing. Set it in darwin_init() as we
 genuinely appear to get bogus memory faults in threaded code from the kernel,
 which point to unprotected boxed pages. They are possibly something we could
 filter out in catch_exception_raise... but don't see how. Experimentally
 continuing after such them seems harmless, so let's try that but make it
 noisy.

 The second one silences the debugging output completely. Not set currently at
 all.

src/runtime/gencgc-internal.h
src/runtime/gencgc.c

index dfe9089..978bb92 100644 (file)
@@ -34,10 +34,12 @@ int gencgc_handle_wp_violation(void *);
 # if GENCGC_CARD_BYTES > UINT_MAX
 #   error "GENCGC_CARD_BYTES unexpectedly large."
 # else
-typedef unsigned int page_bytes_t;
+#   define PAGE_BYTES_FMT "u"
+    typedef unsigned int page_bytes_t;
 # endif
 #else
-typedef unsigned short page_bytes_t;
+# define PAGE_BYTES_FMT "hu"
+  typedef unsigned short page_bytes_t;
 #endif
 
 /* Note that this structure is also used from Lisp-side in
index acc79f5..c143054 100644 (file)
@@ -4247,7 +4247,17 @@ void unhandled_sigmemoryfault(void* addr);
  *
  * Return true if this signal is a normal generational GC thing that
  * we were able to handle, or false if it was abnormal and control
- * should fall through to the general SIGSEGV/SIGBUS/whatever logic. */
+ * should fall through to the general SIGSEGV/SIGBUS/whatever logic.
+ *
+ * We have two control flags for this: one causes us to ignore faults
+ * on unprotected pages completely, and the second complains to stderr
+ * but allows us to continue without losing.
+ */
+extern boolean ignore_memoryfaults_on_unprotected_pages;
+boolean ignore_memoryfaults_on_unprotected_pages = 0;
+
+extern boolean continue_after_memoryfault_on_unprotected_pages;
+boolean continue_after_memoryfault_on_unprotected_pages = 0;
 
 int
 gencgc_handle_wp_violation(void* fault_addr)
@@ -4278,17 +4288,39 @@ gencgc_handle_wp_violation(void* fault_addr)
             os_protect(page_address(page_index), GENCGC_CARD_BYTES, OS_VM_PROT_ALL);
             page_table[page_index].write_protected_cleared = 1;
             page_table[page_index].write_protected = 0;
-        } else {
+        } else if (!ignore_memoryfaults_on_unprotected_pages) {
             /* The only acceptable reason for this signal on a heap
              * access is that GENCGC write-protected the page.
              * However, if two CPUs hit a wp page near-simultaneously,
              * we had better not have the second one lose here if it
              * does this test after the first one has already set wp=0
              */
-            if(page_table[page_index].write_protected_cleared != 1)
-                lose("fault in heap page %d not marked as write-protected\nboxed_region.first_page: %d, boxed_region.last_page %d\n",
-                     page_index, boxed_region.first_page,
-                     boxed_region.last_page);
+            if(page_table[page_index].write_protected_cleared != 1) {
+                void lisp_backtrace(int frames);
+                lisp_backtrace(10);
+                fprintf(stderr,
+                        "Fault @ %p, page %"PAGE_INDEX_FMT" not marked as write-protected:\n"
+                        "  boxed_region.first_page: %"PAGE_INDEX_FMT","
+                        "  boxed_region.last_page %"PAGE_INDEX_FMT"\n"
+                        "  page.region_start_offset: %"OS_VM_SIZE_FMT"\n"
+                        "  page.bytes_used: %"PAGE_BYTES_FMT"\n"
+                        "  page.allocated: %d\n"
+                        "  page.write_protected: %d\n"
+                        "  page.write_protected_cleared: %d\n"
+                        "  page.generation: %d\n",
+                        fault_addr,
+                        page_index,
+                        boxed_region.first_page,
+                        boxed_region.last_page,
+                        page_table[page_index].region_start_offset,
+                        page_table[page_index].bytes_used,
+                        page_table[page_index].allocated,
+                        page_table[page_index].write_protected,
+                        page_table[page_index].write_protected_cleared,
+                        page_table[page_index].gen);
+                if (!continue_after_memoryfault_on_unprotected_pages)
+                    lose("Feh.\n");
+            }
         }
         ret = thread_mutex_unlock(&free_pages_lock);
         gc_assert(ret == 0);