From 97423182206cfe8c078eff105fea00dceb03be99 Mon Sep 17 00:00:00 2001 From: Juho Snellman Date: Sat, 7 Jan 2006 18:53:39 +0000 Subject: [PATCH] 0.9.8.19: Changes to GENCGC memory zeroing behaviour that give a big performance boost, especially on modern processors. * Instead of zeroing memory by remapping memory with munmap/mmap at GC time, pages are just marked as needing zeroing and zeroed with memset when they're added to a new allocation region. This reduces GC latency both for the common and worst cases. * To keep the memory footprint down, clear the pages by remapping after major GCs (arbitrarily defined as a collection of generation 2 or older). The memory freed from a minor GC is just going to get used again immediately , so releasing them back to the OS would make little sense. * Add a GENCGC mode (#define READ_PROTECT_FREE_PAGES) for catching attempts to read unallocated pages * See sbcl-devel "Changes to GENCGC memory zeroing" in 2005-12 for more details and performance measurements. (Note that many parts of this patch have already been committed piecemeal over the last month, this is just the most significant chunk). * Performance effect on BSDs (which used a different zeroing strategy than Linux before this) is unknown. --- NEWS | 1 + src/runtime/gencgc-internal.h | 4 +- src/runtime/gencgc.c | 197 ++++++++++++++++++++++++++++++++--------- version.lisp-expr | 2 +- 4 files changed, 158 insertions(+), 46 deletions(-) diff --git a/NEWS b/NEWS index 45b0300..9b03a93 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,7 @@ changes in sbcl-0.9.9 relative to sbcl-0.9.8: * bug fix: interrupt handling on NetBSD (thanks to Richard M Kreuter) * bug fix: saving a core corrupted callbacks on x86/x86-64 + * optimization: major improvements to GC efficiency on GENCGC platforms * optimization: faster implementation of EQUAL * optimization: emit more efficient opcodes for some common immediate->register MOV instructions on x86-64. (thanks to Lutz Euler) diff --git a/src/runtime/gencgc-internal.h b/src/runtime/gencgc-internal.h index b2d2f25..b8beae0 100644 --- a/src/runtime/gencgc-internal.h +++ b/src/runtime/gencgc-internal.h @@ -75,7 +75,9 @@ struct page { /* If the page is part of a large object then this flag is * set. No other objects should be allocated to these pages. * This is only valid when the page is allocated. */ - large_object :1; + large_object :1, + /* True if the page is known to contain only zeroes. */ + need_to_zero :1; /* the generation that this page belongs to. This should be valid * for all pages that may have objects allocated, even current diff --git a/src/runtime/gencgc.c b/src/runtime/gencgc.c index a6891d0..3d1a3e4 100644 --- a/src/runtime/gencgc.c +++ b/src/runtime/gencgc.c @@ -70,22 +70,6 @@ enum { * that don't have pointers to younger generations? */ boolean enable_page_protection = 1; -/* Should we unmap a page and re-mmap it to have it zero filled? */ -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__sun) -/* comment from cmucl-2.4.8: This can waste a lot of swap on FreeBSD - * so don't unmap there. - * - * The CMU CL comment didn't specify a version, but was probably an - * old version of FreeBSD (pre-4.0), so this might no longer be true. - * OTOH, if it is true, this behavior might exist on OpenBSD too, so - * for now we don't unmap there either. -- WHN 2001-04-07 */ -/* Apparently this flag is required to be 0 for SunOS/x86, as there - * are reports of heap corruption otherwise. */ -boolean gencgc_unmap_zero = 0; -#else -boolean gencgc_unmap_zero = 1; -#endif - /* the minimum size (in bytes) for a large object*/ unsigned long large_object_size = 4 * PAGE_BYTES; @@ -139,6 +123,13 @@ boolean gencgc_zero_check_during_free_heap = 0; * contained a pagetable entry). */ boolean gencgc_partial_pickup = 0; + +/* If defined, free pages are read-protected to ensure that nothing + * accesses them. + */ + +/* #define READ_PROTECT_FREE_PAGES */ + /* * GC structures and variables @@ -429,9 +420,60 @@ print_generation_stats(int verbose) /* FIXME: should take FILE argument */ fpu_restore(fpu_state); } -/* - * allocation routines +/* Zero the pages from START to END (inclusive), but use mmap/munmap instead + * if zeroing it ourselves, i.e. in practice give the memory back to the + * OS. Generally done after a large GC. */ +void zero_pages_with_mmap(page_index_t start, page_index_t end) { + int i; + void *addr = (void *) page_address(start), *new_addr; + size_t length = PAGE_BYTES*(1+end-start); + + if (start > end) + return; + + os_invalidate(addr, length); + new_addr = os_validate(addr, length); + if (new_addr == NULL || new_addr != addr) { + lose("remap_free_pages: page moved, 0x%08x ==> 0x%08x", start, new_addr); + } + + for (i = start; i <= end; i++) { + page_table[i].need_to_zero = 0; + } +} + +/* Zero the pages from START to END (inclusive). Generally done just after + * a new region has been allocated. + */ +static void +zero_pages(page_index_t start, page_index_t end) { + if (start > end) + return; + + memset(page_address(start), 0, PAGE_BYTES*(1+end-start)); +} + +/* Zero the pages from START to END (inclusive), except for those + * pages that are known to already zeroed. Mark all pages in the + * ranges as non-zeroed. + */ +static void +zero_dirty_pages(page_index_t start, page_index_t end) { + page_index_t i; + + for (i = start; i <= end; i++) { + if (page_table[i].need_to_zero == 1) { + zero_pages(start, end); + break; + } + } + + for (i = start; i <= end; i++) { + page_table[i].need_to_zero = 1; + } +} + /* * To support quick and inline allocation, regions of memory can be @@ -606,6 +648,22 @@ gc_alloc_new_region(long nbytes, int unboxed, struct alloc_region *alloc_region) } } } + +#ifdef READ_PROTECT_FREE_PAGES + os_protect(page_address(first_page), + PAGE_BYTES*(1+last_page-first_page), + OS_VM_PROT_ALL); +#endif + + /* If the first page was only partial, don't check whether it's + * zeroed (it won't be) and don't zero it (since the parts that + * we're interested in are guaranteed to be zeroed). + */ + if (page_table[first_page].bytes_used) { + first_page++; + } + + zero_dirty_pages(first_page, last_page); } /* If the record_new_objects flag is 2 then all new regions created @@ -952,7 +1010,15 @@ gc_alloc_large(long nbytes, int unboxed, struct alloc_region *alloc_region) } thread_mutex_unlock(&free_pages_lock); - return((void *)(page_address(first_page)+orig_first_page_bytes_used)); +#ifdef READ_PROTECT_FREE_PAGES + os_protect(page_address(first_page), + PAGE_BYTES*(1+last_page-first_page), + OS_VM_PROT_ALL); +#endif + + zero_dirty_pages(first_page, last_page); + + return page_address(first_page); } static page_index_t gencgc_alloc_start_page = -1; @@ -3080,31 +3146,12 @@ free_oldspace(void) && (page_table[last_page].bytes_used != 0) && (page_table[last_page].gen == from_space)); - /* Zero pages from first_page to (last_page-1). - * - * FIXME: Why not use os_zero(..) function instead of - * hand-coding this again? (Check other gencgc_unmap_zero - * stuff too. */ - if (gencgc_unmap_zero) { - void *page_start, *addr; - - page_start = (void *)page_address(first_page); - - os_invalidate(page_start, PAGE_BYTES*(last_page-first_page)); - addr = os_validate(page_start, PAGE_BYTES*(last_page-first_page)); - if (addr == NULL || addr != page_start) { - lose("free_oldspace: page moved, 0x%08x ==> 0x%08x\n", - page_start, addr); - } - } else { - long *page_start; - - page_start = (long *)page_address(first_page); - memset(page_start, 0,PAGE_BYTES*(last_page-first_page)); - } - +#ifdef READ_PROTECT_FREE_PAGES + os_protect(page_address(first_page), + PAGE_BYTES*(last_page-first_page), + OS_VM_PROT_NONE); +#endif first_page = last_page; - } while (first_page < last_free_page); bytes_allocated -= bytes_freed; @@ -3816,6 +3863,32 @@ update_dynamic_space_free_pointer(void) return 0; /* dummy value: return something ... */ } +static void +remap_free_pages (page_index_t from, page_index_t to) +{ + page_index_t first_page, last_page; + + for (first_page = from; first_page <= to; first_page++) { + if (page_table[first_page].allocated != FREE_PAGE_FLAG || + page_table[first_page].need_to_zero == 0) { + continue; + } + + last_page = first_page + 1; + while (page_table[last_page].allocated == FREE_PAGE_FLAG && + last_page < to && + page_table[last_page].need_to_zero == 1) { + last_page++; + } + + zero_pages_with_mmap(first_page, last_page-1); + + first_page = last_page; + } +} + +generation_index_t small_generation_limit = 1; + /* GC all generations newer than last_gen, raising the objects in each * to the next older generation - we finish when all generations below * last_gen are empty. Then if last_gen is due for a GC, or if @@ -3824,13 +3897,15 @@ update_dynamic_space_free_pointer(void) * * We stop collecting at gencgc_oldest_gen_to_gc, even if this is less than * last_gen (oh, and note that by default it is NUM_GENERATIONS-1) */ - void collect_garbage(generation_index_t last_gen) { generation_index_t gen = 0, i; int raise; int gen_to_wp; + /* The largest value of last_free_page seen since the time + * remap_free_pages was called. */ + static page_index_t high_water_mark = 0; FSHOW((stderr, "/entering collect_garbage(%d)\n", last_gen)); @@ -3932,11 +4007,25 @@ collect_garbage(generation_index_t last_gen) gc_assert((boxed_region.free_pointer - boxed_region.start_addr) == 0); gc_alloc_generation = 0; + /* Save the high-water mark before updating last_free_page */ + if (last_free_page > high_water_mark) + high_water_mark = last_free_page; update_dynamic_space_free_pointer(); auto_gc_trigger = bytes_allocated + bytes_consed_between_gcs; if(gencgc_verbose) fprintf(stderr,"Next gc when %ld bytes have been consed\n", auto_gc_trigger); + + /* If we did a big GC (arbitrarily defined as gen > 1), release memory + * back to the OS. + */ + if (gen > small_generation_limit) { + if (last_free_page > high_water_mark) + high_water_mark = last_free_page; + remap_free_pages(0, high_water_mark); + high_water_mark = 0; + } + SHOW("returning from collect_garbage"); } @@ -4105,6 +4194,7 @@ gencgc_pickup_dynamic(void) page_table[page].write_protected = 0; page_table[page].write_protected_cleared = 0; page_table[page].dont_move = 0; + page_table[page].need_to_zero = 1; if (!gencgc_partial_pickup) { first=gc_search_space(prev,(ptr+2)-prev,ptr); @@ -4294,6 +4384,23 @@ gc_set_region_empty(struct alloc_region *region) region->end_addr = page_address(0); } +static void +zero_all_free_pages() +{ + page_index_t i; + + for (i = 0; i < last_free_page; i++) { + if (page_table[i].allocated == FREE_PAGE_FLAG) { +#ifdef READ_PROTECT_FREE_PAGES + os_protect(page_address(i), + PAGE_BYTES, + OS_VM_PROT_ALL); +#endif + zero_pages(i, i); + } + } +} + /* Things to do before doing a final GC before saving a core (without * purify). * @@ -4348,6 +4455,8 @@ gc_and_save(char *filename) gencgc_alloc_start_page = -1; collect_garbage(HIGHEST_NORMAL_GENERATION+1); + /* The dumper doesn't know that pages need to be zeroed before use. */ + zero_all_free_pages(); save_to_filehandle(file, filename, SymbolValue(RESTART_LISP_FUNCTION,0)); /* Oops. Save still managed to fail. Since we've mangled the stack * beyond hope, there's not much we can do. diff --git a/version.lisp-expr b/version.lisp-expr index 80c7db2..39130ea 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.9.8.18" +"0.9.8.19" -- 1.7.10.4