src/runtime/linux-os.c

   1 /*
   2  * the Linux incarnation of OS-dependent routines.  See also
   3  * $(sbcl_arch)-linux-os.c
   4  *
   5  * This file (along with os.h) exports an OS-independent interface to
   6  * the operating system VM facilities. Surprise surprise, this
   7  * interface looks a lot like the Mach interface (but simpler in some
   8  * places). For some operating systems, a subset of these functions
   9  * will have to be emulated.
  10  */
  11
  12 /*
  13  * This software is part of the SBCL system. See the README file for
  14  * more information.
  15  *
  16  * This software is derived from the CMU CL system, which was
  17  * written at Carnegie Mellon University and released into the
  18  * public domain. The software is in the public domain and is
  19  * provided with absolutely no warranty. See the COPYING and CREDITS
  20  * files for more information.
  21  */
  22
  23 #include <stdio.h>
  24 #include <sys/param.h>
  25 #include <sys/file.h>
  26 #include "./signal.h"
  27 #include "os.h"
  28 #include "arch.h"
  29 #include "globals.h"
  30 #include "interrupt.h"
  31 #include "interr.h"
  32 #include "lispregs.h"
  33 #include "sbcl.h"
  34 #include <sys/socket.h>
  35 #include <sys/utsname.h>
  36
  37 #include <sys/types.h>
  38 #include <signal.h>
  39 /* #include <sys/sysinfo.h> */
  40 #include <sys/time.h>
  41 #include <sys/stat.h>
  42 #include <unistd.h>
  43
  44 #include "validate.h"
  45 #include "thread.h"
  46 size_t os_vm_page_size;
  47
  48 #include "gc.h"
  49 \f
  50
  51 #ifdef sparc
  52 int early_kernel = 0;
  53 #endif
  54 void os_init(void)
  55 {
  56     /* Early versions of Linux don't support the mmap(..) functionality
  57      * that we need. */
  58     {
  59         struct utsname name;
  60         int major_version;
  61 #ifdef sparc
  62         int minor_version;
  63 #endif
  64         uname(&name);
  65         major_version = atoi(name.release);
  66         if (major_version < 2) {
  67             lose("linux major version=%d (can't run in version < 2.0.0)",
  68                  major_version);
  69         }
  70 #ifdef sparc
  71         /* KLUDGE: This will break if Linux moves to a uname() version number
  72          * that has more than one digit initially -- CSR, 2002-02-12 */
  73         minor_version = atoi(name.release+2);
  74         if (minor_version < 4) {
  75             FSHOW((stderr,"linux minor version=%d;\n enabling workarounds for SPARC kernel bugs in signal handling.\n", minor_version));
  76             early_kernel = 1;
  77         }
  78 #endif
  79     }
  80
  81     os_vm_page_size = getpagesize();
  82     /* This could just as well be in arch_init(), but it's not. */
  83 #ifdef __i386__
  84     /* FIXME: This used to be here.  However, I have just removed it
  85        with no apparent ill effects (it may be that earlier kernels
  86        started up a process with a different set of traps, or
  87        something?) Find out what this was meant to do, and reenable it
  88        or delete it if possible. -- CSR, 2002-07-15 */
  89     /* SET_FPU_CONTROL_WORD(0x1372|4|8|16|32);  no interrupts */
  90 #endif
  91 }
  92
  93 /* In Debian CMU CL ca. 2.4.9, it was possible to get an infinite
  94  * cascade of errors from do_mmap(..). This variable is a counter to
  95  * prevent that; when it counts down to zero, an error in do_mmap
  96  * causes the low-level monitor to be called. */
  97 int n_do_mmap_ignorable_errors = 3;
  98
  99 /* Return 0 for success. */
 100 static int
 101 do_mmap(os_vm_address_t *addr, os_vm_size_t len, int flags)
 102 {
 103     /* We *must* have the memory where we expect it. */
 104     os_vm_address_t old_addr = *addr;
 105
 106     *addr = mmap(*addr, len, OS_VM_PROT_ALL, flags, -1, 0);
 107     if (*addr == MAP_FAILED ||
 108         ((old_addr != NULL) && (*addr != old_addr))) {
 109         FSHOW((stderr,
 110                "/retryable error in allocating memory from the OS\n"
 111                "(addr=0x%lx, len=0x%lx, flags=0x%lx)\n",
 112                (long) addr,
 113                (long) len,
 114                (long) flags));
 115         if (n_do_mmap_ignorable_errors > 0) {
 116             --n_do_mmap_ignorable_errors;
 117         } else {
 118             lose("too many errors in allocating memory from the OS");
 119         }
 120         perror("mmap");
 121         return 1;
 122     }
 123     return 0;
 124 }
 125
 126 os_vm_address_t
 127 os_validate(os_vm_address_t addr, os_vm_size_t len)
 128 {
 129     if (addr) {
 130         int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED;
 131         os_vm_address_t base_addr = addr;
 132         do {
 133             /* KLUDGE: It looks as though this code allocates memory
 134              * in chunks of size no larger than 'magic', but why? What
 135              * is the significance of 0x1000000 here? Also, can it be
 136              * right that if the first few 'do_mmap' calls succeed,
 137              * then one fails, we leave the memory allocated by the
 138              * first few in place even while we return a code for
 139              * complete failure? -- WHN 19991020
 140              *
 141              * Peter Van Eynde writes (20000211)
 142              *     This was done because the kernel would only check for
 143              *   overcommit for every allocation seperately. So if you
 144              *   had 16MB of free mem+swap you could allocate 16M. And
 145              *   again, and again, etc.
 146              *     This in [Linux] 2.X could be bad as they changed the memory
 147              *   system. A side effect was/is (I don't really know) that
 148              *   programs with a lot of memory mappings run slower. But
 149              *   of course for 2.2.2X we now have the NO_RESERVE flag that
 150              *   helps...
 151              *
 152              * FIXME: The logic is also flaky w.r.t. failed
 153              * allocations. If we make one or more successful calls to
 154              * do_mmap(..) before one fails, then we've allocated
 155              * memory, and we should ensure that it gets deallocated
 156              * sometime somehow. If this function's response to any
 157              * failed do_mmap(..) is to give up and return NULL (as in
 158              * sbcl-0.6.7), then any failed do_mmap(..) after any
 159              * successful do_mmap(..) causes a memory leak. */
 160             int magic = 0x1000000;
 161             if (len <= magic) {
 162                 if (do_mmap(&addr, len, flags)) {
 163                     return NULL;
 164                 }
 165                 len = 0;
 166             } else {
 167                 if (do_mmap(&addr, magic, flags)) {
 168                     return NULL;
 169                 }
 170                 addr += magic;
 171                 len = len - magic;
 172             }
 173         } while (len > 0);
 174         return base_addr;
 175     } else {
 176         int flags = MAP_PRIVATE | MAP_ANONYMOUS;
 177         if (do_mmap(&addr, len, flags)) {
 178             return NULL;
 179         } else {
 180             return addr;
 181         }
 182     }
 183 }
 184
 185 void
 186 os_invalidate(os_vm_address_t addr, os_vm_size_t len)
 187 {
 188     if (munmap(addr,len) == -1) {
 189         perror("munmap");
 190     }
 191 }
 192
 193 os_vm_address_t
 194 os_map(int fd, int offset, os_vm_address_t addr, os_vm_size_t len)
 195 {
 196     addr = mmap(addr, len,
 197                 OS_VM_PROT_ALL,
 198                 MAP_PRIVATE | MAP_FILE | MAP_FIXED,
 199                 fd, (off_t) offset);
 200
 201     if (addr == MAP_FAILED) {
 202         perror("mmap");
 203         lose("unexpected mmap(..) failure");
 204     }
 205
 206     return addr;
 207 }
 208
 209 void
 210 os_protect(os_vm_address_t address, os_vm_size_t length, os_vm_prot_t prot)
 211 {
 212     if (mprotect(address, length, prot) == -1) {
 213         perror("mprotect");
 214     }
 215 }
 216 \f
 217 /* FIXME: Now that FOO_END, rather than FOO_SIZE, is the fundamental
 218  * description of a space, we could probably punt this and just do
 219  * (FOO_START <= x && x < FOO_END) everywhere it's called. */
 220 static boolean
 221 in_range_p(os_vm_address_t a, lispobj sbeg, size_t slen)
 222 {
 223     char* beg = (char*)((long)sbeg);
 224     char* end = (char*)((long)sbeg) + slen;
 225     char* adr = (char*)a;
 226     return (adr >= beg && adr < end);
 227 }
 228
 229 boolean
 230 is_valid_lisp_addr(os_vm_address_t addr)
 231 {
 232     struct thread *th;
 233     if(in_range_p(addr, READ_ONLY_SPACE_START, READ_ONLY_SPACE_SIZE) ||
 234        in_range_p(addr, STATIC_SPACE_START   , STATIC_SPACE_SIZE) ||
 235        in_range_p(addr, DYNAMIC_SPACE_START  , DYNAMIC_SPACE_SIZE))
 236         return 1;
 237     for_each_thread(th) {
 238         if((th->control_stack_start <= addr) && (addr < th->control_stack_end))
 239             return 1;
 240         if(in_range_p(addr, th->binding_stack_start, BINDING_STACK_SIZE))
 241             return 1;
 242     }
 243     return 0;
 244 }
 245 \f
 246 /*
 247  * any OS-dependent special low-level handling for signals
 248  */
 249
 250
 251 #if defined LISP_FEATURE_GENCGC
 252
 253 /*
 254  * The GENCGC needs to be hooked into whatever signal is raised for
 255  * page fault on this OS.
 256  */
 257 void
 258 sigsegv_handler(int signal, siginfo_t *info, void* void_context)
 259 {
 260     os_context_t *context = arch_os_get_context(&void_context);
 261     void* fault_addr = (void*)context->uc_mcontext.cr2;
 262     if (!gencgc_handle_wp_violation(fault_addr))
 263         if(!handle_control_stack_guard_triggered(context,fault_addr))
 264             interrupt_handle_now(signal, info, void_context);
 265 }
 266
 267 #else
 268
 269 static void
 270 sigsegv_handler(int signal, siginfo_t *info, void* void_context)
 271 {
 272     os_context_t *context = arch_os_get_context(&void_context);
 273     os_vm_address_t addr;
 274
 275     addr = arch_get_bad_addr(signal,info,context);
 276     if (addr != NULL &&
 277         *os_context_register_addr(context,reg_ALLOC) & (1L<<63)){
 278
 279         /* Alpha stuff: This is the end of a pseudo-atomic section
 280          * during which a signal was received.  We must deal with the
 281          * pending interrupt (see also interrupt.c,
 282          * ../code/interrupt.lisp)
 283          */
 284         /* (how we got here: when interrupting, we set bit 63 in
 285          * reg_Alloc.  At the end of the atomic section we tried to
 286          * write to reg_ALLOC, got a SIGSEGV (there's nothing mapped
 287          * there) so ended up here
 288          */
 289         *os_context_register_addr(context,reg_ALLOC) -= (1L<<63);
 290         interrupt_handle_pending(context);
 291     } else {
 292         if(!interrupt_maybe_gc(signal, info, context))
 293             if(!handle_control_stack_guard_triggered(context,addr))
 294                 interrupt_handle_now(signal, info, context);
 295     }
 296 }
 297 #endif
 298
 299 void sigcont_handler(int signal, siginfo_t *info, void *void_context)
 300 {
 301     /* we need to have a handler installed for this signal so that
 302      * sigwaitinfo() for it actually returns at the appropriate time
 303      */
 304 }
 305
 306 void
 307 os_install_interrupt_handlers(void)
 308 {
 309     undoably_install_low_level_interrupt_handler(SIG_MEMORY_FAULT,
 310                                                  sigsegv_handler);
 311     undoably_install_low_level_interrupt_handler(SIGCONT,
 312                                                  sigcont_handler);
 313 }
 314