+ SIZE(GNAME(post_signal_tramp))
+\f
+ .text
+ .align align_16byte,0x90
+ .globl GNAME(fast_bzero)
+ TYPE(GNAME(fast_bzero))
+
+ #ifdef LISP_FEATURE_WIN32
+ #define xmmreg xmm7
+ #define redsave(reg,off) movups reg,-off(%rsp)
+ #define redrestore(reg,off) movups -off(%rsp),reg
+ #else
+ #define xmmreg xmm0
+ #define redsave(reg,off)
+ #define redrestore(reg,off)
+ #endif
+
+GNAME(fast_bzero):
+ /* A fast routine for zero-filling blocks of memory that are
+ * guaranteed to start and end at a 4096-byte aligned address.
+ */
+ shr $6, %rsi /* Amount of 64-byte blocks to copy */
+ jz Lend /* If none, stop */
+ mov %rsi, %rcx /* Save start address */
+ redsave(%xmmreg,16)
+ xorps %xmmreg, %xmmreg /* Zero the XMM register */
+ jmp Lloop
+ .align align_16byte
+Lloop:
+
+ /* Copy the 16 zeroes from xmm7 to memory, 4 times. MOVNTDQ is the
+ * non-caching double-quadword moving variant, i.e. the memory areas
+ * we're touching are not fetched into the L1 cache, since we're just
+ * going to overwrite the memory soon anyway.
+ */
+ movntdq %xmmreg, 0(%rdi)
+ movntdq %xmmreg, 16(%rdi)
+ movntdq %xmmreg, 32(%rdi)
+ movntdq %xmmreg, 48(%rdi)
+
+ add $64, %rdi /* Advance pointer */
+ dec %rsi /* Decrement 64-byte block count */
+ jnz Lloop
+ mfence /* Ensure that the writes are globally visible, since
+ * MOVNTDQ is weakly ordered */
+ redrestore(%xmmreg,16)
+ prefetcht0 0(%rcx) /* Prefetch the start of the block into cache,
+ * since it's likely to be used immediately. */
+Lend:
+ ret
+ SIZE(GNAME(fast_bzero))
+
+\f
+/* When LISP_FEATURE_C_STACK_IS_CONTROL_STACK, we cannot safely scrub
+ * the control stack from C, largely due to not knowing where the
+ * active stack frame ends. On such platforms, we reimplement the
+ * core scrubbing logic in assembly, in this case here:
+ */
+ .text
+ .align align_16byte,0x90
+ .globl GNAME(arch_scrub_control_stack)
+ TYPE(GNAME(arch_scrub_control_stack))
+GNAME(arch_scrub_control_stack):
+ /* We are passed three parameters:
+ * A (struct thread *) in RDI,
+ * the address of the guard page in RSI, and
+ * the address of the hard guard page in RDX.
+ * We may trash RAX, RCX, and R8-R11 with impunity.
+ * [RSP] is our return address, [RSP-8] is the first
+ * stack slot to scrub. */
+
+ /* We start by setting up our scrub pointer in RAX, our
+ * guard page upper bound in R8, and our hard guard
+ * page upper bound in R9. */
+ lea -8(%rsp), %rax
+#ifdef LISP_FEATURE_DARWIN
+ mov GSYM(GNAME(os_vm_page_size)),%r9
+#else
+ mov os_vm_page_size,%r9
+#endif
+ lea (%rsi,%r9), %r8
+ lea (%rdx,%r9), %r9
+
+ /* Now we begin our main scrub loop. */
+ascs_outer_loop:
+
+ /* If we're about to scrub the hard guard page, exit. */
+ cmp %r9, %rax
+ jae ascs_check_guard_page
+ cmp %rax, %rdx
+ jbe ascs_finished
+
+ascs_check_guard_page:
+ /* If we're about to scrub the guard page, and the guard
+ * page is protected, exit. */
+ cmp %r8, %rax
+ jae ascs_clear_loop
+ cmp %rax, %rsi
+ ja ascs_clear_loop
+ cmpq $(NIL), THREAD_CONTROL_STACK_GUARD_PAGE_PROTECTED_OFFSET(%rdi)
+ jne ascs_finished
+
+ /* Clear memory backwards to the start of the (4KiB) page */
+ascs_clear_loop:
+ movq $0, (%rax)
+ test $0xfff, %rax
+ lea -8(%rax), %rax
+ jnz ascs_clear_loop
+
+ /* If we're about to hit the hard guard page, exit. */
+ cmp %r9, %rax
+ jae ascs_finished
+
+ /* If the next (previous?) 4KiB page contains a non-zero
+ * word, continue scrubbing. */
+ascs_check_loop:
+ testq $-1, (%rax)
+ jnz ascs_outer_loop
+ test $0xfff, %rax
+ lea -8(%rax), %rax
+ jnz ascs_check_loop
+
+ascs_finished:
+ ret
+ SIZE(GNAME(arch_scrub_control_stack))