/* * Copyright (c) 2022, Daniel Bertalan * * SPDX-License-Identifier: BSD-2-Clause */ // Optimized x86-64 memset routine based on the following post from the MSRC blog: // https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines // // This algorithm // - makes use of REP MOVSB on CPUs where it is fast (a notable exception is // qemu's TCG backend used in CI) // - uses SSE stores otherwise // - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have // a large overhead .intel_syntax noprefix .global memset_sse2_erms .type memset_sse2_erms, @function .p2align 4 memset_sse2_erms: // Fill all bytes of esi and xmm0 with the given character. movzx esi, sil imul esi, 0x01010101 movd xmm0, esi pshufd xmm0, xmm0, 0 // Store the original address for the return value. mov rax, rdi cmp rdx, 64 jb .Lunder_64 // Limit taken from the article. Could be lower (256 or 512) if we want to // tune it for the latest CPUs. cmp rdx, 800 jb .Lbig .Lerms: // We're going to align the pointer to 64 bytes, and then use REP STOSB. // Fill the first 64 bytes of the memory using SSE stores. movups [rdi], xmm0 movups [rdi + 16], xmm0 movups [rdi + 32], xmm0 movups [rdi + 48], xmm0 // Store the address of the last byte in r8. lea r8, [rdi + rdx] // Align the start pointer to 64 bytes. add rdi, 63 and rdi, ~63 // Calculate the number of remaining bytes to store. mov rcx, r8 sub rcx, rdi // Use REP STOSB to fill the rest. This is implemented in microcode on // recent Intel and AMD CPUs, and can automatically use the widest stores // available in the CPU, so it's strictly faster than SSE for sizes of more // than a couple hundred bytes. xchg rax, rsi rep stosb mov rax, rsi ret .global memset_sse2 .type memset_sse2, @function .p2align 4 memset_sse2: // Fill all bytes of esi and xmm0 with the given character. movzx esi, sil imul rsi, 0x01010101 movd xmm0, esi pshufd xmm0, xmm0, 0 // Store the original address for the return value. mov rax, rdi cmp rdx, 64 jb .Lunder_64 .Lbig: // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot // loop, and then fill the last 48-64 bytes separately to take care of any // trailing bytes. // Fill the first 16 bytes, which might be unaligned. movups [rdi], xmm0 // Calculate the first 16 byte aligned address for the SSE stores. lea rsi, [rdi + 16] and rsi, ~15 // Calculate the number of remaining bytes. sub rdi, rsi add rdx, rdi // Calculate the last aligned address for trailing stores such that // 48-64 bytes are left. lea rcx, [rsi + rdx - 48] and rcx, ~15 // Calculate the address 16 bytes from the end. lea r8, [rsi + rdx - 16] cmp rdx, 64 jb .Ltrailing .Lbig_loop: // Fill 4*16 bytes in a loop. movaps [rsi], xmm0 movaps [rsi + 16], xmm0 movaps [rsi + 32], xmm0 movaps [rsi + 48], xmm0 add rsi, 64 cmp rsi, rcx jb .Lbig_loop .Ltrailing: // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes. movaps [rcx], xmm0 movaps [rcx + 16], xmm0 movaps [rcx + 32], xmm0 movups [r8], xmm0 ret .Lunder_64: cmp rdx, 16 jb .Lunder_16 // We're going to fill 16-63 bytes using variable sized branchess stores. // Although this means that we might set the same byte up to 4 times, we // can avoid branching which is expensive compared to straight-line code. // Calculate the address of the last SSE store. lea r8, [rdi + rdx - 16] // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0. and rdx, 32 // Fill the first 16 bytes. movups [rdi], xmm0 // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0. shr rdx, 1 // Fill the last 16 bytes. movups [r8], xmm0 // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again. movups [rdi + rdx], xmm0 // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again. neg rdx movups [r8 + rdx], xmm0 ret .Lunder_16: cmp rdx, 4 jb .Lunder_4 // We're going to fill 4-15 bytes using variable sized branchless stores like above. lea r8, [rdi + rdx - 4] and rdx, 8 mov [rdi], esi shr rdx, 1 mov [r8], esi mov [rdi + rdx], esi neg rdx mov [r8 + rdx], esi ret .Lunder_4: cmp rdx, 1 jb .Lend // Fill the first byte. mov [rdi], sil jbe .Lend // The size is 2 or 3 bytes. Fill the second and the last one. mov [rdi + 1], sil mov [rdi + rdx - 1], sil .Lend: ret