1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 19:48:12 +00:00
serenity/Userland/Libraries/LibC/arch/x86_64/memset.S
Daniel Bertalan bcf124c07d LibC: Implement a faster memset routine for x86-64 in assembly
This commit addresses the following shortcomings of our current, simple
and elegant memset function:
- REP STOSB/STOSQ has considerable startup overhead, it's impractical to
  use for smaller sizes.
- Up until very recently, AMD CPUs didn't have support for "Enhanced REP
  MOVSB/STOSB", so it performed pretty poorly on them.

With this commit applied, I could measure a ~5% decrease in `test-js`'s
runtime when I used qemu's TCG backend. The implementation is based on
the following article from Microsoft:

https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines

Two versions of the routine are implemented: one that uses the ERMS
extension mentioned above, and one that performs plain SSE stores. The
version appropriate for the CPU is selected at load time using an IFUNC.
2022-05-01 12:42:01 +02:00

196 lines
4.8 KiB
ArmAsm

/*
* Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
// Optimized x86-64 memset routine based on the following post from the MSRC blog:
// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
//
// This algorithm
// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
// qemu's TCG backend used in CI)
// - uses SSE stores otherwise
// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
// a large overhead
.intel_syntax noprefix
.global memset_sse2_erms
.type memset_sse2_erms, @function
.p2align 4
memset_sse2_erms:
// Fill all bytes of esi and xmm0 with the given character.
movzx esi, sil
imul esi, 0x01010101
movd xmm0, esi
pshufd xmm0, xmm0, 0
// Store the original address for the return value.
mov rax, rdi
cmp rdx, 64
jb .Lunder_64
// Limit taken from the article. Could be lower (256 or 512) if we want to
// tune it for the latest CPUs.
cmp rdx, 800
jb .Lbig
.Lerms:
// We're going to align the pointer to 64 bytes, and then use REP STOSB.
// Fill the first 64 bytes of the memory using SSE stores.
movups [rdi], xmm0
movups [rdi + 16], xmm0
movups [rdi + 32], xmm0
movups [rdi + 48], xmm0
// Store the address of the last byte in r8.
lea r8, [rdi + rdx]
// Align the start pointer to 64 bytes.
add rdi, 63
and rdi, ~63
// Calculate the number of remaining bytes to store.
mov rcx, r8
sub rcx, rdi
// Use REP STOSB to fill the rest. This is implemented in microcode on
// recent Intel and AMD CPUs, and can automatically use the widest stores
// available in the CPU, so it's strictly faster than SSE for sizes of more
// than a couple hundred bytes.
xchg rax, rsi
rep stosb
mov rax, rsi
ret
.global memset_sse2
.type memset_sse2, @function
.p2align 4
memset_sse2:
// Fill all bytes of esi and xmm0 with the given character.
movzx esi, sil
imul rsi, 0x01010101
movd xmm0, esi
pshufd xmm0, xmm0, 0
// Store the original address for the return value.
mov rax, rdi
cmp rdx, 64
jb .Lunder_64
.Lbig:
// We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
// loop, and then fill the last 48-64 bytes separately to take care of any
// trailing bytes.
// Fill the first 16 bytes, which might be unaligned.
movups [rdi], xmm0
// Calculate the first 16 byte aligned address for the SSE stores.
lea rsi, [rdi + 16]
and rsi, ~15
// Calculate the number of remaining bytes.
sub rdi, rsi
add rdx, rdi
// Calculate the last aligned address for trailing stores such that
// 48-64 bytes are left.
lea rcx, [rsi + rdx - 48]
and rcx, ~15
// Calculate the address 16 bytes from the end.
lea r8, [rsi + rdx - 16]
cmp rdx, 64
jb .Ltrailing
.Lbig_loop:
// Fill 4*16 bytes in a loop.
movaps [rsi], xmm0
movaps [rsi + 16], xmm0
movaps [rsi + 32], xmm0
movaps [rsi + 48], xmm0
add rsi, 64
cmp rsi, rcx
jb .Lbig_loop
.Ltrailing:
// We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
movaps [rcx], xmm0
movaps [rcx + 16], xmm0
movaps [rcx + 32], xmm0
movups [r8], xmm0
ret
.Lunder_64:
cmp rdx, 16
jb .Lunder_16
// We're going to fill 16-63 bytes using variable sized branchess stores.
// Although this means that we might set the same byte up to 4 times, we
// can avoid branching which is expensive compared to straight-line code.
// Calculate the address of the last SSE store.
lea r8, [rdi + rdx - 16]
// Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
and rdx, 32
// Fill the first 16 bytes.
movups [rdi], xmm0
// Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
shr rdx, 1
// Fill the last 16 bytes.
movups [r8], xmm0
// Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
movups [rdi + rdx], xmm0
// Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
neg rdx
movups [r8 + rdx], xmm0
ret
.Lunder_16:
cmp rdx, 4
jb .Lunder_4
// We're going to fill 4-15 bytes using variable sized branchless stores like above.
lea r8, [rdi + rdx - 4]
and rdx, 8
mov [rdi], esi
shr rdx, 1
mov [r8], esi
mov [rdi + rdx], esi
neg rdx
mov [r8 + rdx], esi
ret
.Lunder_4:
cmp rdx, 1
jb .Lend
// Fill the first byte.
mov [rdi], sil
jbe .Lend
// The size is 2 or 3 bytes. Fill the second and the last one.
mov [rdi + 1], sil
mov [rdi + rdx - 1], sil
.Lend:
ret