1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-14 09:24:57 +00:00

LibC: Implement a faster memset routine for x86-64 in assembly

This commit addresses the following shortcomings of our current, simple
and elegant memset function:
- REP STOSB/STOSQ has considerable startup overhead, it's impractical to
  use for smaller sizes.
- Up until very recently, AMD CPUs didn't have support for "Enhanced REP
  MOVSB/STOSB", so it performed pretty poorly on them.

With this commit applied, I could measure a ~5% decrease in `test-js`'s
runtime when I used qemu's TCG backend. The implementation is based on
the following article from Microsoft:

https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines

Two versions of the routine are implemented: one that uses the ERMS
extension mentioned above, and one that performs plain SSE stores. The
version appropriate for the CPU is selected at load time using an IFUNC.
This commit is contained in:
Daniel Bertalan 2022-03-24 16:41:18 +01:00 committed by Andreas Kling
parent 484f70fb43
commit bcf124c07d
5 changed files with 262 additions and 10 deletions

View file

@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S")
set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S)
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp")
set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S)
endif()

View file

@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
set(CRTI_SOURCE "arch/i386/crti.S")
set(CRTN_SOURCE "arch/i386/crtn.S")
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
set(ASM_SOURCES "arch/x86_64/setjmp.S")
set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp")
set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S")
set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S)
set(CRTI_SOURCE "arch/x86_64/crti.S")
set(CRTN_SOURCE "arch/x86_64/crtn.S")

View file

@ -0,0 +1,196 @@
/*
* Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
// Optimized x86-64 memset routine based on the following post from the MSRC blog:
// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
//
// This algorithm
// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
// qemu's TCG backend used in CI)
// - uses SSE stores otherwise
// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
// a large overhead
.intel_syntax noprefix
.global memset_sse2_erms
.type memset_sse2_erms, @function
.p2align 4
memset_sse2_erms:
// Fill all bytes of esi and xmm0 with the given character.
movzx esi, sil
imul esi, 0x01010101
movd xmm0, esi
pshufd xmm0, xmm0, 0
// Store the original address for the return value.
mov rax, rdi
cmp rdx, 64
jb .Lunder_64
// Limit taken from the article. Could be lower (256 or 512) if we want to
// tune it for the latest CPUs.
cmp rdx, 800
jb .Lbig
.Lerms:
// We're going to align the pointer to 64 bytes, and then use REP STOSB.
// Fill the first 64 bytes of the memory using SSE stores.
movups [rdi], xmm0
movups [rdi + 16], xmm0
movups [rdi + 32], xmm0
movups [rdi + 48], xmm0
// Store the address of the last byte in r8.
lea r8, [rdi + rdx]
// Align the start pointer to 64 bytes.
add rdi, 63
and rdi, ~63
// Calculate the number of remaining bytes to store.
mov rcx, r8
sub rcx, rdi
// Use REP STOSB to fill the rest. This is implemented in microcode on
// recent Intel and AMD CPUs, and can automatically use the widest stores
// available in the CPU, so it's strictly faster than SSE for sizes of more
// than a couple hundred bytes.
xchg rax, rsi
rep stosb
mov rax, rsi
ret
.global memset_sse2
.type memset_sse2, @function
.p2align 4
memset_sse2:
// Fill all bytes of esi and xmm0 with the given character.
movzx esi, sil
imul rsi, 0x01010101
movd xmm0, esi
pshufd xmm0, xmm0, 0
// Store the original address for the return value.
mov rax, rdi
cmp rdx, 64
jb .Lunder_64
.Lbig:
// We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
// loop, and then fill the last 48-64 bytes separately to take care of any
// trailing bytes.
// Fill the first 16 bytes, which might be unaligned.
movups [rdi], xmm0
// Calculate the first 16 byte aligned address for the SSE stores.
lea rsi, [rdi + 16]
and rsi, ~15
// Calculate the number of remaining bytes.
sub rdi, rsi
add rdx, rdi
// Calculate the last aligned address for trailing stores such that
// 48-64 bytes are left.
lea rcx, [rsi + rdx - 48]
and rcx, ~15
// Calculate the address 16 bytes from the end.
lea r8, [rsi + rdx - 16]
cmp rdx, 64
jb .Ltrailing
.Lbig_loop:
// Fill 4*16 bytes in a loop.
movaps [rsi], xmm0
movaps [rsi + 16], xmm0
movaps [rsi + 32], xmm0
movaps [rsi + 48], xmm0
add rsi, 64
cmp rsi, rcx
jb .Lbig_loop
.Ltrailing:
// We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
movaps [rcx], xmm0
movaps [rcx + 16], xmm0
movaps [rcx + 32], xmm0
movups [r8], xmm0
ret
.Lunder_64:
cmp rdx, 16
jb .Lunder_16
// We're going to fill 16-63 bytes using variable sized branchess stores.
// Although this means that we might set the same byte up to 4 times, we
// can avoid branching which is expensive compared to straight-line code.
// Calculate the address of the last SSE store.
lea r8, [rdi + rdx - 16]
// Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
and rdx, 32
// Fill the first 16 bytes.
movups [rdi], xmm0
// Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
shr rdx, 1
// Fill the last 16 bytes.
movups [r8], xmm0
// Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
movups [rdi + rdx], xmm0
// Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
neg rdx
movups [r8 + rdx], xmm0
ret
.Lunder_16:
cmp rdx, 4
jb .Lunder_4
// We're going to fill 4-15 bytes using variable sized branchless stores like above.
lea r8, [rdi + rdx - 4]
and rdx, 8
mov [rdi], esi
shr rdx, 1
mov [r8], esi
mov [rdi + rdx], esi
neg rdx
mov [r8 + rdx], esi
ret
.Lunder_4:
cmp rdx, 1
jb .Lend
// Fill the first byte.
mov [rdi], sil
jbe .Lend
// The size is 2 or 3 bytes. Fill the second and the last one.
mov [rdi + 1], sil
mov [rdi + rdx - 1], sil
.Lend:
ret

View file

@ -0,0 +1,59 @@
/*
* Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Types.h>
#include <cpuid.h>
#include <string.h>
extern "C" {
extern void* memset_sse2(void*, int, size_t);
extern void* memset_sse2_erms(void*, int, size_t);
constexpr u32 tcg_signature_ebx = 0x54474354;
constexpr u32 tcg_signature_ecx = 0x43544743;
constexpr u32 tcg_signature_edx = 0x47435447;
// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB"
constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9;
namespace {
[[gnu::used]] decltype(&memset) resolve_memset()
{
u32 eax, ebx, ecx, edx;
__cpuid(0x40000000, eax, ebx, ecx, edx);
bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx;
// Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than
// SSE copies on all data sizes except <= 4 bytes.
if (is_tcg)
return memset_sse2;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & cpuid_7_ebx_bit_erms)
return memset_sse2_erms;
return memset_sse2;
}
}
#if !defined(__clang__) && !defined(_DYNAMIC_LOADER)
[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t);
#else
// DynamicLoader can't self-relocate IFUNCs.
// FIXME: There's a circular dependency between LibC and libunwind when built with Clang,
// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses.
void* memset(void* dest_ptr, int c, size_t n)
{
static decltype(&memset) s_impl = nullptr;
if (s_impl == nullptr)
s_impl = resolve_memset();
return s_impl(dest_ptr, c, n);
}
#endif
}

View file

@ -137,7 +137,10 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n)
return original_dest;
}
#if ARCH(I386)
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html
//
// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S
void* memset(void* dest_ptr, int c, size_t n)
{
size_t dest = (size_t)dest_ptr;
@ -145,19 +148,11 @@ void* memset(void* dest_ptr, int c, size_t n)
if (!(dest & 0x3) && n >= 12) {
size_t size_ts = n / sizeof(size_t);
size_t expanded_c = explode_byte((u8)c);
#if ARCH(I386)
asm volatile(
"rep stosl\n"
: "=D"(dest)
: "D"(dest), "c"(size_ts), "a"(expanded_c)
: "memory");
#else
asm volatile(
"rep stosq\n"
: "=D"(dest)
: "D"(dest), "c"(size_ts), "a"(expanded_c)
: "memory");
#endif
n -= size_ts * sizeof(size_t);
if (n == 0)
return dest_ptr;
@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n)
: "memory");
return dest_ptr;
}
#endif
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html
void* memmove(void* dest, void const* src, size_t n)