From e6df1c998804cca1848c634411b42d277d9ff265 Mon Sep 17 00:00:00 2001 From: Owen Smith Date: Fri, 23 Jul 2021 21:52:25 +0100 Subject: [PATCH] Kernel: Implement and use the syscall/sysret instruction pair on x86_64 --- Kernel/API/Syscall.h | 35 +++++++++++ Kernel/Arch/x86/Processor.h | 20 ++++++ Kernel/Arch/x86/RegisterState.h | 3 +- Kernel/Arch/x86/common/Processor.cpp | 23 +++++++ Kernel/Arch/x86/x86_64/SyscallEntry.cpp | 82 +++++++++++++++++++++++++ Kernel/CMakeLists.txt | 7 +++ 6 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 Kernel/Arch/x86/x86_64/SyscallEntry.cpp diff --git a/Kernel/API/Syscall.h b/Kernel/API/Syscall.h index 6167244b6e..6372efb165 100644 --- a/Kernel/API/Syscall.h +++ b/Kernel/API/Syscall.h @@ -483,10 +483,17 @@ int sync(); inline uintptr_t invoke(Function function) { uintptr_t result; +# if ARCH(I386) asm volatile("int $0x82" : "=a"(result) : "a"(function) : "memory"); +# else + asm volatile("syscall" + : "=a"(result) + : "a"(function) + : "rcx", "r11", "memory"); +# endif return result; } @@ -494,10 +501,17 @@ template inline uintptr_t invoke(Function function, T1 arg1) { uintptr_t result; +# if ARCH(I386) asm volatile("int $0x82" : "=a"(result) : "a"(function), "d"((uintptr_t)arg1) : "memory"); +# else + asm volatile("syscall" + : "=a"(result) + : "a"(function), "d"((uintptr_t)arg1) + : "rcx", "r11", "memory"); +# endif return result; } @@ -505,10 +519,17 @@ template inline uintptr_t invoke(Function function, T1 arg1, T2 arg2) { uintptr_t result; +# if ARCH(I386) asm volatile("int $0x82" : "=a"(result) : "a"(function), "d"((uintptr_t)arg1), "c"((uintptr_t)arg2) : "memory"); +# else + asm volatile("syscall" + : "=a"(result) + : "a"(function), "d"((uintptr_t)arg1), "D"((uintptr_t)arg2) + : "rcx", "r11", "memory"); +# endif return result; } @@ -516,10 +537,17 @@ template inline uintptr_t invoke(Function function, T1 arg1, T2 arg2, T3 arg3) { uintptr_t result; +# if ARCH(I386) asm volatile("int $0x82" : "=a"(result) : "a"(function), "d"((uintptr_t)arg1), "c"((uintptr_t)arg2), "b"((uintptr_t)arg3) : "memory"); +# else + asm volatile("syscall" + : "=a"(result) + : "a"(function), "d"((uintptr_t)arg1), "D"((uintptr_t)arg2), "b"((uintptr_t)arg3) + : "rcx", "r11", "memory"); +# endif return result; } @@ -527,10 +555,17 @@ template inline uintptr_t invoke(Function function, T1 arg1, T2 arg2, T3 arg3, T4 arg4) { uintptr_t result; +# if ARCH(I386) asm volatile("int $0x82" : "=a"(result) : "a"(function), "d"((uintptr_t)arg1), "c"((uintptr_t)arg2), "b"((uintptr_t)arg3), "S"((uintptr_t)arg4) : "memory"); +# else + asm volatile("syscall" + : "=a"(result) + : "a"(function), "d"((uintptr_t)arg1), "D"((uintptr_t)arg2), "b"((uintptr_t)arg3), "S"((uintptr_t)arg4) + : "memory"); +# endif return result; } # endif diff --git a/Kernel/Arch/x86/Processor.h b/Kernel/Arch/x86/Processor.h index c2f91e1773..2aab603edb 100644 --- a/Kernel/Arch/x86/Processor.h +++ b/Kernel/Arch/x86/Processor.h @@ -30,6 +30,10 @@ struct ProcessorMessage; struct ProcessorMessageEntry; #if ARCH(X86_64) +# define MSR_EFER 0xc0000080 +# define MSR_STAR 0xc0000081 +# define MSR_LSTAR 0xc0000082 +# define MSR_SFMASK 0xc0000084 # define MSR_FS_BASE 0xc0000100 # define MSR_GS_BASE 0xc0000101 #endif @@ -58,6 +62,11 @@ class Processor { Processor* m_self; +#if ARCH(X86_64) + // Saved user stack for the syscall instruction. + void* m_user_stack; +#endif + DescriptorTablePointer m_gdtr; Descriptor m_gdt[256]; u32 m_gdt_length; @@ -205,6 +214,17 @@ public: static bool is_smp_enabled(); +#if ARCH(X86_64) + static constexpr u64 user_stack_offset() + { + return __builtin_offsetof(Processor, m_user_stack); + } + static constexpr u64 kernel_stack_offset() + { + return __builtin_offsetof(Processor, m_tss) + __builtin_offsetof(TSS, rsp0l); + } +#endif + ALWAYS_INLINE static Processor& current() { return *(Processor*)read_gs_ptr(__builtin_offsetof(Processor, m_self)); diff --git a/Kernel/Arch/x86/RegisterState.h b/Kernel/Arch/x86/RegisterState.h index 28c3452112..4588726143 100644 --- a/Kernel/Arch/x86/RegisterState.h +++ b/Kernel/Arch/x86/RegisterState.h @@ -110,9 +110,10 @@ struct [[gnu::packed]] RegisterState { arg3 = ebx; arg4 = esi; #else + // The syscall instruction clobbers rcx, so we must use a different calling convention to 32-bit. function = rax; arg1 = rdx; - arg2 = rcx; + arg2 = rdi; arg3 = rbx; arg4 = rsi; #endif diff --git a/Kernel/Arch/x86/common/Processor.cpp b/Kernel/Arch/x86/common/Processor.cpp index c6c5b3174c..b8c56fa1dc 100644 --- a/Kernel/Arch/x86/common/Processor.cpp +++ b/Kernel/Arch/x86/common/Processor.cpp @@ -45,6 +45,7 @@ Atomic Processor::s_idle_cpu_mask { 0 }; extern "C" void context_first_init(Thread* from_thread, Thread* to_thread, TrapFrame* trap) __attribute__((used)); extern "C" void enter_thread_context(Thread* from_thread, Thread* to_thread) __attribute__((used)); extern "C" FlatPtr do_init_context(Thread* thread, u32 flags) __attribute__((used)); +extern "C" void syscall_entry(); bool Processor::is_smp_enabled() { @@ -220,6 +221,28 @@ UNMAP_AFTER_INIT void Processor::cpu_setup() write_xcr0(read_xcr0() | 0x7); } } + +#if ARCH(X86_64) + // x86_64 processors must have the syscall feature. + VERIFY(has_feature(CPUFeature::SYSCALL)); + MSR efer_msr(MSR_EFER); + efer_msr.set(efer_msr.get() | 1u); + + // Write code and stack selectors to the STAR MSR. The first value stored in bits 63:48 controls the sysret CS (value + 0x10) and SS (value + 0x8), + // and the value stored in bits 47:32 controls the syscall CS (value) and SS (value + 0x8). + u64 star = 0; + star |= 0x13ul << 48u; + star |= 0x08ul << 32u; + MSR star_msr(MSR_STAR); + star_msr.set(star); + + // Write the syscall entry point to the LSTAR MSR, and write the SFMASK MSR to clear rflags upon entry. + // The userspace rflags will be preserved in r11. + MSR lstar_msr(MSR_LSTAR); + MSR sfmask_msr(MSR_SFMASK); + lstar_msr.set(reinterpret_cast(&syscall_entry)); + sfmask_msr.set(~0x2); +#endif } String Processor::features_string() const diff --git a/Kernel/Arch/x86/x86_64/SyscallEntry.cpp b/Kernel/Arch/x86/x86_64/SyscallEntry.cpp new file mode 100644 index 0000000000..ab12809f49 --- /dev/null +++ b/Kernel/Arch/x86/x86_64/SyscallEntry.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021, Owen Smith + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +extern "C" void syscall_entry(); +extern "C" [[gnu::naked]] void syscall_entry() +{ + // clang-format off + asm( + // Store the user stack, then switch to the kernel stack. + " movq %%rsp, %%gs:%c[user_stack] \n" + " movq %%gs:%c[kernel_stack], %%rsp \n" + + // Build RegisterState. + " pushq $0x1b \n" // User ss + " pushq %%gs:%c[user_stack] \n" // User rsp + " sti \n" // It's now safe to enable interrupts, but we can't index into gs after this point + " pushq %%r11 \n" // The CPU preserves the user rflags in r11 + " pushq $0x23 \n" // User cs + " pushq %%rcx \n" // The CPU preserves the user IP in rcx + " pushq $0 \n" + " pushq %%r15 \n" + " pushq %%r14 \n" + " pushq %%r13 \n" + " pushq %%r12 \n" + " pushq %%r11 \n" + " pushq %%r10 \n" + " pushq %%r9 \n" + " pushq %%r8 \n" + " pushq %%rax \n" + " pushq %%rcx \n" + " pushq %%rdx \n" + " pushq %%rbx \n" + " pushq %%rsp \n" + " pushq %%rbp \n" + " pushq %%rsi \n" + " pushq %%rdi \n" + + " pushq %%rsp \n" // TrapFrame::regs + " subq $" __STRINGIFY(TRAP_FRAME_SIZE - 8) ", %%rsp \n" + " movq %%rsp, %%rdi \n" + " call enter_trap_no_irq \n" + " movq %%rsp, %%rdi \n" + " call syscall_handler \n" + " movq %%rsp, %%rdi \n" + " call exit_trap \n" + " addq $" __STRINGIFY(TRAP_FRAME_SIZE) ", %%rsp \n" // Pop TrapFrame + + " popq %%rdi \n" + " popq %%rsi \n" + " popq %%rbp \n" + " addq $8, %%rsp \n" // Skip restoring kernel rsp + " popq %%rbx \n" + " popq %%rdx \n" + " popq %%rcx \n" + " popq %%rax \n" + " popq %%r8 \n" + " popq %%r9 \n" + " popq %%r10 \n" + " popq %%r11 \n" + " popq %%r12 \n" + " popq %%r13 \n" + " popq %%r14 \n" + " popq %%r15 \n" + " addq $8, %%rsp \n" + " popq %%rcx \n" + " addq $16, %%rsp \n" + + // Disable interrupts before we restore the user stack pointer. sysret will re-enable interrupts when it restores + // rflags. + " cli \n" + " popq %%rsp \n" + " sysretq \n" + :: [user_stack] "i"(Kernel::Processor::user_stack_offset()), [kernel_stack] "i"(Kernel::Processor::kernel_stack_offset())); + // clang-format on +} diff --git a/Kernel/CMakeLists.txt b/Kernel/CMakeLists.txt index ab48d2dc5e..bce726aa8a 100644 --- a/Kernel/CMakeLists.txt +++ b/Kernel/CMakeLists.txt @@ -311,6 +311,13 @@ if ("${SERENITY_ARCH}" STREQUAL "i686" OR "${SERENITY_ARCH}" STREQUAL "x86_64") ${CMAKE_CURRENT_SOURCE_DIR}/Arch/x86/common/SafeMem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Arch/x86/common/TrapFrame.cpp ) + + if("${SERENITY_ARCH}" STREQUAL "x86_64") + set(KERNEL_SOURCES + ${KERNEL_SOURCES} + ${CMAKE_CURRENT_SOURCE_DIR}/Arch/x86/${KERNEL_ARCH}/SyscallEntry.cpp + ) + endif() endif() set(AK_SOURCES