LibC: Implement posix_memalign(3) and aligned_alloc(3)

Some ports linked against posix_memalign, but didn't use it, and others used it if it was Available. So I decided to implement posix_memalign. My implementation adds almost no overhead to regular mallocs. However, if an alignment is specified, it will use the smallest ChunkedBlock, for which aligned chunks exist, and simply use one of the chunks that is aligned. If it cannot use a ChunkedBlock, for size or alignment reasons, it will use a BigAllocationBlock, and return a pointer to the first aligned address past the start of the block. This implementation supports alignments up to 32768, due to the limitations of the BigAllocationBlock technique.
2025-07-25 14:47:44 +00:00 · 2022-05-04 23:08:57 -06:00 · 2022-05-04 23:08:57 -06:00 · 4e0adb638d
commit 4e0adb638d
parent 830f5c610d
4 changed files with 172 additions and 60 deletions
--- a/Userland/Libraries/LibC/malloc.cpp
+++ b/Userland/Libraries/LibC/malloc.cpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org>
+ * Copyright (c) 2022, Peter Elliott <pelliott@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */
@ -153,12 +154,75 @@ static inline BigAllocator (&big_allocators())[1]
    return reinterpret_cast<BigAllocator(&)[1]>(g_big_allocators_storage);
 }

-static Allocator* allocator_for_size(size_t size, size_t& good_size)
+// --- BEGIN MATH ---
+// This stuff is only used for checking if there exists an aligned block in a
+// chunk. It has no bearing on the rest of the allocator, especially for
+// regular malloc.
+
+static inline unsigned long modulo(long a, long b)
+{
+    return (b + (a % b)) % b;
+}
+
+struct EuclideanResult {
+    long x;
+    long y;
+    long gcd;
+};
+
+// Returns x, y, gcd.
+static inline EuclideanResult extended_euclid(long a, long b)
+{
+    EuclideanResult old = { 1, 0, a };
+    EuclideanResult current = { 0, 1, b };
+
+    while (current.gcd != 0) {
+        long quotient = old.gcd / current.gcd;
+
+        EuclideanResult next = {
+            old.x - quotient * current.x,
+            old.y - quotient * current.y,
+            old.gcd - quotient * current.gcd,
+        };
+
+        old = current;
+        current = next;
+    }
+
+    return old;
+}
+
+static inline bool block_has_aligned_chunk(long align, long bytes_per_chunk, long chunk_capacity)
+{
+    // Never do math on a normal malloc.
+    if ((size_t)align <= sizeof(ChunkedBlock))
+        return true;
+
+    // Solve the linear congruence n*bytes_per_chunk = -sizeof(ChunkedBlock) (mod align).
+    auto [x, y, gcd] = extended_euclid(bytes_per_chunk % align, align);
+    long constant = modulo(-sizeof(ChunkedBlock), align);
+    if (constant % gcd != 0)
+        // No solution. Chunk size is probably a multiple of align.
+        return false;
+
+    long n = modulo(x * (constant / gcd), align);
+    if (x < 0)
+        n = (n + align / gcd) % align;
+
+    // Don't ask me to prove this.
+    VERIFY(n > 0);
+    return n < chunk_capacity;
+}
+
+// --- END MATH ---
+
+static Allocator* allocator_for_size(size_t size, size_t& good_size, size_t align = 1)
 {
    for (size_t i = 0; size_classes[i]; ++i) {
-        if (size <= size_classes[i]) {
+        auto& allocator = allocators()[i];
+        if (size <= size_classes[i] && block_has_aligned_chunk(align, allocator.size, (ChunkedBlock::block_size - sizeof(ChunkedBlock)) / allocator.size)) {
            good_size = size_classes[i];
-            return &allocators()[i];
+            return &allocator;
        }
    }
    good_size = PAGE_ROUND_UP(size);
@ -176,7 +240,7 @@ static BigAllocator* big_allocator_for_size(size_t size)

 extern "C" {

-static void* os_alloc(size_t size, char const* name)
+static ErrorOr<void*> os_alloc(size_t size, char const* name)
 {
    int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_PURGEABLE;
 #if ARCH(X86_64)
@ -185,8 +249,7 @@ static void* os_alloc(size_t size, char const* name)
    auto* ptr = serenity_mmap(nullptr, size, PROT_READ | PROT_WRITE, flags, 0, 0, ChunkedBlock::block_size, name);
    VERIFY(ptr != nullptr);
    if (ptr == MAP_FAILED) {
-        errno = ENOMEM;
-        return nullptr;
+        return ENOMEM;
    }
    return ptr;
 }
@ -197,6 +260,31 @@ static void os_free(void* ptr, size_t size)
    assert(rc == 0);
 }

+static void* try_allocate_chunk_aligned(size_t align, ChunkedBlock& block)
+{
+    // These loops are guaranteed to run only once for a standard-aligned malloc.
+    for (FreelistEntry** entry = &(block.m_freelist); *entry != nullptr; entry = &((*entry)->next)) {
+        if ((reinterpret_cast<uintptr_t>(*entry) & (align - 1)) == 0) {
+            --block.m_free_chunks;
+            void* ptr = *entry;
+            *entry = (*entry)->next; // Delete the entry.
+            return ptr;
+        }
+    }
+    for (; block.m_next_lazy_freelist_index < block.chunk_capacity(); block.m_next_lazy_freelist_index++) {
+        void* ptr = block.m_slot + block.m_next_lazy_freelist_index * block.m_size;
+        if ((reinterpret_cast<uintptr_t>(ptr) & (align - 1)) == 0) {
+            --block.m_free_chunks;
+            block.m_next_lazy_freelist_index++;
+            return ptr;
+        }
+        auto* entry = (FreelistEntry*)ptr;
+        entry->next = block.m_freelist;
+        block.m_freelist = entry;
+    }
+    return nullptr;
+}
+
 enum class CallerWillInitializeMemory {
    No,
    Yes,
@ -208,12 +296,20 @@ enum class CallerWillInitializeMemory {
 __thread bool s_allocation_enabled;
 #endif

-static void* malloc_impl(size_t size, CallerWillInitializeMemory caller_will_initialize_memory)
+static ErrorOr<void*> malloc_impl(size_t size, size_t align, CallerWillInitializeMemory caller_will_initialize_memory)
 {
 #ifndef NO_TLS
    VERIFY(s_allocation_enabled);
 #endif

+    // Align must be a power of 2.
+    if (popcount(align) != 1)
+        return EINVAL;
+
+    // FIXME: Support larger than 32KiB alignments (if you dare).
+    if (sizeof(BigAllocationBlock) + align >= ChunkedBlock::block_size)
+        return EINVAL;
+
    if (s_log_malloc)
        dbgln("LibC: malloc({})", size);

@ -226,16 +322,15 @@ static void* malloc_impl(size_t size, CallerWillInitializeMemory caller_will_ini
    g_malloc_stats.number_of_malloc_calls++;

    size_t good_size;
-    auto* allocator = allocator_for_size(size, good_size);
+    auto* allocator = allocator_for_size(size, good_size, align);

    PthreadMutexLocker locker(s_malloc_mutex);

    if (!allocator) {
-        size_t real_size = round_up_to_power_of_two(sizeof(BigAllocationBlock) + size, ChunkedBlock::block_size);
+        size_t real_size = round_up_to_power_of_two(sizeof(BigAllocationBlock) + size + ((align > 16) ? align : 0), ChunkedBlock::block_size);
        if (real_size < size) {
            dbgln_if(MALLOC_DEBUG, "LibC: Detected overflow trying to do big allocation of size {} for {}", real_size, size);
-            errno = ENOMEM;
-            return nullptr;
+            return ENOMEM;
        }
 #ifdef RECYCLE_BIG_ALLOCATIONS
        if (auto* allocator = big_allocator_for_size(real_size)) {
@ -257,27 +352,31 @@ static void* malloc_impl(size_t size, CallerWillInitializeMemory caller_will_ini
                    new (block) BigAllocationBlock(real_size);
                }

-                ue_notify_malloc(&block->m_slot[0], size);
-                return &block->m_slot[0];
+                void* ptr = reinterpret_cast<void*>(round_up_to_power_of_two(reinterpret_cast<uintptr_t>(&block->m_slot[0]), align));
+
+                ue_notify_malloc(ptr, size);
+                return ptr;
            }
        }
 #endif
-        auto* block = (BigAllocationBlock*)os_alloc(real_size, "malloc: BigAllocationBlock");
-        if (block == nullptr) {
-            dbgln_if(MALLOC_DEBUG, "LibC: Failed to do big allocation of size {} for {}", real_size, size);
-            return nullptr;
-        }
+        auto* block = (BigAllocationBlock*)TRY(os_alloc(real_size, "malloc: BigAllocationBlock"));
        g_malloc_stats.number_of_big_allocs++;
        new (block) BigAllocationBlock(real_size);
-        ue_notify_malloc(&block->m_slot[0], size);
-        return &block->m_slot[0];
+
+        void* ptr = reinterpret_cast<void*>(round_up_to_power_of_two(reinterpret_cast<uintptr_t>(&block->m_slot[0]), align));
+        ue_notify_malloc(ptr, size);
+        return ptr;
    }

    ChunkedBlock* block = nullptr;
+    void* ptr = nullptr;
    for (auto& current : allocator->usable_blocks) {
        if (current.free_chunks()) {
-            block = &current;
-            break;
+            ptr = try_allocate_chunk_aligned(align, current);
+            if (ptr) {
+                block = &current;
+                break;
+            }
        }
    }

@ -321,23 +420,16 @@ static void* malloc_impl(size_t size, CallerWillInitializeMemory caller_will_ini
        g_malloc_stats.number_of_block_allocs++;
        char buffer[64];
        snprintf(buffer, sizeof(buffer), "malloc: ChunkedBlock(%zu)", good_size);
-        block = (ChunkedBlock*)os_alloc(ChunkedBlock::block_size, buffer);
-        if (block == nullptr) {
-            return nullptr;
-        }
+        block = (ChunkedBlock*)TRY(os_alloc(ChunkedBlock::block_size, buffer));
        new (block) ChunkedBlock(good_size);
        allocator->usable_blocks.append(*block);
        ++allocator->block_count;
    }

-    --block->m_free_chunks;
-    void* ptr = block->m_freelist;
-    if (ptr) {
-        block->m_freelist = block->m_freelist->next;
-    } else {
-        ptr = block->m_slot + block->m_next_lazy_freelist_index * block->m_size;
-        block->m_next_lazy_freelist_index++;
+    if (!ptr) {
+        ptr = try_allocate_chunk_aligned(align, *block);
    }
+
    VERIFY(ptr);
    if (block->is_full()) {
        g_malloc_stats.number_of_blocks_full++;
@ -451,14 +543,21 @@ static void free_impl(void* ptr)
 void* malloc(size_t size)
 {
    MemoryAuditingSuppressor suppressor;
-    void* ptr = malloc_impl(size, CallerWillInitializeMemory::No);
+    auto ptr_or_error = malloc_impl(size, 16, CallerWillInitializeMemory::No);
+
+    if (ptr_or_error.is_error()) {
+        errno = ptr_or_error.error().code();
+        return nullptr;
+    }
+
    if (s_profiling)
-        perf_event(PERF_EVENT_MALLOC, size, reinterpret_cast<FlatPtr>(ptr));
-    return ptr;
+        perf_event(PERF_EVENT_MALLOC, size, reinterpret_cast<FlatPtr>(ptr_or_error.value()));
+
+    return ptr_or_error.value();
 }

 // This is a Microsoft extension, and is not found on other Unix-like systems.
-// FIXME: Implement aligned_alloc() instead
+// FIXME: Remove this when all patches have been switched to aligned_alloc()
 //
 // This is used in libc++ to implement C++17 aligned new/delete.
 //
@ -513,10 +612,41 @@ void* calloc(size_t count, size_t size)
        return nullptr;
    }
    size_t new_size = count * size;
-    auto* ptr = malloc_impl(new_size, CallerWillInitializeMemory::Yes);
-    if (ptr)
-        memset(ptr, 0, new_size);
-    return ptr;
+    auto ptr_or_error = malloc_impl(new_size, 16, CallerWillInitializeMemory::Yes);
+
+    if (ptr_or_error.is_error()) {
+        errno = ptr_or_error.error().code();
+        return nullptr;
+    }
+
+    memset(ptr_or_error.value(), 0, new_size);
+    return ptr_or_error.value();
+}
+
+// https://pubs.opengroup.org/onlinepubs/9699919799/functions/posix_memalign.html
+int posix_memalign(void** memptr, size_t alignment, size_t size)
+{
+    MemoryAuditingSuppressor suppressor;
+    auto ptr_or_error = malloc_impl(size, alignment, CallerWillInitializeMemory::No);
+
+    if (ptr_or_error.is_error())
+        return ptr_or_error.error().code();
+
+    *memptr = ptr_or_error.value();
+    return 0;
+}
+
+void* aligned_alloc(size_t alignment, size_t size)
+{
+    MemoryAuditingSuppressor suppressor;
+    auto ptr_or_error = malloc_impl(size, alignment, CallerWillInitializeMemory::No);
+
+    if (ptr_or_error.is_error()) {
+        errno = ptr_or_error.error().code();
+        return nullptr;
+    }
+
+    return ptr_or_error.value();
 }

 size_t malloc_size(void const* ptr)