From 082ed6f417db7f74ba7f09f4d5e82ae288105d46 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Fri, 23 Jul 2021 02:40:16 +0200 Subject: [PATCH] Kernel: Simplify VMObject locking & page fault handlers This patch greatly simplifies VMObject locking by doing two things: 1. Giving VMObject an IntrusiveList of all its mapping Region objects. 2. Removing VMObject::m_paging_lock in favor of VMObject::m_lock Before (1), VMObject::for_each_region() was forced to acquire the global MM lock (since it worked by walking MemoryManager's list of all regions and checking for regions that pointed to itself.) With each VMObject having its own list of Regions, VMObject's own m_lock is all we need. Before (2), page fault handlers used a separate mutex for preventing overlapping work. This design required multiple temporary unlocks and was generally extremely hard to reason about. Instead, page fault handlers now use VMObject's own m_lock as well. --- Kernel/Syscalls/purge.cpp | 2 - Kernel/VM/AnonymousVMObject.cpp | 60 ++++++++------------- Kernel/VM/AnonymousVMObject.h | 2 - Kernel/VM/InodeVMObject.cpp | 16 +++--- Kernel/VM/InodeVMObject.h | 2 - Kernel/VM/MemoryManager.cpp | 6 +-- Kernel/VM/MemoryManager.h | 21 ++------ Kernel/VM/Region.cpp | 92 ++++++++++++--------------------- Kernel/VM/Region.h | 27 ++++------ Kernel/VM/VMObject.h | 43 +++++++++++++-- 10 files changed, 116 insertions(+), 155 deletions(-) diff --git a/Kernel/Syscalls/purge.cpp b/Kernel/Syscalls/purge.cpp index 838459c24c..f12bbe6dda 100644 --- a/Kernel/Syscalls/purge.cpp +++ b/Kernel/Syscalls/purge.cpp @@ -24,7 +24,6 @@ KResultOr Process::sys$purge(int mode) NonnullRefPtrVector vmobjects; { KResult result(KSuccess); - InterruptDisabler disabler; MM.for_each_vmobject([&](auto& vmobject) { if (vmobject.is_anonymous()) { // In the event that the append fails, only attempt to continue @@ -48,7 +47,6 @@ KResultOr Process::sys$purge(int mode) NonnullRefPtrVector vmobjects; { KResult result(KSuccess); - InterruptDisabler disabler; MM.for_each_vmobject([&](auto& vmobject) { if (vmobject.is_inode()) { // In the event that the append fails, only attempt to continue diff --git a/Kernel/VM/AnonymousVMObject.cpp b/Kernel/VM/AnonymousVMObject.cpp index 3823bda74a..918f9ca099 100644 --- a/Kernel/VM/AnonymousVMObject.cpp +++ b/Kernel/VM/AnonymousVMObject.cpp @@ -153,27 +153,6 @@ AnonymousVMObject::~AnonymousVMObject() } int AnonymousVMObject::purge() -{ - MutexLocker locker(m_paging_lock); - return purge_impl(); -} - -int AnonymousVMObject::purge_with_interrupts_disabled(Badge) -{ - VERIFY_INTERRUPTS_DISABLED(); - if (m_paging_lock.is_locked()) - return 0; - return purge_impl(); -} - -void AnonymousVMObject::set_was_purged(VolatilePageRange const& range) -{ - VERIFY(m_lock.is_locked()); - for (auto* purgeable_ranges : m_purgeable_ranges) - purgeable_ranges->set_was_purged(range); -} - -int AnonymousVMObject::purge_impl() { int purged_page_count = 0; ScopedSpinLock lock(m_lock); @@ -193,30 +172,35 @@ int AnonymousVMObject::purge_impl() purged_page_count += purged_in_range; set_was_purged(range); for_each_region([&](auto& region) { - if (®ion.vmobject() == this) { - if (auto owner = region.get_owner()) { - // we need to hold a reference the process here (if there is one) as we may not own this region - dmesgln("Purged {} pages from region {} owned by {} at {} - {}", - purged_in_range, - region.name(), - *owner, - region.vaddr_from_page_index(range.base), - region.vaddr_from_page_index(range.base + range.count)); - } else { - dmesgln("Purged {} pages from region {} (no ownership) at {} - {}", - purged_in_range, - region.name(), - region.vaddr_from_page_index(range.base), - region.vaddr_from_page_index(range.base + range.count)); - } - region.remap_vmobject_page_range(range.base, range.count); + if (auto owner = region.get_owner()) { + // we need to hold a reference the process here (if there is one) as we may not own this region + dmesgln("Purged {} pages from region {} owned by {} at {} - {}", + purged_in_range, + region.name(), + *owner, + region.vaddr_from_page_index(range.base), + region.vaddr_from_page_index(range.base + range.count)); + } else { + dmesgln("Purged {} pages from region {} (no ownership) at {} - {}", + purged_in_range, + region.name(), + region.vaddr_from_page_index(range.base), + region.vaddr_from_page_index(range.base + range.count)); } + region.remap_vmobject_page_range(range.base, range.count); }); } }); return purged_page_count; } +void AnonymousVMObject::set_was_purged(VolatilePageRange const& range) +{ + VERIFY(m_lock.is_locked()); + for (auto* purgeable_ranges : m_purgeable_ranges) + purgeable_ranges->set_was_purged(range); +} + void AnonymousVMObject::register_purgeable_page_ranges(PurgeablePageRanges& purgeable_page_ranges) { ScopedSpinLock lock(m_lock); diff --git a/Kernel/VM/AnonymousVMObject.h b/Kernel/VM/AnonymousVMObject.h index 7ea9ea75ea..1b1a721695 100644 --- a/Kernel/VM/AnonymousVMObject.h +++ b/Kernel/VM/AnonymousVMObject.h @@ -36,7 +36,6 @@ public: void unregister_purgeable_page_ranges(PurgeablePageRanges&); int purge(); - int purge_with_interrupts_disabled(Badge); bool is_any_volatile() const; @@ -122,7 +121,6 @@ private: virtual StringView class_name() const override { return "AnonymousVMObject"sv; } - int purge_impl(); void update_volatile_cache(); void set_was_purged(VolatilePageRange const&); size_t remove_lazy_commit_pages(VolatilePageRange const&); diff --git a/Kernel/VM/InodeVMObject.cpp b/Kernel/VM/InodeVMObject.cpp index 7315828e01..3ad2ac3d7d 100644 --- a/Kernel/VM/InodeVMObject.cpp +++ b/Kernel/VM/InodeVMObject.cpp @@ -4,7 +4,6 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include #include #include @@ -53,23 +52,20 @@ size_t InodeVMObject::amount_dirty() const int InodeVMObject::release_all_clean_pages() { - MutexLocker locker(m_paging_lock); - return release_all_clean_pages_impl(); -} + ScopedSpinLock locker(m_lock); -int InodeVMObject::release_all_clean_pages_impl() -{ int count = 0; - InterruptDisabler disabler; for (size_t i = 0; i < page_count(); ++i) { if (!m_dirty_pages.get(i) && m_physical_pages[i]) { m_physical_pages[i] = nullptr; ++count; } } - for_each_region([](auto& region) { - region.remap(); - }); + if (count) { + for_each_region([](auto& region) { + region.remap(); + }); + } return count; } diff --git a/Kernel/VM/InodeVMObject.h b/Kernel/VM/InodeVMObject.h index 63fee76f0e..85cde2ca1b 100644 --- a/Kernel/VM/InodeVMObject.h +++ b/Kernel/VM/InodeVMObject.h @@ -37,8 +37,6 @@ protected: virtual bool is_inode() const final { return true; } - int release_all_clean_pages_impl(); - NonnullRefPtr m_inode; Bitmap m_dirty_pages; }; diff --git a/Kernel/VM/MemoryManager.cpp b/Kernel/VM/MemoryManager.cpp index e9d7ec6a3b..36693404e8 100644 --- a/Kernel/VM/MemoryManager.cpp +++ b/Kernel/VM/MemoryManager.cpp @@ -684,7 +684,6 @@ Region* MemoryManager::find_region_from_vaddr(VirtualAddress vaddr) PageFaultResponse MemoryManager::handle_page_fault(PageFault const& fault) { VERIFY_INTERRUPTS_DISABLED(); - ScopedSpinLock lock(s_mm_lock); if (Processor::current().in_irq()) { dbgln("CPU[{}] BUG! Page fault while handling IRQ! code={}, vaddr={}, irq level: {}", Processor::id(), fault.code(), fault.vaddr(), Processor::current().in_irq()); @@ -696,8 +695,7 @@ PageFaultResponse MemoryManager::handle_page_fault(PageFault const& fault) if (!region) { return PageFaultResponse::ShouldCrash; } - - return region->handle_fault(fault, lock); + return region->handle_fault(fault); } OwnPtr MemoryManager::allocate_contiguous_kernel_region(size_t size, StringView name, Region::Access access, Region::Cacheable cacheable) @@ -878,7 +876,7 @@ RefPtr MemoryManager::allocate_user_physical_page(ShouldZeroFill s for_each_vmobject([&](auto& vmobject) { if (!vmobject.is_anonymous()) return IterationDecision::Continue; - int purged_page_count = static_cast(vmobject).purge_with_interrupts_disabled({}); + int purged_page_count = static_cast(vmobject).purge(); if (purged_page_count) { dbgln("MM: Purge saved the day! Purged {} pages from AnonymousVMObject", purged_page_count); page = find_free_user_physical_page(false); diff --git a/Kernel/VM/MemoryManager.h b/Kernel/VM/MemoryManager.h index f96a2273da..72fb442b81 100644 --- a/Kernel/VM/MemoryManager.h +++ b/Kernel/VM/MemoryManager.h @@ -175,6 +175,7 @@ public: template Callback> static void for_each_vmobject(Callback callback) { + ScopedSpinLock locker(s_mm_lock); for (auto& vmobject : MM.m_vmobjects) { if (callback(vmobject) == IterationDecision::Break) break; @@ -255,8 +256,8 @@ private: PhysicalPageEntry* m_physical_page_entries { nullptr }; size_t m_physical_page_entries_count { 0 }; - Region::List m_user_regions; - Region::List m_kernel_regions; + Region::ListInMemoryManager m_user_regions; + Region::ListInMemoryManager m_kernel_regions; Vector m_used_memory_ranges; Vector m_physical_memory_ranges; Vector m_reserved_memory_ranges; @@ -264,22 +265,6 @@ private: VMObject::List m_vmobjects; }; -template -void VMObject::for_each_region(Callback callback) -{ - ScopedSpinLock lock(s_mm_lock); - // FIXME: Figure out a better data structure so we don't have to walk every single region every time an inode changes. - // Perhaps VMObject could have a Vector with all of his mappers? - for (auto& region : MM.m_user_regions) { - if (®ion.vmobject() == this) - callback(region); - } - for (auto& region : MM.m_kernel_regions) { - if (®ion.vmobject() == this) - callback(region); - } -} - inline bool is_user_address(VirtualAddress vaddr) { return vaddr.get() < USER_RANGE_CEILING; diff --git a/Kernel/VM/Region.cpp b/Kernel/VM/Region.cpp index aa43495abf..b9c7cd020a 100644 --- a/Kernel/VM/Region.cpp +++ b/Kernel/VM/Region.cpp @@ -33,14 +33,14 @@ Region::Region(Range const& range, NonnullRefPtr vmobject, size_t offs VERIFY(m_range.size()); VERIFY((m_range.size() % PAGE_SIZE) == 0); - m_vmobject->ref_region(); + m_vmobject->add_region(*this); register_purgeable_page_ranges(); MM.register_region(*this); } Region::~Region() { - m_vmobject->unref_region(); + m_vmobject->remove_region(*this); unregister_purgeable_page_ranges(); // Make sure we disable interrupts so we don't get interrupted between unmapping and unregistering. @@ -130,9 +130,9 @@ void Region::set_vmobject(NonnullRefPtr&& obj) if (m_vmobject.ptr() == obj.ptr()) return; unregister_purgeable_page_ranges(); - m_vmobject->unref_region(); + m_vmobject->remove_region(*this); m_vmobject = move(obj); - m_vmobject->ref_region(); + m_vmobject->add_region(*this); register_purgeable_page_ranges(); } @@ -255,6 +255,9 @@ bool Region::map_individual_page_impl(size_t page_index) PANIC("About to map mmap'ed page at a kernel address"); } + // NOTE: We have to take the MM lock for PTE's to stay valid while we use them. + ScopedSpinLock mm_locker(s_mm_lock); + auto* pte = MM.ensure_pte(*m_page_directory, page_vaddr); if (!pte) return false; @@ -279,7 +282,6 @@ bool Region::map_individual_page_impl(size_t page_index) bool Region::do_remap_vmobject_page_range(size_t page_index, size_t page_count) { bool success = true; - VERIFY(s_mm_lock.own_lock()); if (!m_page_directory) return success; // not an error, region may have not yet mapped it if (!translate_vmobject_page_range(page_index, page_count)) @@ -301,7 +303,6 @@ bool Region::do_remap_vmobject_page_range(size_t page_index, size_t page_count) bool Region::remap_vmobject_page_range(size_t page_index, size_t page_count) { bool success = true; - ScopedSpinLock lock(s_mm_lock); auto& vmobject = this->vmobject(); if (vmobject.is_shared_by_multiple_regions()) { vmobject.for_each_region([&](auto& region) { @@ -317,7 +318,7 @@ bool Region::remap_vmobject_page_range(size_t page_index, size_t page_count) bool Region::do_remap_vmobject_page(size_t page_index, bool with_flush) { - ScopedSpinLock lock(s_mm_lock); + ScopedSpinLock lock(vmobject().m_lock); if (!m_page_directory) return true; // not an error, region may have not yet mapped it if (!translate_vmobject_page(page_index)) @@ -333,8 +334,8 @@ bool Region::do_remap_vmobject_page(size_t page_index, bool with_flush) bool Region::remap_vmobject_page(size_t page_index, bool with_flush) { bool success = true; - ScopedSpinLock lock(s_mm_lock); auto& vmobject = this->vmobject(); + ScopedSpinLock lock(vmobject.m_lock); if (vmobject.is_shared_by_multiple_regions()) { vmobject.for_each_region([&](auto& region) { if (!region.do_remap_vmobject_page(page_index, with_flush)) @@ -406,7 +407,7 @@ void Region::remap() map(*m_page_directory); } -PageFaultResponse Region::handle_fault(PageFault const& fault, ScopedSpinLock& mm_lock) +PageFaultResponse Region::handle_fault(PageFault const& fault) { auto page_index_in_region = page_index_from_address(fault.vaddr()); if (fault.type() == PageFault::Type::PageNotPresent) { @@ -420,7 +421,7 @@ PageFaultResponse Region::handle_fault(PageFault const& fault, ScopedSpinLockis_shared_zero_page() || phys_page->is_lazy_committed_page()) { dbgln_if(PAGE_FAULT_DEBUG, "NP(zero) fault in Region({})[{}] at {}", this, page_index_in_region, fault.vaddr()); - return handle_zero_fault(page_index_in_region, mm_lock); + return handle_zero_fault(page_index_in_region); } return handle_cow_fault(page_index_in_region); } @@ -448,33 +449,16 @@ PageFaultResponse Region::handle_fault(PageFault const& fault, ScopedSpinLock& mm_lock) +PageFaultResponse Region::handle_zero_fault(size_t page_index_in_region) { VERIFY_INTERRUPTS_DISABLED(); VERIFY(vmobject().is_anonymous()); - bool can_lock = Thread::current() && !g_scheduler_lock.own_lock(); - if (can_lock) { - // TODO: This seems rather weird. If we don't have a current thread - // then we're in the Kernel in early initialization still. So we - // can't actually wait on the paging lock. And if we currently - // own the scheduler lock and we trigger zero faults, we also - // can't really wait. But do we actually need to wait here? - mm_lock.unlock(); - VERIFY(!s_mm_lock.own_lock()); - - vmobject().m_paging_lock.lock(); - - mm_lock.lock(); - } - ScopeGuard guard([&]() { - if (can_lock) - vmobject().m_paging_lock.unlock(); - }); - auto& page_slot = physical_page_slot(page_index_in_region); auto page_index_in_vmobject = translate_to_vmobject_page(page_index_in_region); + ScopedSpinLock locker(vmobject().m_lock); + if (!page_slot.is_null() && !page_slot->is_shared_zero_page() && !page_slot->is_lazy_committed_page()) { dbgln_if(PAGE_FAULT_DEBUG, "MM: zero_page() but page already present. Fine with me!"); if (!remap_vmobject_page(page_index_in_vmobject)) @@ -523,33 +507,21 @@ PageFaultResponse Region::handle_cow_fault(size_t page_index_in_region) return response; } -PageFaultResponse Region::handle_inode_fault(size_t page_index_in_region, ScopedSpinLock& mm_lock) +PageFaultResponse Region::handle_inode_fault(size_t page_index_in_region) { VERIFY_INTERRUPTS_DISABLED(); VERIFY(vmobject().is_inode()); - - mm_lock.unlock(); VERIFY(!s_mm_lock.own_lock()); VERIFY(!g_scheduler_lock.own_lock()); - MutexLocker locker(vmobject().m_paging_lock); - - mm_lock.lock(); - - VERIFY_INTERRUPTS_DISABLED(); auto& inode_vmobject = static_cast(vmobject()); + auto page_index_in_vmobject = translate_to_vmobject_page(page_index_in_region); auto& vmobject_physical_page_entry = inode_vmobject.physical_pages()[page_index_in_vmobject]; + VERIFY(vmobject_physical_page_entry.is_null()); dbgln_if(PAGE_FAULT_DEBUG, "Inode fault in {} page index: {}", name(), page_index_in_region); - if (!vmobject_physical_page_entry.is_null()) { - dbgln_if(PAGE_FAULT_DEBUG, "MM: page_in_from_inode() but page already present. Fine with me!"); - if (!remap_vmobject_page(page_index_in_vmobject)) - return PageFaultResponse::OutOfMemory; - return PageFaultResponse::Continue; - } - auto current_thread = Thread::current(); if (current_thread) current_thread->did_inode_fault(); @@ -557,29 +529,33 @@ PageFaultResponse Region::handle_inode_fault(size_t page_index_in_region, Scoped u8 page_buffer[PAGE_SIZE]; auto& inode = inode_vmobject.inode(); - // Reading the page may block, so release the MM lock temporarily - mm_lock.unlock(); - - KResultOr result(KSuccess); - { - ScopedLockRelease release_paging_lock(vmobject().m_paging_lock); - auto buffer = UserOrKernelBuffer::for_kernel_buffer(page_buffer); - result = inode.read_bytes(page_index_in_vmobject * PAGE_SIZE, PAGE_SIZE, buffer, nullptr); - } - - mm_lock.lock(); + auto buffer = UserOrKernelBuffer::for_kernel_buffer(page_buffer); + auto result = inode.read_bytes(page_index_in_vmobject * PAGE_SIZE, PAGE_SIZE, buffer, nullptr); if (result.is_error()) { - dmesgln("MM: handle_inode_fault had error ({}) while reading!", result.error()); + dmesgln("handle_inode_fault: Error ({}) while reading from inode", result.error()); return PageFaultResponse::ShouldCrash; } + auto nread = result.value(); if (nread < PAGE_SIZE) { // If we read less than a page, zero out the rest to avoid leaking uninitialized data. memset(page_buffer + nread, 0, PAGE_SIZE - nread); } + ScopedSpinLock locker(inode_vmobject.m_lock); + + if (!vmobject_physical_page_entry.is_null()) { + // Someone else faulted in this page while we were reading from the inode. + // No harm done (other than some duplicate work), remap the page here and return. + dbgln_if(PAGE_FAULT_DEBUG, "handle_inode_fault: Page faulted in by someone else, remapping."); + if (!remap_vmobject_page(page_index_in_vmobject)) + return PageFaultResponse::OutOfMemory; + return PageFaultResponse::Continue; + } + vmobject_physical_page_entry = MM.allocate_user_physical_page(MemoryManager::ShouldZeroFill::No); + if (vmobject_physical_page_entry.is_null()) { dmesgln("MM: handle_inode_fault was unable to allocate a physical page"); return PageFaultResponse::OutOfMemory; diff --git a/Kernel/VM/Region.h b/Kernel/VM/Region.h index a3ccb93116..34e76802bc 100644 --- a/Kernel/VM/Region.h +++ b/Kernel/VM/Region.h @@ -15,10 +15,10 @@ #include #include #include +#include #include #include #include -#include namespace Kernel { @@ -88,7 +88,7 @@ public: bool is_user() const { return !is_kernel(); } bool is_kernel() const { return vaddr().get() < 0x00800000 || vaddr().get() >= kernel_base; } - PageFaultResponse handle_fault(PageFault const&, ScopedSpinLock&); + PageFaultResponse handle_fault(PageFault const&); OwnPtr clone(Process&); @@ -165,17 +165,8 @@ public: return size() / PAGE_SIZE; } - PhysicalPage const* physical_page(size_t index) const - { - VERIFY(index < page_count()); - return vmobject().physical_pages()[first_page_index() + index]; - } - - RefPtr& physical_page_slot(size_t index) - { - VERIFY(index < page_count()); - return vmobject().physical_pages()[first_page_index() + index]; - } + PhysicalPage const* physical_page(size_t index) const; + RefPtr& physical_page_slot(size_t index); size_t offset_in_vmobject() const { @@ -242,8 +233,8 @@ private: bool remap_vmobject_page(size_t index, bool with_flush = true); PageFaultResponse handle_cow_fault(size_t page_index); - PageFaultResponse handle_inode_fault(size_t page_index, ScopedSpinLock&); - PageFaultResponse handle_zero_fault(size_t page_index, ScopedSpinLock&); + PageFaultResponse handle_inode_fault(size_t page_index); + PageFaultResponse handle_zero_fault(size_t page_index); bool map_individual_page_impl(size_t page_index); @@ -262,10 +253,12 @@ private: bool m_mmap : 1 { false }; bool m_syscall_region : 1 { false }; WeakPtr m_owner; - IntrusiveListNode m_list_node; + IntrusiveListNode m_memory_manager_list_node; + IntrusiveListNode m_vmobject_list_node; public: - using List = IntrusiveList, &Region::m_list_node>; + using ListInMemoryManager = IntrusiveList, &Region::m_memory_manager_list_node>; + using ListInVMObject = IntrusiveList, &Region::m_vmobject_list_node>; }; AK_ENUM_BITWISE_OPERATORS(Region::Access) diff --git a/Kernel/VM/VMObject.h b/Kernel/VM/VMObject.h index 6497d10493..3528eda0b4 100644 --- a/Kernel/VM/VMObject.h +++ b/Kernel/VM/VMObject.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace Kernel { @@ -48,8 +49,20 @@ public: virtual StringView class_name() const = 0; - ALWAYS_INLINE void ref_region() { m_regions_count++; } - ALWAYS_INLINE void unref_region() { m_regions_count--; } + ALWAYS_INLINE void add_region(Region& region) + { + ScopedSpinLock locker(m_lock); + m_regions_count++; + m_regions.append(region); + } + + ALWAYS_INLINE void remove_region(Region& region) + { + ScopedSpinLock locker(m_lock); + m_regions_count--; + m_regions.remove(region); + } + ALWAYS_INLINE bool is_shared_by_multiple_regions() const { return m_regions_count > 1; } void register_on_deleted_handler(VMObjectDeletedHandler& handler) @@ -70,9 +83,8 @@ protected: IntrusiveListNode m_list_node; FixedArray> m_physical_pages; - Mutex m_paging_lock { "VMObject" }; - mutable SpinLock m_lock; + mutable RecursiveSpinLock m_lock; private: VMObject& operator=(VMObject const&) = delete; @@ -83,8 +95,31 @@ private: HashTable m_on_deleted; SpinLock m_on_deleted_lock; + Region::ListInVMObject m_regions; + public: using List = IntrusiveList, &VMObject::m_list_node>; }; +template +inline void VMObject::for_each_region(Callback callback) +{ + ScopedSpinLock lock(m_lock); + for (auto& region : m_regions) { + callback(region); + } +} + +inline PhysicalPage const* Region::physical_page(size_t index) const +{ + VERIFY(index < page_count()); + return vmobject().physical_pages()[first_page_index() + index]; +} + +inline RefPtr& Region::physical_page_slot(size_t index) +{ + VERIFY(index < page_count()); + return vmobject().physical_pages()[first_page_index() + index]; +} + }