From 0096eadf40802eb0ac436ff324666541fd280bfd Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Sun, 26 Mar 2023 23:16:50 +0200 Subject: [PATCH] Kernel/NVMe: Redesign the tracking of requests in an NVMe Queue There was a private variable named m_current_request which was used to track a single request at a time. This guarantee is given by the block layer where we wait on each IO. This design will break down in the driver once the block layer removes that constraint. Redesign the IO handling in a completely asynchronous way by maintaining requests up to queue depth. NVMeIO struct is introduced to track an IO submitted along with other information such whether the IO is still being processed and an endio callback which will be called during the end of a request. A hashmap private variable is created which will key based on the command id of a request with a value of NVMeIO. endio handler will come in handy if we are doing a sync request and we want to wake up the wait queue during the end. This change also simplified the code by removing some special condition in submit_sqe function, etc that were marked as FIXME for a long time. --- Kernel/Storage/NVMe/NVMeInterruptQueue.cpp | 48 ++++++++++++++------ Kernel/Storage/NVMe/NVMeInterruptQueue.h | 2 +- Kernel/Storage/NVMe/NVMePollQueue.cpp | 29 +++++++++--- Kernel/Storage/NVMe/NVMePollQueue.h | 2 +- Kernel/Storage/NVMe/NVMeQueue.cpp | 53 ++++++++++++++-------- Kernel/Storage/NVMe/NVMeQueue.h | 12 ++++- 6 files changed, 103 insertions(+), 43 deletions(-) diff --git a/Kernel/Storage/NVMe/NVMeInterruptQueue.cpp b/Kernel/Storage/NVMe/NVMeInterruptQueue.cpp index 7b4aeaf46b..68a77d962f 100644 --- a/Kernel/Storage/NVMe/NVMeInterruptQueue.cpp +++ b/Kernel/Storage/NVMe/NVMeInterruptQueue.cpp @@ -29,34 +29,54 @@ void NVMeInterruptQueue::submit_sqe(NVMeSubmission& sub) NVMeQueue::submit_sqe(sub); } -void NVMeInterruptQueue::complete_current_request(u16 status) +void NVMeInterruptQueue::complete_current_request(u16 cmdid, u16 status) { - VERIFY(m_request_lock.is_locked()); - - auto work_item_creation_result = g_io_work->try_queue([this, status]() { + auto work_item_creation_result = g_io_work->try_queue([this, cmdid, status]() { SpinlockLocker lock(m_request_lock); - auto current_request = m_current_request; - m_current_request.clear(); - if (status) { + auto& request_pdu = m_requests.get(cmdid).release_value(); + auto current_request = request_pdu.request; + AsyncDeviceRequest::RequestResult req_result = AsyncDeviceRequest::Success; + + ScopeGuard guard = [req_result, status, &request_pdu, &lock] { + // FIXME: We should unlock at the end of this function to make sure no new requests is inserted + // before we complete the request and calling end_io_handler but that results in a deadlock + // For now this is avoided by asserting the `used` field while inserting. lock.unlock(); - current_request->complete(AsyncBlockDeviceRequest::Failure); + if (request_pdu.request) + request_pdu.request->complete(req_result); + if (request_pdu.end_io_handler) + request_pdu.end_io_handler(status); + request_pdu.used = false; + }; + + // There can be submission without any request associated with it such as with + // admin queue commands during init. If there is no request, we are done + if (!current_request) + return; + + if (status) { + req_result = AsyncBlockDeviceRequest::Failure; return; } + if (current_request->request_type() == AsyncBlockDeviceRequest::RequestType::Read) { if (auto result = current_request->write_to_buffer(current_request->buffer(), m_rw_dma_region->vaddr().as_ptr(), current_request->buffer_size()); result.is_error()) { - lock.unlock(); - current_request->complete(AsyncDeviceRequest::MemoryFault); + req_result = AsyncBlockDeviceRequest::MemoryFault; return; } } - lock.unlock(); - current_request->complete(AsyncDeviceRequest::Success); return; }); + if (work_item_creation_result.is_error()) { - auto current_request = m_current_request; - m_current_request.clear(); + SpinlockLocker lock(m_request_lock); + auto& request_pdu = m_requests.get(cmdid).release_value(); + auto current_request = request_pdu.request; + current_request->complete(AsyncDeviceRequest::OutOfMemory); + if (request_pdu.end_io_handler) + request_pdu.end_io_handler(status); + request_pdu.used = false; } } } diff --git a/Kernel/Storage/NVMe/NVMeInterruptQueue.h b/Kernel/Storage/NVMe/NVMeInterruptQueue.h index f8b3632af2..79e401420f 100644 --- a/Kernel/Storage/NVMe/NVMeInterruptQueue.h +++ b/Kernel/Storage/NVMe/NVMeInterruptQueue.h @@ -18,7 +18,7 @@ public: virtual ~NVMeInterruptQueue() override {}; private: - virtual void complete_current_request(u16 status) override; + virtual void complete_current_request(u16 cmdid, u16 status) override; bool handle_irq(RegisterState const&) override; }; } diff --git a/Kernel/Storage/NVMe/NVMePollQueue.cpp b/Kernel/Storage/NVMe/NVMePollQueue.cpp index e8ab70e18a..314654be06 100644 --- a/Kernel/Storage/NVMe/NVMePollQueue.cpp +++ b/Kernel/Storage/NVMe/NVMePollQueue.cpp @@ -24,21 +24,38 @@ void NVMePollQueue::submit_sqe(NVMeSubmission& sub) } } -void NVMePollQueue::complete_current_request(u16 status) +void NVMePollQueue::complete_current_request(u16 cmdid, u16 status) { - auto current_request = m_current_request; - m_current_request.clear(); + SpinlockLocker lock(m_request_lock); + auto& request_pdu = m_requests.get(cmdid).release_value(); + auto current_request = request_pdu.request; + AsyncDeviceRequest::RequestResult req_result = AsyncDeviceRequest::Success; + + ScopeGuard guard = [req_result, status, &request_pdu] { + if (request_pdu.request) + request_pdu.request->complete(req_result); + if (request_pdu.end_io_handler) + request_pdu.end_io_handler(status); + request_pdu.used = false; + }; + + // There can be submission without any request associated with it such as with + // admin queue commands during init. If there is no request, we are done + if (!current_request) + return; + if (status) { - current_request->complete(AsyncBlockDeviceRequest::Failure); + req_result = AsyncBlockDeviceRequest::Failure; return; } + if (current_request->request_type() == AsyncBlockDeviceRequest::RequestType::Read) { if (auto result = current_request->write_to_buffer(current_request->buffer(), m_rw_dma_region->vaddr().as_ptr(), current_request->buffer_size()); result.is_error()) { - current_request->complete(AsyncDeviceRequest::MemoryFault); + req_result = AsyncBlockDeviceRequest::MemoryFault; return; } } - current_request->complete(AsyncDeviceRequest::Success); + return; } } diff --git a/Kernel/Storage/NVMe/NVMePollQueue.h b/Kernel/Storage/NVMe/NVMePollQueue.h index 4080d6c76f..6abe627a03 100644 --- a/Kernel/Storage/NVMe/NVMePollQueue.h +++ b/Kernel/Storage/NVMe/NVMePollQueue.h @@ -17,6 +17,6 @@ public: virtual ~NVMePollQueue() override {}; private: - virtual void complete_current_request(u16 status) override; + virtual void complete_current_request(u16 cmdid, u16 status) override; }; } diff --git a/Kernel/Storage/NVMe/NVMeQueue.cpp b/Kernel/Storage/NVMe/NVMeQueue.cpp index 6736cf8043..a7bf981825 100644 --- a/Kernel/Storage/NVMe/NVMeQueue.cpp +++ b/Kernel/Storage/NVMe/NVMeQueue.cpp @@ -26,8 +26,7 @@ ErrorOr> NVMeQueue::try_create(u16 qid, Optional rw_dma_region, Memory::PhysicalPage const& rw_dma_page, u16 qid, u32 q_depth, OwnPtr cq_dma_region, Vector> cq_dma_page, OwnPtr sq_dma_region, Vector> sq_dma_page, Memory::TypedMapping db_regs) - : m_current_request(nullptr) - , m_rw_dma_region(move(rw_dma_region)) + : m_rw_dma_region(move(rw_dma_region)) , m_qid(qid) , m_admin_queue(qid == 0) , m_qdepth(q_depth) @@ -39,6 +38,7 @@ UNMAP_AFTER_INIT NVMeQueue::NVMeQueue(NonnullOwnPtr rw_dma_regio , m_rw_dma_page(rw_dma_page) { + m_requests.try_ensure_capacity(q_depth).release_value_but_fixme_should_propagate_errors(); m_sqe_array = { reinterpret_cast(m_sq_dma_region->vaddr().as_ptr()), m_qdepth }; m_cqe_array = { reinterpret_cast(m_cq_dma_region->vaddr().as_ptr()), m_qdepth }; } @@ -70,15 +70,12 @@ u32 NVMeQueue::process_cq() status = CQ_STATUS_FIELD(m_cqe_array[m_cq_head].status); cmdid = m_cqe_array[m_cq_head].command_id; dbgln_if(NVME_DEBUG, "NVMe: Completion with status {:x} and command identifier {}. CQ_HEAD: {}", status, cmdid, m_cq_head); - // TODO: We don't use AsyncBlockDevice requests for admin queue as it is only applicable for a block device (NVMe namespace) - // But admin commands precedes namespace creation. Unify requests to avoid special conditions - if (m_admin_queue == false) { - // As the block layer calls are now sync (as we wait on each requests), - // everything is operated on a single request similar to BMIDE driver. - if (m_current_request) { - complete_current_request(status); - } + + if (!m_requests.contains(cmdid)) { + dmesgln("Bogus cmd id: {}", cmdid); + VERIFY_NOT_REACHED(); } + complete_current_request(cmdid, status); update_cqe_head(); } if (nr_of_processed_cqes) { @@ -112,6 +109,14 @@ u16 NVMeQueue::submit_sync_sqe(NVMeSubmission& sub) u16 cid = get_request_cid(); sub.cmdid = cid; + { + SpinlockLocker req_lock(m_request_lock); + + if (m_requests.contains(sub.cmdid) && m_requests.get(sub.cmdid).release_value().used) + VERIFY_NOT_REACHED(); + m_requests.set(sub.cmdid, { nullptr, true, nullptr }); + } + submit_sqe(sub); do { int index; @@ -132,9 +137,6 @@ u16 NVMeQueue::submit_sync_sqe(NVMeSubmission& sub) void NVMeQueue::read(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 count) { NVMeSubmission sub {}; - SpinlockLocker m_lock(m_request_lock); - m_current_request = request; - sub.op = OP_NVME_READ; sub.rw.nsid = nsid; sub.rw.slba = AK::convert_between_host_and_little_endian(index); @@ -143,6 +145,13 @@ void NVMeQueue::read(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 sub.rw.data_ptr.prp1 = reinterpret_cast(AK::convert_between_host_and_little_endian(m_rw_dma_page->paddr().as_ptr())); sub.cmdid = get_request_cid(); + { + SpinlockLocker req_lock(m_request_lock); + if (m_requests.contains(sub.cmdid) && m_requests.get(sub.cmdid).release_value().used) + VERIFY_NOT_REACHED(); + m_requests.set(sub.cmdid, { request, true, nullptr }); + } + full_memory_barrier(); submit_sqe(sub); } @@ -150,13 +159,7 @@ void NVMeQueue::read(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 void NVMeQueue::write(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 count) { NVMeSubmission sub {}; - SpinlockLocker m_lock(m_request_lock); - m_current_request = request; - if (auto result = m_current_request->read_from_buffer(m_current_request->buffer(), m_rw_dma_region->vaddr().as_ptr(), m_current_request->buffer_size()); result.is_error()) { - complete_current_request(AsyncDeviceRequest::MemoryFault); - return; - } sub.op = OP_NVME_WRITE; sub.rw.nsid = nsid; sub.rw.slba = AK::convert_between_host_and_little_endian(index); @@ -165,6 +168,18 @@ void NVMeQueue::write(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 sub.rw.data_ptr.prp1 = reinterpret_cast(AK::convert_between_host_and_little_endian(m_rw_dma_page->paddr().as_ptr())); sub.cmdid = get_request_cid(); + { + SpinlockLocker req_lock(m_request_lock); + if (m_requests.contains(sub.cmdid) && m_requests.get(sub.cmdid).release_value().used) + VERIFY_NOT_REACHED(); + m_requests.set(sub.cmdid, { request, true, nullptr }); + } + + if (auto result = request.read_from_buffer(request.buffer(), m_rw_dma_region->vaddr().as_ptr(), request.buffer_size()); result.is_error()) { + complete_current_request(sub.cmdid, AsyncDeviceRequest::MemoryFault); + return; + } + full_memory_barrier(); submit_sqe(sub); } diff --git a/Kernel/Storage/NVMe/NVMeQueue.h b/Kernel/Storage/NVMe/NVMeQueue.h index 617dfdfa7f..e1d5b39029 100644 --- a/Kernel/Storage/NVMe/NVMeQueue.h +++ b/Kernel/Storage/NVMe/NVMeQueue.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -26,6 +27,13 @@ struct DoorbellRegister { }; class AsyncBlockDeviceRequest; + +struct NVMeIO { + RefPtr request; + bool used = false; + Function end_io_handler; +}; + class NVMeQueue : public AtomicRefCounted { public: static ErrorOr> try_create(u16 qid, Optional irq, u32 q_depth, OwnPtr cq_dma_region, Vector> cq_dma_page, OwnPtr sq_dma_region, Vector> sq_dma_page, Memory::TypedMapping db_regs); @@ -60,7 +68,7 @@ protected: private: bool cqe_available(); void update_cqe_head(); - virtual void complete_current_request(u16 status) = 0; + virtual void complete_current_request(u16 cmdid, u16 status) = 0; void update_cq_doorbell() { m_db_regs->cq_head = m_cq_head; @@ -68,7 +76,7 @@ private: protected: Spinlock m_cq_lock {}; - LockRefPtr m_current_request; + HashMap m_requests; NonnullOwnPtr m_rw_dma_region; Spinlock m_request_lock {};