mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 06:48:12 +00:00

Before of this patch, we supported two methods to address a boot device: 1. Specifying root=/dev/hdXY, where X is a-z letter which corresponds to a boot device, and Y as number from 1 to 16, to indicate the partition number, which can be omitted to instruct the kernel to use a raw device rather than a partition on a raw device. 2. Specifying root=PARTUUID: with a GUID string of a GUID partition. In case of existing storage device with GPT partitions, this is most likely the safest option to ensure booting from persistent storage. While option 2 is more advanced and reliable, the first option has 2 caveats: 1. The string prefix "/dev/hd" doesn't mean anything beside a convention on Linux installations, that was taken into use in Serenity. In Serenity we don't mount DevTmpFS before we mount the boot device on /, so the kernel doesn't really access /dev anyway, so this convention is only a big misleading relic that can easily make the user to assume we access /dev early on boot. 2. This convention although resemble the simple linux convention, is quite limited in specifying a correct boot device across hardware setup changes, so option 2 was recommended to ensure the system is always bootable. With these caveats in mind, this commit tries to fix the problem with adding more addressing options as well as to remove the first option being mentioned above of addressing. To sum it up, there are 4 addressing options: 1. Hardware relative address - Each instance of StorageController is assigned with a index number relative to the type of hardware it handles which makes it possible to address storage devices with a prefix of the commandset ("ata" for ATA, "nvme" for NVMe, "ramdisk" for Plain memory), and then the number for the parent controller relative hardware index, another number LUN target_id, and a third number for LUN disk_id. 2. LUN address - Similar to the previous option, but instead we rely on the parent controller absolute index for the first number. 3. Block device major and minor numbers - by specifying the major and minor numbers, the kernel can simply try to get the corresponding block device and use it as the boot device. 4. GUID string, in the same fashion like before, so the user use the "PARTUUID:" string prefix and add the GUID of the GPT partition. For the new address modes 1 and 2, the user can choose to also specify a partition out of the selected boot device. To do that, the user needs to append the semicolon character and then add the string "partX" where X is to be changed for the partition number. We start counting from 0, and therefore the first partition number is 0 and not 1 in the kernel boot argument.
358 lines
13 KiB
C++
358 lines
13 KiB
C++
/*
|
|
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
|
|
* Copyright (c) 2022, the SerenityOS developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include "NVMeController.h"
|
|
#include "AK/Format.h"
|
|
#include <AK/Types.h>
|
|
#include <Kernel/Arch/SafeMem.h>
|
|
#include <Kernel/Arch/x86/IO.h>
|
|
#include <Kernel/Arch/x86/Processor.h>
|
|
#include <Kernel/Bus/PCI/API.h>
|
|
#include <Kernel/CommandLine.h>
|
|
#include <Kernel/Devices/Device.h>
|
|
#include <Kernel/FileSystem/ProcFS.h>
|
|
#include <Kernel/Library/LockRefPtr.h>
|
|
#include <Kernel/Sections.h>
|
|
#include <Kernel/Storage/StorageManagement.h>
|
|
|
|
namespace Kernel {
|
|
|
|
UNMAP_AFTER_INIT ErrorOr<NonnullLockRefPtr<NVMeController>> NVMeController::try_initialize(Kernel::PCI::DeviceIdentifier const& device_identifier, bool is_queue_polled)
|
|
{
|
|
auto controller = TRY(adopt_nonnull_lock_ref_or_enomem(new NVMeController(device_identifier, StorageManagement::generate_relative_nvme_controller_id({}))));
|
|
TRY(controller->initialize(is_queue_polled));
|
|
return controller;
|
|
}
|
|
|
|
UNMAP_AFTER_INIT NVMeController::NVMeController(const PCI::DeviceIdentifier& device_identifier, u32 hardware_relative_controller_id)
|
|
: PCI::Device(device_identifier.address())
|
|
, StorageController(hardware_relative_controller_id)
|
|
, m_pci_device_id(device_identifier)
|
|
{
|
|
}
|
|
|
|
UNMAP_AFTER_INIT ErrorOr<void> NVMeController::initialize(bool is_queue_polled)
|
|
{
|
|
// Nr of queues = one queue per core
|
|
auto nr_of_queues = Processor::count();
|
|
auto irq = is_queue_polled ? Optional<u8> {} : m_pci_device_id.interrupt_line().value();
|
|
|
|
PCI::enable_memory_space(m_pci_device_id.address());
|
|
PCI::enable_bus_mastering(m_pci_device_id.address());
|
|
m_bar = PCI::get_BAR0(m_pci_device_id.address()) & BAR_ADDR_MASK;
|
|
static_assert(sizeof(ControllerRegister) == REG_SQ0TDBL_START);
|
|
static_assert(sizeof(NVMeSubmission) == (1 << SQ_WIDTH));
|
|
|
|
// Map only until doorbell register for the controller
|
|
// Queues will individually map the doorbell register respectively
|
|
m_controller_regs = TRY(Memory::map_typed_writable<ControllerRegister volatile>(PhysicalAddress(m_bar)));
|
|
|
|
auto caps = m_controller_regs->cap;
|
|
m_ready_timeout = Time::from_milliseconds((CAP_TO(caps) + 1) * 500); // CAP.TO is in 500ms units
|
|
|
|
calculate_doorbell_stride();
|
|
TRY(create_admin_queue(irq));
|
|
VERIFY(m_admin_queue_ready == true);
|
|
|
|
VERIFY(IO_QUEUE_SIZE < MQES(caps));
|
|
dbgln_if(NVME_DEBUG, "NVMe: IO queue depth is: {}", IO_QUEUE_SIZE);
|
|
|
|
// Create an IO queue per core
|
|
for (u32 cpuid = 0; cpuid < nr_of_queues; ++cpuid) {
|
|
// qid is zero is used for admin queue
|
|
TRY(create_io_queue(cpuid + 1, irq));
|
|
}
|
|
TRY(identify_and_init_namespaces());
|
|
return {};
|
|
}
|
|
|
|
bool NVMeController::wait_for_ready(bool expected_ready_bit_value)
|
|
{
|
|
constexpr size_t one_ms_io_delay = 1000;
|
|
auto wait_iterations = m_ready_timeout.to_milliseconds();
|
|
|
|
u32 expected_rdy = expected_ready_bit_value ? 1 : 0;
|
|
while (((m_controller_regs->csts >> CSTS_RDY_BIT) & 0x1) != expected_rdy) {
|
|
IO::delay(one_ms_io_delay);
|
|
|
|
if (--wait_iterations == 0) {
|
|
if (((m_controller_regs->csts >> CSTS_RDY_BIT) & 0x1) != expected_rdy) {
|
|
dbgln_if(NVME_DEBUG, "NVMEController: CSTS.RDY still not set to {} after {} ms", expected_rdy, m_ready_timeout.to_milliseconds());
|
|
return false;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool NVMeController::reset_controller()
|
|
{
|
|
if ((m_controller_regs->cc & (1 << CC_EN_BIT)) != 0) {
|
|
// If the EN bit is already set, we need to wait
|
|
// until the RDY bit is 1, otherwise the behavior is undefined
|
|
if (!wait_for_ready(true))
|
|
return false;
|
|
}
|
|
|
|
auto cc = m_controller_regs->cc;
|
|
|
|
cc = cc & ~(1 << CC_EN_BIT);
|
|
|
|
m_controller_regs->cc = cc;
|
|
|
|
full_memory_barrier();
|
|
|
|
// Wait until the RDY bit is cleared
|
|
if (!wait_for_ready(false))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool NVMeController::start_controller()
|
|
{
|
|
if (!(m_controller_regs->cc & (1 << CC_EN_BIT))) {
|
|
// If the EN bit is not already set, we need to wait
|
|
// until the RDY bit is 0, otherwise the behavior is undefined
|
|
if (!wait_for_ready(false))
|
|
return false;
|
|
}
|
|
|
|
auto cc = m_controller_regs->cc;
|
|
|
|
cc = cc | (1 << CC_EN_BIT);
|
|
cc = cc | (CQ_WIDTH << CC_IOCQES_BIT);
|
|
cc = cc | (SQ_WIDTH << CC_IOSQES_BIT);
|
|
|
|
m_controller_regs->cc = cc;
|
|
|
|
full_memory_barrier();
|
|
|
|
// Wait until the RDY bit is set
|
|
if (!wait_for_ready(true))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
UNMAP_AFTER_INIT u32 NVMeController::get_admin_q_dept()
|
|
{
|
|
u32 aqa = m_controller_regs->aqa;
|
|
// Queue depth is 0 based
|
|
u32 q_depth = min(ACQ_SIZE(aqa), ASQ_SIZE(aqa)) + 1;
|
|
dbgln_if(NVME_DEBUG, "NVMe: Admin queue depth is {}", q_depth);
|
|
return q_depth;
|
|
}
|
|
|
|
UNMAP_AFTER_INIT ErrorOr<void> NVMeController::identify_and_init_namespaces()
|
|
{
|
|
|
|
RefPtr<Memory::PhysicalPage> prp_dma_buffer;
|
|
OwnPtr<Memory::Region> prp_dma_region;
|
|
auto namespace_data_struct = TRY(ByteBuffer::create_zeroed(NVMe_IDENTIFY_SIZE));
|
|
u32 active_namespace_list[NVMe_IDENTIFY_SIZE / sizeof(u32)];
|
|
|
|
{
|
|
auto buffer = TRY(MM.allocate_dma_buffer_page("Identify PRP"sv, Memory::Region::Access::ReadWrite, prp_dma_buffer));
|
|
prp_dma_region = move(buffer);
|
|
}
|
|
|
|
// Get the active namespace
|
|
{
|
|
NVMeSubmission sub {};
|
|
u16 status = 0;
|
|
sub.op = OP_ADMIN_IDENTIFY;
|
|
sub.identify.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(prp_dma_buffer->paddr().as_ptr()));
|
|
sub.identify.cns = NVMe_CNS_ID_ACTIVE_NS & 0xff;
|
|
status = submit_admin_command(sub, true);
|
|
if (status) {
|
|
dmesgln("Failed to identify active namespace command");
|
|
return EFAULT;
|
|
}
|
|
if (void* fault_at; !safe_memcpy(active_namespace_list, prp_dma_region->vaddr().as_ptr(), NVMe_IDENTIFY_SIZE, fault_at)) {
|
|
return EFAULT;
|
|
}
|
|
}
|
|
// Get the NAMESPACE attributes
|
|
{
|
|
NVMeSubmission sub {};
|
|
IdentifyNamespace id_ns {};
|
|
u16 status = 0;
|
|
for (auto nsid : active_namespace_list) {
|
|
memset(prp_dma_region->vaddr().as_ptr(), 0, NVMe_IDENTIFY_SIZE);
|
|
// Invalid NS
|
|
if (nsid == 0)
|
|
break;
|
|
sub.op = OP_ADMIN_IDENTIFY;
|
|
sub.identify.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(prp_dma_buffer->paddr().as_ptr()));
|
|
sub.identify.cns = NVMe_CNS_ID_NS & 0xff;
|
|
sub.identify.nsid = nsid;
|
|
status = submit_admin_command(sub, true);
|
|
if (status) {
|
|
dmesgln("Failed identify namespace with nsid {}", nsid);
|
|
return EFAULT;
|
|
}
|
|
static_assert(sizeof(IdentifyNamespace) == NVMe_IDENTIFY_SIZE);
|
|
if (void* fault_at; !safe_memcpy(&id_ns, prp_dma_region->vaddr().as_ptr(), NVMe_IDENTIFY_SIZE, fault_at)) {
|
|
return EFAULT;
|
|
}
|
|
auto val = get_ns_features(id_ns);
|
|
auto block_counts = val.get<0>();
|
|
auto block_size = 1 << val.get<1>();
|
|
|
|
dbgln_if(NVME_DEBUG, "NVMe: Block count is {} and Block size is {}", block_counts, block_size);
|
|
|
|
m_namespaces.append(TRY(NVMeNameSpace::try_create(*this, m_queues, nsid, block_counts, block_size)));
|
|
m_device_count++;
|
|
dbgln_if(NVME_DEBUG, "NVMe: Initialized namespace with NSID: {}", nsid);
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
UNMAP_AFTER_INIT Tuple<u64, u8> NVMeController::get_ns_features(IdentifyNamespace& identify_data_struct)
|
|
{
|
|
auto flbas = identify_data_struct.flbas & FLBA_SIZE_MASK;
|
|
auto namespace_size = identify_data_struct.nsze;
|
|
auto lba_format = identify_data_struct.lbaf[flbas];
|
|
|
|
auto lba_size = (lba_format & LBA_SIZE_MASK) >> 16;
|
|
return Tuple<u64, u8>(namespace_size, lba_size);
|
|
}
|
|
|
|
LockRefPtr<StorageDevice> NVMeController::device(u32 index) const
|
|
{
|
|
return m_namespaces.at(index);
|
|
}
|
|
|
|
size_t NVMeController::devices_count() const
|
|
{
|
|
return m_device_count;
|
|
}
|
|
|
|
bool NVMeController::reset()
|
|
{
|
|
if (!reset_controller())
|
|
return false;
|
|
if (!start_controller())
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
bool NVMeController::shutdown()
|
|
{
|
|
TODO();
|
|
return false;
|
|
}
|
|
|
|
void NVMeController::complete_current_request([[maybe_unused]] AsyncDeviceRequest::RequestResult result)
|
|
{
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
|
|
UNMAP_AFTER_INIT ErrorOr<void> NVMeController::create_admin_queue(Optional<u8> irq)
|
|
{
|
|
auto qdepth = get_admin_q_dept();
|
|
OwnPtr<Memory::Region> cq_dma_region;
|
|
NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_pages;
|
|
OwnPtr<Memory::Region> sq_dma_region;
|
|
NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_pages;
|
|
auto cq_size = round_up_to_power_of_two(CQ_SIZE(qdepth), 4096);
|
|
auto sq_size = round_up_to_power_of_two(SQ_SIZE(qdepth), 4096);
|
|
if (!reset_controller()) {
|
|
dmesgln("Failed to reset the NVMe controller");
|
|
return EFAULT;
|
|
}
|
|
{
|
|
auto buffer = TRY(MM.allocate_dma_buffer_pages(cq_size, "Admin CQ queue"sv, Memory::Region::Access::ReadWrite, cq_dma_pages));
|
|
cq_dma_region = move(buffer);
|
|
}
|
|
|
|
// Phase bit is important to determine completion, so zero out the space
|
|
// so that we don't get any garbage phase bit value
|
|
memset(cq_dma_region->vaddr().as_ptr(), 0, cq_size);
|
|
|
|
{
|
|
auto buffer = TRY(MM.allocate_dma_buffer_pages(sq_size, "Admin SQ queue"sv, Memory::Region::Access::ReadWrite, sq_dma_pages));
|
|
sq_dma_region = move(buffer);
|
|
}
|
|
auto doorbell_regs = TRY(Memory::map_typed_writable<DoorbellRegister volatile>(PhysicalAddress(m_bar + REG_SQ0TDBL_START)));
|
|
|
|
m_controller_regs->acq = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(cq_dma_pages.first().paddr().as_ptr()));
|
|
m_controller_regs->asq = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(sq_dma_pages.first().paddr().as_ptr()));
|
|
|
|
if (!start_controller()) {
|
|
dmesgln("Failed to restart the NVMe controller");
|
|
return EFAULT;
|
|
}
|
|
set_admin_queue_ready_flag();
|
|
m_admin_queue = TRY(NVMeQueue::try_create(0, irq, qdepth, move(cq_dma_region), cq_dma_pages, move(sq_dma_region), sq_dma_pages, move(doorbell_regs)));
|
|
|
|
dbgln_if(NVME_DEBUG, "NVMe: Admin queue created");
|
|
return {};
|
|
}
|
|
|
|
UNMAP_AFTER_INIT ErrorOr<void> NVMeController::create_io_queue(u8 qid, Optional<u8> irq)
|
|
{
|
|
OwnPtr<Memory::Region> cq_dma_region;
|
|
NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_pages;
|
|
OwnPtr<Memory::Region> sq_dma_region;
|
|
NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_pages;
|
|
auto cq_size = round_up_to_power_of_two(CQ_SIZE(IO_QUEUE_SIZE), 4096);
|
|
auto sq_size = round_up_to_power_of_two(SQ_SIZE(IO_QUEUE_SIZE), 4096);
|
|
|
|
{
|
|
auto buffer = TRY(MM.allocate_dma_buffer_pages(cq_size, "IO CQ queue"sv, Memory::Region::Access::ReadWrite, cq_dma_pages));
|
|
cq_dma_region = move(buffer);
|
|
}
|
|
|
|
// Phase bit is important to determine completion, so zero out the space
|
|
// so that we don't get any garbage phase bit value
|
|
memset(cq_dma_region->vaddr().as_ptr(), 0, cq_size);
|
|
|
|
{
|
|
auto buffer = TRY(MM.allocate_dma_buffer_pages(sq_size, "IO SQ queue"sv, Memory::Region::Access::ReadWrite, sq_dma_pages));
|
|
sq_dma_region = move(buffer);
|
|
}
|
|
|
|
{
|
|
NVMeSubmission sub {};
|
|
sub.op = OP_ADMIN_CREATE_COMPLETION_QUEUE;
|
|
sub.create_cq.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(cq_dma_pages.first().paddr().as_ptr()));
|
|
sub.create_cq.cqid = qid;
|
|
// The queue size is 0 based
|
|
sub.create_cq.qsize = AK::convert_between_host_and_little_endian(IO_QUEUE_SIZE - 1);
|
|
auto flags = irq.has_value() ? QUEUE_IRQ_ENABLED : QUEUE_IRQ_DISABLED;
|
|
flags |= QUEUE_PHY_CONTIGUOUS;
|
|
// TODO: Eventually move to MSI.
|
|
// For now using pin based interrupts. Clear the first 16 bits
|
|
// to use pin-based interrupts.
|
|
sub.create_cq.cq_flags = AK::convert_between_host_and_little_endian(flags & 0xFFFF);
|
|
submit_admin_command(sub, true);
|
|
}
|
|
{
|
|
NVMeSubmission sub {};
|
|
sub.op = OP_ADMIN_CREATE_SUBMISSION_QUEUE;
|
|
sub.create_sq.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(sq_dma_pages.first().paddr().as_ptr()));
|
|
sub.create_sq.sqid = qid;
|
|
// The queue size is 0 based
|
|
sub.create_sq.qsize = AK::convert_between_host_and_little_endian(IO_QUEUE_SIZE - 1);
|
|
auto flags = QUEUE_PHY_CONTIGUOUS;
|
|
sub.create_sq.cqid = qid;
|
|
sub.create_sq.sq_flags = AK::convert_between_host_and_little_endian(flags);
|
|
submit_admin_command(sub, true);
|
|
}
|
|
|
|
auto queue_doorbell_offset = REG_SQ0TDBL_START + ((2 * qid) * (4 << m_dbl_stride));
|
|
auto doorbell_regs = TRY(Memory::map_typed_writable<DoorbellRegister volatile>(PhysicalAddress(m_bar + queue_doorbell_offset)));
|
|
|
|
m_queues.append(TRY(NVMeQueue::try_create(qid, irq, IO_QUEUE_SIZE, move(cq_dma_region), cq_dma_pages, move(sq_dma_region), sq_dma_pages, move(doorbell_regs))));
|
|
dbgln_if(NVME_DEBUG, "NVMe: Created IO Queue with QID{}", m_queues.size());
|
|
return {};
|
|
}
|
|
}
|