From 6613a4cb8cfade82a29be15fc68a1feb23ff5e3f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 9 Aug 2020 21:55:32 -0400 Subject: [PATCH] disasm: Insert symbol names in disassembly stream The symbol name insertion scheme is different from objdump -d's. Compare the output on Build/Userland/id: * disasm: ... _start (08048305-0804836b): 08048305 push ebp ... 08048366 call 0x0000df56 0804836b o16 nop 0804836d o16 nop 0804836f nop (deregister_tm_clones (08048370-08048370)) 08048370 mov eax, 0x080643e0 ... _ZN2AK8Utf8ViewC1ERKNS_6StringE (0805d9b2-0805d9b7): _ZN2AK8Utf8ViewC2ERKNS_6StringE (0805d9b2-0805d9b7): 0805d9b2 jmp 0x00014ff2 0805d9b7 nop * objdump -d: 08048305 <_start>: 8048305: 55 push %ebp ... 8048366: e8 9b dc 00 00 call 8056006 804836b: 66 90 xchg %ax,%ax 804836d: 66 90 xchg %ax,%ax 804836f: 90 nop 08048370 : 8048370: b8 e0 43 06 08 mov $0x80643e0,%eax ... 0805d9b2 <_ZN2AK8Utf8ViewC1ERKNS_6StringE>: 805d9b2: e9 eb f6 ff ff jmp 805d0a2 <_ZN2AK10StringViewC1ERKNS_6StringE> 805d9b7: 90 nop Differences: 1. disasm can show multiple symbols that cover the same instructions. I've only seen this happen for C1/C2 (and D1/D2) ctor/dtor pairs, but it could conceivably happen with ICF as well. 2. disasm separates instructions that do not belong to a symbol with a newline, so that nop padding isn't shown as part of a function when it technically isn't. 3. disasm shows symbols that are skipped (due to having size 0) in parenthesis, separated from preceding and following instructions. --- AK/StringView.h | 7 +++++ AK/Vector.h | 1 + Userland/disasm.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/AK/StringView.h b/AK/StringView.h index c61a99e352..4e8a652dec 100644 --- a/AK/StringView.h +++ b/AK/StringView.h @@ -158,6 +158,13 @@ public: return !(*this == other); } + bool operator<(const StringView& other) const + { + if (int c = __builtin_memcmp(m_characters, other.m_characters, min(m_length, other.m_length))) + return c < 0; + return m_length < other.m_length; + } + const StringImpl* impl() const { return m_impl; } String to_string() const; diff --git a/AK/Vector.h b/AK/Vector.h index 393b9d8b35..3cf47f160e 100644 --- a/AK/Vector.h +++ b/AK/Vector.h @@ -73,6 +73,7 @@ public: return *this; } ALWAYS_INLINE ElementType& operator*() { return m_vector[m_index]; } + ALWAYS_INLINE ElementType* operator->() { return &m_vector[m_index]; } size_t operator-(const VectorIterator& other) { return m_index - other.m_index; } bool is_end() const { return m_index == m_vector.size(); } diff --git a/Userland/disasm.cpp b/Userland/disasm.cpp index 80300b5920..0deaa01b45 100644 --- a/Userland/disasm.cpp +++ b/Userland/disasm.cpp @@ -26,12 +26,16 @@ #include #include +#include +#include #include #include #include #include #include +//#define DISASM_DUMP + int main(int argc, char** argv) { const char* path = nullptr; @@ -46,9 +50,22 @@ int main(int argc, char** argv) return 1; } + struct Symbol { + size_t value; + size_t size; + StringView name; + + size_t address() const { return value; } + size_t address_end() const { return value + size; } + + bool contains(size_t virtual_address) { return address() <= virtual_address && virtual_address < address_end(); } + }; + Vector symbols; + const u8* asm_data = (const u8*)file.data(); size_t asm_size = file.size(); size_t file_offset = 0; + Vector::Iterator current_symbol = symbols.begin(); if (asm_size >= 4 && strncmp((const char*)asm_data, "\u007fELF", 4) == 0) { if (auto elf = ELF::Loader::create(asm_data, asm_size)) { elf->image().for_each_section_of_type(SHT_PROGBITS, [&](const ELF::Image::Section& section) { @@ -60,18 +77,71 @@ int main(int argc, char** argv) file_offset = section.address(); return IterationDecision::Break; }); + symbols.ensure_capacity(elf->image().symbol_count() + 1); + symbols.append({ 0, 0, StringView() }); // Sentinel. + elf->image().for_each_symbol([&](const ELF::Image::Symbol& symbol) { + symbols.append({ symbol.value(), symbol.size(), symbol.name() }); + return IterationDecision::Continue; + }); + quick_sort(symbols, [](auto& a, auto& b) { + if (a.value != b.value) + return a.value < b.value; + if (a.size != b.size) + return a.size < b.size; + return a.name < b.name; + }); +#ifdef DISASM_DUMP + for (size_t i = 0; i < symbols.size(); ++i) + dbg() << symbols[i].name << ": " << (void*)(uintptr_t)symbols[i].value << ", " << symbols[i].size; +#endif } } X86::SimpleInstructionStream stream(asm_data, asm_size); X86::Disassembler disassembler(stream); + bool is_first_symbol = true; + bool current_instruction_is_in_symbol = false; + for (;;) { auto offset = stream.offset(); auto insn = disassembler.next(); if (!insn.has_value()) break; - out() << String::format("%08x", file_offset + offset) << " " << insn.value().to_string(offset); + + // Prefix regions of instructions belonging to a symbol with the symbol's name. + // Separate regions of instructions belonging to distinct symbols with newlines, + // and separate regions of instructions not belonging to symbols from regions belonging to symbols with newlines. + // Interesting cases: + // - More than 1 symbol covering a region of instructions (ICF, D1/D2) + // - Symbols of size 0 that don't cover any instructions but are at an address (want to print them, separated from instructions both before and after) + // Invariant: current_symbol is the largest instruction containing insn, or it is the largest instruction that has an address less than the instruction's address. + size_t virtual_offset = file_offset + offset; + if (current_symbol < symbols.end() && !current_symbol->contains(virtual_offset)) { + if (!is_first_symbol && current_instruction_is_in_symbol) { + // The previous instruction was part of a symbol that doesn't cover the current instruction, so separate it from the current instruction with a newline. + out(); + current_instruction_is_in_symbol = (current_symbol + 1 < symbols.end() && (current_symbol + 1)->contains(virtual_offset)); + } + + // Try to find symbol covering current instruction, if one exists. + while (current_symbol + 1 < symbols.end() && !(current_symbol + 1)->contains(virtual_offset) && (current_symbol + 1)->address() <= virtual_offset) { + ++current_symbol; + if (!is_first_symbol) + out() << "\n(" << current_symbol->name << " (" << String::format("%08x-%08x", current_symbol->address(), current_symbol->address_end()) << "))\n"; + } + while (current_symbol + 1 < symbols.end() && (current_symbol + 1)->contains(virtual_offset)) { + if (!is_first_symbol && !current_instruction_is_in_symbol) + out(); + ++current_symbol; + current_instruction_is_in_symbol = true; + out() << current_symbol->name << " (" << String::format("%08x-%08x", current_symbol->address(), current_symbol->address_end()) << "):"; + } + + is_first_symbol = false; + } + + out() << String::format("%08x", virtual_offset) << " " << insn.value().to_string(offset); } return 0;