LibIMAP: Add quoted printable decoder

This is a very common encoding for e-mail. Gmail seems to encode all HTML e-mail in it. imap qp clang
2025-09-13 23:17:59 +00:00 · 2021-07-21 01:05:19 +01:00 · 2021-07-21 01:05:19 +01:00 · c63913b633
commit c63913b633
parent cc0914ae58
3 changed files with 107 additions and 1 deletions
--- a/Userland/Libraries/LibIMAP/CMakeLists.txt
+++ b/Userland/Libraries/LibIMAP/CMakeLists.txt
@ -1,4 +1,9 @@
-set(SOURCES Objects.cpp Client.cpp  Parser.cpp)
+set(SOURCES
+    Client.cpp
+    Objects.cpp
+    Parser.cpp
+    QuotedPrintable.cpp
+)

 set(GENERATED_SOURCES)

--- a/Userland/Libraries/LibIMAP/QuotedPrintable.cpp
+++ b/Userland/Libraries/LibIMAP/QuotedPrintable.cpp
@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/CharacterTypes.h>
+#include <AK/GenericLexer.h>
+#include <AK/StringBuilder.h>
+#include <LibIMAP/QuotedPrintable.h>
+
+namespace IMAP {
+
+static constexpr bool is_illegal_character(char c)
+{
+    return (u8)c > 0x7E || (is_ascii_control(c) && c != '\t' && c != '\r' && c != '\n');
+}
+
+// RFC 2045 Section 6.7 "Quoted-Printable Content-Transfer-Encoding", https://datatracker.ietf.org/doc/html/rfc2045#section-6.7
+ByteBuffer decode_quoted_printable(StringView const& input)
+{
+    GenericLexer lexer(input);
+    StringBuilder output;
+
+    // NOTE: The RFC says that encoded lines must not be longer than 76 characters.
+    //       However, the RFC says implementations can ignore this and parse as is,
+    //       which is the approach we're taking.
+
+    while (!lexer.is_eof()) {
+        char potential_character = lexer.consume();
+
+        if (is_illegal_character(potential_character))
+            continue;
+
+        if (potential_character == '=') {
+            if (lexer.is_eof()) {
+                TODO();
+            }
+
+            char first_escape_character = lexer.consume();
+
+            // The RFC doesn't formally allow lowercase, but says implementations can treat lowercase the same as uppercase.
+            // Thus we can use is_ascii_hex_digit.
+            if (is_ascii_hex_digit(first_escape_character)) {
+                if (lexer.is_eof()) {
+                    TODO();
+                }
+
+                char second_escape_character = lexer.consume();
+
+                if (is_ascii_hex_digit(second_escape_character)) {
+                    u8 actual_character = (parse_ascii_hex_digit(first_escape_character) << 4) | parse_ascii_hex_digit(second_escape_character);
+                    output.append(actual_character);
+                } else {
+                    TODO();
+                }
+            } else if (first_escape_character == '\r') {
+                if (lexer.is_eof()) {
+                    TODO();
+                }
+
+                char second_escape_character = lexer.consume();
+
+                if (second_escape_character == '\n') {
+                    // This is a soft line break. Don't append anything to the output.
+                } else {
+                    TODO();
+                }
+            } else {
+                if (is_illegal_character(first_escape_character)) {
+                    TODO();
+                }
+
+                // Invalid escape sequence. RFC 2045 says a reasonable solution is just to append '=' followed by the character.
+                output.append('=');
+                output.append(first_escape_character);
+            }
+        } else {
+            output.append(potential_character);
+        }
+    }
+
+    return output.to_byte_buffer();
+}
+
+}
--- a/Userland/Libraries/LibIMAP/QuotedPrintable.h
+++ b/Userland/Libraries/LibIMAP/QuotedPrintable.h
@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2021, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/ByteBuffer.h>
+
+namespace IMAP {
+
+ByteBuffer decode_quoted_printable(StringView const&);
+
+}