From 99d5555134180d5f8795c31f1c3afbc012d8ec65 Mon Sep 17 00:00:00 2001
From: Max Wipfli <mail@maxwipfli.ch>
Date: Thu, 3 Jun 2021 12:36:02 +0200
Subject: [PATCH] AK: Do not trim away non-ASCII bytes when parsing URL

Because non-ASCII code points have negative byte values, trimming away
control characters requires checking for negative bytes values.

This also adds a test case with a URL containing non-ASCII code points.
---
 AK/URLParser.cpp     | 4 ++--
 Tests/AK/TestURL.cpp | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp
index 06e3723b18..daedbc50c9 100644
--- a/AK/URLParser.cpp
+++ b/AK/URLParser.cpp
@@ -174,7 +174,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
     size_t start_index = 0;
     size_t end_index = raw_input.length();
     for (size_t i = 0; i < raw_input.length(); ++i) {
-        if (raw_input[i] <= 0x20) {
+        if (0 <= raw_input[i] && raw_input[i] <= 0x20) {
             ++start_index;
             has_validation_error = true;
         } else {
@@ -182,7 +182,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
         }
     }
     for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
-        if (raw_input[i] <= 0x20) {
+        if (0 <= raw_input[i] && raw_input[i] <= 0x20) {
             --end_index;
             has_validation_error = true;
         } else {
diff --git a/Tests/AK/TestURL.cpp b/Tests/AK/TestURL.cpp
index 9fff681525..1fc2a0679a 100644
--- a/Tests/AK/TestURL.cpp
+++ b/Tests/AK/TestURL.cpp
@@ -328,3 +328,12 @@ TEST_CASE(leading_and_trailing_whitespace)
     EXPECT(url.is_valid());
     EXPECT_EQ(url.to_string(), "https://foo.com/");
 }
+
+TEST_CASE(unicode)
+{
+    URL url { "http://example.com/_ünicöde_téxt_©" };
+    EXPECT(url.is_valid());
+    EXPECT_EQ(url.path(), "/_ünicöde_téxt_©");
+    EXPECT(url.query().is_null());
+    EXPECT(url.fragment().is_null());
+}