From 621f2b5c7a12e9ae20c78e8abd72c55ba8205e09 Mon Sep 17 00:00:00 2001
From: GTimothy <22472919+GTimothy@users.noreply.github.com>
Date: Mon, 24 Mar 2025 22:49:31 +0100
Subject: [PATCH 1/3] checksum/cksum: rewrite lineformat parsing without regex

removes dependency on the regex crate for LineFormat detection and
parsing, resulting in a faster and lighter cksum binary.
---
 src/uucore/Cargo.toml                   |   2 +-
 src/uucore/src/lib/features/checksum.rs | 254 ++++++++++++++++--------
 2 files changed, 176 insertions(+), 80 deletions(-)
diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml
index 57399bcac..c061ca75f 100644
--- a/src/uucore/Cargo.toml
+++ b/src/uucore/Cargo.toml
@@ -89,7 +89,7 @@ default = []
 # * non-default features
 backup-control = []
 colors = []
-checksum = ["data-encoding", "thiserror", "regex", "sum"]
+checksum = ["data-encoding", "thiserror", "sum"]
 encoding = ["data-encoding", "data-encoding-macro", "z85"]
 entries = ["libc"]
 fs = ["dunce", "libc", "winapi-util", "windows-sys"]
diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index e56eedd4b..25eddd0ac 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -2,11 +2,10 @@
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
-// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
+// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
 
 use data_encoding::BASE64;
 use os_display::Quotable;
-use regex::bytes::{Match, Regex};
 use std::{
     borrow::Cow,
     ffi::OsStr,
@@ -15,7 +14,6 @@ use std::{
     io::{self, BufReader, Read, Write, stdin},
     path::Path,
     str,
-    sync::LazyLock,
 };
 
 use crate::{
@@ -466,36 +464,157 @@ pub fn detect_algo(algo: &str, length: Option<usize>) -> UResult<HashAlgorithm>
     }
 }
 
-// Regexp to handle the three input formats:
-// 1. <algo>[-<bits>] (<filename>) = <checksum>
-//    algo must be uppercase or b (for blake2b)
-// 2. <checksum> [* ]<filename>
-// 3. <checksum> [*]<filename> (only one space)
-const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>(?-u:.*))\)\s*=\s*(?P<checksum>[A-Za-z0-9+/]+={0,2})$";
-
-const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>(?-u:.*))$";
-
-// In this case, we ignore the *
-const SINGLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$";
-
-static R_ALGO_BASED: LazyLock<Regex> = LazyLock::new(|| Regex::new(ALGO_BASED_REGEX).unwrap());
-static R_DOUBLE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(DOUBLE_SPACE_REGEX).unwrap());
-static R_SINGLE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(SINGLE_SPACE_REGEX).unwrap());
-
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum LineFormat {
     AlgoBased,
     SingleSpace,
-    DoubleSpace,
+    Untagged,
 }
 
 impl LineFormat {
-    fn to_regex(self) -> &'static Regex {
-        match self {
-            LineFormat::AlgoBased => &R_ALGO_BASED,
-            LineFormat::SingleSpace => &R_SINGLE_SPACE,
-            LineFormat::DoubleSpace => &R_DOUBLE_SPACE,
+    /// parse [tagged output format]
+    /// Normally the format is simply space separated but openssl does not
+    /// respect the gnu definition.
+    ///
+    /// [tagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
+    fn parse_algo_based(line: &[u8]) -> Option<LineInfo> {
+        //   r"\MD5 (a\\ b) = abc123",
+        //   BLAKE2b(44)= a45a4c4883cce4b50d844fab460414cc2080ca83690e74d850a9253e757384366382625b218c8585daee80f34dc9eb2f2fde5fb959db81cd48837f9216e7b0fa
+        let trimmed = line.trim_ascii_start();
+        let algo_start = if trimmed.starts_with(b"\\") { 1 } else { 0 };
+        let rest = &trimmed[algo_start..];
+
+        // find the next parenthesis  using byte search (not next whitespace) because openssl's
+        // tagged format does not put a space before (filename)
+        let par_idx = rest.iter().position(|&b| b == b'(')?;
+        let algo_substring = &rest[..par_idx].trim_ascii();
+        let mut algo_parts = algo_substring.splitn(2, |&b| b == b'-');
+        let algo = algo_parts.next()?;
+
+        // Parse algo_bits if present
+        let algo_bits = algo_parts
+            .next()
+            .and_then(|s| std::str::from_utf8(s).ok()?.parse::<usize>().ok());
+
+        // Check algo format: uppercase ASCII or digits or "BLAKE2b"
+        let is_valid_algo = algo == b"BLAKE2b"
+            || algo
+                .iter()
+                .all(|&b| b.is_ascii_uppercase() || b.is_ascii_digit());
+        if !is_valid_algo {
+            return None;
         }
+        // SAFETY: we just validated the contents of algo, we can unsafely make a
+        // String from it
+        let algo_utf8 = unsafe { String::from_utf8_unchecked(algo.to_vec()) };
+        // stripping '(' not ' (' since we matched on ( not whitespace because of openssl.
+        let after_paren = rest.get(par_idx + 1..)?;
+        let (filename, checksum) = ByteSliceExt::split_once(after_paren, b") = ")
+            .or_else(|| ByteSliceExt::split_once(after_paren, b")= "))?;
+
+        fn is_valid_checksum(checksum: &[u8]) -> bool {
+            if checksum.is_empty() {
+                return false;
+            }
+
+            let mut parts = checksum.splitn(2, |&b| b == b'=');
+            let main = parts.next().unwrap(); // Always exists since checksum isn't empty
+            let padding = parts.next().unwrap_or(&b""[..]); // Empty if no '='
+
+            main.iter()
+                .all(|&b| b.is_ascii_alphanumeric() || b == b'+' || b == b'/')
+                && !main.is_empty()
+                && padding.len() <= 2
+                && padding.iter().all(|&b| b == b'=')
+        }
+        if !is_valid_checksum(checksum) {
+            return None;
+        }
+        // SAFETY: we just validated the contents of checksum, we can unsafely make a
+        // String from it
+        let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) };
+
+        Some(LineInfo {
+            algo_name: Some(algo_utf8),
+            algo_bit_len: algo_bits,
+            checksum: checksum_utf8,
+            filename: filename.to_vec(),
+            format: LineFormat::AlgoBased,
+        })
+    }
+
+    #[allow(rustdoc::invalid_html_tags)]
+    /// parse [untagged output format]
+    /// The format is simple, either "<checksum>  <filename>" or
+    /// "<checksum> *<filename>"
+    ///
+    /// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
+    fn parse_untagged(line: &[u8]) -> Option<LineInfo> {
+        let space_idx = line.iter().position(|&b| b == b' ')?;
+        let checksum = &line[..space_idx];
+        if !checksum.iter().all(|&b| b.is_ascii_hexdigit()) || checksum.is_empty() {
+            return None;
+        }
+        // SAFETY: we just validated the contents of checksum, we can unsafely make a
+        // String from it
+        let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) };
+
+        let rest = &line[space_idx..];
+        let filename = rest
+            .strip_prefix(b"  ")
+            .or_else(|| rest.strip_prefix(b" *"))?;
+
+        Some(LineInfo {
+            algo_name: None,
+            algo_bit_len: None,
+            checksum: checksum_utf8,
+            filename: filename.to_vec(),
+            format: LineFormat::Untagged,
+        })
+    }
+
+    #[allow(rustdoc::invalid_html_tags)]
+    /// parse [untagged output format]
+    /// Normally the format is simple, either "<checksum>  <filename>" or
+    /// "<checksum> *<filename>"
+    /// But the bsd tests expect special single space behavior where
+    /// checksum and filename are separated only by a space, meaning the second
+    /// space or asterisk is part of the file name.
+    /// This parser accounts for this variation
+    ///
+    /// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
+    fn parse_single_space(line: &[u8]) -> Option<LineInfo> {
+        // Find first space
+        let space_idx = line.iter().position(|&b| b == b' ')?;
+        let checksum = &line[..space_idx];
+        if !checksum.iter().all(|&b| b.is_ascii_hexdigit()) || checksum.is_empty() {
+            return None;
+        }
+        // SAFETY: we just validated the contents of checksum, we can unsafely make a
+        // String from it
+        let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) };
+
+        let filename = line.get(space_idx + 1..)?; // Skip single space
+
+        Some(LineInfo {
+            algo_name: None,
+            algo_bit_len: None,
+            checksum: checksum_utf8,
+            filename: filename.to_vec(),
+            format: LineFormat::SingleSpace,
+        })
+    }
+}
+
+// Helper trait for byte slice operations
+trait ByteSliceExt {
+    fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>;
+}
+
+impl ByteSliceExt for [u8] {
+    fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> {
+        let pos = self.windows(pattern.len()).position(|w| w == pattern)?;
+        Some((&self[..pos], &self[pos + pattern.len()..]))
     }
 }
 
@@ -505,62 +624,39 @@ struct LineInfo {
     algo_bit_len: Option<usize>,
     checksum: String,
     filename: Vec<u8>,
-
     format: LineFormat,
 }
 
 impl LineInfo {
     /// Returns a `LineInfo` parsed from a checksum line.
-    /// The function will run 3 regexes against the line and select the first one that matches
+    /// The function will run 3 parsers against the line and select the first one that matches
     /// to populate the fields of the struct.
-    /// However, there is a catch to handle regarding the handling of `cached_regex`.
-    /// In case of non-algo-based regex, if `cached_regex` is Some, it must take the priority
-    /// over the detected regex. Otherwise, we must set it the the detected regex.
+    /// However, there is a catch to handle regarding the handling of `cached_line_format`.
+    /// In case of non-algo-based format, if `cached_line_format` is Some, it must take the priority
+    /// over the detected format. Otherwise, we must set it the the detected format.
     /// This specific behavior is emphasized by the test
     /// `test_hashsum::test_check_md5sum_only_one_space`.
-    fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<LineFormat>) -> Option<Self> {
-        let regexes: &[(&'static Regex, LineFormat)] = &[
-            (&R_ALGO_BASED, LineFormat::AlgoBased),
-            (&R_DOUBLE_SPACE, LineFormat::DoubleSpace),
-            (&R_SINGLE_SPACE, LineFormat::SingleSpace),
-        ];
+    fn parse(s: impl AsRef<OsStr>, cached_line_format: &mut Option<LineFormat>) -> Option<Self> {
+        let line_bytes = os_str_as_bytes(s.as_ref()).ok()?;
 
-        let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed");
-
-        for (regex, format) in regexes {
-            if !regex.is_match(line_bytes) {
-                continue;
-            }
-
-            let mut r = *regex;
-            if *format != LineFormat::AlgoBased {
-                // The cached regex ensures that when processing non-algo based regexes,
-                // it cannot be changed (can't have single and double space regexes
-                // used in the same file).
-                if cached_regex.is_some() {
-                    r = cached_regex.unwrap().to_regex();
-                } else {
-                    *cached_regex = Some(*format);
-                }
-            }
-
-            if let Some(caps) = r.captures(line_bytes) {
-                // These unwraps are safe thanks to the regex
-                let match_to_string = |m: Match| String::from_utf8(m.as_bytes().into()).unwrap();
-
-                return Some(Self {
-                    algo_name: caps.name("algo").map(match_to_string),
-                    algo_bit_len: caps
-                        .name("bits")
-                        .map(|m| match_to_string(m).parse::<usize>().unwrap()),
-                    checksum: caps.name("checksum").map(match_to_string).unwrap(),
-                    filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(),
-                    format: *format,
-                });
-            }
+        if let Some(info) = LineFormat::parse_algo_based(line_bytes) {
+            return Some(info);
+        }
+        if let Some(cached_format) = cached_line_format {
+            match cached_format {
+                LineFormat::Untagged => LineFormat::parse_untagged(line_bytes),
+                LineFormat::SingleSpace => LineFormat::parse_single_space(line_bytes),
+                _ => unreachable!("we never catch the algo based format"),
+            }
+        } else if let Some(info) = LineFormat::parse_untagged(line_bytes) {
+            *cached_line_format = Some(LineFormat::Untagged);
+            Some(info)
+        } else if let Some(info) = LineFormat::parse_single_space(line_bytes) {
+            *cached_line_format = Some(LineFormat::SingleSpace);
+            Some(info)
+        } else {
+            None
         }
-
-        None
     }
 }
 
@@ -835,7 +931,7 @@ fn process_checksum_line(
     cli_algo_name: Option<&str>,
     cli_algo_length: Option<usize>,
     opts: ChecksumOptions,
-    cached_regex: &mut Option<LineFormat>,
+    cached_line_format: &mut Option<LineFormat>,
     last_algo: &mut Option<String>,
 ) -> Result<(), LineCheckError> {
     let line_bytes = os_str_as_bytes(line)?;
@@ -847,14 +943,14 @@ fn process_checksum_line(
 
     // Use `LineInfo` to extract the data of a line.
     // Then, depending on its format, apply a different pre-treatment.
-    let Some(line_info) = LineInfo::parse(line, cached_regex) else {
+    let Some(line_info) = LineInfo::parse(line, cached_line_format) else {
         return Err(LineCheckError::ImproperlyFormatted);
     };
 
     if line_info.format == LineFormat::AlgoBased {
         process_algo_based_line(&line_info, cli_algo_name, opts, last_algo)
     } else if let Some(cli_algo) = cli_algo_name {
-        // If we match a non-algo based regex, we expect a cli argument
+        // If we match a non-algo based parser, we expect a cli argument
         // to give us the algorithm to use
         process_non_algo_based_line(i, &line_info, cli_algo, cli_algo_length, opts)
     } else {
@@ -890,9 +986,9 @@ fn process_checksum_file(
     let reader = BufReader::new(file);
     let lines = read_os_string_lines(reader).collect::<Vec<_>>();
 
-    // cached_regex is used to ensure that several non algo-based checksum line
-    // will use the same regex.
-    let mut cached_regex = None;
+    // cached_line_format is used to ensure that several non algo-based checksum line
+    // will use the same parser.
+    let mut cached_line_format = None;
     // last_algo caches the algorithm used in the last line to print a warning
     // message for the current line if improperly formatted.
     // Behavior tested in gnu_cksum_c::test_warn
@@ -905,7 +1001,7 @@ fn process_checksum_file(
             cli_algo_name,
             cli_algo_length,
             opts,
-            &mut cached_regex,
+            &mut cached_line_format,
             &mut last_algo,
         );
 
@@ -1381,7 +1477,7 @@ mod tests {
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
-        assert_eq!(line_info.format, LineFormat::DoubleSpace);
+        assert_eq!(line_info.format, LineFormat::Untagged);
         assert!(cached_regex.is_some());
 
         cached_regex = None;

From 04ad55510b1ed622364ea77afaaec897fdec2c75 Mon Sep 17 00:00:00 2001
From: GTimothy <22472919+GTimothy@users.noreply.github.com>
Date: Tue, 25 Mar 2025 19:19:25 +0100
Subject: [PATCH 2/3] checksum/cksum: update tests to test new parsers not
 regex

---
 src/uucore/src/lib/features/checksum.rs | 100 ++++++++++++------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index 25eddd0ac..020fa43bb 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -1341,8 +1341,7 @@ mod tests {
     }
 
     #[test]
-    fn test_algo_based_regex() {
-        let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap();
+    fn test_algo_based_parser() {
         #[allow(clippy::type_complexity)]
         let test_cases: &[(&[u8], Option<(&[u8], Option<&[u8]>, &[u8], &[u8])>)] = &[
             (b"SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some((b"SHA256", None, b"example.txt", b"d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))),
@@ -1353,27 +1352,30 @@ mod tests {
         ];
 
         for (input, expected) in test_cases {
-            let captures = algo_based_regex.captures(input);
+            let line_info = LineFormat::parse_algo_based(input);
             match expected {
                 Some((algo, bits, filename, checksum)) => {
-                    assert!(captures.is_some());
-                    let captures = captures.unwrap();
-                    assert_eq!(&captures.name("algo").unwrap().as_bytes(), algo);
-                    assert_eq!(&captures.name("bits").map(|m| m.as_bytes()), bits);
-                    assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename);
-                    assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum);
+                    assert!(line_info.is_some());
+                    let line_info = line_info.unwrap();
+                    assert_eq!(&line_info.algo_name.unwrap().as_bytes(), algo);
+                    assert_eq!(
+                        line_info
+                            .algo_bit_len
+                            .map(|m| m.to_string().as_bytes().to_owned()),
+                        bits.map(|b| b.to_owned())
+                    );
+                    assert_eq!(&line_info.filename, filename);
+                    assert_eq!(&line_info.checksum.as_bytes(), checksum);
                 }
                 None => {
-                    assert!(captures.is_none());
+                    assert!(line_info.is_none());
                 }
             }
         }
     }
 
     #[test]
-    fn test_double_space_regex() {
-        let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap();
-
+    fn test_double_space_parser() {
         #[allow(clippy::type_complexity)]
         let test_cases: &[(&[u8], Option<(&[u8], &[u8])>)] = &[
             (
@@ -1400,24 +1402,23 @@ mod tests {
         ];
 
         for (input, expected) in test_cases {
-            let captures = double_space_regex.captures(input);
+            let line_info = LineFormat::parse_untagged(input);
             match expected {
                 Some((checksum, filename)) => {
-                    assert!(captures.is_some());
-                    let captures = captures.unwrap();
-                    assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum);
-                    assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename);
+                    assert!(line_info.is_some());
+                    let line_info = line_info.unwrap();
+                    assert_eq!(&line_info.filename, filename);
+                    assert_eq!(&line_info.checksum.as_bytes(), checksum);
                 }
                 None => {
-                    assert!(captures.is_none());
+                    assert!(line_info.is_none());
                 }
             }
         }
     }
 
     #[test]
-    fn test_single_space_regex() {
-        let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap();
+    fn test_single_space_parser() {
         #[allow(clippy::type_complexity)]
         let test_cases: &[(&[u8], Option<(&[u8], &[u8])>)] = &[
             (
@@ -1440,16 +1441,16 @@ mod tests {
         ];
 
         for (input, expected) in test_cases {
-            let captures = single_space_regex.captures(input);
+            let line_info = LineFormat::parse_single_space(input);
             match expected {
                 Some((checksum, filename)) => {
-                    assert!(captures.is_some());
-                    let captures = captures.unwrap();
-                    assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum);
-                    assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename);
+                    assert!(line_info.is_some());
+                    let line_info = line_info.unwrap();
+                    assert_eq!(&line_info.filename, filename);
+                    assert_eq!(&line_info.checksum.as_bytes(), checksum);
                 }
                 None => {
-                    assert!(captures.is_none());
+                    assert!(line_info.is_none());
                 }
             }
         }
@@ -1457,68 +1458,69 @@ mod tests {
 
     #[test]
     fn test_line_info() {
-        let mut cached_regex = None;
+        let mut cached_line_format = None;
 
-        // Test algo-based regex
+        // Test algo-based parser
         let line_algo_based =
             OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
-        let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap();
+        let line_info = LineInfo::parse(&line_algo_based, &mut cached_line_format).unwrap();
         assert_eq!(line_info.algo_name.as_deref(), Some("MD5"));
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
         assert_eq!(line_info.format, LineFormat::AlgoBased);
-        assert!(cached_regex.is_none());
+        assert!(cached_line_format.is_none());
 
-        // Test double-space regex
+        // Test double-space parser
         let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e  example.txt");
-        let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap();
+        let line_info = LineInfo::parse(&line_double_space, &mut cached_line_format).unwrap();
         assert!(line_info.algo_name.is_none());
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
         assert_eq!(line_info.format, LineFormat::Untagged);
-        assert!(cached_regex.is_some());
+        assert!(cached_line_format.is_some());
 
-        cached_regex = None;
+        cached_line_format = None;
 
-        // Test single-space regex
+        // Test single-space parser
         let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt");
-        let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap();
+        let line_info = LineInfo::parse(&line_single_space, &mut cached_line_format).unwrap();
         assert!(line_info.algo_name.is_none());
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
         assert_eq!(line_info.format, LineFormat::SingleSpace);
-        assert!(cached_regex.is_some());
+        assert!(cached_line_format.is_some());
 
-        cached_regex = None;
+        cached_line_format = None;
 
         // Test invalid checksum line
         let line_invalid = OsString::from("invalid checksum line");
-        assert!(LineInfo::parse(&line_invalid, &mut cached_regex).is_none());
-        assert!(cached_regex.is_none());
+        assert!(LineInfo::parse(&line_invalid, &mut cached_line_format).is_none());
+        assert!(cached_line_format.is_none());
 
         // Test leading space before checksum line
         let line_algo_based_leading_space =
             OsString::from("   MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
-        let line_info = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex).unwrap();
+        let line_info =
+            LineInfo::parse(&line_algo_based_leading_space, &mut cached_line_format).unwrap();
         assert_eq!(line_info.format, LineFormat::AlgoBased);
-        assert!(cached_regex.is_none());
+        assert!(cached_line_format.is_none());
 
         // Test trailing space after checksum line (should fail)
         let line_algo_based_leading_space =
             OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e ");
-        let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
+        let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_line_format);
         assert!(res.is_none());
-        assert!(cached_regex.is_none());
+        assert!(cached_line_format.is_none());
     }
 
     #[test]
     fn test_get_expected_digest() {
         let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=");
-        let mut cached_regex = None;
-        let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
+        let mut cached_line_format = None;
+        let line_info = LineInfo::parse(&line, &mut cached_line_format).unwrap();
 
         let result = get_expected_digest_as_hex_string(&line_info, None);
 
@@ -1532,8 +1534,8 @@ mod tests {
     fn test_get_expected_checksum_invalid() {
         // The line misses a '=' at the end to be valid base64
         let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU");
-        let mut cached_regex = None;
-        let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
+        let mut cached_line_format = None;
+        let line_info = LineInfo::parse(&line, &mut cached_line_format).unwrap();
 
         let result = get_expected_digest_as_hex_string(&line_info, None);
 

From 09a9dc72b9398b869b0cacd80bed1ff4d84ff5f0 Mon Sep 17 00:00:00 2001
From: GTimothy <22472919+GTimothy@users.noreply.github.com>
Date: Thu, 27 Mar 2025 12:02:52 +0100
Subject: [PATCH 3/3] checksum/cksum: fix: filename that include separator
 should parse + add tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fixes this non-regex implementation's flaw with file_names containing
the separator's pattern:
 - replaces left-to-right greedy separator match with right-to-left one.
 - added bugfix tests
fixes secondary bug: positive match on hybrid posix-openssl format
adds secondary bugfix tests

Co-authored-by: Dorian Péron
<72708393+RenjiSann@users.noreply.github.com>
---
 src/uucore/src/lib/features/checksum.rs | 90 ++++++++++++++++++++-----
 1 file changed, 75 insertions(+), 15 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index 020fa43bb..e9f027e1d 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -2,7 +2,7 @@
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
-// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
+// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit rsplit
 
 use data_encoding::BASE64;
 use os_display::Quotable;
@@ -484,10 +484,24 @@ impl LineFormat {
         let algo_start = if trimmed.starts_with(b"\\") { 1 } else { 0 };
         let rest = &trimmed[algo_start..];
 
+        enum SubCase {
+            Posix,
+            OpenSSL,
+        }
         // find the next parenthesis  using byte search (not next whitespace) because openssl's
         // tagged format does not put a space before (filename)
+
         let par_idx = rest.iter().position(|&b| b == b'(')?;
-        let algo_substring = &rest[..par_idx].trim_ascii();
+        let sub_case = if rest[par_idx - 1] == b' ' {
+            SubCase::Posix
+        } else {
+            SubCase::OpenSSL
+        };
+
+        let algo_substring = match sub_case {
+            SubCase::Posix => &rest[..par_idx - 1],
+            SubCase::OpenSSL => &rest[..par_idx],
+        };
         let mut algo_parts = algo_substring.splitn(2, |&b| b == b'-');
         let algo = algo_parts.next()?;
 
@@ -509,8 +523,10 @@ impl LineFormat {
         let algo_utf8 = unsafe { String::from_utf8_unchecked(algo.to_vec()) };
         // stripping '(' not ' (' since we matched on ( not whitespace because of openssl.
         let after_paren = rest.get(par_idx + 1..)?;
-        let (filename, checksum) = ByteSliceExt::split_once(after_paren, b") = ")
-            .or_else(|| ByteSliceExt::split_once(after_paren, b")= "))?;
+        let (filename, checksum) = match sub_case {
+            SubCase::Posix => ByteSliceExt::rsplit_once(after_paren, b") = ")?,
+            SubCase::OpenSSL => ByteSliceExt::rsplit_once(after_paren, b")= ")?,
+        };
 
         fn is_valid_checksum(checksum: &[u8]) -> bool {
             if checksum.is_empty() {
@@ -608,13 +624,20 @@ impl LineFormat {
 
 // Helper trait for byte slice operations
 trait ByteSliceExt {
-    fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>;
+    /// Look for a pattern from right to left, return surrounding parts if found.
+    fn rsplit_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>;
 }
 
 impl ByteSliceExt for [u8] {
-    fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> {
-        let pos = self.windows(pattern.len()).position(|w| w == pattern)?;
-        Some((&self[..pos], &self[pos + pattern.len()..]))
+    fn rsplit_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> {
+        let pos = self
+            .windows(pattern.len())
+            .rev()
+            .position(|w| w == pattern)?;
+        Some((
+            &self[..self.len() - pattern.len() - pos],
+            &self[self.len() - pos..],
+        ))
     }
 }
 
@@ -1345,30 +1368,67 @@ mod tests {
         #[allow(clippy::type_complexity)]
         let test_cases: &[(&[u8], Option<(&[u8], Option<&[u8]>, &[u8], &[u8])>)] = &[
             (b"SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some((b"SHA256", None, b"example.txt", b"d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))),
-            // cspell:disable-next-line
+            // cspell:disable
             (b"BLAKE2b-512 (file) = abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef", Some((b"BLAKE2b", Some(b"512"), b"file", b"abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef"))),
             (b" MD5 (test) = 9e107d9d372bb6826bd81d3542a419d6", Some((b"MD5", None, b"test", b"9e107d9d372bb6826bd81d3542a419d6"))),
             (b"SHA-1 (anotherfile) = a9993e364706816aba3e25717850c26c9cd0d89d", Some((b"SHA", Some(b"1"), b"anotherfile", b"a9993e364706816aba3e25717850c26c9cd0d89d"))),
+            (b" MD5 (anothertest) = fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"anothertest", b"fds65dsf46as5df4d6f54asds5d7f7g9"))),
+            (b" MD5(anothertest2) = fds65dsf46as5df4d6f54asds5d7f7g9", None),
+            (b" MD5(weirdfilename0)= stillfilename)= fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename0)= stillfilename", b"fds65dsf46as5df4d6f54asds5d7f7g9"))),
+            (b" MD5(weirdfilename1)= )= fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename1)= ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))),
+            (b" MD5(weirdfilename2) = )= fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename2) = ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))),
+            (b" MD5 (weirdfilename3)= ) = fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename3)= ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))),
+            (b" MD5 (weirdfilename4) = ) = fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename4) = ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))),
+            (b" MD5(weirdfilename5)= ) = fds65dsf46as5df4d6f54asds5d7f7g9", None),
+            (b" MD5(weirdfilename6) = ) = fds65dsf46as5df4d6f54asds5d7f7g9", None),
+            (b" MD5 (weirdfilename7)= )= fds65dsf46as5df4d6f54asds5d7f7g9", None),
+            (b" MD5 (weirdfilename8) = )= fds65dsf46as5df4d6f54asds5d7f7g9", None),
         ];
 
+        // cspell:enable
         for (input, expected) in test_cases {
             let line_info = LineFormat::parse_algo_based(input);
             match expected {
                 Some((algo, bits, filename, checksum)) => {
-                    assert!(line_info.is_some());
+                    assert!(
+                        line_info.is_some(),
+                        "expected Some, got None for {}",
+                        String::from_utf8_lossy(filename)
+                    );
                     let line_info = line_info.unwrap();
-                    assert_eq!(&line_info.algo_name.unwrap().as_bytes(), algo);
+                    assert_eq!(
+                        &line_info.algo_name.unwrap().as_bytes(),
+                        algo,
+                        "failed for {}",
+                        String::from_utf8_lossy(filename)
+                    );
                     assert_eq!(
                         line_info
                             .algo_bit_len
                             .map(|m| m.to_string().as_bytes().to_owned()),
-                        bits.map(|b| b.to_owned())
+                        bits.map(|b| b.to_owned()),
+                        "failed for {}",
+                        String::from_utf8_lossy(filename)
+                    );
+                    assert_eq!(
+                        &line_info.filename,
+                        filename,
+                        "failed for {}",
+                        String::from_utf8_lossy(filename)
+                    );
+                    assert_eq!(
+                        &line_info.checksum.as_bytes(),
+                        checksum,
+                        "failed for {}",
+                        String::from_utf8_lossy(filename)
                     );
-                    assert_eq!(&line_info.filename, filename);
-                    assert_eq!(&line_info.checksum.as_bytes(), checksum);
                 }
                 None => {
-                    assert!(line_info.is_none());
+                    assert!(
+                        line_info.is_none(),
+                        "failed for {}",
+                        String::from_utf8_lossy(input)
+                    );
                 }
             }
         }