From 5cbe87620c380ec3d0681246b5819820f83df21c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <peron@adacore.com>
Date: Thu, 5 Dec 2024 15:57:01 +0100
Subject: [PATCH 1/8] checksum: move regex detection to the line level

---
 src/uucore/src/lib/features/checksum.rs | 109 ++++++++----------------
 1 file changed, 36 insertions(+), 73 deletions(-)
diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index f7228830b..34dc0f870 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -8,7 +8,7 @@ use data_encoding::BASE64;
 use os_display::Quotable;
 use regex::bytes::{Captures, Regex};
 use std::{
-    ffi::{OsStr, OsString},
+    ffi::OsStr,
     fmt::Display,
     fs::File,
     io::{self, stdin, BufReader, Read, Write},
@@ -130,9 +130,6 @@ enum FileCheckError {
     ImproperlyFormatted,
     /// reading of the checksum file failed
     CantOpenChecksumFile,
-    /// Algorithm detection was unsuccessful.
-    /// Either none is provided, or there is a conflict.
-    AlgoDetectionError,
 }
 
 impl From<Box<dyn UError>> for FileCheckError {
@@ -441,7 +438,7 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
 }
 
 /// Determines the appropriate regular expression to use based on the provided lines.
-fn determine_regex(lines: &[OsString]) -> Option<(Regex, bool)> {
+fn determine_regex(line: impl AsRef<OsStr>) -> Option<(Regex, bool)> {
     let regexes = [
         (Regex::new(ALGO_BASED_REGEX).unwrap(), true),
         (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false),
@@ -449,12 +446,10 @@ fn determine_regex(lines: &[OsString]) -> Option<(Regex, bool)> {
         (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), true),
     ];
 
-    for line in lines {
-        let line_bytes = os_str_as_bytes(line).expect("UTF-8 decoding failed");
-        for (regex, is_algo_based) in &regexes {
-            if regex.is_match(line_bytes) {
-                return Some((regex.clone(), *is_algo_based));
-            }
+    let line_bytes = os_str_as_bytes(line.as_ref()).expect("UTF-8 decoding failed");
+    for (regex, is_algo_based) in &regexes {
+        if regex.is_match(line_bytes) {
+            return Some((regex.clone(), *is_algo_based));
         }
     }
 
@@ -599,13 +594,20 @@ fn process_checksum_line(
     filename_input: &OsStr,
     line: &OsStr,
     i: usize,
-    chosen_regex: &Regex,
-    is_algo_based_format: bool,
     cli_algo_name: Option<&str>,
     cli_algo_length: Option<usize>,
     opts: ChecksumOptions,
 ) -> Result<(), LineCheckError> {
     let line_bytes = os_str_as_bytes(line)?;
+
+    // early return on empty or commented lines.
+    if line.is_empty() || line_bytes.starts_with(b"#") {
+        return Err(LineCheckError::Skipped);
+    }
+
+    let (chosen_regex, is_algo_based_format) =
+        determine_regex(line).ok_or(LineCheckError::ImproperlyFormatted)?;
+
     if let Some(caps) = chosen_regex.captures(line_bytes) {
         let mut filename_to_check = caps.name("filename").unwrap().as_bytes();
 
@@ -617,7 +619,7 @@ fn process_checksum_line(
             filename_to_check = &filename_to_check[1..];
         }
 
-        let expected_checksum = get_expected_digest_as_hex_string(&caps, chosen_regex)
+        let expected_checksum = get_expected_digest_as_hex_string(&caps, &chosen_regex)
             .ok_or(LineCheckError::ImproperlyFormatted)?;
 
         // If the algo_name is provided, we use it, otherwise we try to detect it
@@ -672,10 +674,6 @@ fn process_checksum_line(
             Err(LineCheckError::DigestMismatch)
         }
     } else {
-        if line.is_empty() || line_bytes.starts_with(b"#") {
-            // Don't show any warning for empty or commented lines.
-            return Err(LineCheckError::Skipped);
-        }
         if opts.warn {
             let algo = if let Some(algo_name_input) = cli_algo_name {
                 algo_name_input.to_uppercase()
@@ -723,19 +721,11 @@ fn process_checksum_file(
     let reader = BufReader::new(file);
     let lines = read_os_string_lines(reader).collect::<Vec<_>>();
 
-    let Some((chosen_regex, is_algo_based_format)) = determine_regex(&lines) else {
-        log_no_properly_formatted(get_filename_for_output(filename_input, input_is_stdin));
-        set_exit_code(1);
-        return Err(FileCheckError::AlgoDetectionError);
-    };
-
     for (i, line) in lines.iter().enumerate() {
         let line_result = process_checksum_line(
             filename_input,
             line,
             i,
-            &chosen_regex,
-            is_algo_based_format,
             cli_algo_name,
             cli_algo_length,
             opts,
@@ -816,8 +806,7 @@ where
         use FileCheckError::*;
         match process_checksum_file(filename_input, algo_name_input, length_input, opts) {
             Err(UError(e)) => return Err(e),
-            Err(ImproperlyFormatted) => break,
-            Err(CantOpenChecksumFile | AlgoDetectionError) | Ok(_) => continue,
+            Err(CantOpenChecksumFile | ImproperlyFormatted) | Ok(_) => continue,
         }
     }
 
@@ -926,6 +915,7 @@ pub fn escape_filename(filename: &Path) -> (String, &'static str) {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::ffi::OsString;
 
     #[test]
     fn test_unescape_filename() {
@@ -1161,66 +1151,39 @@ mod tests {
     #[test]
     fn test_determine_regex() {
         // Test algo-based regex
-        let lines_algo_based = ["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"]
-            .iter()
-            .map(|s| OsString::from(s.to_string()))
-            .collect::<Vec<_>>();
-        let (regex, algo_based) = determine_regex(&lines_algo_based).unwrap();
+        let line_algo_based =
+            OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
+        let (regex, algo_based) = determine_regex(&line_algo_based).unwrap();
         assert!(algo_based);
-        assert!(regex.is_match(os_str_as_bytes(&lines_algo_based[0]).unwrap()));
+        assert!(regex.is_match(os_str_as_bytes(&line_algo_based).unwrap()));
 
         // Test double-space regex
-        let lines_double_space = ["d41d8cd98f00b204e9800998ecf8427e  example.txt"]
-            .iter()
-            .map(|s| OsString::from(s.to_string()))
-            .collect::<Vec<_>>();
-        let (regex, algo_based) = determine_regex(&lines_double_space).unwrap();
+        let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e  example.txt");
+        let (regex, algo_based) = determine_regex(&line_double_space).unwrap();
         assert!(!algo_based);
-        assert!(regex.is_match(os_str_as_bytes(&lines_double_space[0]).unwrap()));
+        assert!(regex.is_match(os_str_as_bytes(&line_double_space).unwrap()));
 
         // Test single-space regex
-        let lines_single_space = ["d41d8cd98f00b204e9800998ecf8427e example.txt"]
-            .iter()
-            .map(|s| OsString::from(s.to_string()))
-            .collect::<Vec<_>>();
-        let (regex, algo_based) = determine_regex(&lines_single_space).unwrap();
+        let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt");
+        let (regex, algo_based) = determine_regex(&line_single_space).unwrap();
         assert!(!algo_based);
-        assert!(regex.is_match(os_str_as_bytes(&lines_single_space[0]).unwrap()));
-
-        // Test double-space regex start with invalid
-        let lines_double_space = ["ERR", "d41d8cd98f00b204e9800998ecf8427e  example.txt"]
-            .iter()
-            .map(|s| OsString::from(s.to_string()))
-            .collect::<Vec<_>>();
-        let (regex, algo_based) = determine_regex(&lines_double_space).unwrap();
-        assert!(!algo_based);
-        assert!(!regex.is_match(os_str_as_bytes(&lines_double_space[0]).unwrap()));
-        assert!(regex.is_match(os_str_as_bytes(&lines_double_space[1]).unwrap()));
+        assert!(regex.is_match(os_str_as_bytes(&line_single_space).unwrap()));
 
         // Test invalid checksum line
-        let lines_invalid = ["invalid checksum line"]
-            .iter()
-            .map(|s| OsString::from(s.to_string()))
-            .collect::<Vec<_>>();
-        assert!(determine_regex(&lines_invalid).is_none());
+        let line_invalid = OsString::from("invalid checksum line");
+        assert!(determine_regex(&line_invalid).is_none());
 
         // Test leading space before checksum line
-        let lines_algo_based_leading_space =
-            ["   MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"]
-                .iter()
-                .map(|s| OsString::from(s.to_string()))
-                .collect::<Vec<_>>();
-        let res = determine_regex(&lines_algo_based_leading_space);
+        let line_algo_based_leading_space =
+            OsString::from("   MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
+        let res = determine_regex(&line_algo_based_leading_space);
         assert!(res.is_some());
         assert_eq!(res.unwrap().0.as_str(), ALGO_BASED_REGEX);
 
         // Test trailing space after checksum line (should fail)
-        let lines_algo_based_leading_space =
-            ["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e "]
-                .iter()
-                .map(|s| OsString::from(s.to_string()))
-                .collect::<Vec<_>>();
-        let res = determine_regex(&lines_algo_based_leading_space);
+        let line_algo_based_leading_space =
+            OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e ");
+        let res = determine_regex(&line_algo_based_leading_space);
         assert!(res.is_none());
     }
 

From ed15ca1d264ce9b6faa5720d7215ed6707b77651 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <peron@adacore.com>
Date: Thu, 5 Dec 2024 17:59:00 +0100
Subject: [PATCH 2/8] checksum: keep a cache of the first used regex for
 non-algo-based regexes

---
 src/uucore/src/lib/features/checksum.rs | 216 ++++++++++++++++--------
 1 file changed, 145 insertions(+), 71 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index 34dc0f870..d575a4b38 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -6,8 +6,9 @@
 
 use data_encoding::BASE64;
 use os_display::Quotable;
-use regex::bytes::{Captures, Regex};
+use regex::bytes::{Match, Regex};
 use std::{
+    borrow::Cow,
     ffi::OsStr,
     fmt::Display,
     fs::File,
@@ -427,6 +428,67 @@ const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>
 // In this case, we ignore the *
 const SINGLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$";
 
+/// Hold the data extracted from a checksum line.
+struct LineInfo {
+    algo_name: Option<String>,
+    algo_bit_len: Option<usize>,
+    checksum: String,
+    filename: Vec<u8>,
+
+    regex: Regex,
+}
+
+impl LineInfo {
+    fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<Regex>) -> Option<Self> {
+        let regexes = [
+            (Regex::new(ALGO_BASED_REGEX).unwrap(), true),
+            (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false),
+            (Regex::new(SINGLE_SPACE_REGEX).unwrap(), false),
+            (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), false),
+        ];
+
+        let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed");
+
+        for (regex, algo_based) in &regexes {
+            if !regex.is_match(line_bytes) {
+                continue;
+            }
+
+            let mut r = regex.clone();
+            if !algo_based && cached_regex.is_some() {
+                r = cached_regex.clone().unwrap();
+            }
+
+            if let Some(caps) = r.captures(line_bytes) {
+                // These unwraps are safe thanks to the regex
+                let match_to_string = |m: Match| String::from_utf8(m.as_bytes().into()).unwrap();
+
+                return Some(Self {
+                    algo_name: caps.name("algo").map(match_to_string),
+                    algo_bit_len: caps
+                        .name("bits")
+                        .map(|m| match_to_string(m).parse::<usize>().unwrap()),
+                    checksum: caps.name("checksum").map(match_to_string).unwrap(),
+                    filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(),
+                    regex: r.clone(),
+                });
+            }
+        }
+
+        None
+    }
+
+    #[inline]
+    fn is_algo_based(&self) -> bool {
+        self.algo_name.is_some()
+    }
+
+    #[inline]
+    fn regex_str(&self) -> &str {
+        self.regex.as_str()
+    }
+}
+
 fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
     if input_is_stdin {
         "standard input"
@@ -437,34 +499,18 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
     .to_string()
 }
 
-/// Determines the appropriate regular expression to use based on the provided lines.
-fn determine_regex(line: impl AsRef<OsStr>) -> Option<(Regex, bool)> {
-    let regexes = [
-        (Regex::new(ALGO_BASED_REGEX).unwrap(), true),
-        (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false),
-        (Regex::new(SINGLE_SPACE_REGEX).unwrap(), false),
-        (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), true),
-    ];
-
-    let line_bytes = os_str_as_bytes(line.as_ref()).expect("UTF-8 decoding failed");
-    for (regex, is_algo_based) in &regexes {
-        if regex.is_match(line_bytes) {
-            return Some((regex.clone(), *is_algo_based));
-        }
-    }
-
-    None
-}
-
 /// Extract the expected digest from the checksum string
-fn get_expected_digest_as_hex_string(caps: &Captures, chosen_regex: &Regex) -> Option<String> {
-    // Unwraps are safe, ensured by regex.
-    let ck = caps.name("checksum").unwrap().as_bytes();
+fn get_expected_digest_as_hex_string(line_info: &LineInfo) -> Option<Cow<str>> {
+    let ck = &line_info.checksum;
 
-    if chosen_regex.as_str() == ALGO_BASED_REGEX_BASE64 {
-        BASE64.decode(ck).map(hex::encode).ok()
+    if line_info.regex_str() == ALGO_BASED_REGEX_BASE64 {
+        BASE64
+            .decode(ck.as_bytes())
+            .map(hex::encode)
+            .map(Cow::Owned)
+            .ok()
     } else if ck.len() % 2 == 0 {
-        Some(str::from_utf8(ck).unwrap().to_string())
+        Some(Cow::Borrowed(ck))
     } else {
         // If the length of the digest is not a multiple of 2, then it
         // must be improperly formatted (1 hex digit is 2 characters)
@@ -545,15 +591,14 @@ fn get_input_file(filename: &OsStr) -> UResult<Box<dyn Read>> {
 
 /// Extracts the algorithm name and length from the regex captures if the algo-based format is matched.
 fn identify_algo_name_and_length(
-    caps: &Captures,
+    line_info: &LineInfo,
     algo_name_input: Option<&str>,
 ) -> Option<(String, Option<usize>)> {
     // When the algo-based format is matched, extract details from regex captures
-    let algorithm = caps
-        .name("algo")
-        .map_or(String::new(), |m| {
-            String::from_utf8(m.as_bytes().into()).unwrap()
-        })
+    let algorithm = line_info
+        .algo_name
+        .clone()
+        .unwrap_or_default()
         .to_lowercase();
 
     // check if we are called with XXXsum (example: md5sum) but we detected a different algo parsing the file
@@ -568,13 +613,9 @@ fn identify_algo_name_and_length(
         return None;
     }
 
-    let bits = caps.name("bits").map_or(Some(None), |m| {
-        let bits_value = String::from_utf8(m.as_bytes().into())
-            .unwrap()
-            .parse::<usize>()
-            .unwrap();
-        if bits_value % 8 == 0 {
-            Some(Some(bits_value / 8))
+    let bits = line_info.algo_bitlen.map_or(Some(None), |bits| {
+        if bits % 8 == 0 {
+            Some(Some(bits / 8))
         } else {
             None // Return None to signal a divisibility issue
         }
@@ -597,6 +638,7 @@ fn process_checksum_line(
     cli_algo_name: Option<&str>,
     cli_algo_length: Option<usize>,
     opts: ChecksumOptions,
+    cached_regex: &mut Option<Regex>,
 ) -> Result<(), LineCheckError> {
     let line_bytes = os_str_as_bytes(line)?;
 
@@ -605,26 +647,30 @@ fn process_checksum_line(
         return Err(LineCheckError::Skipped);
     }
 
-    let (chosen_regex, is_algo_based_format) =
-        determine_regex(line).ok_or(LineCheckError::ImproperlyFormatted)?;
+    if let Some(line_info) = LineInfo::parse(line, cached_regex) {
+        // The cached regex ensures that when processing non-algo based regexes,
+        // its cannot be changed (can't have single and double space regexes
+        // used in the same file).
+        if cached_regex.is_none() && !line_info.is_algo_based() {
+            let _ = cached_regex.insert(line_info.regex.clone());
+        }
 
-    if let Some(caps) = chosen_regex.captures(line_bytes) {
-        let mut filename_to_check = caps.name("filename").unwrap().as_bytes();
+        let mut filename_to_check = line_info.filename.as_slice();
 
         if filename_to_check.starts_with(b"*")
             && i == 0
-            && chosen_regex.as_str() == SINGLE_SPACE_REGEX
+            && line_info.regex_str() == SINGLE_SPACE_REGEX
         {
             // Remove the leading asterisk if present - only for the first line
             filename_to_check = &filename_to_check[1..];
         }
 
-        let expected_checksum = get_expected_digest_as_hex_string(&caps, &chosen_regex)
+        let expected_checksum = get_expected_digest_as_hex_string(&line_info)
             .ok_or(LineCheckError::ImproperlyFormatted)?;
 
         // If the algo_name is provided, we use it, otherwise we try to detect it
-        let (algo_name, length) = if is_algo_based_format {
-            identify_algo_name_and_length(&caps, cli_algo_name)
+        let (algo_name, length) = if line_info.is_algo_based() {
+            identify_algo_name_and_length(&line_info, cli_algo_name)
                 .ok_or(LineCheckError::ImproperlyFormatted)?
         } else if let Some(a) = cli_algo_name {
             // When a specific algorithm name is input, use it and use the provided bits
@@ -721,6 +767,10 @@ fn process_checksum_file(
     let reader = BufReader::new(file);
     let lines = read_os_string_lines(reader).collect::<Vec<_>>();
 
+    // cached_regex is used to ensure that several non algo-based checksum line
+    // will use the same regex.
+    let mut cached_regex = None;
+
     for (i, line) in lines.iter().enumerate() {
         let line_result = process_checksum_line(
             filename_input,
@@ -729,6 +779,7 @@ fn process_checksum_file(
             cli_algo_name,
             cli_algo_length,
             opts,
+            &mut cached_regex,
         );
 
         // Match a first time to elude critical UErrors, and increment the total
@@ -1149,52 +1200,75 @@ mod tests {
     }
 
     #[test]
-    fn test_determine_regex() {
+    fn test_line_info() {
+        let mut cached_regex = None;
+
         // Test algo-based regex
         let line_algo_based =
             OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
-        let (regex, algo_based) = determine_regex(&line_algo_based).unwrap();
-        assert!(algo_based);
-        assert!(regex.is_match(os_str_as_bytes(&line_algo_based).unwrap()));
+        let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap();
+        assert!(line_info.is_algo_based());
+        assert_eq!(line_info.algo_name.as_deref(), Some("MD5"));
+        assert!(line_info.algo_bit_len.is_none());
+        assert_eq!(line_info.filename, b"example.txt");
+        assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
+        assert_eq!(line_info.regex_str(), ALGO_BASED_REGEX);
+        assert!(cached_regex.is_none());
 
         // Test double-space regex
         let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e  example.txt");
-        let (regex, algo_based) = determine_regex(&line_double_space).unwrap();
-        assert!(!algo_based);
-        assert!(regex.is_match(os_str_as_bytes(&line_double_space).unwrap()));
+        let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap();
+        assert!(!line_info.is_algo_based());
+        assert!(line_info.algo_name.is_none());
+        assert!(line_info.algo_bit_len.is_none());
+        assert_eq!(line_info.filename, b"example.txt");
+        assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
+        assert_eq!(line_info.regex_str(), DOUBLE_SPACE_REGEX);
+        assert!(cached_regex.is_some());
+
+        cached_regex = None;
 
         // Test single-space regex
         let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt");
-        let (regex, algo_based) = determine_regex(&line_single_space).unwrap();
-        assert!(!algo_based);
-        assert!(regex.is_match(os_str_as_bytes(&line_single_space).unwrap()));
+        let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap();
+        assert!(!line_info.is_algo_based());
+        assert!(line_info.algo_name.is_none());
+        assert!(line_info.algo_bit_len.is_none());
+        assert_eq!(line_info.filename, b"example.txt");
+        assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
+        assert_eq!(line_info.regex_str(), SINGLE_SPACE_REGEX);
+        assert!(cached_regex.is_some());
+
+        cached_regex = None;
 
         // Test invalid checksum line
         let line_invalid = OsString::from("invalid checksum line");
-        assert!(determine_regex(&line_invalid).is_none());
+        assert!(LineInfo::parse(&line_invalid, &mut cached_regex).is_none());
+        assert!(cached_regex.is_none());
 
         // Test leading space before checksum line
         let line_algo_based_leading_space =
             OsString::from("   MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
-        let res = determine_regex(&line_algo_based_leading_space);
+        let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
         assert!(res.is_some());
-        assert_eq!(res.unwrap().0.as_str(), ALGO_BASED_REGEX);
+        assert_eq!(res.unwrap().regex_str(), ALGO_BASED_REGEX);
+        assert!(cached_regex.is_none());
 
         // Test trailing space after checksum line (should fail)
         let line_algo_based_leading_space =
             OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e ");
-        let res = determine_regex(&line_algo_based_leading_space);
+        let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
         assert!(res.is_none());
+        assert!(cached_regex.is_none());
     }
 
     #[test]
     fn test_get_expected_digest() {
-        let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap();
-        let caps = re
-            .captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=")
-            .unwrap();
+        let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=");
+        let mut cached_regex = None;
+        let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
 
-        let result = get_expected_digest_as_hex_string(&caps, &re);
+        let result = get_expected_digest_as_hex_string(&line_info);
 
         assert_eq!(
             result.unwrap(),
@@ -1204,12 +1278,12 @@ mod tests {
 
     #[test]
     fn test_get_expected_checksum_invalid() {
-        let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap();
-        let caps = re
-            .captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU")
-            .unwrap();
+        // The line misses a '=' at the end to be valid base64
+        let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU");
+        let mut cached_regex = None;
+        let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
 
-        let result = get_expected_digest_as_hex_string(&caps, &re);
+        let result = get_expected_digest_as_hex_string(&line_info);
 
         assert!(result.is_none());
     }

From 65ddccbeb6a4e9bc45b0fc0184209a08387da411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <peron@adacore.com>
Date: Thu, 5 Dec 2024 18:19:50 +0100
Subject: [PATCH 3/8] checksum: avoid to recompute Regexps

---
 Cargo.lock                              |  1 +
 src/uucore/Cargo.toml                   |  1 +
 src/uucore/src/lib/features/checksum.rs | 40 ++++++++++++++++---------
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cb09c5fd6..611cd240c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3573,6 +3573,7 @@ dependencies = [
  "glob",
  "hex",
  "itertools",
+ "lazy_static",
  "libc",
  "md-5",
  "memchr",
diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml
index b72a8ed71..a4529f3a5 100644
--- a/src/uucore/Cargo.toml
+++ b/src/uucore/Cargo.toml
@@ -25,6 +25,7 @@ dns-lookup = { workspace = true, optional = true }
 dunce = { version = "1.0.4", optional = true }
 wild = "2.2.1"
 glob = { workspace = true }
+lazy_static = "1.4.0"
 # * optional
 itertools = { workspace = true, optional = true }
 thiserror = { workspace = true, optional = true }
diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index d575a4b38..dec5fcf21 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -5,6 +5,7 @@
 // spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename
 
 use data_encoding::BASE64;
+use lazy_static::lazy_static;
 use os_display::Quotable;
 use regex::bytes::{Match, Regex};
 use std::{
@@ -428,6 +429,13 @@ const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>
 // In this case, we ignore the *
 const SINGLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$";
 
+lazy_static! {
+    static ref R_ALGO_BASED: Regex = Regex::new(ALGO_BASED_REGEX).unwrap();
+    static ref R_DOUBLE_SPACE: Regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap();
+    static ref R_SINGLE_SPACE: Regex = Regex::new(SINGLE_SPACE_REGEX).unwrap();
+    static ref R_ALGO_BASED_BASE_64: Regex = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap();
+}
+
 /// Hold the data extracted from a checksum line.
 struct LineInfo {
     algo_name: Option<String>,
@@ -435,28 +443,32 @@ struct LineInfo {
     checksum: String,
     filename: Vec<u8>,
 
-    regex: Regex,
+    regex: &'static Regex,
 }
 
 impl LineInfo {
-    fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<Regex>) -> Option<Self> {
-        let regexes = [
-            (Regex::new(ALGO_BASED_REGEX).unwrap(), true),
-            (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false),
-            (Regex::new(SINGLE_SPACE_REGEX).unwrap(), false),
-            (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), false),
+    fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<&'static Regex>) -> Option<Self> {
+        let regexes: &[(&'static Regex, bool)] = &[
+            (&R_ALGO_BASED, true),
+            (&R_DOUBLE_SPACE, false),
+            (&R_SINGLE_SPACE, false),
+            (&R_ALGO_BASED_BASE_64, true),
         ];
 
         let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed");
 
-        for (regex, algo_based) in &regexes {
+        for (regex, algo_based) in regexes {
             if !regex.is_match(line_bytes) {
                 continue;
             }
 
-            let mut r = regex.clone();
-            if !algo_based && cached_regex.is_some() {
-                r = cached_regex.clone().unwrap();
+            let mut r = *regex;
+            if !algo_based {
+                if cached_regex.is_some() {
+                    r = cached_regex.unwrap();
+                } else {
+                    *cached_regex = Some(r);
+                }
             }
 
             if let Some(caps) = r.captures(line_bytes) {
@@ -470,7 +482,7 @@ impl LineInfo {
                         .map(|m| match_to_string(m).parse::<usize>().unwrap()),
                     checksum: caps.name("checksum").map(match_to_string).unwrap(),
                     filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(),
-                    regex: r.clone(),
+                    regex: r,
                 });
             }
         }
@@ -638,7 +650,7 @@ fn process_checksum_line(
     cli_algo_name: Option<&str>,
     cli_algo_length: Option<usize>,
     opts: ChecksumOptions,
-    cached_regex: &mut Option<Regex>,
+    cached_regex: &mut Option<&'static Regex>,
 ) -> Result<(), LineCheckError> {
     let line_bytes = os_str_as_bytes(line)?;
 
@@ -652,7 +664,7 @@ fn process_checksum_line(
         // its cannot be changed (can't have single and double space regexes
         // used in the same file).
         if cached_regex.is_none() && !line_info.is_algo_based() {
-            let _ = cached_regex.insert(line_info.regex.clone());
+            let _ = cached_regex.insert(line_info.regex);
         }
 
         let mut filename_to_check = line_info.filename.as_slice();

From df16c1c65560b42f0a4471876f7e3203a6c05a94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <peron@adacore.com>
Date: Thu, 5 Dec 2024 17:59:35 +0100
Subject: [PATCH 4/8] test(cksum): Add tests

---
 tests/by-util/test_cksum.rs | 81 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/tests/by-util/test_cksum.rs b/tests/by-util/test_cksum.rs
index bf74de9cc..6c1718112 100644
--- a/tests/by-util/test_cksum.rs
+++ b/tests/by-util/test_cksum.rs
@@ -1443,7 +1443,7 @@ mod check_utf8 {
         let scene = TestScenario::new(util_name!());
         let at = &scene.fixtures;
         let filename: OsString = OsStringExt::from_vec(b"funky\xffname".to_vec());
-        at.touch(&filename);
+        at.touch(filename);
 
         // Checksum match
         at.write_bytes("check",
@@ -1544,7 +1544,6 @@ fn test_check_confusing_base64() {
 
 /// This test checks that when a file contains several checksum lines
 /// with different encoding, the decoding still works.
-#[ignore = "not yet implemented"]
 #[test]
 fn test_check_mix_hex_base64() {
     let b64 = "BLAKE2b-128 (foo1.dat) = BBNuJPhdRwRlw9tm5Y7VbA==";
@@ -1769,3 +1768,81 @@ mod gnu_cksum_base64 {
         }
     }
 }
+
+/// The tests in this module check the behavior of cksum when given different
+/// checksum formats and algorithms in the same file, while specifying an
+/// algorithm on CLI or not.
+mod format_mix {
+    use super::*;
+
+    // First line is algo-based, second one is not
+    const INPUT_ALGO_NON_ALGO: &str = "\
+        BLAKE2b (bar) = 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce\n\
+        786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce  foo";
+
+    // First line is non algo-based, second one is
+    const INPUT_NON_ALGO_ALGO: &str = "\
+        786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce  foo\n\
+        BLAKE2b (bar) = 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce";
+
+    /// Make a simple scene with foo and bar empty files
+    fn make_scene() -> TestScenario {
+        let scene = TestScenario::new(util_name!());
+        let at = &scene.fixtures;
+
+        at.touch("foo");
+        at.touch("bar");
+
+        scene
+    }
+
+    #[test]
+    fn test_check_cli_algo_non_algo() {
+        let scene = make_scene();
+        scene
+            .ucmd()
+            .arg("--check")
+            .arg("--algo=blake2b")
+            .pipe_in(INPUT_ALGO_NON_ALGO)
+            .succeeds()
+            .stdout_contains("bar: OK\nfoo: OK")
+            .no_stderr();
+    }
+
+    #[test]
+    fn test_check_cli_non_algo_algo() {
+        let scene = make_scene();
+        scene
+            .ucmd()
+            .arg("--check")
+            .arg("--algo=blake2b")
+            .pipe_in(INPUT_NON_ALGO_ALGO)
+            .succeeds()
+            .stdout_contains("foo: OK\nbar: OK")
+            .no_stderr();
+    }
+
+    #[test]
+    fn test_check_algo_non_algo() {
+        let scene = make_scene();
+        scene
+            .ucmd()
+            .arg("--check")
+            .pipe_in(INPUT_ALGO_NON_ALGO)
+            .succeeds()
+            .stdout_contains("bar: OK")
+            .stderr_contains("cksum: WARNING: 1 line is improperly formatted");
+    }
+
+    #[test]
+    fn test_check_non_algo_algo() {
+        let scene = make_scene();
+        scene
+            .ucmd()
+            .arg("--check")
+            .pipe_in(INPUT_NON_ALGO_ALGO)
+            .succeeds()
+            .stdout_contains("bar: OK")
+            .stderr_contains("cksum: WARNING: 1 line is improperly formatted");
+    }
+}

From 10a9b0bfbf237ed37b298ffcd932c6b31e17a18a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <peron@adacore.com>
Date: Thu, 5 Dec 2024 18:22:52 +0100
Subject: [PATCH 5/8] checksum: split treatment of algo-based and non-algo
 based into separate functions

---
 src/uucore/src/lib/features/checksum.rs | 172 ++++++++++++++----------
 1 file changed, 102 insertions(+), 70 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index dec5fcf21..0da814a76 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -464,6 +464,9 @@ impl LineInfo {
 
             let mut r = *regex;
             if !algo_based {
+                // The cached regex ensures that when processing non-algo based regexes,
+                // its cannot be changed (can't have single and double space regexes
+                // used in the same file).
                 if cached_regex.is_some() {
                     r = cached_regex.unwrap();
                 } else {
@@ -636,13 +639,101 @@ fn identify_algo_name_and_length(
     Some((algorithm, bits))
 }
 
+/// Given a filename and an algorithm, compute the digest and compare it with
+/// the expected one.
+fn compute_and_check_digest_from_file(
+    filename: &[u8],
+    expected_checksum: &str,
+    mut algo: HashAlgorithm,
+    opts: ChecksumOptions,
+) -> Result<(), LineCheckError> {
+    let (filename_to_check_unescaped, prefix) = unescape_filename(filename);
+    let real_filename_to_check = os_str_from_bytes(&filename_to_check_unescaped)?;
+
+    // Open the input file
+    let file_to_check = get_file_to_check(&real_filename_to_check, opts)?;
+    let mut file_reader = BufReader::new(file_to_check);
+
+    // Read the file and calculate the checksum
+    let create_fn = &mut algo.create_fn;
+    let mut digest = create_fn();
+    let (calculated_checksum, _) =
+        digest_reader(&mut digest, &mut file_reader, opts.binary, algo.bits).unwrap();
+
+    // Do the checksum validation
+    let checksum_correct = expected_checksum == calculated_checksum;
+    print_file_report(
+        std::io::stdout(),
+        filename,
+        FileChecksumResult::from_bool(checksum_correct),
+        prefix,
+        opts,
+    );
+
+    if checksum_correct {
+        Ok(())
+    } else {
+        Err(LineCheckError::DigestMismatch)
+    }
+}
+
+/// Check a digest checksum with non-algo based pre-treatment.
+fn process_algo_based_line(
+    line_info: &LineInfo,
+    cli_algo_name: Option<&str>,
+    opts: ChecksumOptions,
+) -> Result<(), LineCheckError> {
+    let filename_to_check = line_info.filename.as_slice();
+    let expected_checksum =
+        get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?;
+
+    let (algo_name, algo_bitlen) = identify_algo_name_and_length(line_info, cli_algo_name)
+        .ok_or(LineCheckError::ImproperlyFormatted)?;
+
+    let algo = detect_algo(&algo_name, algo_bitlen)?;
+
+    compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts)
+}
+
+/// Check a digest checksum with non-algo based pre-treatment.
+fn process_non_algo_based_line(
+    i: usize,
+    line_info: &LineInfo,
+    cli_algo_name: &str,
+    cli_algo_length: Option<usize>,
+    opts: ChecksumOptions,
+) -> Result<(), LineCheckError> {
+    let mut filename_to_check = line_info.filename.as_slice();
+    if filename_to_check.starts_with(b"*") && i == 0 && line_info.regex_str() == SINGLE_SPACE_REGEX
+    {
+        // Remove the leading asterisk if present - only for the first line
+        filename_to_check = &filename_to_check[1..];
+    }
+    let expected_checksum =
+        get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?;
+
+    // When a specific algorithm name is input, use it and use the provided bits
+    // except when dealing with blake2b, where we will detect the length
+    let (algo_name, algo_bitlen) = if cli_algo_name == ALGORITHM_OPTIONS_BLAKE2B {
+        // division by 2 converts the length of the Blake2b checksum from hexadecimal
+        // characters to bytes, as each byte is represented by two hexadecimal characters.
+        let length = Some(expected_checksum.len() / 2);
+        (ALGORITHM_OPTIONS_BLAKE2B.to_string(), length)
+    } else {
+        (cli_algo_name.to_lowercase(), cli_algo_length)
+    };
+
+    let algo = detect_algo(&algo_name, algo_bitlen)?;
+
+    compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts)
+}
+
 /// Parses a checksum line, detect the algorithm to use, read the file and produce
 /// its digest, and compare it to the expected value.
 ///
 /// Returns `Ok(bool)` if the comparison happened, bool indicates if the digest
 /// matched the expected.
 /// If the comparison didn't happen, return a `LineChecksumError`.
-#[allow(clippy::too_many_arguments)]
 fn process_checksum_line(
     filename_input: &OsStr,
     line: &OsStr,
@@ -654,82 +745,23 @@ fn process_checksum_line(
 ) -> Result<(), LineCheckError> {
     let line_bytes = os_str_as_bytes(line)?;
 
-    // early return on empty or commented lines.
+    // Early return on empty or commented lines.
     if line.is_empty() || line_bytes.starts_with(b"#") {
         return Err(LineCheckError::Skipped);
     }
 
+    // Use `LineInfo` to extract the data of a line.
+    // Then, depending on its format, apply a different pre-treatment.
     if let Some(line_info) = LineInfo::parse(line, cached_regex) {
-        // The cached regex ensures that when processing non-algo based regexes,
-        // its cannot be changed (can't have single and double space regexes
-        // used in the same file).
-        if cached_regex.is_none() && !line_info.is_algo_based() {
-            let _ = cached_regex.insert(line_info.regex);
-        }
-
-        let mut filename_to_check = line_info.filename.as_slice();
-
-        if filename_to_check.starts_with(b"*")
-            && i == 0
-            && line_info.regex_str() == SINGLE_SPACE_REGEX
-        {
-            // Remove the leading asterisk if present - only for the first line
-            filename_to_check = &filename_to_check[1..];
-        }
-
-        let expected_checksum = get_expected_digest_as_hex_string(&line_info)
-            .ok_or(LineCheckError::ImproperlyFormatted)?;
-
-        // If the algo_name is provided, we use it, otherwise we try to detect it
-        let (algo_name, length) = if line_info.is_algo_based() {
-            identify_algo_name_and_length(&line_info, cli_algo_name)
-                .ok_or(LineCheckError::ImproperlyFormatted)?
-        } else if let Some(a) = cli_algo_name {
-            // When a specific algorithm name is input, use it and use the provided bits
-            // except when dealing with blake2b, where we will detect the length
-            if cli_algo_name == Some(ALGORITHM_OPTIONS_BLAKE2B) {
-                // division by 2 converts the length of the Blake2b checksum from hexadecimal
-                // characters to bytes, as each byte is represented by two hexadecimal characters.
-                let length = Some(expected_checksum.len() / 2);
-                (ALGORITHM_OPTIONS_BLAKE2B.to_string(), length)
-            } else {
-                (a.to_lowercase(), cli_algo_length)
-            }
+        if line_info.is_algo_based() {
+            process_algo_based_line(&line_info, cli_algo_name, opts)
+        } else if let Some(cli_algo) = cli_algo_name {
+            // If we match a non-algo based regex, we expect a cli argument
+            // to give us the algorithm to use
+            process_non_algo_based_line(i, &line_info, cli_algo, cli_algo_length, opts)
         } else {
-            // Default case if no algorithm is specified and non-algo based format is matched
+            // We have no clue of what algorithm to use
             return Err(LineCheckError::ImproperlyFormatted);
-        };
-
-        let mut algo = detect_algo(&algo_name, length)?;
-
-        let (filename_to_check_unescaped, prefix) = unescape_filename(filename_to_check);
-
-        let real_filename_to_check = os_str_from_bytes(&filename_to_check_unescaped)?;
-
-        // manage the input file
-        let file_to_check = get_file_to_check(&real_filename_to_check, opts)?;
-        let mut file_reader = BufReader::new(file_to_check);
-
-        // Read the file and calculate the checksum
-        let create_fn = &mut algo.create_fn;
-        let mut digest = create_fn();
-        let (calculated_checksum, _) =
-            digest_reader(&mut digest, &mut file_reader, opts.binary, algo.bits).unwrap();
-
-        // Do the checksum validation
-        let checksum_correct = expected_checksum == calculated_checksum;
-        print_file_report(
-            std::io::stdout(),
-            filename_to_check,
-            FileChecksumResult::from_bool(checksum_correct),
-            prefix,
-            opts,
-        );
-
-        if checksum_correct {
-            Ok(())
-        } else {
-            Err(LineCheckError::DigestMismatch)
         }
     } else {
         if opts.warn {

From f4e5dc2e0fa8ae5e2a4ad23fa4dd0f0a44e12253 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <dorianperon.i@gmail.com>
Date: Sat, 7 Dec 2024 02:29:54 +0100
Subject: [PATCH 6/8] checksum: use the blake2b length as an hint to check the
 correctness of the expected digest

---
 src/uucore/src/lib/features/checksum.rs | 95 +++++++++++++++++--------
 1 file changed, 64 insertions(+), 31 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index 0da814a76..8de983490 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -2,7 +2,7 @@
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
-// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename
+// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
 
 use data_encoding::BASE64;
 use lazy_static::lazy_static;
@@ -515,22 +515,43 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
 }
 
 /// Extract the expected digest from the checksum string
-fn get_expected_digest_as_hex_string(line_info: &LineInfo) -> Option<Cow<str>> {
+fn get_expected_digest_as_hex_string(
+    line_info: &LineInfo,
+    len_hint: Option<usize>,
+) -> Option<Cow<str>> {
     let ck = &line_info.checksum;
 
-    if line_info.regex_str() == ALGO_BASED_REGEX_BASE64 {
-        BASE64
-            .decode(ck.as_bytes())
-            .map(hex::encode)
-            .map(Cow::Owned)
-            .ok()
-    } else if ck.len() % 2 == 0 {
-        Some(Cow::Borrowed(ck))
-    } else {
+    // TODO MSRV 1.82, replace `is_some_and` with `is_none_or`
+    // to improve readability. This closure returns True if a length hint provided
+    // and the argument isn't the same as the hint.
+    let against_hint = |len| len_hint.is_some_and(|l| l != len);
+
+    if ck.len() % 2 != 0 {
         // If the length of the digest is not a multiple of 2, then it
         // must be improperly formatted (1 hex digit is 2 characters)
-        None
+        return None;
     }
+
+    // If the digest can be decoded as hexadecimal AND it length match the
+    // one expected (in case it's given), just go with it.
+    if ck.as_bytes().iter().all(u8::is_ascii_hexdigit) && !against_hint(ck.len()) {
+        return Some(Cow::Borrowed(ck));
+    }
+
+    // If hexadecimal digest fails for any reason, interpret the digest as base 64.
+    BASE64
+        .decode(ck.as_bytes()) // Decode the string as encoded base64
+        .map(hex::encode) // Encode it back as hexadecimal
+        .map(Cow::<str>::Owned)
+        .ok()
+        .and_then(|s| {
+            // Check the digest length
+            if !against_hint(s.len()) {
+                Some(s)
+            } else {
+                None
+            }
+        })
 }
 
 /// Returns a reader that reads from the specified file, or from stdin if `filename_to_check` is "-".
@@ -604,12 +625,11 @@ fn get_input_file(filename: &OsStr) -> UResult<Box<dyn Read>> {
     }
 }
 
-/// Extracts the algorithm name and length from the regex captures if the algo-based format is matched.
+/// Gets the algorithm name and length from the `LineInfo` if the algo-based format is matched.
 fn identify_algo_name_and_length(
     line_info: &LineInfo,
     algo_name_input: Option<&str>,
 ) -> Option<(String, Option<usize>)> {
-    // When the algo-based format is matched, extract details from regex captures
     let algorithm = line_info
         .algo_name
         .clone()
@@ -628,15 +648,20 @@ fn identify_algo_name_and_length(
         return None;
     }
 
-    let bits = line_info.algo_bitlen.map_or(Some(None), |bits| {
-        if bits % 8 == 0 {
-            Some(Some(bits / 8))
-        } else {
-            None // Return None to signal a divisibility issue
+    let bytes = if let Some(bitlen) = line_info.algo_bit_len {
+        if bitlen % 8 != 0 {
+            // The given length is wrong
+            return None;
         }
-    })?;
+        Some(bitlen / 8)
+    } else if algorithm == ALGORITHM_OPTIONS_BLAKE2B {
+        // Default length with BLAKE2b,
+        Some(64)
+    } else {
+        None
+    };
 
-    Some((algorithm, bits))
+    Some((algorithm, bytes))
 }
 
 /// Given a filename and an algorithm, compute the digest and compare it with
@@ -684,13 +709,21 @@ fn process_algo_based_line(
     opts: ChecksumOptions,
 ) -> Result<(), LineCheckError> {
     let filename_to_check = line_info.filename.as_slice();
-    let expected_checksum =
-        get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?;
 
-    let (algo_name, algo_bitlen) = identify_algo_name_and_length(line_info, cli_algo_name)
+    let (algo_name, algo_byte_len) = identify_algo_name_and_length(line_info, cli_algo_name)
         .ok_or(LineCheckError::ImproperlyFormatted)?;
 
-    let algo = detect_algo(&algo_name, algo_bitlen)?;
+    // If the digest bitlen is known, we can check the format of the expected
+    // checksum with it.
+    let digest_char_length_hint = match (algo_name.as_str(), algo_byte_len) {
+        (ALGORITHM_OPTIONS_BLAKE2B, Some(bytelen)) => Some(bytelen * 2),
+        _ => None,
+    };
+
+    let expected_checksum = get_expected_digest_as_hex_string(line_info, digest_char_length_hint)
+        .ok_or(LineCheckError::ImproperlyFormatted)?;
+
+    let algo = detect_algo(&algo_name, algo_byte_len)?;
 
     compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts)
 }
@@ -709,12 +742,12 @@ fn process_non_algo_based_line(
         // Remove the leading asterisk if present - only for the first line
         filename_to_check = &filename_to_check[1..];
     }
-    let expected_checksum =
-        get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?;
+    let expected_checksum = get_expected_digest_as_hex_string(line_info, None)
+        .ok_or(LineCheckError::ImproperlyFormatted)?;
 
     // When a specific algorithm name is input, use it and use the provided bits
     // except when dealing with blake2b, where we will detect the length
-    let (algo_name, algo_bitlen) = if cli_algo_name == ALGORITHM_OPTIONS_BLAKE2B {
+    let (algo_name, algo_byte_len) = if cli_algo_name == ALGORITHM_OPTIONS_BLAKE2B {
         // division by 2 converts the length of the Blake2b checksum from hexadecimal
         // characters to bytes, as each byte is represented by two hexadecimal characters.
         let length = Some(expected_checksum.len() / 2);
@@ -723,7 +756,7 @@ fn process_non_algo_based_line(
         (cli_algo_name.to_lowercase(), cli_algo_length)
     };
 
-    let algo = detect_algo(&algo_name, algo_bitlen)?;
+    let algo = detect_algo(&algo_name, algo_byte_len)?;
 
     compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts)
 }
@@ -1312,7 +1345,7 @@ mod tests {
         let mut cached_regex = None;
         let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
 
-        let result = get_expected_digest_as_hex_string(&line_info);
+        let result = get_expected_digest_as_hex_string(&line_info, None);
 
         assert_eq!(
             result.unwrap(),
@@ -1327,7 +1360,7 @@ mod tests {
         let mut cached_regex = None;
         let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap();
 
-        let result = get_expected_digest_as_hex_string(&line_info);
+        let result = get_expected_digest_as_hex_string(&line_info, None);
 
         assert!(result.is_none());
     }

From 567bbc5f3c8a242607881554728a420b68613792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <dorianperon.i@gmail.com>
Date: Sat, 7 Dec 2024 02:38:00 +0100
Subject: [PATCH 7/8] checksum: remove ALGO_BASED_REGEX (non base64) as its not
 useful anymore and introduce LineFormat struct

---
 src/uucore/src/lib/features/checksum.rs | 85 ++++++++++++++-----------
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index 8de983490..e19845f18 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -421,8 +421,7 @@ pub fn detect_algo(algo: &str, length: Option<usize>) -> UResult<HashAlgorithm>
 //    algo must be uppercase or b (for blake2b)
 // 2. <checksum> [* ]<filename>
 // 3. <checksum> [*]<filename> (only one space)
-const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>(?-u:.*))\)\s*=\s*(?P<checksum>[a-fA-F0-9]+)$";
-const ALGO_BASED_REGEX_BASE64: &str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>(?-u:.*))\)\s*=\s*(?P<checksum>[A-Za-z0-9+/]+={0,2})$";
+const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>(?-u:.*))\)\s*=\s*(?P<checksum>[A-Za-z0-9+/]+={0,2})$";
 
 const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>(?-u:.*))$";
 
@@ -433,7 +432,23 @@ lazy_static! {
     static ref R_ALGO_BASED: Regex = Regex::new(ALGO_BASED_REGEX).unwrap();
     static ref R_DOUBLE_SPACE: Regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap();
     static ref R_SINGLE_SPACE: Regex = Regex::new(SINGLE_SPACE_REGEX).unwrap();
-    static ref R_ALGO_BASED_BASE_64: Regex = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap();
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum LineFormat {
+    AlgoBased,
+    SingleSpace,
+    DoubleSpace,
+}
+
+impl LineFormat {
+    fn to_regex(self) -> &'static Regex {
+        match self {
+            LineFormat::AlgoBased => &R_ALGO_BASED,
+            LineFormat::SingleSpace => &R_SINGLE_SPACE,
+            LineFormat::DoubleSpace => &R_DOUBLE_SPACE,
+        }
+    }
 }
 
 /// Hold the data extracted from a checksum line.
@@ -443,34 +458,41 @@ struct LineInfo {
     checksum: String,
     filename: Vec<u8>,
 
-    regex: &'static Regex,
+    format: LineFormat,
 }
 
 impl LineInfo {
-    fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<&'static Regex>) -> Option<Self> {
-        let regexes: &[(&'static Regex, bool)] = &[
-            (&R_ALGO_BASED, true),
-            (&R_DOUBLE_SPACE, false),
-            (&R_SINGLE_SPACE, false),
-            (&R_ALGO_BASED_BASE_64, true),
+    /// Returns a `LineInfo` parsed from a checksum line.
+    /// The function will run 3 regexes against the line and select the first one that matches
+    /// to populate the fields of the struct.
+    /// However, there is a catch to handle regarding the handling of `cached_regex`.
+    /// In case of non-algo-based regex, if `cached_regex` is Some, it must take the priority
+    /// over the detected regex. Otherwise, we must set it the the detected regex.
+    /// This specific behavior is emphasized by the test
+    /// `test_hashsum::test_check_md5sum_only_one_space`.
+    fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<LineFormat>) -> Option<Self> {
+        let regexes: &[(&'static Regex, LineFormat)] = &[
+            (&R_ALGO_BASED, LineFormat::AlgoBased),
+            (&R_DOUBLE_SPACE, LineFormat::DoubleSpace),
+            (&R_SINGLE_SPACE, LineFormat::SingleSpace),
         ];
 
         let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed");
 
-        for (regex, algo_based) in regexes {
+        for (regex, format) in regexes {
             if !regex.is_match(line_bytes) {
                 continue;
             }
 
             let mut r = *regex;
-            if !algo_based {
+            if *format != LineFormat::AlgoBased {
                 // The cached regex ensures that when processing non-algo based regexes,
-                // its cannot be changed (can't have single and double space regexes
+                // it cannot be changed (can't have single and double space regexes
                 // used in the same file).
                 if cached_regex.is_some() {
-                    r = cached_regex.unwrap();
+                    r = cached_regex.unwrap().to_regex();
                 } else {
-                    *cached_regex = Some(r);
+                    *cached_regex = Some(*format);
                 }
             }
 
@@ -485,23 +507,13 @@ impl LineInfo {
                         .map(|m| match_to_string(m).parse::<usize>().unwrap()),
                     checksum: caps.name("checksum").map(match_to_string).unwrap(),
                     filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(),
-                    regex: r,
+                    format: *format,
                 });
             }
         }
 
         None
     }
-
-    #[inline]
-    fn is_algo_based(&self) -> bool {
-        self.algo_name.is_some()
-    }
-
-    #[inline]
-    fn regex_str(&self) -> &str {
-        self.regex.as_str()
-    }
 }
 
 fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String {
@@ -730,14 +742,16 @@ fn process_algo_based_line(
 
 /// Check a digest checksum with non-algo based pre-treatment.
 fn process_non_algo_based_line(
-    i: usize,
+    line_number: usize,
     line_info: &LineInfo,
     cli_algo_name: &str,
     cli_algo_length: Option<usize>,
     opts: ChecksumOptions,
 ) -> Result<(), LineCheckError> {
     let mut filename_to_check = line_info.filename.as_slice();
-    if filename_to_check.starts_with(b"*") && i == 0 && line_info.regex_str() == SINGLE_SPACE_REGEX
+    if filename_to_check.starts_with(b"*")
+        && line_number == 0
+        && line_info.format == LineFormat::SingleSpace
     {
         // Remove the leading asterisk if present - only for the first line
         filename_to_check = &filename_to_check[1..];
@@ -774,7 +788,7 @@ fn process_checksum_line(
     cli_algo_name: Option<&str>,
     cli_algo_length: Option<usize>,
     opts: ChecksumOptions,
-    cached_regex: &mut Option<&'static Regex>,
+    cached_regex: &mut Option<LineFormat>,
 ) -> Result<(), LineCheckError> {
     let line_bytes = os_str_as_bytes(line)?;
 
@@ -786,7 +800,7 @@ fn process_checksum_line(
     // Use `LineInfo` to extract the data of a line.
     // Then, depending on its format, apply a different pre-treatment.
     if let Some(line_info) = LineInfo::parse(line, cached_regex) {
-        if line_info.is_algo_based() {
+        if line_info.format == LineFormat::AlgoBased {
             process_algo_based_line(&line_info, cli_algo_name, opts)
         } else if let Some(cli_algo) = cli_algo_name {
             // If we match a non-algo based regex, we expect a cli argument
@@ -1284,23 +1298,21 @@ mod tests {
         let line_algo_based =
             OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
         let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap();
-        assert!(line_info.is_algo_based());
         assert_eq!(line_info.algo_name.as_deref(), Some("MD5"));
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
-        assert_eq!(line_info.regex_str(), ALGO_BASED_REGEX);
+        assert_eq!(line_info.format, LineFormat::AlgoBased);
         assert!(cached_regex.is_none());
 
         // Test double-space regex
         let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e  example.txt");
         let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap();
-        assert!(!line_info.is_algo_based());
         assert!(line_info.algo_name.is_none());
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
-        assert_eq!(line_info.regex_str(), DOUBLE_SPACE_REGEX);
+        assert_eq!(line_info.format, LineFormat::DoubleSpace);
         assert!(cached_regex.is_some());
 
         cached_regex = None;
@@ -1308,12 +1320,11 @@ mod tests {
         // Test single-space regex
         let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt");
         let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap();
-        assert!(!line_info.is_algo_based());
         assert!(line_info.algo_name.is_none());
         assert!(line_info.algo_bit_len.is_none());
         assert_eq!(line_info.filename, b"example.txt");
         assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
-        assert_eq!(line_info.regex_str(), SINGLE_SPACE_REGEX);
+        assert_eq!(line_info.format, LineFormat::SingleSpace);
         assert!(cached_regex.is_some());
 
         cached_regex = None;
@@ -1328,7 +1339,7 @@ mod tests {
             OsString::from("   MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
         let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
         assert!(res.is_some());
-        assert_eq!(res.unwrap().regex_str(), ALGO_BASED_REGEX);
+        assert_eq!(line_info.format, LineFormat::AlgoBased);
         assert!(cached_regex.is_none());
 
         // Test trailing space after checksum line (should fail)

From 958222a07ccc25940a8aa7304bfe22cd41fd904b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dorian=20P=C3=A9ron?= <dorianperon.i@gmail.com>
Date: Sat, 7 Dec 2024 02:30:30 +0100
Subject: [PATCH 8/8] test(cksum): un-ignore tests that are now implemented

---
 src/uucore/src/lib/features/checksum.rs | 3 +--
 tests/by-util/test_cksum.rs             | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs
index e19845f18..0b3e4e249 100644
--- a/src/uucore/src/lib/features/checksum.rs
+++ b/src/uucore/src/lib/features/checksum.rs
@@ -1337,8 +1337,7 @@ mod tests {
         // Test leading space before checksum line
         let line_algo_based_leading_space =
             OsString::from("   MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e");
-        let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex);
-        assert!(res.is_some());
+        let line_info = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex).unwrap();
         assert_eq!(line_info.format, LineFormat::AlgoBased);
         assert!(cached_regex.is_none());
 
diff --git a/tests/by-util/test_cksum.rs b/tests/by-util/test_cksum.rs
index 6c1718112..2efc78b96 100644
--- a/tests/by-util/test_cksum.rs
+++ b/tests/by-util/test_cksum.rs
@@ -1480,7 +1480,6 @@ mod check_utf8 {
     }
 }
 
-#[ignore = "not yet implemented"]
 #[test]
 fn test_check_blake_length_guess() {
     let correct_lines = [
@@ -1523,7 +1522,6 @@ fn test_check_blake_length_guess() {
         .stderr_contains("foo.sums: no properly formatted checksum lines found");
 }
 
-#[ignore = "not yet implemented"]
 #[test]
 fn test_check_confusing_base64() {
     let cksum = "BLAKE2b-48 (foo.dat) = fc1f97C4";