From 5cbe87620c380ec3d0681246b5819820f83df21c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Thu, 5 Dec 2024 15:57:01 +0100 Subject: [PATCH 1/8] checksum: move regex detection to the line level --- src/uucore/src/lib/features/checksum.rs | 109 ++++++++---------------- 1 file changed, 36 insertions(+), 73 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index f7228830b..34dc0f870 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -8,7 +8,7 @@ use data_encoding::BASE64; use os_display::Quotable; use regex::bytes::{Captures, Regex}; use std::{ - ffi::{OsStr, OsString}, + ffi::OsStr, fmt::Display, fs::File, io::{self, stdin, BufReader, Read, Write}, @@ -130,9 +130,6 @@ enum FileCheckError { ImproperlyFormatted, /// reading of the checksum file failed CantOpenChecksumFile, - /// Algorithm detection was unsuccessful. - /// Either none is provided, or there is a conflict. - AlgoDetectionError, } impl From> for FileCheckError { @@ -441,7 +438,7 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { } /// Determines the appropriate regular expression to use based on the provided lines. -fn determine_regex(lines: &[OsString]) -> Option<(Regex, bool)> { +fn determine_regex(line: impl AsRef) -> Option<(Regex, bool)> { let regexes = [ (Regex::new(ALGO_BASED_REGEX).unwrap(), true), (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false), @@ -449,12 +446,10 @@ fn determine_regex(lines: &[OsString]) -> Option<(Regex, bool)> { (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), true), ]; - for line in lines { - let line_bytes = os_str_as_bytes(line).expect("UTF-8 decoding failed"); - for (regex, is_algo_based) in ®exes { - if regex.is_match(line_bytes) { - return Some((regex.clone(), *is_algo_based)); - } + let line_bytes = os_str_as_bytes(line.as_ref()).expect("UTF-8 decoding failed"); + for (regex, is_algo_based) in ®exes { + if regex.is_match(line_bytes) { + return Some((regex.clone(), *is_algo_based)); } } @@ -599,13 +594,20 @@ fn process_checksum_line( filename_input: &OsStr, line: &OsStr, i: usize, - chosen_regex: &Regex, - is_algo_based_format: bool, cli_algo_name: Option<&str>, cli_algo_length: Option, opts: ChecksumOptions, ) -> Result<(), LineCheckError> { let line_bytes = os_str_as_bytes(line)?; + + // early return on empty or commented lines. + if line.is_empty() || line_bytes.starts_with(b"#") { + return Err(LineCheckError::Skipped); + } + + let (chosen_regex, is_algo_based_format) = + determine_regex(line).ok_or(LineCheckError::ImproperlyFormatted)?; + if let Some(caps) = chosen_regex.captures(line_bytes) { let mut filename_to_check = caps.name("filename").unwrap().as_bytes(); @@ -617,7 +619,7 @@ fn process_checksum_line( filename_to_check = &filename_to_check[1..]; } - let expected_checksum = get_expected_digest_as_hex_string(&caps, chosen_regex) + let expected_checksum = get_expected_digest_as_hex_string(&caps, &chosen_regex) .ok_or(LineCheckError::ImproperlyFormatted)?; // If the algo_name is provided, we use it, otherwise we try to detect it @@ -672,10 +674,6 @@ fn process_checksum_line( Err(LineCheckError::DigestMismatch) } } else { - if line.is_empty() || line_bytes.starts_with(b"#") { - // Don't show any warning for empty or commented lines. - return Err(LineCheckError::Skipped); - } if opts.warn { let algo = if let Some(algo_name_input) = cli_algo_name { algo_name_input.to_uppercase() @@ -723,19 +721,11 @@ fn process_checksum_file( let reader = BufReader::new(file); let lines = read_os_string_lines(reader).collect::>(); - let Some((chosen_regex, is_algo_based_format)) = determine_regex(&lines) else { - log_no_properly_formatted(get_filename_for_output(filename_input, input_is_stdin)); - set_exit_code(1); - return Err(FileCheckError::AlgoDetectionError); - }; - for (i, line) in lines.iter().enumerate() { let line_result = process_checksum_line( filename_input, line, i, - &chosen_regex, - is_algo_based_format, cli_algo_name, cli_algo_length, opts, @@ -816,8 +806,7 @@ where use FileCheckError::*; match process_checksum_file(filename_input, algo_name_input, length_input, opts) { Err(UError(e)) => return Err(e), - Err(ImproperlyFormatted) => break, - Err(CantOpenChecksumFile | AlgoDetectionError) | Ok(_) => continue, + Err(CantOpenChecksumFile | ImproperlyFormatted) | Ok(_) => continue, } } @@ -926,6 +915,7 @@ pub fn escape_filename(filename: &Path) -> (String, &'static str) { #[cfg(test)] mod tests { use super::*; + use std::ffi::OsString; #[test] fn test_unescape_filename() { @@ -1161,66 +1151,39 @@ mod tests { #[test] fn test_determine_regex() { // Test algo-based regex - let lines_algo_based = ["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - let (regex, algo_based) = determine_regex(&lines_algo_based).unwrap(); + let line_algo_based = + OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); + let (regex, algo_based) = determine_regex(&line_algo_based).unwrap(); assert!(algo_based); - assert!(regex.is_match(os_str_as_bytes(&lines_algo_based[0]).unwrap())); + assert!(regex.is_match(os_str_as_bytes(&line_algo_based).unwrap())); // Test double-space regex - let lines_double_space = ["d41d8cd98f00b204e9800998ecf8427e example.txt"] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - let (regex, algo_based) = determine_regex(&lines_double_space).unwrap(); + let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); + let (regex, algo_based) = determine_regex(&line_double_space).unwrap(); assert!(!algo_based); - assert!(regex.is_match(os_str_as_bytes(&lines_double_space[0]).unwrap())); + assert!(regex.is_match(os_str_as_bytes(&line_double_space).unwrap())); // Test single-space regex - let lines_single_space = ["d41d8cd98f00b204e9800998ecf8427e example.txt"] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - let (regex, algo_based) = determine_regex(&lines_single_space).unwrap(); + let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); + let (regex, algo_based) = determine_regex(&line_single_space).unwrap(); assert!(!algo_based); - assert!(regex.is_match(os_str_as_bytes(&lines_single_space[0]).unwrap())); - - // Test double-space regex start with invalid - let lines_double_space = ["ERR", "d41d8cd98f00b204e9800998ecf8427e example.txt"] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - let (regex, algo_based) = determine_regex(&lines_double_space).unwrap(); - assert!(!algo_based); - assert!(!regex.is_match(os_str_as_bytes(&lines_double_space[0]).unwrap())); - assert!(regex.is_match(os_str_as_bytes(&lines_double_space[1]).unwrap())); + assert!(regex.is_match(os_str_as_bytes(&line_single_space).unwrap())); // Test invalid checksum line - let lines_invalid = ["invalid checksum line"] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - assert!(determine_regex(&lines_invalid).is_none()); + let line_invalid = OsString::from("invalid checksum line"); + assert!(determine_regex(&line_invalid).is_none()); // Test leading space before checksum line - let lines_algo_based_leading_space = - [" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - let res = determine_regex(&lines_algo_based_leading_space); + let line_algo_based_leading_space = + OsString::from(" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); + let res = determine_regex(&line_algo_based_leading_space); assert!(res.is_some()); assert_eq!(res.unwrap().0.as_str(), ALGO_BASED_REGEX); // Test trailing space after checksum line (should fail) - let lines_algo_based_leading_space = - ["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e "] - .iter() - .map(|s| OsString::from(s.to_string())) - .collect::>(); - let res = determine_regex(&lines_algo_based_leading_space); + let line_algo_based_leading_space = + OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e "); + let res = determine_regex(&line_algo_based_leading_space); assert!(res.is_none()); } From ed15ca1d264ce9b6faa5720d7215ed6707b77651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Thu, 5 Dec 2024 17:59:00 +0100 Subject: [PATCH 2/8] checksum: keep a cache of the first used regex for non-algo-based regexes --- src/uucore/src/lib/features/checksum.rs | 216 ++++++++++++++++-------- 1 file changed, 145 insertions(+), 71 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 34dc0f870..d575a4b38 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -6,8 +6,9 @@ use data_encoding::BASE64; use os_display::Quotable; -use regex::bytes::{Captures, Regex}; +use regex::bytes::{Match, Regex}; use std::{ + borrow::Cow, ffi::OsStr, fmt::Display, fs::File, @@ -427,6 +428,67 @@ const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P // In this case, we ignore the * const SINGLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s(?P\*?(?-u:.*))$"; +/// Hold the data extracted from a checksum line. +struct LineInfo { + algo_name: Option, + algo_bit_len: Option, + checksum: String, + filename: Vec, + + regex: Regex, +} + +impl LineInfo { + fn parse(s: impl AsRef, cached_regex: &mut Option) -> Option { + let regexes = [ + (Regex::new(ALGO_BASED_REGEX).unwrap(), true), + (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false), + (Regex::new(SINGLE_SPACE_REGEX).unwrap(), false), + (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), false), + ]; + + let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed"); + + for (regex, algo_based) in ®exes { + if !regex.is_match(line_bytes) { + continue; + } + + let mut r = regex.clone(); + if !algo_based && cached_regex.is_some() { + r = cached_regex.clone().unwrap(); + } + + if let Some(caps) = r.captures(line_bytes) { + // These unwraps are safe thanks to the regex + let match_to_string = |m: Match| String::from_utf8(m.as_bytes().into()).unwrap(); + + return Some(Self { + algo_name: caps.name("algo").map(match_to_string), + algo_bit_len: caps + .name("bits") + .map(|m| match_to_string(m).parse::().unwrap()), + checksum: caps.name("checksum").map(match_to_string).unwrap(), + filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(), + regex: r.clone(), + }); + } + } + + None + } + + #[inline] + fn is_algo_based(&self) -> bool { + self.algo_name.is_some() + } + + #[inline] + fn regex_str(&self) -> &str { + self.regex.as_str() + } +} + fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { if input_is_stdin { "standard input" @@ -437,34 +499,18 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { .to_string() } -/// Determines the appropriate regular expression to use based on the provided lines. -fn determine_regex(line: impl AsRef) -> Option<(Regex, bool)> { - let regexes = [ - (Regex::new(ALGO_BASED_REGEX).unwrap(), true), - (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false), - (Regex::new(SINGLE_SPACE_REGEX).unwrap(), false), - (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), true), - ]; - - let line_bytes = os_str_as_bytes(line.as_ref()).expect("UTF-8 decoding failed"); - for (regex, is_algo_based) in ®exes { - if regex.is_match(line_bytes) { - return Some((regex.clone(), *is_algo_based)); - } - } - - None -} - /// Extract the expected digest from the checksum string -fn get_expected_digest_as_hex_string(caps: &Captures, chosen_regex: &Regex) -> Option { - // Unwraps are safe, ensured by regex. - let ck = caps.name("checksum").unwrap().as_bytes(); +fn get_expected_digest_as_hex_string(line_info: &LineInfo) -> Option> { + let ck = &line_info.checksum; - if chosen_regex.as_str() == ALGO_BASED_REGEX_BASE64 { - BASE64.decode(ck).map(hex::encode).ok() + if line_info.regex_str() == ALGO_BASED_REGEX_BASE64 { + BASE64 + .decode(ck.as_bytes()) + .map(hex::encode) + .map(Cow::Owned) + .ok() } else if ck.len() % 2 == 0 { - Some(str::from_utf8(ck).unwrap().to_string()) + Some(Cow::Borrowed(ck)) } else { // If the length of the digest is not a multiple of 2, then it // must be improperly formatted (1 hex digit is 2 characters) @@ -545,15 +591,14 @@ fn get_input_file(filename: &OsStr) -> UResult> { /// Extracts the algorithm name and length from the regex captures if the algo-based format is matched. fn identify_algo_name_and_length( - caps: &Captures, + line_info: &LineInfo, algo_name_input: Option<&str>, ) -> Option<(String, Option)> { // When the algo-based format is matched, extract details from regex captures - let algorithm = caps - .name("algo") - .map_or(String::new(), |m| { - String::from_utf8(m.as_bytes().into()).unwrap() - }) + let algorithm = line_info + .algo_name + .clone() + .unwrap_or_default() .to_lowercase(); // check if we are called with XXXsum (example: md5sum) but we detected a different algo parsing the file @@ -568,13 +613,9 @@ fn identify_algo_name_and_length( return None; } - let bits = caps.name("bits").map_or(Some(None), |m| { - let bits_value = String::from_utf8(m.as_bytes().into()) - .unwrap() - .parse::() - .unwrap(); - if bits_value % 8 == 0 { - Some(Some(bits_value / 8)) + let bits = line_info.algo_bitlen.map_or(Some(None), |bits| { + if bits % 8 == 0 { + Some(Some(bits / 8)) } else { None // Return None to signal a divisibility issue } @@ -597,6 +638,7 @@ fn process_checksum_line( cli_algo_name: Option<&str>, cli_algo_length: Option, opts: ChecksumOptions, + cached_regex: &mut Option, ) -> Result<(), LineCheckError> { let line_bytes = os_str_as_bytes(line)?; @@ -605,26 +647,30 @@ fn process_checksum_line( return Err(LineCheckError::Skipped); } - let (chosen_regex, is_algo_based_format) = - determine_regex(line).ok_or(LineCheckError::ImproperlyFormatted)?; + if let Some(line_info) = LineInfo::parse(line, cached_regex) { + // The cached regex ensures that when processing non-algo based regexes, + // its cannot be changed (can't have single and double space regexes + // used in the same file). + if cached_regex.is_none() && !line_info.is_algo_based() { + let _ = cached_regex.insert(line_info.regex.clone()); + } - if let Some(caps) = chosen_regex.captures(line_bytes) { - let mut filename_to_check = caps.name("filename").unwrap().as_bytes(); + let mut filename_to_check = line_info.filename.as_slice(); if filename_to_check.starts_with(b"*") && i == 0 - && chosen_regex.as_str() == SINGLE_SPACE_REGEX + && line_info.regex_str() == SINGLE_SPACE_REGEX { // Remove the leading asterisk if present - only for the first line filename_to_check = &filename_to_check[1..]; } - let expected_checksum = get_expected_digest_as_hex_string(&caps, &chosen_regex) + let expected_checksum = get_expected_digest_as_hex_string(&line_info) .ok_or(LineCheckError::ImproperlyFormatted)?; // If the algo_name is provided, we use it, otherwise we try to detect it - let (algo_name, length) = if is_algo_based_format { - identify_algo_name_and_length(&caps, cli_algo_name) + let (algo_name, length) = if line_info.is_algo_based() { + identify_algo_name_and_length(&line_info, cli_algo_name) .ok_or(LineCheckError::ImproperlyFormatted)? } else if let Some(a) = cli_algo_name { // When a specific algorithm name is input, use it and use the provided bits @@ -721,6 +767,10 @@ fn process_checksum_file( let reader = BufReader::new(file); let lines = read_os_string_lines(reader).collect::>(); + // cached_regex is used to ensure that several non algo-based checksum line + // will use the same regex. + let mut cached_regex = None; + for (i, line) in lines.iter().enumerate() { let line_result = process_checksum_line( filename_input, @@ -729,6 +779,7 @@ fn process_checksum_file( cli_algo_name, cli_algo_length, opts, + &mut cached_regex, ); // Match a first time to elude critical UErrors, and increment the total @@ -1149,52 +1200,75 @@ mod tests { } #[test] - fn test_determine_regex() { + fn test_line_info() { + let mut cached_regex = None; + // Test algo-based regex let line_algo_based = OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); - let (regex, algo_based) = determine_regex(&line_algo_based).unwrap(); - assert!(algo_based); - assert!(regex.is_match(os_str_as_bytes(&line_algo_based).unwrap())); + let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap(); + assert!(line_info.is_algo_based()); + assert_eq!(line_info.algo_name.as_deref(), Some("MD5")); + assert!(line_info.algo_bit_len.is_none()); + assert_eq!(line_info.filename, b"example.txt"); + assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); + assert_eq!(line_info.regex_str(), ALGO_BASED_REGEX); + assert!(cached_regex.is_none()); // Test double-space regex let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); - let (regex, algo_based) = determine_regex(&line_double_space).unwrap(); - assert!(!algo_based); - assert!(regex.is_match(os_str_as_bytes(&line_double_space).unwrap())); + let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap(); + assert!(!line_info.is_algo_based()); + assert!(line_info.algo_name.is_none()); + assert!(line_info.algo_bit_len.is_none()); + assert_eq!(line_info.filename, b"example.txt"); + assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); + assert_eq!(line_info.regex_str(), DOUBLE_SPACE_REGEX); + assert!(cached_regex.is_some()); + + cached_regex = None; // Test single-space regex let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); - let (regex, algo_based) = determine_regex(&line_single_space).unwrap(); - assert!(!algo_based); - assert!(regex.is_match(os_str_as_bytes(&line_single_space).unwrap())); + let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap(); + assert!(!line_info.is_algo_based()); + assert!(line_info.algo_name.is_none()); + assert!(line_info.algo_bit_len.is_none()); + assert_eq!(line_info.filename, b"example.txt"); + assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); + assert_eq!(line_info.regex_str(), SINGLE_SPACE_REGEX); + assert!(cached_regex.is_some()); + + cached_regex = None; // Test invalid checksum line let line_invalid = OsString::from("invalid checksum line"); - assert!(determine_regex(&line_invalid).is_none()); + assert!(LineInfo::parse(&line_invalid, &mut cached_regex).is_none()); + assert!(cached_regex.is_none()); // Test leading space before checksum line let line_algo_based_leading_space = OsString::from(" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); - let res = determine_regex(&line_algo_based_leading_space); + let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex); assert!(res.is_some()); - assert_eq!(res.unwrap().0.as_str(), ALGO_BASED_REGEX); + assert_eq!(res.unwrap().regex_str(), ALGO_BASED_REGEX); + assert!(cached_regex.is_none()); // Test trailing space after checksum line (should fail) let line_algo_based_leading_space = OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e "); - let res = determine_regex(&line_algo_based_leading_space); + let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex); assert!(res.is_none()); + assert!(cached_regex.is_none()); } #[test] fn test_get_expected_digest() { - let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(); - let caps = re - .captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=") - .unwrap(); + let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU="); + let mut cached_regex = None; + let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap(); - let result = get_expected_digest_as_hex_string(&caps, &re); + let result = get_expected_digest_as_hex_string(&line_info); assert_eq!( result.unwrap(), @@ -1204,12 +1278,12 @@ mod tests { #[test] fn test_get_expected_checksum_invalid() { - let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(); - let caps = re - .captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU") - .unwrap(); + // The line misses a '=' at the end to be valid base64 + let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU"); + let mut cached_regex = None; + let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap(); - let result = get_expected_digest_as_hex_string(&caps, &re); + let result = get_expected_digest_as_hex_string(&line_info); assert!(result.is_none()); } From 65ddccbeb6a4e9bc45b0fc0184209a08387da411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Thu, 5 Dec 2024 18:19:50 +0100 Subject: [PATCH 3/8] checksum: avoid to recompute Regexps --- Cargo.lock | 1 + src/uucore/Cargo.toml | 1 + src/uucore/src/lib/features/checksum.rs | 40 ++++++++++++++++--------- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb09c5fd6..611cd240c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3573,6 +3573,7 @@ dependencies = [ "glob", "hex", "itertools", + "lazy_static", "libc", "md-5", "memchr", diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index b72a8ed71..a4529f3a5 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -25,6 +25,7 @@ dns-lookup = { workspace = true, optional = true } dunce = { version = "1.0.4", optional = true } wild = "2.2.1" glob = { workspace = true } +lazy_static = "1.4.0" # * optional itertools = { workspace = true, optional = true } thiserror = { workspace = true, optional = true } diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index d575a4b38..dec5fcf21 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -5,6 +5,7 @@ // spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename use data_encoding::BASE64; +use lazy_static::lazy_static; use os_display::Quotable; use regex::bytes::{Match, Regex}; use std::{ @@ -428,6 +429,13 @@ const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P // In this case, we ignore the * const SINGLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s(?P\*?(?-u:.*))$"; +lazy_static! { + static ref R_ALGO_BASED: Regex = Regex::new(ALGO_BASED_REGEX).unwrap(); + static ref R_DOUBLE_SPACE: Regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap(); + static ref R_SINGLE_SPACE: Regex = Regex::new(SINGLE_SPACE_REGEX).unwrap(); + static ref R_ALGO_BASED_BASE_64: Regex = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(); +} + /// Hold the data extracted from a checksum line. struct LineInfo { algo_name: Option, @@ -435,28 +443,32 @@ struct LineInfo { checksum: String, filename: Vec, - regex: Regex, + regex: &'static Regex, } impl LineInfo { - fn parse(s: impl AsRef, cached_regex: &mut Option) -> Option { - let regexes = [ - (Regex::new(ALGO_BASED_REGEX).unwrap(), true), - (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false), - (Regex::new(SINGLE_SPACE_REGEX).unwrap(), false), - (Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(), false), + fn parse(s: impl AsRef, cached_regex: &mut Option<&'static Regex>) -> Option { + let regexes: &[(&'static Regex, bool)] = &[ + (&R_ALGO_BASED, true), + (&R_DOUBLE_SPACE, false), + (&R_SINGLE_SPACE, false), + (&R_ALGO_BASED_BASE_64, true), ]; let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed"); - for (regex, algo_based) in ®exes { + for (regex, algo_based) in regexes { if !regex.is_match(line_bytes) { continue; } - let mut r = regex.clone(); - if !algo_based && cached_regex.is_some() { - r = cached_regex.clone().unwrap(); + let mut r = *regex; + if !algo_based { + if cached_regex.is_some() { + r = cached_regex.unwrap(); + } else { + *cached_regex = Some(r); + } } if let Some(caps) = r.captures(line_bytes) { @@ -470,7 +482,7 @@ impl LineInfo { .map(|m| match_to_string(m).parse::().unwrap()), checksum: caps.name("checksum").map(match_to_string).unwrap(), filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(), - regex: r.clone(), + regex: r, }); } } @@ -638,7 +650,7 @@ fn process_checksum_line( cli_algo_name: Option<&str>, cli_algo_length: Option, opts: ChecksumOptions, - cached_regex: &mut Option, + cached_regex: &mut Option<&'static Regex>, ) -> Result<(), LineCheckError> { let line_bytes = os_str_as_bytes(line)?; @@ -652,7 +664,7 @@ fn process_checksum_line( // its cannot be changed (can't have single and double space regexes // used in the same file). if cached_regex.is_none() && !line_info.is_algo_based() { - let _ = cached_regex.insert(line_info.regex.clone()); + let _ = cached_regex.insert(line_info.regex); } let mut filename_to_check = line_info.filename.as_slice(); From df16c1c65560b42f0a4471876f7e3203a6c05a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Thu, 5 Dec 2024 17:59:35 +0100 Subject: [PATCH 4/8] test(cksum): Add tests --- tests/by-util/test_cksum.rs | 81 ++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/tests/by-util/test_cksum.rs b/tests/by-util/test_cksum.rs index bf74de9cc..6c1718112 100644 --- a/tests/by-util/test_cksum.rs +++ b/tests/by-util/test_cksum.rs @@ -1443,7 +1443,7 @@ mod check_utf8 { let scene = TestScenario::new(util_name!()); let at = &scene.fixtures; let filename: OsString = OsStringExt::from_vec(b"funky\xffname".to_vec()); - at.touch(&filename); + at.touch(filename); // Checksum match at.write_bytes("check", @@ -1544,7 +1544,6 @@ fn test_check_confusing_base64() { /// This test checks that when a file contains several checksum lines /// with different encoding, the decoding still works. -#[ignore = "not yet implemented"] #[test] fn test_check_mix_hex_base64() { let b64 = "BLAKE2b-128 (foo1.dat) = BBNuJPhdRwRlw9tm5Y7VbA=="; @@ -1769,3 +1768,81 @@ mod gnu_cksum_base64 { } } } + +/// The tests in this module check the behavior of cksum when given different +/// checksum formats and algorithms in the same file, while specifying an +/// algorithm on CLI or not. +mod format_mix { + use super::*; + + // First line is algo-based, second one is not + const INPUT_ALGO_NON_ALGO: &str = "\ + BLAKE2b (bar) = 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce\n\ + 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce foo"; + + // First line is non algo-based, second one is + const INPUT_NON_ALGO_ALGO: &str = "\ + 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce foo\n\ + BLAKE2b (bar) = 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce"; + + /// Make a simple scene with foo and bar empty files + fn make_scene() -> TestScenario { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + + at.touch("foo"); + at.touch("bar"); + + scene + } + + #[test] + fn test_check_cli_algo_non_algo() { + let scene = make_scene(); + scene + .ucmd() + .arg("--check") + .arg("--algo=blake2b") + .pipe_in(INPUT_ALGO_NON_ALGO) + .succeeds() + .stdout_contains("bar: OK\nfoo: OK") + .no_stderr(); + } + + #[test] + fn test_check_cli_non_algo_algo() { + let scene = make_scene(); + scene + .ucmd() + .arg("--check") + .arg("--algo=blake2b") + .pipe_in(INPUT_NON_ALGO_ALGO) + .succeeds() + .stdout_contains("foo: OK\nbar: OK") + .no_stderr(); + } + + #[test] + fn test_check_algo_non_algo() { + let scene = make_scene(); + scene + .ucmd() + .arg("--check") + .pipe_in(INPUT_ALGO_NON_ALGO) + .succeeds() + .stdout_contains("bar: OK") + .stderr_contains("cksum: WARNING: 1 line is improperly formatted"); + } + + #[test] + fn test_check_non_algo_algo() { + let scene = make_scene(); + scene + .ucmd() + .arg("--check") + .pipe_in(INPUT_NON_ALGO_ALGO) + .succeeds() + .stdout_contains("bar: OK") + .stderr_contains("cksum: WARNING: 1 line is improperly formatted"); + } +} From 10a9b0bfbf237ed37b298ffcd932c6b31e17a18a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Thu, 5 Dec 2024 18:22:52 +0100 Subject: [PATCH 5/8] checksum: split treatment of algo-based and non-algo based into separate functions --- src/uucore/src/lib/features/checksum.rs | 172 ++++++++++++++---------- 1 file changed, 102 insertions(+), 70 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index dec5fcf21..0da814a76 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -464,6 +464,9 @@ impl LineInfo { let mut r = *regex; if !algo_based { + // The cached regex ensures that when processing non-algo based regexes, + // its cannot be changed (can't have single and double space regexes + // used in the same file). if cached_regex.is_some() { r = cached_regex.unwrap(); } else { @@ -636,13 +639,101 @@ fn identify_algo_name_and_length( Some((algorithm, bits)) } +/// Given a filename and an algorithm, compute the digest and compare it with +/// the expected one. +fn compute_and_check_digest_from_file( + filename: &[u8], + expected_checksum: &str, + mut algo: HashAlgorithm, + opts: ChecksumOptions, +) -> Result<(), LineCheckError> { + let (filename_to_check_unescaped, prefix) = unescape_filename(filename); + let real_filename_to_check = os_str_from_bytes(&filename_to_check_unescaped)?; + + // Open the input file + let file_to_check = get_file_to_check(&real_filename_to_check, opts)?; + let mut file_reader = BufReader::new(file_to_check); + + // Read the file and calculate the checksum + let create_fn = &mut algo.create_fn; + let mut digest = create_fn(); + let (calculated_checksum, _) = + digest_reader(&mut digest, &mut file_reader, opts.binary, algo.bits).unwrap(); + + // Do the checksum validation + let checksum_correct = expected_checksum == calculated_checksum; + print_file_report( + std::io::stdout(), + filename, + FileChecksumResult::from_bool(checksum_correct), + prefix, + opts, + ); + + if checksum_correct { + Ok(()) + } else { + Err(LineCheckError::DigestMismatch) + } +} + +/// Check a digest checksum with non-algo based pre-treatment. +fn process_algo_based_line( + line_info: &LineInfo, + cli_algo_name: Option<&str>, + opts: ChecksumOptions, +) -> Result<(), LineCheckError> { + let filename_to_check = line_info.filename.as_slice(); + let expected_checksum = + get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?; + + let (algo_name, algo_bitlen) = identify_algo_name_and_length(line_info, cli_algo_name) + .ok_or(LineCheckError::ImproperlyFormatted)?; + + let algo = detect_algo(&algo_name, algo_bitlen)?; + + compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts) +} + +/// Check a digest checksum with non-algo based pre-treatment. +fn process_non_algo_based_line( + i: usize, + line_info: &LineInfo, + cli_algo_name: &str, + cli_algo_length: Option, + opts: ChecksumOptions, +) -> Result<(), LineCheckError> { + let mut filename_to_check = line_info.filename.as_slice(); + if filename_to_check.starts_with(b"*") && i == 0 && line_info.regex_str() == SINGLE_SPACE_REGEX + { + // Remove the leading asterisk if present - only for the first line + filename_to_check = &filename_to_check[1..]; + } + let expected_checksum = + get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?; + + // When a specific algorithm name is input, use it and use the provided bits + // except when dealing with blake2b, where we will detect the length + let (algo_name, algo_bitlen) = if cli_algo_name == ALGORITHM_OPTIONS_BLAKE2B { + // division by 2 converts the length of the Blake2b checksum from hexadecimal + // characters to bytes, as each byte is represented by two hexadecimal characters. + let length = Some(expected_checksum.len() / 2); + (ALGORITHM_OPTIONS_BLAKE2B.to_string(), length) + } else { + (cli_algo_name.to_lowercase(), cli_algo_length) + }; + + let algo = detect_algo(&algo_name, algo_bitlen)?; + + compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts) +} + /// Parses a checksum line, detect the algorithm to use, read the file and produce /// its digest, and compare it to the expected value. /// /// Returns `Ok(bool)` if the comparison happened, bool indicates if the digest /// matched the expected. /// If the comparison didn't happen, return a `LineChecksumError`. -#[allow(clippy::too_many_arguments)] fn process_checksum_line( filename_input: &OsStr, line: &OsStr, @@ -654,82 +745,23 @@ fn process_checksum_line( ) -> Result<(), LineCheckError> { let line_bytes = os_str_as_bytes(line)?; - // early return on empty or commented lines. + // Early return on empty or commented lines. if line.is_empty() || line_bytes.starts_with(b"#") { return Err(LineCheckError::Skipped); } + // Use `LineInfo` to extract the data of a line. + // Then, depending on its format, apply a different pre-treatment. if let Some(line_info) = LineInfo::parse(line, cached_regex) { - // The cached regex ensures that when processing non-algo based regexes, - // its cannot be changed (can't have single and double space regexes - // used in the same file). - if cached_regex.is_none() && !line_info.is_algo_based() { - let _ = cached_regex.insert(line_info.regex); - } - - let mut filename_to_check = line_info.filename.as_slice(); - - if filename_to_check.starts_with(b"*") - && i == 0 - && line_info.regex_str() == SINGLE_SPACE_REGEX - { - // Remove the leading asterisk if present - only for the first line - filename_to_check = &filename_to_check[1..]; - } - - let expected_checksum = get_expected_digest_as_hex_string(&line_info) - .ok_or(LineCheckError::ImproperlyFormatted)?; - - // If the algo_name is provided, we use it, otherwise we try to detect it - let (algo_name, length) = if line_info.is_algo_based() { - identify_algo_name_and_length(&line_info, cli_algo_name) - .ok_or(LineCheckError::ImproperlyFormatted)? - } else if let Some(a) = cli_algo_name { - // When a specific algorithm name is input, use it and use the provided bits - // except when dealing with blake2b, where we will detect the length - if cli_algo_name == Some(ALGORITHM_OPTIONS_BLAKE2B) { - // division by 2 converts the length of the Blake2b checksum from hexadecimal - // characters to bytes, as each byte is represented by two hexadecimal characters. - let length = Some(expected_checksum.len() / 2); - (ALGORITHM_OPTIONS_BLAKE2B.to_string(), length) - } else { - (a.to_lowercase(), cli_algo_length) - } + if line_info.is_algo_based() { + process_algo_based_line(&line_info, cli_algo_name, opts) + } else if let Some(cli_algo) = cli_algo_name { + // If we match a non-algo based regex, we expect a cli argument + // to give us the algorithm to use + process_non_algo_based_line(i, &line_info, cli_algo, cli_algo_length, opts) } else { - // Default case if no algorithm is specified and non-algo based format is matched + // We have no clue of what algorithm to use return Err(LineCheckError::ImproperlyFormatted); - }; - - let mut algo = detect_algo(&algo_name, length)?; - - let (filename_to_check_unescaped, prefix) = unescape_filename(filename_to_check); - - let real_filename_to_check = os_str_from_bytes(&filename_to_check_unescaped)?; - - // manage the input file - let file_to_check = get_file_to_check(&real_filename_to_check, opts)?; - let mut file_reader = BufReader::new(file_to_check); - - // Read the file and calculate the checksum - let create_fn = &mut algo.create_fn; - let mut digest = create_fn(); - let (calculated_checksum, _) = - digest_reader(&mut digest, &mut file_reader, opts.binary, algo.bits).unwrap(); - - // Do the checksum validation - let checksum_correct = expected_checksum == calculated_checksum; - print_file_report( - std::io::stdout(), - filename_to_check, - FileChecksumResult::from_bool(checksum_correct), - prefix, - opts, - ); - - if checksum_correct { - Ok(()) - } else { - Err(LineCheckError::DigestMismatch) } } else { if opts.warn { From f4e5dc2e0fa8ae5e2a4ad23fa4dd0f0a44e12253 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Sat, 7 Dec 2024 02:29:54 +0100 Subject: [PATCH 6/8] checksum: use the blake2b length as an hint to check the correctness of the expected digest --- src/uucore/src/lib/features/checksum.rs | 95 +++++++++++++++++-------- 1 file changed, 64 insertions(+), 31 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 0da814a76..8de983490 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename +// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit use data_encoding::BASE64; use lazy_static::lazy_static; @@ -515,22 +515,43 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { } /// Extract the expected digest from the checksum string -fn get_expected_digest_as_hex_string(line_info: &LineInfo) -> Option> { +fn get_expected_digest_as_hex_string( + line_info: &LineInfo, + len_hint: Option, +) -> Option> { let ck = &line_info.checksum; - if line_info.regex_str() == ALGO_BASED_REGEX_BASE64 { - BASE64 - .decode(ck.as_bytes()) - .map(hex::encode) - .map(Cow::Owned) - .ok() - } else if ck.len() % 2 == 0 { - Some(Cow::Borrowed(ck)) - } else { + // TODO MSRV 1.82, replace `is_some_and` with `is_none_or` + // to improve readability. This closure returns True if a length hint provided + // and the argument isn't the same as the hint. + let against_hint = |len| len_hint.is_some_and(|l| l != len); + + if ck.len() % 2 != 0 { // If the length of the digest is not a multiple of 2, then it // must be improperly formatted (1 hex digit is 2 characters) - None + return None; } + + // If the digest can be decoded as hexadecimal AND it length match the + // one expected (in case it's given), just go with it. + if ck.as_bytes().iter().all(u8::is_ascii_hexdigit) && !against_hint(ck.len()) { + return Some(Cow::Borrowed(ck)); + } + + // If hexadecimal digest fails for any reason, interpret the digest as base 64. + BASE64 + .decode(ck.as_bytes()) // Decode the string as encoded base64 + .map(hex::encode) // Encode it back as hexadecimal + .map(Cow::::Owned) + .ok() + .and_then(|s| { + // Check the digest length + if !against_hint(s.len()) { + Some(s) + } else { + None + } + }) } /// Returns a reader that reads from the specified file, or from stdin if `filename_to_check` is "-". @@ -604,12 +625,11 @@ fn get_input_file(filename: &OsStr) -> UResult> { } } -/// Extracts the algorithm name and length from the regex captures if the algo-based format is matched. +/// Gets the algorithm name and length from the `LineInfo` if the algo-based format is matched. fn identify_algo_name_and_length( line_info: &LineInfo, algo_name_input: Option<&str>, ) -> Option<(String, Option)> { - // When the algo-based format is matched, extract details from regex captures let algorithm = line_info .algo_name .clone() @@ -628,15 +648,20 @@ fn identify_algo_name_and_length( return None; } - let bits = line_info.algo_bitlen.map_or(Some(None), |bits| { - if bits % 8 == 0 { - Some(Some(bits / 8)) - } else { - None // Return None to signal a divisibility issue + let bytes = if let Some(bitlen) = line_info.algo_bit_len { + if bitlen % 8 != 0 { + // The given length is wrong + return None; } - })?; + Some(bitlen / 8) + } else if algorithm == ALGORITHM_OPTIONS_BLAKE2B { + // Default length with BLAKE2b, + Some(64) + } else { + None + }; - Some((algorithm, bits)) + Some((algorithm, bytes)) } /// Given a filename and an algorithm, compute the digest and compare it with @@ -684,13 +709,21 @@ fn process_algo_based_line( opts: ChecksumOptions, ) -> Result<(), LineCheckError> { let filename_to_check = line_info.filename.as_slice(); - let expected_checksum = - get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?; - let (algo_name, algo_bitlen) = identify_algo_name_and_length(line_info, cli_algo_name) + let (algo_name, algo_byte_len) = identify_algo_name_and_length(line_info, cli_algo_name) .ok_or(LineCheckError::ImproperlyFormatted)?; - let algo = detect_algo(&algo_name, algo_bitlen)?; + // If the digest bitlen is known, we can check the format of the expected + // checksum with it. + let digest_char_length_hint = match (algo_name.as_str(), algo_byte_len) { + (ALGORITHM_OPTIONS_BLAKE2B, Some(bytelen)) => Some(bytelen * 2), + _ => None, + }; + + let expected_checksum = get_expected_digest_as_hex_string(line_info, digest_char_length_hint) + .ok_or(LineCheckError::ImproperlyFormatted)?; + + let algo = detect_algo(&algo_name, algo_byte_len)?; compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts) } @@ -709,12 +742,12 @@ fn process_non_algo_based_line( // Remove the leading asterisk if present - only for the first line filename_to_check = &filename_to_check[1..]; } - let expected_checksum = - get_expected_digest_as_hex_string(line_info).ok_or(LineCheckError::ImproperlyFormatted)?; + let expected_checksum = get_expected_digest_as_hex_string(line_info, None) + .ok_or(LineCheckError::ImproperlyFormatted)?; // When a specific algorithm name is input, use it and use the provided bits // except when dealing with blake2b, where we will detect the length - let (algo_name, algo_bitlen) = if cli_algo_name == ALGORITHM_OPTIONS_BLAKE2B { + let (algo_name, algo_byte_len) = if cli_algo_name == ALGORITHM_OPTIONS_BLAKE2B { // division by 2 converts the length of the Blake2b checksum from hexadecimal // characters to bytes, as each byte is represented by two hexadecimal characters. let length = Some(expected_checksum.len() / 2); @@ -723,7 +756,7 @@ fn process_non_algo_based_line( (cli_algo_name.to_lowercase(), cli_algo_length) }; - let algo = detect_algo(&algo_name, algo_bitlen)?; + let algo = detect_algo(&algo_name, algo_byte_len)?; compute_and_check_digest_from_file(filename_to_check, &expected_checksum, algo, opts) } @@ -1312,7 +1345,7 @@ mod tests { let mut cached_regex = None; let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap(); - let result = get_expected_digest_as_hex_string(&line_info); + let result = get_expected_digest_as_hex_string(&line_info, None); assert_eq!( result.unwrap(), @@ -1327,7 +1360,7 @@ mod tests { let mut cached_regex = None; let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap(); - let result = get_expected_digest_as_hex_string(&line_info); + let result = get_expected_digest_as_hex_string(&line_info, None); assert!(result.is_none()); } From 567bbc5f3c8a242607881554728a420b68613792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Sat, 7 Dec 2024 02:38:00 +0100 Subject: [PATCH 7/8] checksum: remove ALGO_BASED_REGEX (non base64) as its not useful anymore and introduce LineFormat struct --- src/uucore/src/lib/features/checksum.rs | 85 ++++++++++++++----------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 8de983490..e19845f18 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -421,8 +421,7 @@ pub fn detect_algo(algo: &str, length: Option) -> UResult // algo must be uppercase or b (for blake2b) // 2. [* ] // 3. [*] (only one space) -const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P(?-u:.*))\)\s*=\s*(?P[a-fA-F0-9]+)$"; -const ALGO_BASED_REGEX_BASE64: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P(?-u:.*))\)\s*=\s*(?P[A-Za-z0-9+/]+={0,2})$"; +const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P(?-u:.*))\)\s*=\s*(?P[A-Za-z0-9+/]+={0,2})$"; const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P(?-u:.*))$"; @@ -433,7 +432,23 @@ lazy_static! { static ref R_ALGO_BASED: Regex = Regex::new(ALGO_BASED_REGEX).unwrap(); static ref R_DOUBLE_SPACE: Regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap(); static ref R_SINGLE_SPACE: Regex = Regex::new(SINGLE_SPACE_REGEX).unwrap(); - static ref R_ALGO_BASED_BASE_64: Regex = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(); +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +enum LineFormat { + AlgoBased, + SingleSpace, + DoubleSpace, +} + +impl LineFormat { + fn to_regex(self) -> &'static Regex { + match self { + LineFormat::AlgoBased => &R_ALGO_BASED, + LineFormat::SingleSpace => &R_SINGLE_SPACE, + LineFormat::DoubleSpace => &R_DOUBLE_SPACE, + } + } } /// Hold the data extracted from a checksum line. @@ -443,34 +458,41 @@ struct LineInfo { checksum: String, filename: Vec, - regex: &'static Regex, + format: LineFormat, } impl LineInfo { - fn parse(s: impl AsRef, cached_regex: &mut Option<&'static Regex>) -> Option { - let regexes: &[(&'static Regex, bool)] = &[ - (&R_ALGO_BASED, true), - (&R_DOUBLE_SPACE, false), - (&R_SINGLE_SPACE, false), - (&R_ALGO_BASED_BASE_64, true), + /// Returns a `LineInfo` parsed from a checksum line. + /// The function will run 3 regexes against the line and select the first one that matches + /// to populate the fields of the struct. + /// However, there is a catch to handle regarding the handling of `cached_regex`. + /// In case of non-algo-based regex, if `cached_regex` is Some, it must take the priority + /// over the detected regex. Otherwise, we must set it the the detected regex. + /// This specific behavior is emphasized by the test + /// `test_hashsum::test_check_md5sum_only_one_space`. + fn parse(s: impl AsRef, cached_regex: &mut Option) -> Option { + let regexes: &[(&'static Regex, LineFormat)] = &[ + (&R_ALGO_BASED, LineFormat::AlgoBased), + (&R_DOUBLE_SPACE, LineFormat::DoubleSpace), + (&R_SINGLE_SPACE, LineFormat::SingleSpace), ]; let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed"); - for (regex, algo_based) in regexes { + for (regex, format) in regexes { if !regex.is_match(line_bytes) { continue; } let mut r = *regex; - if !algo_based { + if *format != LineFormat::AlgoBased { // The cached regex ensures that when processing non-algo based regexes, - // its cannot be changed (can't have single and double space regexes + // it cannot be changed (can't have single and double space regexes // used in the same file). if cached_regex.is_some() { - r = cached_regex.unwrap(); + r = cached_regex.unwrap().to_regex(); } else { - *cached_regex = Some(r); + *cached_regex = Some(*format); } } @@ -485,23 +507,13 @@ impl LineInfo { .map(|m| match_to_string(m).parse::().unwrap()), checksum: caps.name("checksum").map(match_to_string).unwrap(), filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(), - regex: r, + format: *format, }); } } None } - - #[inline] - fn is_algo_based(&self) -> bool { - self.algo_name.is_some() - } - - #[inline] - fn regex_str(&self) -> &str { - self.regex.as_str() - } } fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { @@ -730,14 +742,16 @@ fn process_algo_based_line( /// Check a digest checksum with non-algo based pre-treatment. fn process_non_algo_based_line( - i: usize, + line_number: usize, line_info: &LineInfo, cli_algo_name: &str, cli_algo_length: Option, opts: ChecksumOptions, ) -> Result<(), LineCheckError> { let mut filename_to_check = line_info.filename.as_slice(); - if filename_to_check.starts_with(b"*") && i == 0 && line_info.regex_str() == SINGLE_SPACE_REGEX + if filename_to_check.starts_with(b"*") + && line_number == 0 + && line_info.format == LineFormat::SingleSpace { // Remove the leading asterisk if present - only for the first line filename_to_check = &filename_to_check[1..]; @@ -774,7 +788,7 @@ fn process_checksum_line( cli_algo_name: Option<&str>, cli_algo_length: Option, opts: ChecksumOptions, - cached_regex: &mut Option<&'static Regex>, + cached_regex: &mut Option, ) -> Result<(), LineCheckError> { let line_bytes = os_str_as_bytes(line)?; @@ -786,7 +800,7 @@ fn process_checksum_line( // Use `LineInfo` to extract the data of a line. // Then, depending on its format, apply a different pre-treatment. if let Some(line_info) = LineInfo::parse(line, cached_regex) { - if line_info.is_algo_based() { + if line_info.format == LineFormat::AlgoBased { process_algo_based_line(&line_info, cli_algo_name, opts) } else if let Some(cli_algo) = cli_algo_name { // If we match a non-algo based regex, we expect a cli argument @@ -1284,23 +1298,21 @@ mod tests { let line_algo_based = OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap(); - assert!(line_info.is_algo_based()); assert_eq!(line_info.algo_name.as_deref(), Some("MD5")); assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); - assert_eq!(line_info.regex_str(), ALGO_BASED_REGEX); + assert_eq!(line_info.format, LineFormat::AlgoBased); assert!(cached_regex.is_none()); // Test double-space regex let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap(); - assert!(!line_info.is_algo_based()); assert!(line_info.algo_name.is_none()); assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); - assert_eq!(line_info.regex_str(), DOUBLE_SPACE_REGEX); + assert_eq!(line_info.format, LineFormat::DoubleSpace); assert!(cached_regex.is_some()); cached_regex = None; @@ -1308,12 +1320,11 @@ mod tests { // Test single-space regex let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap(); - assert!(!line_info.is_algo_based()); assert!(line_info.algo_name.is_none()); assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); - assert_eq!(line_info.regex_str(), SINGLE_SPACE_REGEX); + assert_eq!(line_info.format, LineFormat::SingleSpace); assert!(cached_regex.is_some()); cached_regex = None; @@ -1328,7 +1339,7 @@ mod tests { OsString::from(" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex); assert!(res.is_some()); - assert_eq!(res.unwrap().regex_str(), ALGO_BASED_REGEX); + assert_eq!(line_info.format, LineFormat::AlgoBased); assert!(cached_regex.is_none()); // Test trailing space after checksum line (should fail) From 958222a07ccc25940a8aa7304bfe22cd41fd904b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Sat, 7 Dec 2024 02:30:30 +0100 Subject: [PATCH 8/8] test(cksum): un-ignore tests that are now implemented --- src/uucore/src/lib/features/checksum.rs | 3 +-- tests/by-util/test_cksum.rs | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index e19845f18..0b3e4e249 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -1337,8 +1337,7 @@ mod tests { // Test leading space before checksum line let line_algo_based_leading_space = OsString::from(" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); - let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex); - assert!(res.is_some()); + let line_info = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex).unwrap(); assert_eq!(line_info.format, LineFormat::AlgoBased); assert!(cached_regex.is_none()); diff --git a/tests/by-util/test_cksum.rs b/tests/by-util/test_cksum.rs index 6c1718112..2efc78b96 100644 --- a/tests/by-util/test_cksum.rs +++ b/tests/by-util/test_cksum.rs @@ -1480,7 +1480,6 @@ mod check_utf8 { } } -#[ignore = "not yet implemented"] #[test] fn test_check_blake_length_guess() { let correct_lines = [ @@ -1523,7 +1522,6 @@ fn test_check_blake_length_guess() { .stderr_contains("foo.sums: no properly formatted checksum lines found"); } -#[ignore = "not yet implemented"] #[test] fn test_check_confusing_base64() { let cksum = "BLAKE2b-48 (foo.dat) = fc1f97C4";