From 621f2b5c7a12e9ae20c78e8abd72c55ba8205e09 Mon Sep 17 00:00:00 2001 From: GTimothy <22472919+GTimothy@users.noreply.github.com> Date: Mon, 24 Mar 2025 22:49:31 +0100 Subject: [PATCH 1/3] checksum/cksum: rewrite lineformat parsing without regex removes dependency on the regex crate for LineFormat detection and parsing, resulting in a faster and lighter cksum binary. --- src/uucore/Cargo.toml | 2 +- src/uucore/src/lib/features/checksum.rs | 254 ++++++++++++++++-------- 2 files changed, 176 insertions(+), 80 deletions(-) diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index 57399bcac..c061ca75f 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -89,7 +89,7 @@ default = [] # * non-default features backup-control = [] colors = [] -checksum = ["data-encoding", "thiserror", "regex", "sum"] +checksum = ["data-encoding", "thiserror", "sum"] encoding = ["data-encoding", "data-encoding-macro", "z85"] entries = ["libc"] fs = ["dunce", "libc", "winapi-util", "windows-sys"] diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index e56eedd4b..25eddd0ac 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -2,11 +2,10 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit +// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit use data_encoding::BASE64; use os_display::Quotable; -use regex::bytes::{Match, Regex}; use std::{ borrow::Cow, ffi::OsStr, @@ -15,7 +14,6 @@ use std::{ io::{self, BufReader, Read, Write, stdin}, path::Path, str, - sync::LazyLock, }; use crate::{ @@ -466,36 +464,157 @@ pub fn detect_algo(algo: &str, length: Option) -> UResult } } -// Regexp to handle the three input formats: -// 1. [-] () = -// algo must be uppercase or b (for blake2b) -// 2. [* ] -// 3. [*] (only one space) -const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P(?-u:.*))\)\s*=\s*(?P[A-Za-z0-9+/]+={0,2})$"; - -const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P(?-u:.*))$"; - -// In this case, we ignore the * -const SINGLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s(?P\*?(?-u:.*))$"; - -static R_ALGO_BASED: LazyLock = LazyLock::new(|| Regex::new(ALGO_BASED_REGEX).unwrap()); -static R_DOUBLE_SPACE: LazyLock = LazyLock::new(|| Regex::new(DOUBLE_SPACE_REGEX).unwrap()); -static R_SINGLE_SPACE: LazyLock = LazyLock::new(|| Regex::new(SINGLE_SPACE_REGEX).unwrap()); - #[derive(Debug, PartialEq, Eq, Clone, Copy)] enum LineFormat { AlgoBased, SingleSpace, - DoubleSpace, + Untagged, } impl LineFormat { - fn to_regex(self) -> &'static Regex { - match self { - LineFormat::AlgoBased => &R_ALGO_BASED, - LineFormat::SingleSpace => &R_SINGLE_SPACE, - LineFormat::DoubleSpace => &R_DOUBLE_SPACE, + /// parse [tagged output format] + /// Normally the format is simply space separated but openssl does not + /// respect the gnu definition. + /// + /// [tagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1 + fn parse_algo_based(line: &[u8]) -> Option { + // r"\MD5 (a\\ b) = abc123", + // BLAKE2b(44)= a45a4c4883cce4b50d844fab460414cc2080ca83690e74d850a9253e757384366382625b218c8585daee80f34dc9eb2f2fde5fb959db81cd48837f9216e7b0fa + let trimmed = line.trim_ascii_start(); + let algo_start = if trimmed.starts_with(b"\\") { 1 } else { 0 }; + let rest = &trimmed[algo_start..]; + + // find the next parenthesis using byte search (not next whitespace) because openssl's + // tagged format does not put a space before (filename) + let par_idx = rest.iter().position(|&b| b == b'(')?; + let algo_substring = &rest[..par_idx].trim_ascii(); + let mut algo_parts = algo_substring.splitn(2, |&b| b == b'-'); + let algo = algo_parts.next()?; + + // Parse algo_bits if present + let algo_bits = algo_parts + .next() + .and_then(|s| std::str::from_utf8(s).ok()?.parse::().ok()); + + // Check algo format: uppercase ASCII or digits or "BLAKE2b" + let is_valid_algo = algo == b"BLAKE2b" + || algo + .iter() + .all(|&b| b.is_ascii_uppercase() || b.is_ascii_digit()); + if !is_valid_algo { + return None; } + // SAFETY: we just validated the contents of algo, we can unsafely make a + // String from it + let algo_utf8 = unsafe { String::from_utf8_unchecked(algo.to_vec()) }; + // stripping '(' not ' (' since we matched on ( not whitespace because of openssl. + let after_paren = rest.get(par_idx + 1..)?; + let (filename, checksum) = ByteSliceExt::split_once(after_paren, b") = ") + .or_else(|| ByteSliceExt::split_once(after_paren, b")= "))?; + + fn is_valid_checksum(checksum: &[u8]) -> bool { + if checksum.is_empty() { + return false; + } + + let mut parts = checksum.splitn(2, |&b| b == b'='); + let main = parts.next().unwrap(); // Always exists since checksum isn't empty + let padding = parts.next().unwrap_or(&b""[..]); // Empty if no '=' + + main.iter() + .all(|&b| b.is_ascii_alphanumeric() || b == b'+' || b == b'/') + && !main.is_empty() + && padding.len() <= 2 + && padding.iter().all(|&b| b == b'=') + } + if !is_valid_checksum(checksum) { + return None; + } + // SAFETY: we just validated the contents of checksum, we can unsafely make a + // String from it + let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) }; + + Some(LineInfo { + algo_name: Some(algo_utf8), + algo_bit_len: algo_bits, + checksum: checksum_utf8, + filename: filename.to_vec(), + format: LineFormat::AlgoBased, + }) + } + + #[allow(rustdoc::invalid_html_tags)] + /// parse [untagged output format] + /// The format is simple, either " " or + /// " *" + /// + /// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1 + fn parse_untagged(line: &[u8]) -> Option { + let space_idx = line.iter().position(|&b| b == b' ')?; + let checksum = &line[..space_idx]; + if !checksum.iter().all(|&b| b.is_ascii_hexdigit()) || checksum.is_empty() { + return None; + } + // SAFETY: we just validated the contents of checksum, we can unsafely make a + // String from it + let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) }; + + let rest = &line[space_idx..]; + let filename = rest + .strip_prefix(b" ") + .or_else(|| rest.strip_prefix(b" *"))?; + + Some(LineInfo { + algo_name: None, + algo_bit_len: None, + checksum: checksum_utf8, + filename: filename.to_vec(), + format: LineFormat::Untagged, + }) + } + + #[allow(rustdoc::invalid_html_tags)] + /// parse [untagged output format] + /// Normally the format is simple, either " " or + /// " *" + /// But the bsd tests expect special single space behavior where + /// checksum and filename are separated only by a space, meaning the second + /// space or asterisk is part of the file name. + /// This parser accounts for this variation + /// + /// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1 + fn parse_single_space(line: &[u8]) -> Option { + // Find first space + let space_idx = line.iter().position(|&b| b == b' ')?; + let checksum = &line[..space_idx]; + if !checksum.iter().all(|&b| b.is_ascii_hexdigit()) || checksum.is_empty() { + return None; + } + // SAFETY: we just validated the contents of checksum, we can unsafely make a + // String from it + let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) }; + + let filename = line.get(space_idx + 1..)?; // Skip single space + + Some(LineInfo { + algo_name: None, + algo_bit_len: None, + checksum: checksum_utf8, + filename: filename.to_vec(), + format: LineFormat::SingleSpace, + }) + } +} + +// Helper trait for byte slice operations +trait ByteSliceExt { + fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>; +} + +impl ByteSliceExt for [u8] { + fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> { + let pos = self.windows(pattern.len()).position(|w| w == pattern)?; + Some((&self[..pos], &self[pos + pattern.len()..])) } } @@ -505,62 +624,39 @@ struct LineInfo { algo_bit_len: Option, checksum: String, filename: Vec, - format: LineFormat, } impl LineInfo { /// Returns a `LineInfo` parsed from a checksum line. - /// The function will run 3 regexes against the line and select the first one that matches + /// The function will run 3 parsers against the line and select the first one that matches /// to populate the fields of the struct. - /// However, there is a catch to handle regarding the handling of `cached_regex`. - /// In case of non-algo-based regex, if `cached_regex` is Some, it must take the priority - /// over the detected regex. Otherwise, we must set it the the detected regex. + /// However, there is a catch to handle regarding the handling of `cached_line_format`. + /// In case of non-algo-based format, if `cached_line_format` is Some, it must take the priority + /// over the detected format. Otherwise, we must set it the the detected format. /// This specific behavior is emphasized by the test /// `test_hashsum::test_check_md5sum_only_one_space`. - fn parse(s: impl AsRef, cached_regex: &mut Option) -> Option { - let regexes: &[(&'static Regex, LineFormat)] = &[ - (&R_ALGO_BASED, LineFormat::AlgoBased), - (&R_DOUBLE_SPACE, LineFormat::DoubleSpace), - (&R_SINGLE_SPACE, LineFormat::SingleSpace), - ]; + fn parse(s: impl AsRef, cached_line_format: &mut Option) -> Option { + let line_bytes = os_str_as_bytes(s.as_ref()).ok()?; - let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed"); - - for (regex, format) in regexes { - if !regex.is_match(line_bytes) { - continue; - } - - let mut r = *regex; - if *format != LineFormat::AlgoBased { - // The cached regex ensures that when processing non-algo based regexes, - // it cannot be changed (can't have single and double space regexes - // used in the same file). - if cached_regex.is_some() { - r = cached_regex.unwrap().to_regex(); - } else { - *cached_regex = Some(*format); - } - } - - if let Some(caps) = r.captures(line_bytes) { - // These unwraps are safe thanks to the regex - let match_to_string = |m: Match| String::from_utf8(m.as_bytes().into()).unwrap(); - - return Some(Self { - algo_name: caps.name("algo").map(match_to_string), - algo_bit_len: caps - .name("bits") - .map(|m| match_to_string(m).parse::().unwrap()), - checksum: caps.name("checksum").map(match_to_string).unwrap(), - filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(), - format: *format, - }); - } + if let Some(info) = LineFormat::parse_algo_based(line_bytes) { + return Some(info); + } + if let Some(cached_format) = cached_line_format { + match cached_format { + LineFormat::Untagged => LineFormat::parse_untagged(line_bytes), + LineFormat::SingleSpace => LineFormat::parse_single_space(line_bytes), + _ => unreachable!("we never catch the algo based format"), + } + } else if let Some(info) = LineFormat::parse_untagged(line_bytes) { + *cached_line_format = Some(LineFormat::Untagged); + Some(info) + } else if let Some(info) = LineFormat::parse_single_space(line_bytes) { + *cached_line_format = Some(LineFormat::SingleSpace); + Some(info) + } else { + None } - - None } } @@ -835,7 +931,7 @@ fn process_checksum_line( cli_algo_name: Option<&str>, cli_algo_length: Option, opts: ChecksumOptions, - cached_regex: &mut Option, + cached_line_format: &mut Option, last_algo: &mut Option, ) -> Result<(), LineCheckError> { let line_bytes = os_str_as_bytes(line)?; @@ -847,14 +943,14 @@ fn process_checksum_line( // Use `LineInfo` to extract the data of a line. // Then, depending on its format, apply a different pre-treatment. - let Some(line_info) = LineInfo::parse(line, cached_regex) else { + let Some(line_info) = LineInfo::parse(line, cached_line_format) else { return Err(LineCheckError::ImproperlyFormatted); }; if line_info.format == LineFormat::AlgoBased { process_algo_based_line(&line_info, cli_algo_name, opts, last_algo) } else if let Some(cli_algo) = cli_algo_name { - // If we match a non-algo based regex, we expect a cli argument + // If we match a non-algo based parser, we expect a cli argument // to give us the algorithm to use process_non_algo_based_line(i, &line_info, cli_algo, cli_algo_length, opts) } else { @@ -890,9 +986,9 @@ fn process_checksum_file( let reader = BufReader::new(file); let lines = read_os_string_lines(reader).collect::>(); - // cached_regex is used to ensure that several non algo-based checksum line - // will use the same regex. - let mut cached_regex = None; + // cached_line_format is used to ensure that several non algo-based checksum line + // will use the same parser. + let mut cached_line_format = None; // last_algo caches the algorithm used in the last line to print a warning // message for the current line if improperly formatted. // Behavior tested in gnu_cksum_c::test_warn @@ -905,7 +1001,7 @@ fn process_checksum_file( cli_algo_name, cli_algo_length, opts, - &mut cached_regex, + &mut cached_line_format, &mut last_algo, ); @@ -1381,7 +1477,7 @@ mod tests { assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); - assert_eq!(line_info.format, LineFormat::DoubleSpace); + assert_eq!(line_info.format, LineFormat::Untagged); assert!(cached_regex.is_some()); cached_regex = None; From 04ad55510b1ed622364ea77afaaec897fdec2c75 Mon Sep 17 00:00:00 2001 From: GTimothy <22472919+GTimothy@users.noreply.github.com> Date: Tue, 25 Mar 2025 19:19:25 +0100 Subject: [PATCH 2/3] checksum/cksum: update tests to test new parsers not regex --- src/uucore/src/lib/features/checksum.rs | 100 ++++++++++++------------ 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 25eddd0ac..020fa43bb 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -1341,8 +1341,7 @@ mod tests { } #[test] - fn test_algo_based_regex() { - let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap(); + fn test_algo_based_parser() { #[allow(clippy::type_complexity)] let test_cases: &[(&[u8], Option<(&[u8], Option<&[u8]>, &[u8], &[u8])>)] = &[ (b"SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some((b"SHA256", None, b"example.txt", b"d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))), @@ -1353,27 +1352,30 @@ mod tests { ]; for (input, expected) in test_cases { - let captures = algo_based_regex.captures(input); + let line_info = LineFormat::parse_algo_based(input); match expected { Some((algo, bits, filename, checksum)) => { - assert!(captures.is_some()); - let captures = captures.unwrap(); - assert_eq!(&captures.name("algo").unwrap().as_bytes(), algo); - assert_eq!(&captures.name("bits").map(|m| m.as_bytes()), bits); - assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename); - assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum); + assert!(line_info.is_some()); + let line_info = line_info.unwrap(); + assert_eq!(&line_info.algo_name.unwrap().as_bytes(), algo); + assert_eq!( + line_info + .algo_bit_len + .map(|m| m.to_string().as_bytes().to_owned()), + bits.map(|b| b.to_owned()) + ); + assert_eq!(&line_info.filename, filename); + assert_eq!(&line_info.checksum.as_bytes(), checksum); } None => { - assert!(captures.is_none()); + assert!(line_info.is_none()); } } } } #[test] - fn test_double_space_regex() { - let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap(); - + fn test_double_space_parser() { #[allow(clippy::type_complexity)] let test_cases: &[(&[u8], Option<(&[u8], &[u8])>)] = &[ ( @@ -1400,24 +1402,23 @@ mod tests { ]; for (input, expected) in test_cases { - let captures = double_space_regex.captures(input); + let line_info = LineFormat::parse_untagged(input); match expected { Some((checksum, filename)) => { - assert!(captures.is_some()); - let captures = captures.unwrap(); - assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum); - assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename); + assert!(line_info.is_some()); + let line_info = line_info.unwrap(); + assert_eq!(&line_info.filename, filename); + assert_eq!(&line_info.checksum.as_bytes(), checksum); } None => { - assert!(captures.is_none()); + assert!(line_info.is_none()); } } } } #[test] - fn test_single_space_regex() { - let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap(); + fn test_single_space_parser() { #[allow(clippy::type_complexity)] let test_cases: &[(&[u8], Option<(&[u8], &[u8])>)] = &[ ( @@ -1440,16 +1441,16 @@ mod tests { ]; for (input, expected) in test_cases { - let captures = single_space_regex.captures(input); + let line_info = LineFormat::parse_single_space(input); match expected { Some((checksum, filename)) => { - assert!(captures.is_some()); - let captures = captures.unwrap(); - assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum); - assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename); + assert!(line_info.is_some()); + let line_info = line_info.unwrap(); + assert_eq!(&line_info.filename, filename); + assert_eq!(&line_info.checksum.as_bytes(), checksum); } None => { - assert!(captures.is_none()); + assert!(line_info.is_none()); } } } @@ -1457,68 +1458,69 @@ mod tests { #[test] fn test_line_info() { - let mut cached_regex = None; + let mut cached_line_format = None; - // Test algo-based regex + // Test algo-based parser let line_algo_based = OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); - let line_info = LineInfo::parse(&line_algo_based, &mut cached_regex).unwrap(); + let line_info = LineInfo::parse(&line_algo_based, &mut cached_line_format).unwrap(); assert_eq!(line_info.algo_name.as_deref(), Some("MD5")); assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); assert_eq!(line_info.format, LineFormat::AlgoBased); - assert!(cached_regex.is_none()); + assert!(cached_line_format.is_none()); - // Test double-space regex + // Test double-space parser let line_double_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); - let line_info = LineInfo::parse(&line_double_space, &mut cached_regex).unwrap(); + let line_info = LineInfo::parse(&line_double_space, &mut cached_line_format).unwrap(); assert!(line_info.algo_name.is_none()); assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); assert_eq!(line_info.format, LineFormat::Untagged); - assert!(cached_regex.is_some()); + assert!(cached_line_format.is_some()); - cached_regex = None; + cached_line_format = None; - // Test single-space regex + // Test single-space parser let line_single_space = OsString::from("d41d8cd98f00b204e9800998ecf8427e example.txt"); - let line_info = LineInfo::parse(&line_single_space, &mut cached_regex).unwrap(); + let line_info = LineInfo::parse(&line_single_space, &mut cached_line_format).unwrap(); assert!(line_info.algo_name.is_none()); assert!(line_info.algo_bit_len.is_none()); assert_eq!(line_info.filename, b"example.txt"); assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e"); assert_eq!(line_info.format, LineFormat::SingleSpace); - assert!(cached_regex.is_some()); + assert!(cached_line_format.is_some()); - cached_regex = None; + cached_line_format = None; // Test invalid checksum line let line_invalid = OsString::from("invalid checksum line"); - assert!(LineInfo::parse(&line_invalid, &mut cached_regex).is_none()); - assert!(cached_regex.is_none()); + assert!(LineInfo::parse(&line_invalid, &mut cached_line_format).is_none()); + assert!(cached_line_format.is_none()); // Test leading space before checksum line let line_algo_based_leading_space = OsString::from(" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"); - let line_info = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex).unwrap(); + let line_info = + LineInfo::parse(&line_algo_based_leading_space, &mut cached_line_format).unwrap(); assert_eq!(line_info.format, LineFormat::AlgoBased); - assert!(cached_regex.is_none()); + assert!(cached_line_format.is_none()); // Test trailing space after checksum line (should fail) let line_algo_based_leading_space = OsString::from("MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e "); - let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_regex); + let res = LineInfo::parse(&line_algo_based_leading_space, &mut cached_line_format); assert!(res.is_none()); - assert!(cached_regex.is_none()); + assert!(cached_line_format.is_none()); } #[test] fn test_get_expected_digest() { let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU="); - let mut cached_regex = None; - let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap(); + let mut cached_line_format = None; + let line_info = LineInfo::parse(&line, &mut cached_line_format).unwrap(); let result = get_expected_digest_as_hex_string(&line_info, None); @@ -1532,8 +1534,8 @@ mod tests { fn test_get_expected_checksum_invalid() { // The line misses a '=' at the end to be valid base64 let line = OsString::from("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU"); - let mut cached_regex = None; - let line_info = LineInfo::parse(&line, &mut cached_regex).unwrap(); + let mut cached_line_format = None; + let line_info = LineInfo::parse(&line, &mut cached_line_format).unwrap(); let result = get_expected_digest_as_hex_string(&line_info, None); From 09a9dc72b9398b869b0cacd80bed1ff4d84ff5f0 Mon Sep 17 00:00:00 2001 From: GTimothy <22472919+GTimothy@users.noreply.github.com> Date: Thu, 27 Mar 2025 12:02:52 +0100 Subject: [PATCH 3/3] checksum/cksum: fix: filename that include separator should parse + add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixes this non-regex implementation's flaw with file_names containing the separator's pattern: - replaces left-to-right greedy separator match with right-to-left one. - added bugfix tests fixes secondary bug: positive match on hybrid posix-openssl format adds secondary bugfix tests Co-authored-by: Dorian Péron <72708393+RenjiSann@users.noreply.github.com> --- src/uucore/src/lib/features/checksum.rs | 90 ++++++++++++++++++++----- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 020fa43bb..e9f027e1d 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit +// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit rsplit use data_encoding::BASE64; use os_display::Quotable; @@ -484,10 +484,24 @@ impl LineFormat { let algo_start = if trimmed.starts_with(b"\\") { 1 } else { 0 }; let rest = &trimmed[algo_start..]; + enum SubCase { + Posix, + OpenSSL, + } // find the next parenthesis using byte search (not next whitespace) because openssl's // tagged format does not put a space before (filename) + let par_idx = rest.iter().position(|&b| b == b'(')?; - let algo_substring = &rest[..par_idx].trim_ascii(); + let sub_case = if rest[par_idx - 1] == b' ' { + SubCase::Posix + } else { + SubCase::OpenSSL + }; + + let algo_substring = match sub_case { + SubCase::Posix => &rest[..par_idx - 1], + SubCase::OpenSSL => &rest[..par_idx], + }; let mut algo_parts = algo_substring.splitn(2, |&b| b == b'-'); let algo = algo_parts.next()?; @@ -509,8 +523,10 @@ impl LineFormat { let algo_utf8 = unsafe { String::from_utf8_unchecked(algo.to_vec()) }; // stripping '(' not ' (' since we matched on ( not whitespace because of openssl. let after_paren = rest.get(par_idx + 1..)?; - let (filename, checksum) = ByteSliceExt::split_once(after_paren, b") = ") - .or_else(|| ByteSliceExt::split_once(after_paren, b")= "))?; + let (filename, checksum) = match sub_case { + SubCase::Posix => ByteSliceExt::rsplit_once(after_paren, b") = ")?, + SubCase::OpenSSL => ByteSliceExt::rsplit_once(after_paren, b")= ")?, + }; fn is_valid_checksum(checksum: &[u8]) -> bool { if checksum.is_empty() { @@ -608,13 +624,20 @@ impl LineFormat { // Helper trait for byte slice operations trait ByteSliceExt { - fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>; + /// Look for a pattern from right to left, return surrounding parts if found. + fn rsplit_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>; } impl ByteSliceExt for [u8] { - fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> { - let pos = self.windows(pattern.len()).position(|w| w == pattern)?; - Some((&self[..pos], &self[pos + pattern.len()..])) + fn rsplit_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> { + let pos = self + .windows(pattern.len()) + .rev() + .position(|w| w == pattern)?; + Some(( + &self[..self.len() - pattern.len() - pos], + &self[self.len() - pos..], + )) } } @@ -1345,30 +1368,67 @@ mod tests { #[allow(clippy::type_complexity)] let test_cases: &[(&[u8], Option<(&[u8], Option<&[u8]>, &[u8], &[u8])>)] = &[ (b"SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some((b"SHA256", None, b"example.txt", b"d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))), - // cspell:disable-next-line + // cspell:disable (b"BLAKE2b-512 (file) = abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef", Some((b"BLAKE2b", Some(b"512"), b"file", b"abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef"))), (b" MD5 (test) = 9e107d9d372bb6826bd81d3542a419d6", Some((b"MD5", None, b"test", b"9e107d9d372bb6826bd81d3542a419d6"))), (b"SHA-1 (anotherfile) = a9993e364706816aba3e25717850c26c9cd0d89d", Some((b"SHA", Some(b"1"), b"anotherfile", b"a9993e364706816aba3e25717850c26c9cd0d89d"))), + (b" MD5 (anothertest) = fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"anothertest", b"fds65dsf46as5df4d6f54asds5d7f7g9"))), + (b" MD5(anothertest2) = fds65dsf46as5df4d6f54asds5d7f7g9", None), + (b" MD5(weirdfilename0)= stillfilename)= fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename0)= stillfilename", b"fds65dsf46as5df4d6f54asds5d7f7g9"))), + (b" MD5(weirdfilename1)= )= fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename1)= ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))), + (b" MD5(weirdfilename2) = )= fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename2) = ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))), + (b" MD5 (weirdfilename3)= ) = fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename3)= ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))), + (b" MD5 (weirdfilename4) = ) = fds65dsf46as5df4d6f54asds5d7f7g9", Some((b"MD5", None, b"weirdfilename4) = ", b"fds65dsf46as5df4d6f54asds5d7f7g9"))), + (b" MD5(weirdfilename5)= ) = fds65dsf46as5df4d6f54asds5d7f7g9", None), + (b" MD5(weirdfilename6) = ) = fds65dsf46as5df4d6f54asds5d7f7g9", None), + (b" MD5 (weirdfilename7)= )= fds65dsf46as5df4d6f54asds5d7f7g9", None), + (b" MD5 (weirdfilename8) = )= fds65dsf46as5df4d6f54asds5d7f7g9", None), ]; + // cspell:enable for (input, expected) in test_cases { let line_info = LineFormat::parse_algo_based(input); match expected { Some((algo, bits, filename, checksum)) => { - assert!(line_info.is_some()); + assert!( + line_info.is_some(), + "expected Some, got None for {}", + String::from_utf8_lossy(filename) + ); let line_info = line_info.unwrap(); - assert_eq!(&line_info.algo_name.unwrap().as_bytes(), algo); + assert_eq!( + &line_info.algo_name.unwrap().as_bytes(), + algo, + "failed for {}", + String::from_utf8_lossy(filename) + ); assert_eq!( line_info .algo_bit_len .map(|m| m.to_string().as_bytes().to_owned()), - bits.map(|b| b.to_owned()) + bits.map(|b| b.to_owned()), + "failed for {}", + String::from_utf8_lossy(filename) + ); + assert_eq!( + &line_info.filename, + filename, + "failed for {}", + String::from_utf8_lossy(filename) + ); + assert_eq!( + &line_info.checksum.as_bytes(), + checksum, + "failed for {}", + String::from_utf8_lossy(filename) ); - assert_eq!(&line_info.filename, filename); - assert_eq!(&line_info.checksum.as_bytes(), checksum); } None => { - assert!(line_info.is_none()); + assert!( + line_info.is_none(), + "failed for {}", + String::from_utf8_lossy(input) + ); } } }