diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 98d57e0ba..a2de28bc5 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -2,30 +2,29 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore anotherfile invalidchecksum regexes JWZG +// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename use data_encoding::BASE64; use os_display::Quotable; -use regex::Regex; +use regex::bytes::{Captures, Regex}; use std::{ - ffi::OsStr, + ffi::{OsStr, OsString}, + fmt::Display, fs::File, - io::{self, BufReader, Read}, + io::{self, stdin, BufReader, Read, Write}, path::Path, + str, }; use crate::{ error::{set_exit_code, FromIo, UError, UResult, USimpleError}, - show, show_error, show_warning_caps, + os_str_as_bytes, os_str_from_bytes, read_os_string_lines, show, show_error, show_warning_caps, sum::{ Blake2b, Blake3, Digest, DigestWriter, Md5, Sha1, Sha224, Sha256, Sha384, Sha3_224, Sha3_256, Sha3_384, Sha3_512, Sha512, Shake128, Shake256, Sm3, BSD, CRC, SYSV, }, util_name, }; -use std::fmt::Write; -use std::io::stdin; -use std::io::BufRead; use thiserror::Error; pub const ALGORITHM_OPTIONS_SYSV: &str = "sysv"; @@ -175,6 +174,36 @@ fn cksum_output(res: &ChecksumResult, status: bool) { } } +#[derive(Debug, Clone, Copy)] +enum FileChecksumResult { + Ok, + Failed, + CantOpen, +} + +impl Display for FileChecksumResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FileChecksumResult::Ok => write!(f, "OK"), + FileChecksumResult::Failed => write!(f, "FAILED"), + FileChecksumResult::CantOpen => write!(f, "FAILED open or read"), + } + } +} + +/// Print to the given buffer the checksum validation status of a file which +/// name might contain non-utf-8 characters. +fn print_file_report( + mut w: W, + filename: &[u8], + result: FileChecksumResult, + prefix: &str, +) { + let _ = write!(w, "{prefix}"); + let _ = w.write_all(filename); + let _ = writeln!(w, ": {result}"); +} + pub fn detect_algo(algo: &str, length: Option) -> UResult { match algo { ALGORITHM_OPTIONS_SYSV => Ok(HashAlgorithm { @@ -279,13 +308,13 @@ pub fn detect_algo(algo: &str, length: Option) -> UResult // algo must be uppercase or b (for blake2b) // 2. [* ] // 3. [*] (only one space) -const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P.*)\)\s*=\s*(?P[a-fA-F0-9]+)$"; -const ALGO_BASED_REGEX_BASE64: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P.*)\)\s*=\s*(?P[A-Za-z0-9+/]+={0,2})$"; +const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P(?-u:.*))\)\s*=\s*(?P[a-fA-F0-9]+)$"; +const ALGO_BASED_REGEX_BASE64: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P(?-u:.*))\)\s*=\s*(?P[A-Za-z0-9+/]+={0,2})$"; -const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P.*)$"; +const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P(?-u:.*))$"; // In this case, we ignore the * -const SINGLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s(?P\*?.*)$"; +const SINGLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s(?P\*?(?-u:.*))$"; fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { if input_is_stdin { @@ -298,7 +327,7 @@ fn get_filename_for_output(filename: &OsStr, input_is_stdin: bool) -> String { } /// Determines the appropriate regular expression to use based on the provided lines. -fn determine_regex(lines: &[String]) -> Option<(Regex, bool)> { +fn determine_regex(lines: &[OsString]) -> Option<(Regex, bool)> { let regexes = [ (Regex::new(ALGO_BASED_REGEX).unwrap(), true), (Regex::new(DOUBLE_SPACE_REGEX).unwrap(), false), @@ -307,9 +336,9 @@ fn determine_regex(lines: &[String]) -> Option<(Regex, bool)> { ]; for line in lines { - let line_trim = line.trim(); + let line_bytes = os_str_as_bytes(line).expect("UTF-8 decoding failed"); for (regex, is_algo_based) in ®exes { - if regex.is_match(line_trim) { + if regex.is_match(line_bytes) { return Some((regex.clone(), *is_algo_based)); } } @@ -320,6 +349,7 @@ fn determine_regex(lines: &[String]) -> Option<(Regex, bool)> { // Converts bytes to a hexadecimal string fn bytes_to_hex(bytes: &[u8]) -> String { + use std::fmt::Write; bytes .iter() .fold(String::with_capacity(bytes.len() * 2), |mut hex, byte| { @@ -329,13 +359,14 @@ fn bytes_to_hex(bytes: &[u8]) -> String { } fn get_expected_checksum( - filename: &str, - caps: ®ex::Captures, + filename: &[u8], + caps: &Captures, chosen_regex: &Regex, ) -> UResult { if chosen_regex.as_str() == ALGO_BASED_REGEX_BASE64 { - let ck = caps.name("checksum").unwrap().as_str(); - match BASE64.decode(ck.as_bytes()) { + // Unwrap is safe, ensured by regex + let ck = caps.name("checksum").unwrap().as_bytes(); + match BASE64.decode(ck) { Ok(decoded_bytes) => { match std::str::from_utf8(&decoded_bytes) { Ok(decoded_str) => Ok(decoded_str.to_string()), @@ -344,32 +375,45 @@ fn get_expected_checksum( } Err(_) => Err(Box::new( ChecksumError::NoProperlyFormattedChecksumLinesFound { - filename: (&filename).to_string(), + filename: String::from_utf8_lossy(filename).to_string(), }, )), } } else { - Ok(caps.name("checksum").unwrap().as_str().to_string()) + // Unwraps are safe, ensured by regex. + Ok(str::from_utf8(caps.name("checksum").unwrap().as_bytes()) + .unwrap() + .to_string()) } } /// Returns a reader that reads from the specified file, or from stdin if `filename_to_check` is "-". fn get_file_to_check( - filename: &str, + filename: &OsStr, ignore_missing: bool, res: &mut ChecksumResult, ) -> Option> { + let filename_bytes = os_str_as_bytes(filename).expect("UTF-8 error"); + let filename_lossy = String::from_utf8_lossy(filename_bytes); if filename == "-" { Some(Box::new(stdin())) // Use stdin if "-" is specified in the checksum file } else { let mut failed_open = || { - println!("{filename}: FAILED open or read"); + print_file_report( + std::io::stdout(), + filename_bytes, + FileChecksumResult::CantOpen, + "", + ); res.failed_open_file += 1; }; match File::open(filename) { Ok(f) => { if f.metadata().ok()?.is_dir() { - show!(USimpleError::new(1, format!("{filename}: Is a directory"))); + show!(USimpleError::new( + 1, + format!("{filename_lossy}: Is a directory") + )); // also regarded as a failed open failed_open(); None @@ -380,7 +424,7 @@ fn get_file_to_check( Err(err) => { if !ignore_missing { // yes, we have both stderr and stdout here - show!(err.map_err_context(|| filename.to_string())); + show!(err.map_err_context(|| filename_lossy.to_string())); failed_open(); } // we could not open the file but we want to continue @@ -414,13 +458,18 @@ fn get_input_file(filename: &OsStr) -> UResult> { /// Extracts the algorithm name and length from the regex captures if the algo-based format is matched. fn identify_algo_name_and_length( - caps: ®ex::Captures, + caps: &Captures, algo_name_input: Option<&str>, res: &mut ChecksumResult, properly_formatted: &mut bool, ) -> Option<(String, Option)> { // When the algo-based format is matched, extract details from regex captures - let algorithm = caps.name("algo").map_or("", |m| m.as_str()).to_lowercase(); + let algorithm = caps + .name("algo") + .map_or(String::new(), |m| { + String::from_utf8(m.as_bytes().into()).unwrap() + }) + .to_lowercase(); // check if we are called with XXXsum (example: md5sum) but we detected a different algo parsing the file // (for example SHA1 (f) = d...) @@ -438,7 +487,10 @@ fn identify_algo_name_and_length( } let bits = caps.name("bits").map_or(Some(None), |m| { - let bits_value = m.as_str().parse::().unwrap(); + let bits_value = String::from_utf8(m.as_bytes().into()) + .unwrap() + .parse::() + .unwrap(); if bits_value % 8 == 0 { Some(Some(bits_value / 8)) } else { @@ -491,7 +543,8 @@ where }; let reader = BufReader::new(file); - let lines: Vec = reader.lines().collect::>()?; + let lines = read_os_string_lines(reader).collect::>(); + let Some((chosen_regex, is_algo_based_format)) = determine_regex(&lines) else { let e = ChecksumError::NoProperlyFormattedChecksumLinesFound { filename: get_filename_for_output(filename_input, input_is_stdin), @@ -502,11 +555,13 @@ where }; for (i, line) in lines.iter().enumerate() { - if let Some(caps) = chosen_regex.captures(line) { + let line_bytes = os_str_as_bytes(line)?; + if let Some(caps) = chosen_regex.captures(line_bytes) { properly_formatted = true; - let mut filename_to_check = caps.name("filename").unwrap().as_str(); - if filename_to_check.starts_with('*') + let mut filename_to_check = caps.name("filename").unwrap().as_bytes(); + + if filename_to_check.starts_with(b"*") && i == 0 && chosen_regex.as_str() == SINGLE_SPACE_REGEX { @@ -551,10 +606,11 @@ where let (filename_to_check_unescaped, prefix) = unescape_filename(filename_to_check); + let real_filename_to_check = os_str_from_bytes(&filename_to_check_unescaped)?; + // manage the input file let file_to_check = - match get_file_to_check(&filename_to_check_unescaped, ignore_missing, &mut res) - { + match get_file_to_check(&real_filename_to_check, ignore_missing, &mut res) { Some(file) => file, None => continue, }; @@ -568,17 +624,27 @@ where // Do the checksum validation if expected_checksum == calculated_checksum { if !quiet && !status { - println!("{prefix}{filename_to_check}: OK"); + print_file_report( + std::io::stdout(), + filename_to_check, + FileChecksumResult::Ok, + prefix, + ); } correct_format += 1; } else { if !status { - println!("{prefix}{filename_to_check}: FAILED"); + print_file_report( + std::io::stdout(), + filename_to_check, + FileChecksumResult::Failed, + prefix, + ); } res.failed_cksum += 1; } } else { - if line.is_empty() || line.starts_with("#") { + if line.is_empty() || line_bytes.starts_with(b"#") { // Don't show any warning for empty or commented lines. continue; } @@ -707,11 +773,28 @@ pub fn calculate_blake2b_length(length: usize) -> UResult> { } } -pub fn unescape_filename(filename: &str) -> (String, &'static str) { - let unescaped = filename - .replace("\\\\", "\\") - .replace("\\n", "\n") - .replace("\\r", "\r"); +pub fn unescape_filename(filename: &[u8]) -> (Vec, &'static str) { + let mut unescaped = Vec::with_capacity(filename.len()); + let mut byte_iter = filename.iter().peekable(); + loop { + let Some(byte) = byte_iter.next() else { + break; + }; + if *byte == b'\\' { + match byte_iter.next() { + Some(b'\\') => unescaped.push(b'\\'), + Some(b'n') => unescaped.push(b'\n'), + Some(b'r') => unescaped.push(b'\r'), + Some(x) => { + unescaped.push(b'\\'); + unescaped.push(*x); + } + _ => {} + } + } else { + unescaped.push(*byte); + } + } let prefix = if unescaped == filename { "" } else { "\\" }; (unescaped, prefix) } @@ -732,19 +815,19 @@ mod tests { #[test] fn test_unescape_filename() { - let (unescaped, prefix) = unescape_filename("test\\nfile.txt"); - assert_eq!(unescaped, "test\nfile.txt"); + let (unescaped, prefix) = unescape_filename(b"test\\nfile.txt"); + assert_eq!(unescaped, b"test\nfile.txt"); assert_eq!(prefix, "\\"); - let (unescaped, prefix) = unescape_filename("test\\nfile.txt"); - assert_eq!(unescaped, "test\nfile.txt"); + let (unescaped, prefix) = unescape_filename(b"test\\nfile.txt"); + assert_eq!(unescaped, b"test\nfile.txt"); assert_eq!(prefix, "\\"); - let (unescaped, prefix) = unescape_filename("test\\rfile.txt"); - assert_eq!(unescaped, "test\rfile.txt"); + let (unescaped, prefix) = unescape_filename(b"test\\rfile.txt"); + assert_eq!(unescaped, b"test\rfile.txt"); assert_eq!(prefix, "\\"); - let (unescaped, prefix) = unescape_filename("test\\\\file.txt"); - assert_eq!(unescaped, "test\\file.txt"); + let (unescaped, prefix) = unescape_filename(b"test\\\\file.txt"); + assert_eq!(unescaped, b"test\\file.txt"); assert_eq!(prefix, "\\"); } @@ -849,24 +932,25 @@ mod tests { #[test] fn test_algo_based_regex() { let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap(); - let test_cases = vec![ - ("SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some(("SHA256", None, "example.txt", "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))), + #[allow(clippy::type_complexity)] + let test_cases: &[(&[u8], Option<(&[u8], Option<&[u8]>, &[u8], &[u8])>)] = &[ + (b"SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some((b"SHA256", None, b"example.txt", b"d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))), // cspell:disable-next-line - ("BLAKE2b-512 (file) = abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef", Some(("BLAKE2b", Some("512"), "file", "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef"))), - (" MD5 (test) = 9e107d9d372bb6826bd81d3542a419d6", Some(("MD5", None, "test", "9e107d9d372bb6826bd81d3542a419d6"))), - ("SHA-1 (anotherfile) = a9993e364706816aba3e25717850c26c9cd0d89d", Some(("SHA", Some("1"), "anotherfile", "a9993e364706816aba3e25717850c26c9cd0d89d"))), + (b"BLAKE2b-512 (file) = abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef", Some((b"BLAKE2b", Some(b"512"), b"file", b"abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef"))), + (b" MD5 (test) = 9e107d9d372bb6826bd81d3542a419d6", Some((b"MD5", None, b"test", b"9e107d9d372bb6826bd81d3542a419d6"))), + (b"SHA-1 (anotherfile) = a9993e364706816aba3e25717850c26c9cd0d89d", Some((b"SHA", Some(b"1"), b"anotherfile", b"a9993e364706816aba3e25717850c26c9cd0d89d"))), ]; for (input, expected) in test_cases { - let captures = algo_based_regex.captures(input); + let captures = algo_based_regex.captures(*input); match expected { Some((algo, bits, filename, checksum)) => { assert!(captures.is_some()); let captures = captures.unwrap(); - assert_eq!(captures.name("algo").unwrap().as_str(), algo); - assert_eq!(captures.name("bits").map(|m| m.as_str()), bits); - assert_eq!(captures.name("filename").unwrap().as_str(), filename); - assert_eq!(captures.name("checksum").unwrap().as_str(), checksum); + assert_eq!(&captures.name("algo").unwrap().as_bytes(), algo); + assert_eq!(&captures.name("bits").map(|m| m.as_bytes()), bits); + assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename); + assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum); } None => { assert!(captures.is_none()); @@ -879,28 +963,29 @@ mod tests { fn test_double_space_regex() { let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap(); - let test_cases = vec![ + #[allow(clippy::type_complexity)] + let test_cases: &[(&[u8], Option<(&[u8], &[u8])>)] = &[ ( - "60b725f10c9c85c70d97880dfe8191b3 a", - Some(("60b725f10c9c85c70d97880dfe8191b3", "a")), + b"60b725f10c9c85c70d97880dfe8191b3 a", + Some((b"60b725f10c9c85c70d97880dfe8191b3", b"a")), ), ( - "bf35d7536c785cf06730d5a40301eba2 b", - Some(("bf35d7536c785cf06730d5a40301eba2", " b")), + b"bf35d7536c785cf06730d5a40301eba2 b", + Some((b"bf35d7536c785cf06730d5a40301eba2", b" b")), ), ( - "f5b61709718c1ecf8db1aea8547d4698 *c", - Some(("f5b61709718c1ecf8db1aea8547d4698", "*c")), + b"f5b61709718c1ecf8db1aea8547d4698 *c", + Some((b"f5b61709718c1ecf8db1aea8547d4698", b"*c")), ), ( - "b064a020db8018f18ff5ae367d01b212 dd", - Some(("b064a020db8018f18ff5ae367d01b212", "dd")), + b"b064a020db8018f18ff5ae367d01b212 dd", + Some((b"b064a020db8018f18ff5ae367d01b212", b"dd")), ), ( - "b064a020db8018f18ff5ae367d01b212 ", - Some(("b064a020db8018f18ff5ae367d01b212", " ")), + b"b064a020db8018f18ff5ae367d01b212 ", + Some((b"b064a020db8018f18ff5ae367d01b212", b" ")), ), - ("invalidchecksum test", None), + (b"invalidchecksum test", None), ]; for (input, expected) in test_cases { @@ -909,8 +994,8 @@ mod tests { Some((checksum, filename)) => { assert!(captures.is_some()); let captures = captures.unwrap(); - assert_eq!(captures.name("checksum").unwrap().as_str(), checksum); - assert_eq!(captures.name("filename").unwrap().as_str(), filename); + assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum); + assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename); } None => { assert!(captures.is_none()); @@ -922,24 +1007,25 @@ mod tests { #[test] fn test_single_space_regex() { let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap(); - let test_cases = vec![ + #[allow(clippy::type_complexity)] + let test_cases: &[(&[u8], Option<(&[u8], &[u8])>)] = &[ ( - "60b725f10c9c85c70d97880dfe8191b3 a", - Some(("60b725f10c9c85c70d97880dfe8191b3", "a")), + b"60b725f10c9c85c70d97880dfe8191b3 a", + Some((b"60b725f10c9c85c70d97880dfe8191b3", b"a")), ), ( - "bf35d7536c785cf06730d5a40301eba2 b", - Some(("bf35d7536c785cf06730d5a40301eba2", "b")), + b"bf35d7536c785cf06730d5a40301eba2 b", + Some((b"bf35d7536c785cf06730d5a40301eba2", b"b")), ), ( - "f5b61709718c1ecf8db1aea8547d4698 *c", - Some(("f5b61709718c1ecf8db1aea8547d4698", "*c")), + b"f5b61709718c1ecf8db1aea8547d4698 *c", + Some((b"f5b61709718c1ecf8db1aea8547d4698", b"*c")), ), ( - "b064a020db8018f18ff5ae367d01b212 dd", - Some(("b064a020db8018f18ff5ae367d01b212", "dd")), + b"b064a020db8018f18ff5ae367d01b212 dd", + Some((b"b064a020db8018f18ff5ae367d01b212", b"dd")), ), - ("invalidchecksum test", None), + (b"invalidchecksum test", None), ]; for (input, expected) in test_cases { @@ -948,8 +1034,8 @@ mod tests { Some((checksum, filename)) => { assert!(captures.is_some()); let captures = captures.unwrap(); - assert_eq!(captures.name("checksum").unwrap().as_str(), checksum); - assert_eq!(captures.name("filename").unwrap().as_str(), filename); + assert_eq!(&captures.name("checksum").unwrap().as_bytes(), checksum); + assert_eq!(&captures.name("filename").unwrap().as_bytes(), filename); } None => { assert!(captures.is_none()); @@ -961,47 +1047,77 @@ mod tests { #[test] fn test_determine_regex() { // Test algo-based regex - let lines_algo_based = - vec!["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e".to_string()]; + let lines_algo_based = ["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); let (regex, algo_based) = determine_regex(&lines_algo_based).unwrap(); assert!(algo_based); - assert!(regex.is_match(&lines_algo_based[0])); + assert!(regex.is_match(os_str_as_bytes(&lines_algo_based[0]).unwrap())); // Test double-space regex - let lines_double_space = vec!["d41d8cd98f00b204e9800998ecf8427e example.txt".to_string()]; + let lines_double_space = ["d41d8cd98f00b204e9800998ecf8427e example.txt"] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); let (regex, algo_based) = determine_regex(&lines_double_space).unwrap(); assert!(!algo_based); - assert!(regex.is_match(&lines_double_space[0])); + assert!(regex.is_match(os_str_as_bytes(&lines_double_space[0]).unwrap())); // Test single-space regex - let lines_single_space = vec!["d41d8cd98f00b204e9800998ecf8427e example.txt".to_string()]; + let lines_single_space = ["d41d8cd98f00b204e9800998ecf8427e example.txt"] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); let (regex, algo_based) = determine_regex(&lines_single_space).unwrap(); assert!(!algo_based); - assert!(regex.is_match(&lines_single_space[0])); + assert!(regex.is_match(os_str_as_bytes(&lines_single_space[0]).unwrap())); // Test double-space regex start with invalid - let lines_double_space = vec![ - "ERR".to_string(), - "d41d8cd98f00b204e9800998ecf8427e example.txt".to_string(), - ]; + let lines_double_space = ["ERR", "d41d8cd98f00b204e9800998ecf8427e example.txt"] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); let (regex, algo_based) = determine_regex(&lines_double_space).unwrap(); assert!(!algo_based); - assert!(!regex.is_match(&lines_double_space[0])); - assert!(regex.is_match(&lines_double_space[1])); + assert!(!regex.is_match(os_str_as_bytes(&lines_double_space[0]).unwrap())); + assert!(regex.is_match(os_str_as_bytes(&lines_double_space[1]).unwrap())); // Test invalid checksum line - let lines_invalid = vec!["invalid checksum line".to_string()]; + let lines_invalid = ["invalid checksum line"] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); assert!(determine_regex(&lines_invalid).is_none()); + + // Test leading space before checksum line + let lines_algo_based_leading_space = + vec![" MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e"] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); + let res = determine_regex(&lines_algo_based_leading_space); + assert!(res.is_some()); + assert_eq!(res.unwrap().0.as_str(), ALGO_BASED_REGEX); + + // Test trailing space after checksum line (should fail) + let lines_algo_based_leading_space = + vec!["MD5 (example.txt) = d41d8cd98f00b204e9800998ecf8427e "] + .iter() + .map(|s| OsString::from(s.to_string())) + .collect::>(); + let res = determine_regex(&lines_algo_based_leading_space); + assert!(res.is_none()); } #[test] fn test_get_expected_checksum() { let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(); let caps = re - .captures("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=") + .captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=") .unwrap(); - let result = get_expected_checksum("filename", &caps, &re); + let result = get_expected_checksum(b"filename", &caps, &re); assert_eq!( result.unwrap(), @@ -1013,11 +1129,48 @@ mod tests { fn test_get_expected_checksum_invalid() { let re = Regex::new(ALGO_BASED_REGEX_BASE64).unwrap(); let caps = re - .captures("SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU") + .captures(b"SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU") .unwrap(); - let result = get_expected_checksum("filename", &caps, &re); + let result = get_expected_checksum(b"filename", &caps, &re); assert!(result.is_err()); } + + #[test] + fn test_print_file_report() { + let cases: &[(&[u8], FileChecksumResult, &str, &[u8])] = &[ + (b"filename", FileChecksumResult::Ok, "", b"filename: OK\n"), + ( + b"filename", + FileChecksumResult::Failed, + "", + b"filename: FAILED\n", + ), + ( + b"filename", + FileChecksumResult::CantOpen, + "", + b"filename: FAILED open or read\n", + ), + ( + b"filename", + FileChecksumResult::Ok, + "prefix", + b"prefixfilename: OK\n", + ), + ( + b"funky\xffname", + FileChecksumResult::Ok, + "", + b"funky\xffname: OK\n", + ), + ]; + + for (filename, result, prefix, expected) in cases { + let mut buffer: Vec = vec![]; + print_file_report(&mut buffer, filename, *result, prefix); + assert_eq!(&buffer, expected) + } + } } diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index b4b353e3e..a636fcdab 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -100,10 +100,14 @@ pub use crate::features::fsxattr; //## core functions +use std::borrow::Cow; use std::ffi::OsStr; use std::ffi::OsString; +use std::io::{BufRead, BufReader}; +use std::iter; #[cfg(unix)] -use std::os::unix::ffi::OsStrExt; +use std::os::unix::ffi::{OsStrExt, OsStringExt}; +use std::str; use std::sync::atomic::Ordering; use once_cell::sync::Lazy; @@ -240,6 +244,72 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { Ok(bytes) } +/// Helper function for converting a slice of bytes into an &OsStr +/// or OsString in non-unix targets. +/// +/// It converts `&[u8]` to `Cow` for unix targets only. +/// On non-unix (i.e. Windows), the conversion goes through the String type +/// and thus undergo UTF-8 validation, making it fail if the stream contains +/// non-UTF-8 characters. +pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { + #[cfg(unix)] + let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); + #[cfg(not(unix))] + let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { + mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") + })?)); + + Ok(os_str) +} + +/// Helper function for making an `OsString` from a byte field +/// It converts `Vec` to `OsString` for unix targets only. +/// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8 +pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { + #[cfg(unix)] + let s = OsString::from_vec(vec); + #[cfg(not(unix))] + let s = OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?); + + Ok(s) +} + +/// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec`, +/// which avoids panicking on non UTF-8 input. +pub fn read_byte_lines( + mut buf_reader: BufReader, +) -> impl Iterator> { + iter::from_fn(move || { + let mut buf = Vec::with_capacity(256); + let size = buf_reader.read_until(b'\n', &mut buf).ok()?; + + if size == 0 { + return None; + } + + // Trim (\r)\n + if buf.ends_with(b"\n") { + buf.pop(); + if buf.ends_with(b"\r") { + buf.pop(); + } + } + + Some(buf) + }) +} + +/// Equivalent to `std::BufRead::lines` which outputs each line as an `OsString` +/// This won't panic on non UTF-8 characters on Unix, +/// but it still will on Windows. +pub fn read_os_string_lines( + buf_reader: BufReader, +) -> impl Iterator { + read_byte_lines(buf_reader).map(|byte_line| os_string_from_vec(byte_line).expect("UTF-8 error")) +} + /// Prompt the user with a formatted string and returns `true` if they reply `'y'` or `'Y'` /// /// This macro functions accepts the same syntax as `format!`. The prompt is written to diff --git a/tests/by-util/test_cksum.rs b/tests/by-util/test_cksum.rs index 07c6e7b17..98366cbec 100644 --- a/tests/by-util/test_cksum.rs +++ b/tests/by-util/test_cksum.rs @@ -1402,3 +1402,103 @@ fn test_zero_single_file() { .succeeds() .stdout_is_fixture("zero_single_file.expected"); } + +#[test] +fn test_check_trailing_space_fails() { + // If a checksum line has trailing spaces after the digest, + // it shall be considered improperly formatted. + + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + + at.write("foo", "foo-content\n"); + at.write( + "CHECKSUM", + "SHA1 (foo) = 058ab38dd3603703b3a7063cf95dc51a4286b6fe \n", + ); + + scene + .ucmd() + .arg("--check") + .arg("CHECKSUM") + .fails() + .no_stdout() + .stderr_contains("CHECKSUM: no properly formatted checksum lines found"); +} + +/// Regroup tests related to the handling of non-utf-8 content +/// in checksum files. +/// These tests are excluded from Windows because it does not provide any safe +/// conversion between `OsString` and byte sequences for non-utf-8 strings. +#[cfg(not(windows))] +mod check_utf8 { + use super::*; + + #[test] + fn test_check_non_utf8_comment() { + let hashes = + b"MD5 (empty) = 1B2M2Y8AsgTpgAmY7PhCfg==\n\ + # Comment with a non utf8 char: >>\xff<<\n\ + SHA256 (empty) = 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=\n\ + BLAKE2b (empty) = eGoC90IBWQPGxv2FJVLScpEvR0DhWEdhiobiF/cfVBnSXhAxr+5YUxOJZESTTrBLkDpoWxRIt1XVb3Aa/pvizg==\n" + ; + + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + + at.touch("empty"); + at.write_bytes("check", hashes); + + scene + .ucmd() + .arg("--check") + .arg(at.subdir.join("check")) + .succeeds() + .stdout_is("empty: OK\nempty: OK\nempty: OK\n") + .no_stderr(); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_check_non_utf8_filename() { + use std::{ffi::OsString, os::unix::ffi::OsStringExt}; + + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + let filename: OsString = OsStringExt::from_vec(b"funky\xffname".to_vec()); + at.touch(&filename); + + // Checksum match + at.write_bytes("check", + b"SHA256 (funky\xffname) = e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855\n"); + scene + .ucmd() + .arg("--check") + .arg(at.subdir.join("check")) + .succeeds() + .stdout_is_bytes(b"funky\xffname: OK\n") + .no_stderr(); + + // Checksum mismatch + at.write_bytes("check", + b"SHA256 (funky\xffname) = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff\n"); + scene + .ucmd() + .arg("--check") + .arg(at.subdir.join("check")) + .fails() + .stdout_is_bytes(b"funky\xffname: FAILED\n") + .stderr_contains("1 computed checksum did NOT match"); + + // file not found + at.write_bytes("check", + b"SHA256 (flakey\xffname) = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff\n"); + scene + .ucmd() + .arg("--check") + .arg(at.subdir.join("check")) + .fails() + .stdout_is_bytes(b"flakey\xffname: FAILED open or read\n") + .stderr_contains("1 listed file could not be read"); + } +}