1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

split: avoid using collect_lossy + test for invalid UTF8 arguments

This commit is contained in:
zhitkoff 2023-08-31 14:46:56 -04:00
parent 6f37b4b4cf
commit 5bfe9b19ef
2 changed files with 134 additions and 78 deletions

View file

@ -13,6 +13,7 @@ use crate::filenames::FilenameIterator;
use crate::filenames::SuffixType; use crate::filenames::SuffixType;
use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command}; use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command};
use std::env; use std::env;
use std::ffi::OsString;
use std::fmt; use std::fmt;
use std::fs::{metadata, File}; use std::fs::{metadata, File};
use std::io; use std::io;
@ -52,9 +53,8 @@ const AFTER_HELP: &str = help_section!("after help", "split.md");
#[uucore::main] #[uucore::main]
pub fn uumain(args: impl uucore::Args) -> UResult<()> { pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let args = args.collect_lossy();
let (args, obs_lines) = handle_obsolete(args);
let (args, obs_lines) = handle_obsolete(&args[..]);
let matches = uu_app().try_get_matches_from(args)?; let matches = uu_app().try_get_matches_from(args)?;
@ -71,91 +71,102 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
/// `split -x300e file` would mean `split -x -l 300 -e file` /// `split -x300e file` would mean `split -x -l 300 -e file`
/// `split -x300e -22 file` would mean `split -x -e -l 22 file` (last obsolete lines option wins) /// `split -x300e -22 file` would mean `split -x -e -l 22 file` (last obsolete lines option wins)
/// following GNU `split` behavior /// following GNU `split` behavior
fn handle_obsolete(args: &[String]) -> (Vec<String>, Option<String>) { fn handle_obsolete(args: impl uucore::Args) -> (Vec<OsString>, Option<String>) {
let mut obs_lines = None; let mut obs_lines = None;
let mut preceding_long_opt_req_value = false; let mut preceding_long_opt_req_value = false;
let mut preceding_short_opt_req_value = false; let mut preceding_short_opt_req_value = false;
let filtered_args = args let filtered_args = args
.iter() .filter_map(|os_slice| {
.filter_map(|slice| { let filter: Option<OsString>;
let filter: Option<String>; if let Some(slice) = os_slice.to_str() {
// check if the slice is a true short option (and not hyphen prefixed value of an option) // check if the slice is a true short option (and not hyphen prefixed value of an option)
// and if so, a short option that can contain obsolete lines value // and if so, a short option that can contain obsolete lines value
if slice.starts_with('-') if slice.starts_with('-')
&& !slice.starts_with("--") && !slice.starts_with("--")
&& !preceding_long_opt_req_value && !preceding_long_opt_req_value
&& !preceding_short_opt_req_value && !preceding_short_opt_req_value
&& !slice.starts_with("-a") && !slice.starts_with("-a")
&& !slice.starts_with("-b") && !slice.starts_with("-b")
&& !slice.starts_with("-C") && !slice.starts_with("-C")
&& !slice.starts_with("-l") && !slice.starts_with("-l")
&& !slice.starts_with("-n") && !slice.starts_with("-n")
{ {
// start of the short option string // start of the short option string
// that can have obsolete lines option value in it // that can have obsolete lines option value in it
// extract numeric part and filter it out // extract numeric part and filter it out
let mut obs_lines_extracted: Vec<char> = vec![]; let mut obs_lines_extracted: Vec<char> = vec![];
let mut obs_lines_end_reached = false; let mut obs_lines_end_reached = false;
let filtered_slice: Vec<char> = slice let filtered_slice: Vec<char> = slice
.chars() .chars()
.filter(|c| { .filter(|c| {
// To correctly process scenario like '-x200a4' // To correctly process scenario like '-x200a4'
// we need to stop extracting digits once alphabetic character is encountered // we need to stop extracting digits once alphabetic character is encountered
// after we already have something in obs_lines_extracted // after we already have something in obs_lines_extracted
if c.is_ascii_digit() && !obs_lines_end_reached { if c.is_ascii_digit() && !obs_lines_end_reached {
obs_lines_extracted.push(*c); obs_lines_extracted.push(*c);
false false
} else { } else {
if !obs_lines_extracted.is_empty() { if !obs_lines_extracted.is_empty() {
obs_lines_end_reached = true; obs_lines_end_reached = true;
}
true
} }
true })
} .collect();
})
.collect();
if obs_lines_extracted.is_empty() { if obs_lines_extracted.is_empty() {
// no obsolete lines value found/extracted // no obsolete lines value found/extracted
filter = Some(slice.to_owned()); filter = Some(OsString::from(slice));
} else {
// obsolete lines value was extracted
obs_lines = Some(obs_lines_extracted.iter().collect());
if filtered_slice.get(1).is_some() {
// there were some short options in front of or after obsolete lines value
// i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value
// would look like '-xd' or '-de' or similar
filter = Some(filtered_slice.iter().collect());
} else { } else {
filter = None; // obsolete lines value was extracted
obs_lines = Some(obs_lines_extracted.iter().collect());
if filtered_slice.get(1).is_some() {
// there were some short options in front of or after obsolete lines value
// i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value
// would look like '-xd' or '-de' or similar
let filtered_slice: String = filtered_slice.iter().collect();
filter = Some(OsString::from(filtered_slice));
} else {
filter = None;
}
} }
} else {
// either not a short option
// or a short option that cannot have obsolete lines value in it
filter = Some(OsString::from(slice));
}
// capture if current slice is a preceding long option that requires value and does not use '=' to assign that value
// following slice should be treaded as value for this option
// even if it starts with '-' (which would be treated as hyphen prefixed value)
if slice.starts_with("--") {
preceding_long_opt_req_value = &slice[2..] == OPT_BYTES
|| &slice[2..] == OPT_LINE_BYTES
|| &slice[2..] == OPT_LINES
|| &slice[2..] == OPT_ADDITIONAL_SUFFIX
|| &slice[2..] == OPT_FILTER
|| &slice[2..] == OPT_NUMBER
|| &slice[2..] == OPT_SUFFIX_LENGTH;
}
// capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace)
// following slice should be treaded as value for this option
// even if it starts with '-' (which would be treated as hyphen prefixed value)
preceding_short_opt_req_value = slice == "-b"
|| slice == "-C"
|| slice == "-l"
|| slice == "-n"
|| slice == "-a";
// slice is a value
// reset preceding option flags
if !slice.starts_with('-') {
preceding_short_opt_req_value = false;
preceding_long_opt_req_value = false;
} }
} else { } else {
// either not a short option // Cannot cleanly convert os_slice to UTF-8
// or a short option that cannot have obsolete lines value in it // Do not process and return as-is
filter = Some(slice.to_owned()); // This will cause failure later on, but we should not handle it here
} // and let clap panic on invalid UTF-8 argument
// capture if current slice is a preceding long option that requires value and does not use '=' to assign that value filter = Some(os_slice);
// following slice should be treaded as value for this option
// even if it starts with '-' (which would be treated as hyphen prefixed value)
if slice.starts_with("--") {
preceding_long_opt_req_value = &slice[2..] == OPT_BYTES
|| &slice[2..] == OPT_LINE_BYTES
|| &slice[2..] == OPT_LINES
|| &slice[2..] == OPT_ADDITIONAL_SUFFIX
|| &slice[2..] == OPT_FILTER
|| &slice[2..] == OPT_NUMBER
|| &slice[2..] == OPT_SUFFIX_LENGTH;
}
// capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace)
// following slice should be treaded as value for this option
// even if it starts with '-' (which would be treated as hyphen prefixed value)
preceding_short_opt_req_value =
slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a";
// slice is a value
// reset preceding option flags
if !slice.starts_with('-') {
preceding_short_opt_req_value = false;
preceding_long_opt_req_value = false;
} }
// return filter // return filter
filter filter

View file

@ -9,6 +9,7 @@ use rand::{thread_rng, Rng, SeedableRng};
use regex::Regex; use regex::Regex;
#[cfg(not(windows))] #[cfg(not(windows))]
use std::env; use std::env;
use std::ffi::OsStr;
use std::path::Path; use std::path::Path;
use std::{ use std::{
fs::{read_dir, File}, fs::{read_dir, File},
@ -1287,3 +1288,47 @@ fn test_split_invalid_input() {
.no_stdout() .no_stdout()
.stderr_contains("split: invalid number of chunks: 0"); .stderr_contains("split: invalid number of chunks: 0");
} }
/// Test if there are invalid (non UTF-8) in the arguments - unix
/// clap is expected to fail/panic
#[cfg(unix)]
#[test]
fn test_split_non_utf8_argument_unix() {
use std::os::unix::ffi::OsStrExt;
let (at, mut ucmd) = at_and_ucmd!();
let name = "test_split_non_utf8_argument";
let opt = OsStr::from_bytes("--additional-suffix".as_bytes());
RandomFile::new(&at, name).add_lines(2000);
// Here, the values 0x66 and 0x6f correspond to 'f' and 'o'
// respectively. The value 0x80 is a lone continuation byte, invalid
// in a UTF-8 sequence.
let opt_value = [0x66, 0x6f, 0x80, 0x6f];
let opt_value = OsStr::from_bytes(&opt_value[..]);
let name = OsStr::from_bytes(name.as_bytes());
ucmd.args(&[opt, opt_value, name])
.fails()
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
}
/// Test if there are invalid (non UTF-8) in the arguments - windows
/// clap is expected to fail/panic
#[cfg(windows)]
#[test]
fn test_split_non_utf8_argument_windows() {
use std::os::windows::prelude::*;
let (at, mut ucmd) = at_and_ucmd!();
let name = "test_split_non_utf8_argument";
let opt = OsStr::from_bytes("--additional-suffix".as_bytes());
RandomFile::new(&at, name).add_lines(2000);
// Here the values 0x0066 and 0x006f correspond to 'f' and 'o'
// respectively. The value 0xD800 is a lone surrogate half, invalid
// in a UTF-16 sequence.
let opt_value = [0x0066, 0x006f, 0xD800, 0x006f];
let opt_value = OsString::from_wide(&opt_value[..]);
let name = OsStr::from_bytes(name.as_bytes());
ucmd.args(&[opt, opt_value, name])
.fails()
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
}