From 7fbd80571352f958e9a0ad5721bb17f6e74cc83a Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 30 Jan 2022 21:35:43 -0500 Subject: [PATCH 1/3] split: refactor to add SuffixType enum Refactor the code to use a `SuffixType` enumeration with two members, `Alphabetic` and `NumericDecimal`, representing the two currently supported ways of producing filename suffixes. This prepares the code to more easily support other formats, like numeric hexadecimal. --- src/uu/split/src/filenames.rs | 62 ++++++++++++++++++++++++----------- src/uu/split/src/split.rs | 20 ++++++++--- 2 files changed, 58 insertions(+), 24 deletions(-) diff --git a/src/uu/split/src/filenames.rs b/src/uu/split/src/filenames.rs index 3e2db3606..1b89190c7 100644 --- a/src/uu/split/src/filenames.rs +++ b/src/uu/split/src/filenames.rs @@ -13,12 +13,13 @@ //! //! ```rust,ignore //! use crate::filenames::FilenameIterator; +//! use crate::filenames::SuffixType; //! //! let prefix = "chunk_".to_string(); //! let suffix = ".txt".to_string(); //! let width = 2; -//! let use_numeric_suffix = false; -//! let it = FilenameIterator::new(prefix, suffix, width, use_numeric_suffix); +//! let suffix_type = SuffixType::Alphabetic; +//! let it = FilenameIterator::new(prefix, suffix, width, suffix_type); //! //! assert_eq!(it.next().unwrap(), "chunk_aa.txt"); //! assert_eq!(it.next().unwrap(), "chunk_ab.txt"); @@ -28,6 +29,26 @@ use crate::number::DynamicWidthNumber; use crate::number::FixedWidthNumber; use crate::number::Number; +/// The format to use for suffixes in the filename for each output chunk. +#[derive(Clone, Copy)] +pub enum SuffixType { + /// Lowercase ASCII alphabetic characters. + Alphabetic, + + /// Decimal numbers. + NumericDecimal, +} + +impl SuffixType { + /// The radix to use when representing the suffix string as digits. + fn radix(&self) -> u8 { + match self { + SuffixType::Alphabetic => 26, + SuffixType::NumericDecimal => 10, + } + } +} + /// Compute filenames from a given index. /// /// This iterator yields filenames for use with ``split``. @@ -42,8 +63,8 @@ use crate::number::Number; /// width in characters. In that case, after the iterator yields each /// string of that width, the iterator is exhausted. /// -/// Finally, if `use_numeric_suffix` is `true`, then numbers will be -/// used instead of lowercase ASCII alphabetic characters. +/// Finally, `suffix_type` controls which type of suffix to produce, +/// alphabetic or numeric. /// /// # Examples /// @@ -52,28 +73,30 @@ use crate::number::Number; /// /// ```rust,ignore /// use crate::filenames::FilenameIterator; +/// use crate::filenames::SuffixType; /// /// let prefix = "chunk_".to_string(); /// let suffix = ".txt".to_string(); /// let width = 2; -/// let use_numeric_suffix = false; -/// let it = FilenameIterator::new(prefix, suffix, width, use_numeric_suffix); +/// let suffix_type = SuffixType::Alphabetic; +/// let it = FilenameIterator::new(prefix, suffix, width, suffix_type); /// /// assert_eq!(it.next().unwrap(), "chunk_aa.txt"); /// assert_eq!(it.next().unwrap(), "chunk_ab.txt"); /// assert_eq!(it.next().unwrap(), "chunk_ac.txt"); /// ``` /// -/// For numeric filenames, set `use_numeric_suffix` to `true`: +/// For numeric filenames, use `SuffixType::NumericDecimal`: /// /// ```rust,ignore /// use crate::filenames::FilenameIterator; +/// use crate::filenames::SuffixType; /// /// let prefix = "chunk_".to_string(); /// let suffix = ".txt".to_string(); /// let width = 2; -/// let use_numeric_suffix = true; -/// let it = FilenameIterator::new(prefix, suffix, width, use_numeric_suffix); +/// let suffix_type = SuffixType::NumericDecimal; +/// let it = FilenameIterator::new(prefix, suffix, width, suffix_type); /// /// assert_eq!(it.next().unwrap(), "chunk_00.txt"); /// assert_eq!(it.next().unwrap(), "chunk_01.txt"); @@ -91,9 +114,9 @@ impl<'a> FilenameIterator<'a> { prefix: &'a str, additional_suffix: &'a str, suffix_length: usize, - use_numeric_suffix: bool, + suffix_type: SuffixType, ) -> FilenameIterator<'a> { - let radix = if use_numeric_suffix { 10 } else { 26 }; + let radix = suffix_type.radix(); let number = if suffix_length == 0 { Number::DynamicWidth(DynamicWidthNumber::new(radix)) } else { @@ -130,39 +153,40 @@ impl<'a> Iterator for FilenameIterator<'a> { mod tests { use crate::filenames::FilenameIterator; + use crate::filenames::SuffixType; #[test] fn test_filename_iterator_alphabetic_fixed_width() { - let mut it = FilenameIterator::new("chunk_", ".txt", 2, false); + let mut it = FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Alphabetic); assert_eq!(it.next().unwrap(), "chunk_aa.txt"); assert_eq!(it.next().unwrap(), "chunk_ab.txt"); assert_eq!(it.next().unwrap(), "chunk_ac.txt"); - let mut it = FilenameIterator::new("chunk_", ".txt", 2, false); + let mut it = FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Alphabetic); assert_eq!(it.nth(26 * 26 - 1).unwrap(), "chunk_zz.txt"); assert_eq!(it.next(), None); } #[test] fn test_filename_iterator_numeric_fixed_width() { - let mut it = FilenameIterator::new("chunk_", ".txt", 2, true); + let mut it = FilenameIterator::new("chunk_", ".txt", 2, SuffixType::NumericDecimal); assert_eq!(it.next().unwrap(), "chunk_00.txt"); assert_eq!(it.next().unwrap(), "chunk_01.txt"); assert_eq!(it.next().unwrap(), "chunk_02.txt"); - let mut it = FilenameIterator::new("chunk_", ".txt", 2, true); + let mut it = FilenameIterator::new("chunk_", ".txt", 2, SuffixType::NumericDecimal); assert_eq!(it.nth(10 * 10 - 1).unwrap(), "chunk_99.txt"); assert_eq!(it.next(), None); } #[test] fn test_filename_iterator_alphabetic_dynamic_width() { - let mut it = FilenameIterator::new("chunk_", ".txt", 0, false); + let mut it = FilenameIterator::new("chunk_", ".txt", 0, SuffixType::Alphabetic); assert_eq!(it.next().unwrap(), "chunk_aa.txt"); assert_eq!(it.next().unwrap(), "chunk_ab.txt"); assert_eq!(it.next().unwrap(), "chunk_ac.txt"); - let mut it = FilenameIterator::new("chunk_", ".txt", 0, false); + let mut it = FilenameIterator::new("chunk_", ".txt", 0, SuffixType::Alphabetic); assert_eq!(it.nth(26 * 25 - 1).unwrap(), "chunk_yz.txt"); assert_eq!(it.next().unwrap(), "chunk_zaaa.txt"); assert_eq!(it.next().unwrap(), "chunk_zaab.txt"); @@ -170,12 +194,12 @@ mod tests { #[test] fn test_filename_iterator_numeric_dynamic_width() { - let mut it = FilenameIterator::new("chunk_", ".txt", 0, true); + let mut it = FilenameIterator::new("chunk_", ".txt", 0, SuffixType::NumericDecimal); assert_eq!(it.next().unwrap(), "chunk_00.txt"); assert_eq!(it.next().unwrap(), "chunk_01.txt"); assert_eq!(it.next().unwrap(), "chunk_02.txt"); - let mut it = FilenameIterator::new("chunk_", ".txt", 0, true); + let mut it = FilenameIterator::new("chunk_", ".txt", 0, SuffixType::NumericDecimal); assert_eq!(it.nth(10 * 9 - 1).unwrap(), "chunk_89.txt"); assert_eq!(it.next().unwrap(), "chunk_9000.txt"); assert_eq!(it.next().unwrap(), "chunk_9001.txt"); diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 57953ae27..e9dab1725 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -12,6 +12,7 @@ mod number; mod platform; use crate::filenames::FilenameIterator; +use crate::filenames::SuffixType; use clap::{crate_version, App, AppSettings, Arg, ArgMatches}; use std::env; use std::fmt; @@ -240,13 +241,22 @@ impl Strategy { } } +/// Parse the suffix type from the command-line arguments. +fn suffix_type_from(matches: &ArgMatches) -> SuffixType { + if matches.occurrences_of(OPT_NUMERIC_SUFFIXES) > 0 { + SuffixType::NumericDecimal + } else { + SuffixType::Alphabetic + } +} + /// Parameters that control how a file gets split. /// /// You can convert an [`ArgMatches`] instance into a [`Settings`] /// instance by calling [`Settings::from`]. struct Settings { prefix: String, - numeric_suffix: bool, + suffix_type: SuffixType, suffix_length: usize, additional_suffix: String, input: String, @@ -314,7 +324,7 @@ impl Settings { suffix_length: suffix_length_str .parse() .map_err(|_| SettingsError::SuffixLength(suffix_length_str.to_string()))?, - numeric_suffix: matches.occurrences_of(OPT_NUMERIC_SUFFIXES) > 0, + suffix_type: suffix_type_from(matches), additional_suffix, verbose: matches.occurrences_of("verbose") > 0, strategy: Strategy::from(matches).map_err(SettingsError::Strategy)?, @@ -374,7 +384,7 @@ impl<'a> ByteChunkWriter<'a> { &settings.prefix, &settings.additional_suffix, settings.suffix_length, - settings.numeric_suffix, + settings.suffix_type, ); let filename = filename_iterator.next()?; if settings.verbose { @@ -502,7 +512,7 @@ impl<'a> LineChunkWriter<'a> { &settings.prefix, &settings.additional_suffix, settings.suffix_length, - settings.numeric_suffix, + settings.suffix_type, ); let filename = filename_iterator.next()?; if settings.verbose { @@ -594,7 +604,7 @@ where &settings.prefix, &settings.additional_suffix, settings.suffix_length, - settings.numeric_suffix, + settings.suffix_type, ); // Create one writer for each chunk. This will create each From 494dc7ec573dc3d5754fbb707a0c410ef8befe6a Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 16 Jan 2022 10:36:26 -0500 Subject: [PATCH 2/3] split: add SuffixType::NumericHexadecimal Add a `NumericHexadecimal` member to the `SuffixType` enum so that a future commit can add support for hexadecimal filename suffixes to the `split` program. --- src/uu/split/src/filenames.rs | 6 +- src/uu/split/src/number.rs | 119 +++++++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 24 deletions(-) diff --git a/src/uu/split/src/filenames.rs b/src/uu/split/src/filenames.rs index 1b89190c7..0121ba87f 100644 --- a/src/uu/split/src/filenames.rs +++ b/src/uu/split/src/filenames.rs @@ -37,6 +37,9 @@ pub enum SuffixType { /// Decimal numbers. NumericDecimal, + + /// Hexadecimal numbers. + NumericHexadecimal, } impl SuffixType { @@ -45,6 +48,7 @@ impl SuffixType { match self { SuffixType::Alphabetic => 26, SuffixType::NumericDecimal => 10, + SuffixType::NumericHexadecimal => 16, } } } @@ -86,7 +90,7 @@ impl SuffixType { /// assert_eq!(it.next().unwrap(), "chunk_ac.txt"); /// ``` /// -/// For numeric filenames, use `SuffixType::NumericDecimal`: +/// For decimal numeric filenames, use `SuffixType::NumericDecimal`: /// /// ```rust,ignore /// use crate::filenames::FilenameIterator; diff --git a/src/uu/split/src/number.rs b/src/uu/split/src/number.rs index ef3ccbc4b..d5427e2ca 100644 --- a/src/uu/split/src/number.rs +++ b/src/uu/split/src/number.rs @@ -40,13 +40,19 @@ impl Error for Overflow {} /// specifically for the `split` program. See the /// [`DynamicWidthNumber`] documentation for more information. /// -/// Numbers of radix 10 are displayable and rendered as decimal -/// numbers (for example, "00" or "917"). Numbers of radix 26 are -/// displayable and rendered as lowercase ASCII alphabetic characters -/// (for example, "aa" or "zax"). Numbers of other radices cannot be -/// displayed. The display of a [`DynamicWidthNumber`] includes a -/// prefix whose length depends on the width of the number. See the -/// [`DynamicWidthNumber`] documentation for more information. +/// Numbers of radix +/// +/// * 10 are displayable and rendered as decimal numbers (for example, +/// "00" or "917"), +/// * 16 are displayable and rendered as hexadecimal numbers (for example, +/// "00" or "e7f"), +/// * 26 are displayable and rendered as lowercase ASCII alphabetic +/// characters (for example, "aa" or "zax"). +/// +/// Numbers of other radices cannot be displayed. The display of a +/// [`DynamicWidthNumber`] includes a prefix whose length depends on +/// the width of the number. See the [`DynamicWidthNumber`] +/// documentation for more information. /// /// The digits of a number are accessible via the [`Number::digits`] /// method. The digits are represented as a [`Vec`] with the most @@ -169,12 +175,12 @@ impl Display for Number { /// /// # Displaying /// -/// This number is only displayable if `radix` is 10 or `radix` is -/// 26. If `radix` is 10, then the digits are concatenated and -/// displayed as a fixed-width decimal number. If `radix` is 26, then -/// each digit is translated to the corresponding lowercase ASCII -/// alphabetic character (that is, 'a', 'b', 'c', etc.) and -/// concatenated. +/// This number is only displayable if `radix` is 10, 26, or 26. If +/// `radix` is 10 or 16, then the digits are concatenated and +/// displayed as a fixed-width decimal or hexadecimal number, +/// respectively. If `radix` is 26, then each digit is translated to +/// the corresponding lowercase ASCII alphabetic character (that is, +/// 'a', 'b', 'c', etc.) and concatenated. #[derive(Clone)] pub struct FixedWidthNumber { radix: u8, @@ -228,6 +234,14 @@ impl Display for FixedWidthNumber { let digits: String = self.digits.iter().map(|d| (b'0' + d) as char).collect(); write!(f, "{}", digits) } + 16 => { + let digits: String = self + .digits + .iter() + .map(|d| (if *d < 10 { b'0' + d } else { b'a' + (d - 10) }) as char) + .collect(); + write!(f, "{}", digits) + } 26 => { let digits: String = self.digits.iter().map(|d| (b'a' + d) as char).collect(); write!(f, "{}", digits) @@ -264,14 +278,15 @@ impl Display for FixedWidthNumber { /// /// # Displaying /// -/// This number is only displayable if `radix` is 10 or `radix` is -/// 26. If `radix` is 10, then the digits are concatenated and -/// displayed as a fixed-width decimal number with a prefix of `n - 2` -/// instances of the character '9', where `n` is the number of digits. -/// If `radix` is 26, then each digit is translated to the -/// corresponding lowercase ASCII alphabetic character (that is, 'a', -/// 'b', 'c', etc.) and concatenated with a prefix of `n - 2` -/// instances of the character 'z'. +/// This number is only displayable if `radix` is 10, 16, or 26. If +/// `radix` is 10 or 16, then the digits are concatenated and +/// displayed as a fixed-width decimal or hexadecimal number, +/// respectively, with a prefix of `n - 2` instances of the character +/// '9' of 'f', respectively, where `n` is the number of digits. If +/// `radix` is 26, then each digit is translated to the corresponding +/// lowercase ASCII alphabetic character (that is, 'a', 'b', 'c', +/// etc.) and concatenated with a prefix of `n - 2` instances of the +/// character 'z'. /// /// This notion of displaying the number is specific to the `split` /// program. @@ -349,6 +364,21 @@ impl Display for DynamicWidthNumber { digits = digits, ) } + 16 => { + let num_fill_chars = self.digits.len() - 2; + let digits: String = self + .digits + .iter() + .map(|d| (if *d < 10 { b'0' + d } else { b'a' + (d - 10) }) as char) + .collect(); + write!( + f, + "{empty:f { let num_fill_chars = self.digits.len() - 2; let digits: String = self.digits.iter().map(|d| (b'a' + d) as char).collect(); @@ -424,7 +454,7 @@ mod tests { } #[test] - fn test_dynamic_width_number_display_numeric() { + fn test_dynamic_width_number_display_numeric_decimal() { fn num(n: usize) -> Number { let mut number = Number::DynamicWidth(DynamicWidthNumber::new(10)); for _ in 0..n { @@ -444,6 +474,30 @@ mod tests { assert_eq!(format!("{}", num(10 * 99 + 1)), "990001"); } + #[test] + fn test_dynamic_width_number_display_numeric_hexadecimal() { + fn num(n: usize) -> Number { + let mut number = Number::DynamicWidth(DynamicWidthNumber::new(16)); + for _ in 0..n { + number.increment().unwrap() + } + number + } + + assert_eq!(format!("{}", num(0)), "00"); + assert_eq!(format!("{}", num(15)), "0f"); + assert_eq!(format!("{}", num(16)), "10"); + assert_eq!(format!("{}", num(17)), "11"); + assert_eq!(format!("{}", num(18)), "12"); + + assert_eq!(format!("{}", num(16 * 15 - 1)), "ef"); + assert_eq!(format!("{}", num(16 * 15)), "f000"); + assert_eq!(format!("{}", num(16 * 15 + 1)), "f001"); + assert_eq!(format!("{}", num(16 * 255 - 1)), "feff"); + assert_eq!(format!("{}", num(16 * 255)), "ff0000"); + assert_eq!(format!("{}", num(16 * 255 + 1)), "ff0001"); + } + #[test] fn test_fixed_width_number_increment() { let mut n = Number::FixedWidth(FixedWidthNumber::new(3, 2)); @@ -493,7 +547,7 @@ mod tests { } #[test] - fn test_fixed_width_number_display_numeric() { + fn test_fixed_width_number_display_numeric_decimal() { fn num(n: usize) -> Result { let mut number = Number::FixedWidth(FixedWidthNumber::new(10, 2)); for _ in 0..n { @@ -510,4 +564,23 @@ mod tests { assert_eq!(format!("{}", num(10 * 10 - 1).unwrap()), "99"); assert!(num(10 * 10).is_err()); } + + #[test] + fn test_fixed_width_number_display_numeric_hexadecimal() { + fn num(n: usize) -> Result { + let mut number = Number::FixedWidth(FixedWidthNumber::new(16, 2)); + for _ in 0..n { + number.increment()?; + } + Ok(number) + } + + assert_eq!(format!("{}", num(0).unwrap()), "00"); + assert_eq!(format!("{}", num(15).unwrap()), "0f"); + assert_eq!(format!("{}", num(17).unwrap()), "11"); + assert_eq!(format!("{}", num(16 * 15 - 1).unwrap()), "ef"); + assert_eq!(format!("{}", num(16 * 15).unwrap()), "f0"); + assert_eq!(format!("{}", num(16 * 16 - 1).unwrap()), "ff"); + assert!(num(16 * 16).is_err()); + } } From a4955b4e06c04445d74490517c119107ae2d30b7 Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 16 Jan 2022 10:41:52 -0500 Subject: [PATCH 3/3] split: add support for -x option (hex suffixes) Add support for the `-x` command-line option to `split`. This option causes `split` to produce filenames with hexadecimal suffixes instead of the default alphabetic suffixes. --- src/uu/split/src/split.rs | 11 +++++++++ tests/by-util/test_split.rs | 24 ++++++++++++++++++- .../split/twohundredfortyonebytes.txt | 1 + 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/split/twohundredfortyonebytes.txt diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index e9dab1725..2c344f4d3 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -32,6 +32,7 @@ static OPT_ADDITIONAL_SUFFIX: &str = "additional-suffix"; static OPT_FILTER: &str = "filter"; static OPT_NUMBER: &str = "number"; static OPT_NUMERIC_SUFFIXES: &str = "numeric-suffixes"; +static OPT_HEX_SUFFIXES: &str = "hex-suffixes"; static OPT_SUFFIX_LENGTH: &str = "suffix-length"; static OPT_DEFAULT_SUFFIX_LENGTH: &str = "0"; static OPT_VERBOSE: &str = "verbose"; @@ -140,6 +141,14 @@ pub fn uu_app<'a>() -> App<'a> { .default_value(OPT_DEFAULT_SUFFIX_LENGTH) .help("use suffixes of length N (default 2)"), ) + .arg( + Arg::new(OPT_HEX_SUFFIXES) + .short('x') + .long(OPT_HEX_SUFFIXES) + .takes_value(true) + .default_missing_value("0") + .help("use hex suffixes starting at 0, not alphabetic"), + ) .arg( Arg::new(OPT_VERBOSE) .long(OPT_VERBOSE) @@ -245,6 +254,8 @@ impl Strategy { fn suffix_type_from(matches: &ArgMatches) -> SuffixType { if matches.occurrences_of(OPT_NUMERIC_SUFFIXES) > 0 { SuffixType::NumericDecimal + } else if matches.occurrences_of(OPT_HEX_SUFFIXES) > 0 { + SuffixType::NumericHexadecimal } else { SuffixType::Alphabetic } diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 0291d1f4a..e30e29acc 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -2,7 +2,7 @@ // * // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz fivelines +// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes extern crate rand; extern crate regex; @@ -409,6 +409,28 @@ fn test_numeric_dynamic_suffix_length() { assert_eq!(file_read(&at, "x9000"), "a"); } +#[test] +fn test_hex_dynamic_suffix_length() { + let (at, mut ucmd) = at_and_ucmd!(); + // Split into chunks of one byte each, use hexadecimal digits + // instead of letters as file suffixes. + // + // The input file has (16^2) - 16 + 1 = 241 bytes. This is just + // enough to force `split` to dynamically increase the length of + // the filename for the very last chunk. + // + // x00, x01, x02, ..., xed, xee, xef, xf000 + // + ucmd.args(&["-x", "-b", "1", "twohundredfortyonebytes.txt"]) + .succeeds(); + for i in 0..240 { + let filename = format!("x{:02x}", i); + let contents = file_read(&at, &filename); + assert_eq!(contents, "a"); + } + assert_eq!(file_read(&at, "xf000"), "a"); +} + #[test] fn test_suffixes_exhausted() { new_ucmd!() diff --git a/tests/fixtures/split/twohundredfortyonebytes.txt b/tests/fixtures/split/twohundredfortyonebytes.txt new file mode 100644 index 000000000..10a53a61c --- /dev/null +++ b/tests/fixtures/split/twohundredfortyonebytes.txt @@ -0,0 +1 @@ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa \ No newline at end of file