From ee36dea1a93a67a9b93af510e0d2d75dd5f84aa1 Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Wed, 5 Jan 2022 21:06:04 -0500 Subject: [PATCH] split: implement outputting kth chunk of file Implement `-n l/k/N` option, where the `k`th chunk of the input file is written to stdout. For example, $ seq -w 0 99 > f; split -n l/3/10 f 20 21 22 23 24 25 26 27 28 29 --- src/uu/split/src/split.rs | 72 ++++++++++++++++ tests/by-util/test_split.rs | 8 ++ tests/fixtures/split/onehundredlines.txt | 100 +++++++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 tests/fixtures/split/onehundredlines.txt diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index e2504f305..090d89d4e 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -859,6 +859,11 @@ where /// /// This function returns an error if there is a problem reading from /// `reader` or writing to one of the output files. +/// +/// # See also +/// +/// * [`kth_chunk_by_line`], which splits its input in the same way, +/// but writes only one specified chunk to stdout. fn split_into_n_chunks_by_line( settings: &Settings, reader: &mut R, @@ -915,6 +920,67 @@ where Ok(()) } +/// Print the k-th chunk of a file, splitting by line. +/// +/// This function is like [`split_into_n_chunks_by_line`], but instead +/// of writing each chunk to its own file, it only writes to stdout +/// the contents of the chunk identified by `chunk_number`. +/// +/// # Errors +/// +/// This function returns an error if there is a problem reading from +/// `reader` or writing to one of the output files. +/// +/// # See also +/// +/// * [`split_into_n_chunks_by_line`], which splits its input in the +/// same way, but writes each chunk to its own file. +fn kth_chunk_by_line( + settings: &Settings, + reader: &mut R, + chunk_number: u64, + num_chunks: u64, +) -> UResult<()> +where + R: BufRead, +{ + // Get the size of the input file in bytes and compute the number + // of bytes per chunk. + let metadata = metadata(&settings.input).unwrap(); + let num_bytes = metadata.len(); + let chunk_size = (num_bytes / (num_chunks as u64)) as usize; + + // Write to stdout instead of to a file. + let stdout = std::io::stdout(); + let mut writer = stdout.lock(); + + let mut num_bytes_remaining_in_current_chunk = chunk_size; + let mut i = 0; + for line_result in reader.lines() { + let line = line_result?; + let bytes = line.as_bytes(); + if i == chunk_number { + writer.write_all(bytes)?; + writer.write_all(b"\n")?; + } + + // Add one byte for the newline character. + let num_bytes = bytes.len() + 1; + if num_bytes >= num_bytes_remaining_in_current_chunk { + num_bytes_remaining_in_current_chunk = chunk_size; + i += 1; + } else { + num_bytes_remaining_in_current_chunk -= num_bytes; + } + + if i > chunk_number { + break; + } + } + + Ok(()) +} + fn split(settings: &Settings) -> UResult<()> { let mut reader = BufReader::new(if settings.input == "-" { Box::new(stdin()) as Box @@ -935,6 +1001,12 @@ fn split(settings: &Settings) -> UResult<()> { Strategy::Number(NumberType::Lines(num_chunks)) => { split_into_n_chunks_by_line(settings, &mut reader, num_chunks) } + Strategy::Number(NumberType::KthLines(chunk_number, num_chunks)) => { + // The chunk number is given as a 1-indexed number, but it + // is a little easier to deal with a 0-indexed number. + let chunk_number = chunk_number - 1; + kth_chunk_by_line(settings, &mut reader, chunk_number, num_chunks) + } Strategy::Number(_) => Err(USimpleError::new(1, "-n mode not yet fully implemented")), Strategy::Lines(chunk_size) => { let mut writer = LineChunkWriter::new(chunk_size, settings) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index ab59a573a..06aa9ea61 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -587,3 +587,11 @@ fn test_lines() { assert_eq!(file_read("xaa"), "1\n2\n3\n"); assert_eq!(file_read("xab"), "4\n5\n"); } + +#[test] +fn test_lines_kth() { + new_ucmd!() + .args(&["-n", "l/3/10", "onehundredlines.txt"]) + .succeeds() + .stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n"); +} diff --git a/tests/fixtures/split/onehundredlines.txt b/tests/fixtures/split/onehundredlines.txt new file mode 100644 index 000000000..f2abdb403 --- /dev/null +++ b/tests/fixtures/split/onehundredlines.txt @@ -0,0 +1,100 @@ +00 +01 +02 +03 +04 +05 +06 +07 +08 +09 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99