From 9957d0a64ee6178113a7ca0f9f4f120bd676b649 Mon Sep 17 00:00:00 2001 From: Karl McDowall Date: Tue, 18 Mar 2025 09:05:34 -0600 Subject: [PATCH] head: fix bug with non-terminated files. Fixes issue #7472. Update to head app when printing all-but-last-n-lines of a file. Code now checks if the last line of the input file is missing a terminating newline character, and if so prints an extra line. This aligns with GNU-head behavior. Also improved performance of this usecase by using an optimized iterator (memchr-iter) for searching through the input file. --- src/uu/head/src/head.rs | 64 +++++++++++++++++++++++++++++++------- tests/by-util/test_head.rs | 37 +++++++++++++++++++++- 2 files changed, 88 insertions(+), 13 deletions(-) diff --git a/src/uu/head/src/head.rs b/src/uu/head/src/head.rs index fb0a9e771..eec2442ac 100644 --- a/src/uu/head/src/head.rs +++ b/src/uu/head/src/head.rs @@ -3,9 +3,10 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (vars) seekable +// spell-checker:ignore (vars) seekable memrchr use clap::{Arg, ArgAction, ArgMatches, Command}; +use memchr::memrchr_iter; use std::ffi::OsString; #[cfg(unix)] use std::fs::File; @@ -378,30 +379,50 @@ where let mut buffer = [0u8; BUF_SIZE]; - let mut i = 0u64; let mut lines = 0u64; + let mut check_last_byte_first_loop = true; + let mut bytes_remaining_to_search = file_size; loop { // the casts here are ok, `buffer.len()` should never be above a few k - let bytes_remaining_to_search = file_size - i; - let bytes_to_read_this_loop = bytes_remaining_to_search.min(BUF_SIZE.try_into().unwrap()); + let bytes_to_read_this_loop = + bytes_remaining_to_search.min(buffer.len().try_into().unwrap()); let read_start_offset = bytes_remaining_to_search - bytes_to_read_this_loop; let buffer = &mut buffer[..bytes_to_read_this_loop.try_into().unwrap()]; + bytes_remaining_to_search -= bytes_to_read_this_loop; input.seek(SeekFrom::Start(read_start_offset))?; input.read_exact(buffer)?; - for byte in buffer.iter().rev() { - if byte == &separator { - lines += 1; - } - // if it were just `n`, + + // Unfortunately need special handling for the case that the input file doesn't have + // a terminating `separator` character. + // If the input file doesn't end with a `separator` character, add an extra line to our + // `line` counter. In the case that `n` is 0 we need to return here since we've + // obviously found our 0th-line-from-the-end offset. + if check_last_byte_first_loop { + check_last_byte_first_loop = false; + if let Some(last_byte_of_file) = buffer.last() { + if last_byte_of_file != &separator { + if n == 0 { + input.rewind()?; + return Ok(file_size); + } + assert_eq!(lines, 0); + lines = 1; + } + }; + } + + for separator_offset in memrchr_iter(separator, &buffer[..]) { + lines += 1; if lines == n + 1 { input.rewind()?; - return Ok(file_size - i); + return Ok(read_start_offset + + TryInto::::try_into(separator_offset).unwrap() + + 1); } - i += 1; } - if file_size - i == 0 { + if read_start_offset == 0 { input.rewind()?; return Ok(0); } @@ -753,4 +774,23 @@ mod tests { 0 ); } + + #[test] + fn test_find_nth_line_from_end_non_terminated() { + // Validate the find_nth_line_from_end for files that are not terminated with a final + // newline character. + let input_file = "a\nb"; + let mut input = Cursor::new(input_file); + assert_eq!(find_nth_line_from_end(&mut input, 0, b'\n').unwrap(), 3); + assert_eq!(find_nth_line_from_end(&mut input, 1, b'\n').unwrap(), 2); + } + + #[test] + fn test_find_nth_line_from_end_empty() { + // Validate the find_nth_line_from_end for files that are empty. + let input_file = ""; + let mut input = Cursor::new(input_file); + assert_eq!(find_nth_line_from_end(&mut input, 0, b'\n').unwrap(), 0); + assert_eq!(find_nth_line_from_end(&mut input, 1, b'\n').unwrap(), 0); + } } diff --git a/tests/by-util/test_head.rs b/tests/by-util/test_head.rs index 04b68b9c7..24f534191 100644 --- a/tests/by-util/test_head.rs +++ b/tests/by-util/test_head.rs @@ -148,6 +148,15 @@ fn test_zero_terminated_syntax_2() { .stdout_is("x\0y"); } +#[test] +fn test_non_terminated_input() { + new_ucmd!() + .args(&["-n", "-1"]) + .pipe_in("x\ny") + .succeeds() + .stdout_is("x\n"); +} + #[test] fn test_zero_terminated_negative_lines() { new_ucmd!() @@ -448,12 +457,19 @@ fn test_all_but_last_lines_large_file() { let scene = TestScenario::new(util_name!()); let fixtures = &scene.fixtures; let seq_20000_file_name = "seq_20000"; + let seq_20000_truncated_file_name = "seq_20000_truncated"; let seq_1000_file_name = "seq_1000"; scene .cmd("seq") .arg("20000") .set_stdout(fixtures.make_file(seq_20000_file_name)) .succeeds(); + // Create a file the same as seq_20000 except for the final terminating endline. + scene + .ucmd() + .args(&["-c", "-1", seq_20000_file_name]) + .set_stdout(fixtures.make_file(seq_20000_truncated_file_name)) + .succeeds(); scene .cmd("seq") .arg("1000") @@ -465,7 +481,7 @@ fn test_all_but_last_lines_large_file() { .ucmd() .args(&["-n", "-19000", seq_20000_file_name]) .succeeds() - .stdout_only_fixture("seq_1000"); + .stdout_only_fixture(seq_1000_file_name); scene .ucmd() @@ -478,6 +494,25 @@ fn test_all_but_last_lines_large_file() { .args(&["-n", "-20001", seq_20000_file_name]) .succeeds() .stdout_only_fixture("emptyfile.txt"); + + // Confirm correct behavior when the input file doesn't end with a newline. + scene + .ucmd() + .args(&["-n", "-19000", seq_20000_truncated_file_name]) + .succeeds() + .stdout_only_fixture(seq_1000_file_name); + + scene + .ucmd() + .args(&["-n", "-20000", seq_20000_truncated_file_name]) + .succeeds() + .stdout_only_fixture("emptyfile.txt"); + + scene + .ucmd() + .args(&["-n", "-20001", seq_20000_truncated_file_name]) + .succeeds() + .stdout_only_fixture("emptyfile.txt"); } #[cfg(all(