1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

head: fix bug with non-terminated files.

Fixes issue #7472.
Update to head app when printing all-but-last-n-lines of a file.
Code now checks if the last line of the input file is missing a terminating
newline character, and if so prints an extra line.
This aligns with GNU-head behavior.
Also improved performance of this usecase by using an optimized iterator
(memchr-iter) for searching through the input file.
This commit is contained in:
Karl McDowall 2025-03-18 09:05:34 -06:00
parent 5a0988c28d
commit 9957d0a64e
2 changed files with 88 additions and 13 deletions

View file

@ -3,9 +3,10 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (vars) seekable
// spell-checker:ignore (vars) seekable memrchr
use clap::{Arg, ArgAction, ArgMatches, Command};
use memchr::memrchr_iter;
use std::ffi::OsString;
#[cfg(unix)]
use std::fs::File;
@ -378,30 +379,50 @@ where
let mut buffer = [0u8; BUF_SIZE];
let mut i = 0u64;
let mut lines = 0u64;
let mut check_last_byte_first_loop = true;
let mut bytes_remaining_to_search = file_size;
loop {
// the casts here are ok, `buffer.len()` should never be above a few k
let bytes_remaining_to_search = file_size - i;
let bytes_to_read_this_loop = bytes_remaining_to_search.min(BUF_SIZE.try_into().unwrap());
let bytes_to_read_this_loop =
bytes_remaining_to_search.min(buffer.len().try_into().unwrap());
let read_start_offset = bytes_remaining_to_search - bytes_to_read_this_loop;
let buffer = &mut buffer[..bytes_to_read_this_loop.try_into().unwrap()];
bytes_remaining_to_search -= bytes_to_read_this_loop;
input.seek(SeekFrom::Start(read_start_offset))?;
input.read_exact(buffer)?;
for byte in buffer.iter().rev() {
if byte == &separator {
lines += 1;
// Unfortunately need special handling for the case that the input file doesn't have
// a terminating `separator` character.
// If the input file doesn't end with a `separator` character, add an extra line to our
// `line` counter. In the case that `n` is 0 we need to return here since we've
// obviously found our 0th-line-from-the-end offset.
if check_last_byte_first_loop {
check_last_byte_first_loop = false;
if let Some(last_byte_of_file) = buffer.last() {
if last_byte_of_file != &separator {
if n == 0 {
input.rewind()?;
return Ok(file_size);
}
// if it were just `n`,
assert_eq!(lines, 0);
lines = 1;
}
};
}
for separator_offset in memrchr_iter(separator, &buffer[..]) {
lines += 1;
if lines == n + 1 {
input.rewind()?;
return Ok(file_size - i);
return Ok(read_start_offset
+ TryInto::<u64>::try_into(separator_offset).unwrap()
+ 1);
}
i += 1;
}
if file_size - i == 0 {
if read_start_offset == 0 {
input.rewind()?;
return Ok(0);
}
@ -753,4 +774,23 @@ mod tests {
0
);
}
#[test]
fn test_find_nth_line_from_end_non_terminated() {
// Validate the find_nth_line_from_end for files that are not terminated with a final
// newline character.
let input_file = "a\nb";
let mut input = Cursor::new(input_file);
assert_eq!(find_nth_line_from_end(&mut input, 0, b'\n').unwrap(), 3);
assert_eq!(find_nth_line_from_end(&mut input, 1, b'\n').unwrap(), 2);
}
#[test]
fn test_find_nth_line_from_end_empty() {
// Validate the find_nth_line_from_end for files that are empty.
let input_file = "";
let mut input = Cursor::new(input_file);
assert_eq!(find_nth_line_from_end(&mut input, 0, b'\n').unwrap(), 0);
assert_eq!(find_nth_line_from_end(&mut input, 1, b'\n').unwrap(), 0);
}
}

View file

@ -148,6 +148,15 @@ fn test_zero_terminated_syntax_2() {
.stdout_is("x\0y");
}
#[test]
fn test_non_terminated_input() {
new_ucmd!()
.args(&["-n", "-1"])
.pipe_in("x\ny")
.succeeds()
.stdout_is("x\n");
}
#[test]
fn test_zero_terminated_negative_lines() {
new_ucmd!()
@ -448,12 +457,19 @@ fn test_all_but_last_lines_large_file() {
let scene = TestScenario::new(util_name!());
let fixtures = &scene.fixtures;
let seq_20000_file_name = "seq_20000";
let seq_20000_truncated_file_name = "seq_20000_truncated";
let seq_1000_file_name = "seq_1000";
scene
.cmd("seq")
.arg("20000")
.set_stdout(fixtures.make_file(seq_20000_file_name))
.succeeds();
// Create a file the same as seq_20000 except for the final terminating endline.
scene
.ucmd()
.args(&["-c", "-1", seq_20000_file_name])
.set_stdout(fixtures.make_file(seq_20000_truncated_file_name))
.succeeds();
scene
.cmd("seq")
.arg("1000")
@ -465,7 +481,7 @@ fn test_all_but_last_lines_large_file() {
.ucmd()
.args(&["-n", "-19000", seq_20000_file_name])
.succeeds()
.stdout_only_fixture("seq_1000");
.stdout_only_fixture(seq_1000_file_name);
scene
.ucmd()
@ -478,6 +494,25 @@ fn test_all_but_last_lines_large_file() {
.args(&["-n", "-20001", seq_20000_file_name])
.succeeds()
.stdout_only_fixture("emptyfile.txt");
// Confirm correct behavior when the input file doesn't end with a newline.
scene
.ucmd()
.args(&["-n", "-19000", seq_20000_truncated_file_name])
.succeeds()
.stdout_only_fixture(seq_1000_file_name);
scene
.ucmd()
.args(&["-n", "-20000", seq_20000_truncated_file_name])
.succeeds()
.stdout_only_fixture("emptyfile.txt");
scene
.ucmd()
.args(&["-n", "-20001", seq_20000_truncated_file_name])
.succeeds()
.stdout_only_fixture("emptyfile.txt");
}
#[cfg(all(