From 977742f20960ef6f41dd23d0a2b949a23e3250fe Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Sat, 19 Mar 2016 16:03:42 -0700 Subject: [PATCH 1/4] tail: Take ownership of the provided BufReader The `BufReader` argument passed to the `fn tail(&mut BufReader, settings: &settings)` function is never reused, so the `tail` function should just take ownership of it. --- src/tail/tail.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) mode change 100644 => 100755 src/tail/tail.rs diff --git a/src/tail/tail.rs b/src/tail/tail.rs old mode 100644 new mode 100755 index e6bab3a5d..cbcd7fddc --- a/src/tail/tail.rs +++ b/src/tail/tail.rs @@ -134,8 +134,8 @@ pub fn uumain(args: Vec) -> i32 { let files = given_options.free; if files.is_empty() { - let mut buffer = BufReader::new(stdin()); - tail(&mut buffer, &settings); + let buffer = BufReader::new(stdin()); + tail(buffer, &settings); } else { let mut multiple = false; let mut firstime = true; @@ -153,8 +153,8 @@ pub fn uumain(args: Vec) -> i32 { let path = Path::new(file); let reader = File::open(&path).unwrap(); - let mut buffer = BufReader::new(reader); - tail(&mut buffer, &settings); + let buffer = BufReader::new(reader); + tail(buffer, &settings); } } @@ -169,7 +169,7 @@ fn parse_size(mut size_slice: &str) -> Option { } else { 1024usize }; - let exponent = + let exponent = if size_slice.len() > 0 { let mut has_suffix = true; let exp = match size_slice.chars().last().unwrap_or('_') { @@ -248,7 +248,7 @@ fn obsolete(options: &[String]) -> (Vec, Option) { (options, None) } -fn tail(reader: &mut BufReader, settings: &Settings) { +fn tail(mut reader: BufReader, settings: &Settings) { // Read through each line/char and store them in a ringbuffer that always // contains count lines/chars. When reaching the end of file, output the // data in the ringbuf. From 56d16ca7e7daddf3a2e3389874ddafd08ed6d5cf Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Sat, 19 Mar 2016 22:15:25 -0700 Subject: [PATCH 2/4] tail: Optimize tail for bounded searches in files When tail'ing a file, we do not need to read the whole file from start to finish just to find the last n lines or bytes. Instead, we can seek to the end of the file, and then read the file "backwards" in chunks until we find the location of the first line/byte we wish to print. This ends up being a nice performance win for very large files. Fixes #764 --- src/tail/tail.rs | 145 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 16 deletions(-) diff --git a/src/tail/tail.rs b/src/tail/tail.rs index cbcd7fddc..d89c95a60 100755 --- a/src/tail/tail.rs +++ b/src/tail/tail.rs @@ -17,7 +17,7 @@ extern crate uucore; use std::collections::VecDeque; use std::fs::File; -use std::io::{BufRead, BufReader, Read, stdin, stdout, Write}; +use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, stdin, stdout, Write}; use std::path::Path; use std::str::from_utf8; use std::thread::sleep; @@ -135,7 +135,7 @@ pub fn uumain(args: Vec) -> i32 { if files.is_empty() { let buffer = BufReader::new(stdin()); - tail(buffer, &settings); + unbounded_tail(buffer, &settings); } else { let mut multiple = false; let mut firstime = true; @@ -153,8 +153,7 @@ pub fn uumain(args: Vec) -> i32 { let path = Path::new(file); let reader = File::open(&path).unwrap(); - let buffer = BufReader::new(reader); - tail(buffer, &settings); + bounded_tail(reader, &settings); } } @@ -248,7 +247,130 @@ fn obsolete(options: &[String]) -> (Vec, Option) { (options, None) } -fn tail(mut reader: BufReader, settings: &Settings) { +/// When reading files in reverse in `bounded_tail`, this is the size of each +/// block read at a time. +const BLOCK_SIZE: u64 = 1 << 16; + +fn follow(mut reader: BufReader, settings: &Settings) { + assert!(settings.follow); + loop { + sleep(Duration::new(0, settings.sleep_msec*1000)); + loop { + let mut datum = String::new(); + match reader.read_line(&mut datum) { + Ok(0) => break, + Ok(_) => print!("{}", datum), + Err(err) => panic!(err) + } + } + } +} + +/// Iterate over bytes in the file, in reverse, until `should_stop` returns +/// true. The `file` is left seek'd to the position just after the byte that +/// `should_stop` returned true for. +fn backwards_thru_file(file: &mut File, size: u64, buf: &mut Vec, should_stop: &mut F) + where F: FnMut(u8) -> bool +{ + let max_blocks_to_read = (size as f64 / BLOCK_SIZE as f64).ceil() as usize; + + for block_idx in 0..max_blocks_to_read { + let block_size = if block_idx == max_blocks_to_read - 1 { + size % BLOCK_SIZE + } else { + BLOCK_SIZE + }; + + // Ensure that the buffer is filled and zeroed, if needed. + if buf.len() < (block_size as usize) { + for _ in buf.len()..(block_size as usize) { + buf.push(0); + } + } + + // Seek backwards by the next block, read the full block into + // `buf`, and then seek back to the start of the block again. + let pos = file.seek(SeekFrom::Current(-(block_size as i64))).unwrap(); + file.read_exact(&mut buf[0..(block_size as usize)]).unwrap(); + let pos2 = file.seek(SeekFrom::Current(-(block_size as i64))).unwrap(); + assert_eq!(pos, pos2); + + // Iterate backwards through the bytes, calling `should_stop` on each + // one. + let slice = &buf[0..(block_size as usize)]; + for (i, ch) in slice.iter().enumerate().rev() { + // Ignore one trailing newline. + if block_idx == 0 && i as u64 == block_size - 1 && *ch == ('\n' as u8) { + continue; + } + + if should_stop(*ch) { + file.seek(SeekFrom::Current((i + 1) as i64)).unwrap(); + return; + } + } + } +} + +/// When tail'ing a file, we do not need to read the whole file from start to +/// finish just to find the last n lines or bytes. Instead, we can seek to the +/// end of the file, and then read the file "backwards" in blocks of size +/// `BLOCK_SIZE` until we find the location of the first line/byte. This ends up +/// being a nice performance win for very large files. +fn bounded_tail(mut file: File, settings: &Settings) { + let size = file.seek(SeekFrom::End(0)).unwrap(); + if size == 0 { + if settings.follow { + let reader = BufReader::new(file); + follow(reader, settings); + } + return; + } + + let mut buf = Vec::with_capacity(BLOCK_SIZE as usize); + + // Find the position in the file to start printing from. + match settings.mode { + FilterMode::Lines(mut count) => { + backwards_thru_file(&mut file, size, &mut buf, &mut |byte| { + if byte == ('\n' as u8) { + count -= 1; + count == 0 + } else { + false + } + }); + }, + FilterMode::Bytes(mut count) => { + backwards_thru_file(&mut file, size, &mut buf, &mut |_| { + count -= 1; + count == 0 + }); + }, + } + + // Print the target section of the file. + loop { + let bytes_read = file.read(&mut buf).unwrap(); + + let mut stdout = stdout(); + for b in &buf[0..bytes_read] { + print_byte(&mut stdout, b); + } + + if bytes_read == 0 { + break; + } + } + + // Continue following changes, if requested. + if settings.follow { + let reader = BufReader::new(file); + follow(reader, settings); + } +} + +fn unbounded_tail(mut reader: BufReader, settings: &Settings) { // Read through each line/char and store them in a ringbuffer that always // contains count lines/chars. When reaching the end of file, output the // data in the ringbuf. @@ -317,17 +439,8 @@ fn tail(mut reader: BufReader, settings: &Settings) { } } - // if we follow the file, sleep a bit and print the rest if the file has grown. - while settings.follow { - sleep(Duration::new(0, settings.sleep_msec*1000)); - loop { - let mut datum = String::new(); - match reader.read_line(&mut datum) { - Ok(0) => break, - Ok(_) => print!("{}", datum), - Err(err) => panic!(err) - } - } + if settings.follow { + follow(reader, settings); } } From 161f96dc8c901c2a0fecc67ac5b4f97fff8b8fa2 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Sat, 19 Mar 2016 22:17:40 -0700 Subject: [PATCH 3/4] tests/tail: Rename tail test fixture contents to be easier to read The repetition of "foo" and "bar" made for difficult-to-read assertion failures when hacking on `tail`. I think that having each line have unique contents makes it a bit easier to parse. --- tests/fixtures/tail/foobar.txt | 22 +++++++++---------- .../tail/foobar_single_default.expected | 20 ++++++++--------- .../tail/foobar_stdin_default.expected | 20 ++++++++--------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/fixtures/tail/foobar.txt b/tests/fixtures/tail/foobar.txt index 32233d4a7..6e1f36ddc 100644 --- a/tests/fixtures/tail/foobar.txt +++ b/tests/fixtures/tail/foobar.txt @@ -1,11 +1,11 @@ -baz -foo -bar -foo -bar -foo -bar -foo -bar -foo -bar +uno +dos +tres +quattro +cinco +seis +siette +ocho +nueve +diez +once diff --git a/tests/fixtures/tail/foobar_single_default.expected b/tests/fixtures/tail/foobar_single_default.expected index af4f1f055..8eacac3b5 100644 --- a/tests/fixtures/tail/foobar_single_default.expected +++ b/tests/fixtures/tail/foobar_single_default.expected @@ -1,10 +1,10 @@ -foo -bar -foo -bar -foo -bar -foo -bar -foo -bar +dos +tres +quattro +cinco +seis +siette +ocho +nueve +diez +once diff --git a/tests/fixtures/tail/foobar_stdin_default.expected b/tests/fixtures/tail/foobar_stdin_default.expected index af4f1f055..8eacac3b5 100644 --- a/tests/fixtures/tail/foobar_stdin_default.expected +++ b/tests/fixtures/tail/foobar_stdin_default.expected @@ -1,10 +1,10 @@ -foo -bar -foo -bar -foo -bar -foo -bar -foo -bar +dos +tres +quattro +cinco +seis +siette +ocho +nueve +diez +once From d7974c56a05542ba7b18c9ad6db170f2ebf6dff7 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Sat, 19 Mar 2016 22:25:53 -0700 Subject: [PATCH 4/4] tests/tail: Add a test for tail'ing large files This tests both large files and iterating backwards through the file when we need to search backwards further than our BUFFER_SIZE. --- tests/tail.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/tail.rs b/tests/tail.rs index 8b9dc94a3..b83f8848e 100644 --- a/tests/tail.rs +++ b/tests/tail.rs @@ -1,3 +1,5 @@ +use std::io::Write; + #[macro_use] mod common; @@ -7,6 +9,9 @@ static UTIL_NAME: &'static str = "tail"; static INPUT: &'static str = "foobar.txt"; +static BIG: &'static str = "big.txt"; + +static BIG_EXPECTED: &'static str = "big_single_big_args.expected"; #[test] fn test_stdin_default() { @@ -21,3 +26,34 @@ fn test_single_default() { let result = ucmd.arg(INPUT).run(); assert_eq!(result.stdout, at.read("foobar_single_default.expected")); } + +const BIG_LINES: usize = 1_000_000; +const BIG_N_ARG: usize = 100_000; + +fn generate_big_test_files(at: &AtPath) { + let mut big_input = at.make_file(BIG); + for i in 0..BIG_LINES { + write!(&mut big_input, "Line {}\n", i).expect("Could not write to BIG file"); + } + big_input.flush().expect("Could not flush BIG file"); + + let mut big_expected = at.make_file(BIG_EXPECTED); + for i in (BIG_LINES - BIG_N_ARG)..BIG_LINES { + write!(&mut big_expected, "Line {}\n", i).expect("Could not write to BIG_EXPECTED file"); + } + big_expected.flush().expect("Could not flush BIG_EXPECTED file"); +} + +fn cleanup_big_test_files(at: &AtPath) { + at.cleanup(BIG); + at.cleanup(BIG_EXPECTED); +} + +#[test] +fn test_single_big_args() { + let (at, mut ucmd) = testing(UTIL_NAME); + generate_big_test_files(&at); + let result = ucmd.arg(BIG).arg("-n").arg(format!("{}", BIG_N_ARG)).run(); + assert_eq!(result.stdout, at.read(BIG_EXPECTED)); + cleanup_big_test_files(&at); +}