mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-29 20:17:45 +00:00
Merge pull request #2705 from adamreichold/tac-mmap
Minor improvements to tac
This commit is contained in:
commit
40a895f79d
4 changed files with 113 additions and 30 deletions
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -1064,6 +1064,15 @@ version = "2.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
|
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memmap2"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4647a11b578fead29cdbb34d4adef8dd3dc35b876c9c6d5240d83f205abfe96e"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memoffset"
|
name = "memoffset"
|
||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
|
@ -3025,6 +3034,7 @@ version = "0.0.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"memchr 2.4.0",
|
"memchr 2.4.0",
|
||||||
|
"memmap2",
|
||||||
"regex",
|
"regex",
|
||||||
"uucore",
|
"uucore",
|
||||||
"uucore_procs",
|
"uucore_procs",
|
||||||
|
|
25
src/uu/tac/BENCHMARKING.md
Normal file
25
src/uu/tac/BENCHMARKING.md
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
## Benchmarking `tac`
|
||||||
|
|
||||||
|
<!-- spell-checker:ignore wikidatawiki -->
|
||||||
|
|
||||||
|
`tac` is often used to process log files in reverse chronological order, i.e. from newer towards older entries. In this case, the performance target to yield results as fast as possible, i.e. without reading in the whole file that is to be reversed line-by-line. Therefore, a sensible benchmark is to read a large log file containing N lines and measure how long it takes to produce the last K lines from that file.
|
||||||
|
|
||||||
|
Large text files can for example be found in the [Wikipedia database dumps](https://dumps.wikimedia.org/wikidatawiki/latest/), usually sized at multiple gigabytes and comprising more than 100M lines.
|
||||||
|
|
||||||
|
After you have obtained and uncompressed such a file, you need to build `tac` in release mode
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ cargo build --release --package uu_tac
|
||||||
|
```
|
||||||
|
|
||||||
|
and then you can time how it long it takes to extract the last 10M lines by running
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ /usr/bin/time ./target/release/tac wikidatawiki-20211001-pages-logging.xml | head -n10000000 >/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
For more systematic measurements that include warm-ups, repetitions and comparisons, [Hyperfine](https://github.com/sharkdp/hyperfine) can be helpful. For example, to compare this implementation to the one provided by your distribution run
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ hyperfine "./target/release/tac wikidatawiki-20211001-pages-logging.xml | head -n10000000 >/dev/null" "/usr/bin/tac wikidatawiki-20211001-pages-logging.xml | head -n10000000 >/dev/null"
|
||||||
|
```
|
|
@ -1,3 +1,5 @@
|
||||||
|
# spell-checker:ignore memmap
|
||||||
|
|
||||||
[package]
|
[package]
|
||||||
name = "uu_tac"
|
name = "uu_tac"
|
||||||
version = "0.0.7"
|
version = "0.0.7"
|
||||||
|
@ -16,6 +18,7 @@ path = "src/tac.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
memchr = "2"
|
memchr = "2"
|
||||||
|
memmap2 = "0.5"
|
||||||
regex = "1"
|
regex = "1"
|
||||||
clap = { version = "2.33", features = ["wrap_help"] }
|
clap = { version = "2.33", features = ["wrap_help"] }
|
||||||
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
||||||
|
|
|
@ -5,15 +5,19 @@
|
||||||
// * For the full copyright and license information, please view the LICENSE
|
// * For the full copyright and license information, please view the LICENSE
|
||||||
// * file that was distributed with this source code.
|
// * file that was distributed with this source code.
|
||||||
|
|
||||||
// spell-checker:ignore (ToDO) sbytes slen dlen memmem
|
// spell-checker:ignore (ToDO) sbytes slen dlen memmem memmap Mmap mmap SIGBUS
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate uucore;
|
extern crate uucore;
|
||||||
|
|
||||||
use clap::{crate_version, App, Arg};
|
use clap::{crate_version, App, Arg};
|
||||||
use memchr::memmem;
|
use memchr::memmem;
|
||||||
use std::io::{stdin, stdout, BufReader, Read, Write};
|
use memmap2::Mmap;
|
||||||
use std::{fs::File, path::Path};
|
use std::io::{stdin, stdout, BufWriter, Read, Write};
|
||||||
|
use std::{
|
||||||
|
fs::{read, File},
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
use uucore::display::Quotable;
|
use uucore::display::Quotable;
|
||||||
use uucore::InvalidEncodingHandling;
|
use uucore::InvalidEncodingHandling;
|
||||||
|
|
||||||
|
@ -44,9 +48,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
raw_separator
|
raw_separator
|
||||||
};
|
};
|
||||||
|
|
||||||
let files: Vec<String> = match matches.values_of(options::FILE) {
|
let files: Vec<&str> = match matches.values_of(options::FILE) {
|
||||||
Some(v) => v.map(|v| v.to_owned()).collect(),
|
Some(v) => v.collect(),
|
||||||
None => vec!["-".to_owned()],
|
None => vec!["-"],
|
||||||
};
|
};
|
||||||
|
|
||||||
tac(files, before, regex, separator)
|
tac(files, before, regex, separator)
|
||||||
|
@ -102,10 +106,11 @@ pub fn uu_app() -> App<'static, 'static> {
|
||||||
/// returns [`std::io::Error`].
|
/// returns [`std::io::Error`].
|
||||||
fn buffer_tac_regex(
|
fn buffer_tac_regex(
|
||||||
data: &[u8],
|
data: &[u8],
|
||||||
pattern: regex::bytes::Regex,
|
pattern: ®ex::bytes::Regex,
|
||||||
before: bool,
|
before: bool,
|
||||||
) -> std::io::Result<()> {
|
) -> std::io::Result<()> {
|
||||||
let mut out = stdout();
|
let out = stdout();
|
||||||
|
let mut out = BufWriter::new(out.lock());
|
||||||
|
|
||||||
// The index of the line separator for the current line.
|
// The index of the line separator for the current line.
|
||||||
//
|
//
|
||||||
|
@ -171,7 +176,8 @@ fn buffer_tac_regex(
|
||||||
/// `separator` appears at the beginning of each line, as in
|
/// `separator` appears at the beginning of each line, as in
|
||||||
/// `"/abc/def"`.
|
/// `"/abc/def"`.
|
||||||
fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> {
|
fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> {
|
||||||
let mut out = stdout();
|
let out = stdout();
|
||||||
|
let mut out = BufWriter::new(out.lock());
|
||||||
|
|
||||||
// The number of bytes in the line separator.
|
// The number of bytes in the line separator.
|
||||||
let slen = separator.as_bytes().len();
|
let slen = separator.as_bytes().len();
|
||||||
|
@ -208,12 +214,33 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tac(filenames: Vec<String>, before: bool, regex: bool, separator: &str) -> i32 {
|
fn tac(filenames: Vec<&str>, before: bool, regex: bool, separator: &str) -> i32 {
|
||||||
let mut exit_code = 0;
|
let mut exit_code = 0;
|
||||||
|
|
||||||
for filename in &filenames {
|
let pattern = if regex {
|
||||||
let mut file = BufReader::new(if filename == "-" {
|
Some(crash_if_err!(1, regex::bytes::Regex::new(separator)))
|
||||||
Box::new(stdin()) as Box<dyn Read>
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
for &filename in &filenames {
|
||||||
|
let mmap;
|
||||||
|
let buf;
|
||||||
|
|
||||||
|
let data: &[u8] = if filename == "-" {
|
||||||
|
if let Some(mmap1) = try_mmap_stdin() {
|
||||||
|
mmap = mmap1;
|
||||||
|
&mmap
|
||||||
|
} else {
|
||||||
|
let mut buf1 = Vec::new();
|
||||||
|
if let Err(e) = stdin().read_to_end(&mut buf1) {
|
||||||
|
show_error!("failed to read from stdin: {}", e);
|
||||||
|
exit_code = 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
buf = buf1;
|
||||||
|
&buf
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
let path = Path::new(filename);
|
let path = Path::new(filename);
|
||||||
if path.is_dir() || path.metadata().is_err() {
|
if path.is_dir() || path.metadata().is_err() {
|
||||||
|
@ -228,29 +255,47 @@ fn tac(filenames: Vec<String>, before: bool, regex: bool, separator: &str) -> i3
|
||||||
exit_code = 1;
|
exit_code = 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
match File::open(path) {
|
|
||||||
Ok(f) => Box::new(f) as Box<dyn Read>,
|
if let Some(mmap1) = try_mmap_path(path) {
|
||||||
Err(e) => {
|
mmap = mmap1;
|
||||||
show_error!("failed to open {} for reading: {}", filename.quote(), e);
|
&mmap
|
||||||
exit_code = 1;
|
} else {
|
||||||
continue;
|
match read(path) {
|
||||||
|
Ok(buf1) => {
|
||||||
|
buf = buf1;
|
||||||
|
&buf
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
show_error!("failed to read {}: {}", filename.quote(), e);
|
||||||
|
exit_code = 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
let mut data = Vec::new();
|
|
||||||
if let Err(e) = file.read_to_end(&mut data) {
|
|
||||||
show_error!("failed to read {}: {}", filename.quote(), e);
|
|
||||||
exit_code = 1;
|
|
||||||
continue;
|
|
||||||
};
|
};
|
||||||
if regex {
|
|
||||||
let pattern = crash_if_err!(1, regex::bytes::Regex::new(separator));
|
if let Some(pattern) = &pattern {
|
||||||
buffer_tac_regex(&data, pattern, before)
|
buffer_tac_regex(data, pattern, before)
|
||||||
} else {
|
} else {
|
||||||
buffer_tac(&data, before, separator)
|
buffer_tac(data, before, separator)
|
||||||
}
|
}
|
||||||
.unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e));
|
.unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e));
|
||||||
}
|
}
|
||||||
exit_code
|
exit_code
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn try_mmap_stdin() -> Option<Mmap> {
|
||||||
|
// SAFETY: If the file is truncated while we map it, SIGBUS will be raised
|
||||||
|
// and our process will be terminated, thus preventing access of invalid memory.
|
||||||
|
unsafe { Mmap::map(&stdin()).ok() }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_mmap_path(path: &Path) -> Option<Mmap> {
|
||||||
|
let file = File::open(path).ok()?;
|
||||||
|
|
||||||
|
// SAFETY: If the file is truncated while we map it, SIGBUS will be raised
|
||||||
|
// and our process will be terminated, thus preventing access of invalid memory.
|
||||||
|
let mmap = unsafe { Mmap::map(&file).ok()? };
|
||||||
|
|
||||||
|
Some(mmap)
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue