1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

join: guess the number of fields in each line

This lets us use fewer reallocations when parsing each line.
The current guess is set to the maximum fields in a line so far. This is
a free performance win in the common case where each line has the same
number of fields, but comes with some memory overhead in the case where
there is a line with lots of fields at the beginning of the file, and
fewer later, but each of these lines are typically not kept for very
long anyway.
This commit is contained in:
Justin Tracey 2022-02-06 02:22:54 -05:00
parent f33e058a5a
commit ac9d006886

View file

@ -283,8 +283,8 @@ struct Line {
}
impl Line {
fn new(string: Vec<u8>, separator: Sep) -> Self {
let mut field_ranges = Vec::new();
fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
if separator == Sep::Whitespaces {
// GNU join uses Bourne shell field splitters by default
@ -325,6 +325,7 @@ struct State<'a> {
file_num: FileNum,
print_unpaired: bool,
lines: Split<Box<dyn BufRead + 'a>>,
max_len: usize,
seq: Vec<Line>,
line_num: usize,
has_failed: bool,
@ -355,6 +356,7 @@ impl<'a> State<'a> {
file_num,
print_unpaired,
lines: f.split(line_ending as u8),
max_len: 1,
seq: Vec::new(),
line_num: 0,
has_failed: false,
@ -517,7 +519,11 @@ impl<'a> State<'a> {
match self.lines.next() {
Some(value) => {
self.line_num += 1;
Ok(Some(Line::new(value?, sep)))
let line = Line::new(value?, sep, self.max_len);
if line.field_ranges.len() > self.max_len {
self.max_len = line.field_ranges.len();
}
Ok(Some(line))
}
None => Ok(None),
}