mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
join: guess the number of fields in each line
This lets us use fewer reallocations when parsing each line. The current guess is set to the maximum fields in a line so far. This is a free performance win in the common case where each line has the same number of fields, but comes with some memory overhead in the case where there is a line with lots of fields at the beginning of the file, and fewer later, but each of these lines are typically not kept for very long anyway.
This commit is contained in:
parent
f33e058a5a
commit
ac9d006886
1 changed files with 9 additions and 3 deletions
|
@ -283,8 +283,8 @@ struct Line {
|
|||
}
|
||||
|
||||
impl Line {
|
||||
fn new(string: Vec<u8>, separator: Sep) -> Self {
|
||||
let mut field_ranges = Vec::new();
|
||||
fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self {
|
||||
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||
let mut last_end = 0;
|
||||
if separator == Sep::Whitespaces {
|
||||
// GNU join uses Bourne shell field splitters by default
|
||||
|
@ -325,6 +325,7 @@ struct State<'a> {
|
|||
file_num: FileNum,
|
||||
print_unpaired: bool,
|
||||
lines: Split<Box<dyn BufRead + 'a>>,
|
||||
max_len: usize,
|
||||
seq: Vec<Line>,
|
||||
line_num: usize,
|
||||
has_failed: bool,
|
||||
|
@ -355,6 +356,7 @@ impl<'a> State<'a> {
|
|||
file_num,
|
||||
print_unpaired,
|
||||
lines: f.split(line_ending as u8),
|
||||
max_len: 1,
|
||||
seq: Vec::new(),
|
||||
line_num: 0,
|
||||
has_failed: false,
|
||||
|
@ -517,7 +519,11 @@ impl<'a> State<'a> {
|
|||
match self.lines.next() {
|
||||
Some(value) => {
|
||||
self.line_num += 1;
|
||||
Ok(Some(Line::new(value?, sep)))
|
||||
let line = Line::new(value?, sep, self.max_len);
|
||||
if line.field_ranges.len() > self.max_len {
|
||||
self.max_len = line.field_ranges.len();
|
||||
}
|
||||
Ok(Some(line))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue