From ac9d0068861e97bc892bb8fe5608b3f4a2012d5f Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Sun, 6 Feb 2022 02:22:54 -0500 Subject: [PATCH] join: guess the number of fields in each line This lets us use fewer reallocations when parsing each line. The current guess is set to the maximum fields in a line so far. This is a free performance win in the common case where each line has the same number of fields, but comes with some memory overhead in the case where there is a line with lots of fields at the beginning of the file, and fewer later, but each of these lines are typically not kept for very long anyway. --- src/uu/join/src/join.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/uu/join/src/join.rs b/src/uu/join/src/join.rs index dcd699438..861d1684a 100644 --- a/src/uu/join/src/join.rs +++ b/src/uu/join/src/join.rs @@ -283,8 +283,8 @@ struct Line { } impl Line { - fn new(string: Vec, separator: Sep) -> Self { - let mut field_ranges = Vec::new(); + fn new(string: Vec, separator: Sep, len_guess: usize) -> Self { + let mut field_ranges = Vec::with_capacity(len_guess); let mut last_end = 0; if separator == Sep::Whitespaces { // GNU join uses Bourne shell field splitters by default @@ -325,6 +325,7 @@ struct State<'a> { file_num: FileNum, print_unpaired: bool, lines: Split>, + max_len: usize, seq: Vec, line_num: usize, has_failed: bool, @@ -355,6 +356,7 @@ impl<'a> State<'a> { file_num, print_unpaired, lines: f.split(line_ending as u8), + max_len: 1, seq: Vec::new(), line_num: 0, has_failed: false, @@ -517,7 +519,11 @@ impl<'a> State<'a> { match self.lines.next() { Some(value) => { self.line_num += 1; - Ok(Some(Line::new(value?, sep))) + let line = Line::new(value?, sep, self.max_len); + if line.field_ranges.len() > self.max_len { + self.max_len = line.field_ranges.len(); + } + Ok(Some(line)) } None => Ok(None), }