From f33e058a5a3af79ee5ca7654aa2a0f372b75ae0a Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Sun, 6 Feb 2022 02:17:25 -0500 Subject: [PATCH] join: faster field parsing and representation Using indexes into the line instead of Vecs means we don't have to copy the line to store the fields (indexes instead of slices because it avoids self-referential structs). Using memchr also empirically saves a lot of intermediate allocations. --- Cargo.lock | 1 + src/uu/join/Cargo.toml | 1 + src/uu/join/src/join.rs | 58 +++++++++++++++++++++++++---------------- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cc7c3967b..b1c374651 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2444,6 +2444,7 @@ name = "uu_join" version = "0.0.12" dependencies = [ "clap 3.0.10", + "memchr 2.4.1", "uucore", ] diff --git a/src/uu/join/Cargo.toml b/src/uu/join/Cargo.toml index b4150b4be..2a03fe002 100644 --- a/src/uu/join/Cargo.toml +++ b/src/uu/join/Cargo.toml @@ -17,6 +17,7 @@ path = "src/join.rs" [dependencies] clap = { version = "3.0", features = ["wrap_help", "cargo"] } uucore = { version=">=0.0.11", package="uucore", path="../../uucore" } +memchr = "2" [[bin]] name = "join" diff --git a/src/uu/join/src/join.rs b/src/uu/join/src/join.rs index cb953c133..dcd699438 100644 --- a/src/uu/join/src/join.rs +++ b/src/uu/join/src/join.rs @@ -11,6 +11,7 @@ extern crate uucore; use clap::{crate_version, App, AppSettings, Arg}; +use memchr::{memchr3_iter, memchr_iter}; use std::cmp::Ordering; use std::convert::From; use std::error::Error; @@ -66,7 +67,7 @@ enum LineEnding { Newline = b'\n', } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, PartialEq)] enum Sep { Char(u8), Line, @@ -147,7 +148,7 @@ impl<'a> Repr<'a> { fn print_field( &self, writer: &mut impl Write, - field: Option<&Vec>, + field: Option<&[u8]>, ) -> Result<(), std::io::Error> { let value = match field { Some(field) => field, @@ -164,10 +165,10 @@ impl<'a> Repr<'a> { line: &Line, index: usize, ) -> Result<(), std::io::Error> { - for i in 0..line.fields.len() { + for i in 0..line.field_ranges.len() { if i != index { writer.write_all(&[self.separator])?; - writer.write_all(&line.fields[i])?; + writer.write_all(line.get_field(i).unwrap())?; } } Ok(()) @@ -176,7 +177,7 @@ impl<'a> Repr<'a> { /// Print each field or the empty filler if the field is not set. fn print_format(&self, writer: &mut impl Write, f: F) -> Result<(), std::io::Error> where - F: Fn(&Spec) -> Option<&'a Vec>, + F: Fn(&Spec) -> Option<&'a [u8]>, { for i in 0..self.format.len() { if i > 0 { @@ -214,7 +215,7 @@ impl Input { } } - fn compare(&self, field1: Option<&Vec>, field2: Option<&Vec>) -> Ordering { + fn compare(&self, field1: Option<&[u8]>, field2: Option<&[u8]>) -> Ordering { if let (Some(field1), Some(field2)) = (field1, field2) { if self.ignore_case { field1 @@ -277,30 +278,41 @@ impl Spec { } struct Line { - fields: Vec>, + field_ranges: Vec<(usize, usize)>, string: Vec, } impl Line { fn new(string: Vec, separator: Sep) -> Self { - let fields = match separator { - Sep::Whitespaces => string - // GNU join uses Bourne shell field splitters by default - .split(|c| matches!(*c, b' ' | b'\t' | b'\n')) - .filter(|f| !f.is_empty()) - .map(Vec::from) - .collect(), - Sep::Char(sep) => string.split(|c| *c == sep).map(Vec::from).collect(), - Sep::Line => vec![string.clone()], - }; + let mut field_ranges = Vec::new(); + let mut last_end = 0; + if separator == Sep::Whitespaces { + // GNU join uses Bourne shell field splitters by default + for i in memchr3_iter(b' ', b'\t', b'\n', &string) { + if i > last_end { + field_ranges.push((last_end, i)); + } + last_end = i + 1; + } + } else if let Sep::Char(sep) = separator { + for i in memchr_iter(sep, &string) { + field_ranges.push((last_end, i)); + last_end = i + 1; + } + } + field_ranges.push((last_end, string.len())); - Self { fields, string } + Self { + field_ranges, + string, + } } /// Get field at index. - fn get_field(&self, index: usize) -> Option<&Vec> { - if index < self.fields.len() { - Some(&self.fields[index]) + fn get_field(&self, index: usize) -> Option<&[u8]> { + if index < self.field_ranges.len() { + let (low, high) = self.field_ranges[index]; + Some(&self.string[low..high]) } else { None } @@ -470,7 +482,7 @@ impl<'a> State<'a> { self.seq.push(line); if autoformat { - return self.seq[0].fields.len(); + return self.seq[0].field_ranges.len(); } } 0 @@ -547,7 +559,7 @@ impl<'a> State<'a> { } /// Gets the key value of the lines stored in seq. - fn get_current_key(&self) -> Option<&Vec> { + fn get_current_key(&self) -> Option<&[u8]> { self.seq[0].get_field(self.key) }