1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-29 12:07:46 +00:00

join: faster field parsing and representation

Using indexes into the line instead of Vec<u8>s means we don't have to copy
the line to store the fields (indexes instead of slices because it avoids
self-referential structs). Using memchr also empirically saves a lot of
intermediate allocations.
This commit is contained in:
Justin Tracey 2022-02-06 02:17:25 -05:00
parent e6f59b12f7
commit f33e058a5a
3 changed files with 37 additions and 23 deletions

1
Cargo.lock generated
View file

@ -2444,6 +2444,7 @@ name = "uu_join"
version = "0.0.12" version = "0.0.12"
dependencies = [ dependencies = [
"clap 3.0.10", "clap 3.0.10",
"memchr 2.4.1",
"uucore", "uucore",
] ]

View file

@ -17,6 +17,7 @@ path = "src/join.rs"
[dependencies] [dependencies]
clap = { version = "3.0", features = ["wrap_help", "cargo"] } clap = { version = "3.0", features = ["wrap_help", "cargo"] }
uucore = { version=">=0.0.11", package="uucore", path="../../uucore" } uucore = { version=">=0.0.11", package="uucore", path="../../uucore" }
memchr = "2"
[[bin]] [[bin]]
name = "join" name = "join"

View file

@ -11,6 +11,7 @@
extern crate uucore; extern crate uucore;
use clap::{crate_version, App, AppSettings, Arg}; use clap::{crate_version, App, AppSettings, Arg};
use memchr::{memchr3_iter, memchr_iter};
use std::cmp::Ordering; use std::cmp::Ordering;
use std::convert::From; use std::convert::From;
use std::error::Error; use std::error::Error;
@ -66,7 +67,7 @@ enum LineEnding {
Newline = b'\n', Newline = b'\n',
} }
#[derive(Copy, Clone)] #[derive(Copy, Clone, PartialEq)]
enum Sep { enum Sep {
Char(u8), Char(u8),
Line, Line,
@ -147,7 +148,7 @@ impl<'a> Repr<'a> {
fn print_field( fn print_field(
&self, &self,
writer: &mut impl Write, writer: &mut impl Write,
field: Option<&Vec<u8>>, field: Option<&[u8]>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let value = match field { let value = match field {
Some(field) => field, Some(field) => field,
@ -164,10 +165,10 @@ impl<'a> Repr<'a> {
line: &Line, line: &Line,
index: usize, index: usize,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
for i in 0..line.fields.len() { for i in 0..line.field_ranges.len() {
if i != index { if i != index {
writer.write_all(&[self.separator])?; writer.write_all(&[self.separator])?;
writer.write_all(&line.fields[i])?; writer.write_all(line.get_field(i).unwrap())?;
} }
} }
Ok(()) Ok(())
@ -176,7 +177,7 @@ impl<'a> Repr<'a> {
/// Print each field or the empty filler if the field is not set. /// Print each field or the empty filler if the field is not set.
fn print_format<F>(&self, writer: &mut impl Write, f: F) -> Result<(), std::io::Error> fn print_format<F>(&self, writer: &mut impl Write, f: F) -> Result<(), std::io::Error>
where where
F: Fn(&Spec) -> Option<&'a Vec<u8>>, F: Fn(&Spec) -> Option<&'a [u8]>,
{ {
for i in 0..self.format.len() { for i in 0..self.format.len() {
if i > 0 { if i > 0 {
@ -214,7 +215,7 @@ impl Input {
} }
} }
fn compare(&self, field1: Option<&Vec<u8>>, field2: Option<&Vec<u8>>) -> Ordering { fn compare(&self, field1: Option<&[u8]>, field2: Option<&[u8]>) -> Ordering {
if let (Some(field1), Some(field2)) = (field1, field2) { if let (Some(field1), Some(field2)) = (field1, field2) {
if self.ignore_case { if self.ignore_case {
field1 field1
@ -277,30 +278,41 @@ impl Spec {
} }
struct Line { struct Line {
fields: Vec<Vec<u8>>, field_ranges: Vec<(usize, usize)>,
string: Vec<u8>, string: Vec<u8>,
} }
impl Line { impl Line {
fn new(string: Vec<u8>, separator: Sep) -> Self { fn new(string: Vec<u8>, separator: Sep) -> Self {
let fields = match separator { let mut field_ranges = Vec::new();
Sep::Whitespaces => string let mut last_end = 0;
// GNU join uses Bourne shell field splitters by default if separator == Sep::Whitespaces {
.split(|c| matches!(*c, b' ' | b'\t' | b'\n')) // GNU join uses Bourne shell field splitters by default
.filter(|f| !f.is_empty()) for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
.map(Vec::from) if i > last_end {
.collect(), field_ranges.push((last_end, i));
Sep::Char(sep) => string.split(|c| *c == sep).map(Vec::from).collect(), }
Sep::Line => vec![string.clone()], last_end = i + 1;
}; }
} else if let Sep::Char(sep) = separator {
for i in memchr_iter(sep, &string) {
field_ranges.push((last_end, i));
last_end = i + 1;
}
}
field_ranges.push((last_end, string.len()));
Self { fields, string } Self {
field_ranges,
string,
}
} }
/// Get field at index. /// Get field at index.
fn get_field(&self, index: usize) -> Option<&Vec<u8>> { fn get_field(&self, index: usize) -> Option<&[u8]> {
if index < self.fields.len() { if index < self.field_ranges.len() {
Some(&self.fields[index]) let (low, high) = self.field_ranges[index];
Some(&self.string[low..high])
} else { } else {
None None
} }
@ -470,7 +482,7 @@ impl<'a> State<'a> {
self.seq.push(line); self.seq.push(line);
if autoformat { if autoformat {
return self.seq[0].fields.len(); return self.seq[0].field_ranges.len();
} }
} }
0 0
@ -547,7 +559,7 @@ impl<'a> State<'a> {
} }
/// Gets the key value of the lines stored in seq. /// Gets the key value of the lines stored in seq.
fn get_current_key(&self) -> Option<&Vec<u8>> { fn get_current_key(&self) -> Option<&[u8]> {
self.seq[0].get_field(self.key) self.seq[0].get_field(self.key)
} }