mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-29 12:07:46 +00:00
join: faster field parsing and representation
Using indexes into the line instead of Vec<u8>s means we don't have to copy the line to store the fields (indexes instead of slices because it avoids self-referential structs). Using memchr also empirically saves a lot of intermediate allocations.
This commit is contained in:
parent
e6f59b12f7
commit
f33e058a5a
3 changed files with 37 additions and 23 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2444,6 +2444,7 @@ name = "uu_join"
|
||||||
version = "0.0.12"
|
version = "0.0.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 3.0.10",
|
"clap 3.0.10",
|
||||||
|
"memchr 2.4.1",
|
||||||
"uucore",
|
"uucore",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ path = "src/join.rs"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "3.0", features = ["wrap_help", "cargo"] }
|
clap = { version = "3.0", features = ["wrap_help", "cargo"] }
|
||||||
uucore = { version=">=0.0.11", package="uucore", path="../../uucore" }
|
uucore = { version=">=0.0.11", package="uucore", path="../../uucore" }
|
||||||
|
memchr = "2"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "join"
|
name = "join"
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
extern crate uucore;
|
extern crate uucore;
|
||||||
|
|
||||||
use clap::{crate_version, App, AppSettings, Arg};
|
use clap::{crate_version, App, AppSettings, Arg};
|
||||||
|
use memchr::{memchr3_iter, memchr_iter};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::convert::From;
|
use std::convert::From;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
@ -66,7 +67,7 @@ enum LineEnding {
|
||||||
Newline = b'\n',
|
Newline = b'\n',
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone, PartialEq)]
|
||||||
enum Sep {
|
enum Sep {
|
||||||
Char(u8),
|
Char(u8),
|
||||||
Line,
|
Line,
|
||||||
|
@ -147,7 +148,7 @@ impl<'a> Repr<'a> {
|
||||||
fn print_field(
|
fn print_field(
|
||||||
&self,
|
&self,
|
||||||
writer: &mut impl Write,
|
writer: &mut impl Write,
|
||||||
field: Option<&Vec<u8>>,
|
field: Option<&[u8]>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
let value = match field {
|
let value = match field {
|
||||||
Some(field) => field,
|
Some(field) => field,
|
||||||
|
@ -164,10 +165,10 @@ impl<'a> Repr<'a> {
|
||||||
line: &Line,
|
line: &Line,
|
||||||
index: usize,
|
index: usize,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
for i in 0..line.fields.len() {
|
for i in 0..line.field_ranges.len() {
|
||||||
if i != index {
|
if i != index {
|
||||||
writer.write_all(&[self.separator])?;
|
writer.write_all(&[self.separator])?;
|
||||||
writer.write_all(&line.fields[i])?;
|
writer.write_all(line.get_field(i).unwrap())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -176,7 +177,7 @@ impl<'a> Repr<'a> {
|
||||||
/// Print each field or the empty filler if the field is not set.
|
/// Print each field or the empty filler if the field is not set.
|
||||||
fn print_format<F>(&self, writer: &mut impl Write, f: F) -> Result<(), std::io::Error>
|
fn print_format<F>(&self, writer: &mut impl Write, f: F) -> Result<(), std::io::Error>
|
||||||
where
|
where
|
||||||
F: Fn(&Spec) -> Option<&'a Vec<u8>>,
|
F: Fn(&Spec) -> Option<&'a [u8]>,
|
||||||
{
|
{
|
||||||
for i in 0..self.format.len() {
|
for i in 0..self.format.len() {
|
||||||
if i > 0 {
|
if i > 0 {
|
||||||
|
@ -214,7 +215,7 @@ impl Input {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compare(&self, field1: Option<&Vec<u8>>, field2: Option<&Vec<u8>>) -> Ordering {
|
fn compare(&self, field1: Option<&[u8]>, field2: Option<&[u8]>) -> Ordering {
|
||||||
if let (Some(field1), Some(field2)) = (field1, field2) {
|
if let (Some(field1), Some(field2)) = (field1, field2) {
|
||||||
if self.ignore_case {
|
if self.ignore_case {
|
||||||
field1
|
field1
|
||||||
|
@ -277,30 +278,41 @@ impl Spec {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Line {
|
struct Line {
|
||||||
fields: Vec<Vec<u8>>,
|
field_ranges: Vec<(usize, usize)>,
|
||||||
string: Vec<u8>,
|
string: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Line {
|
impl Line {
|
||||||
fn new(string: Vec<u8>, separator: Sep) -> Self {
|
fn new(string: Vec<u8>, separator: Sep) -> Self {
|
||||||
let fields = match separator {
|
let mut field_ranges = Vec::new();
|
||||||
Sep::Whitespaces => string
|
let mut last_end = 0;
|
||||||
// GNU join uses Bourne shell field splitters by default
|
if separator == Sep::Whitespaces {
|
||||||
.split(|c| matches!(*c, b' ' | b'\t' | b'\n'))
|
// GNU join uses Bourne shell field splitters by default
|
||||||
.filter(|f| !f.is_empty())
|
for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
|
||||||
.map(Vec::from)
|
if i > last_end {
|
||||||
.collect(),
|
field_ranges.push((last_end, i));
|
||||||
Sep::Char(sep) => string.split(|c| *c == sep).map(Vec::from).collect(),
|
}
|
||||||
Sep::Line => vec![string.clone()],
|
last_end = i + 1;
|
||||||
};
|
}
|
||||||
|
} else if let Sep::Char(sep) = separator {
|
||||||
|
for i in memchr_iter(sep, &string) {
|
||||||
|
field_ranges.push((last_end, i));
|
||||||
|
last_end = i + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
field_ranges.push((last_end, string.len()));
|
||||||
|
|
||||||
Self { fields, string }
|
Self {
|
||||||
|
field_ranges,
|
||||||
|
string,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get field at index.
|
/// Get field at index.
|
||||||
fn get_field(&self, index: usize) -> Option<&Vec<u8>> {
|
fn get_field(&self, index: usize) -> Option<&[u8]> {
|
||||||
if index < self.fields.len() {
|
if index < self.field_ranges.len() {
|
||||||
Some(&self.fields[index])
|
let (low, high) = self.field_ranges[index];
|
||||||
|
Some(&self.string[low..high])
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
@ -470,7 +482,7 @@ impl<'a> State<'a> {
|
||||||
self.seq.push(line);
|
self.seq.push(line);
|
||||||
|
|
||||||
if autoformat {
|
if autoformat {
|
||||||
return self.seq[0].fields.len();
|
return self.seq[0].field_ranges.len();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
0
|
0
|
||||||
|
@ -547,7 +559,7 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets the key value of the lines stored in seq.
|
/// Gets the key value of the lines stored in seq.
|
||||||
fn get_current_key(&self) -> Option<&Vec<u8>> {
|
fn get_current_key(&self) -> Option<&[u8]> {
|
||||||
self.seq[0].get_field(self.key)
|
self.seq[0].get_field(self.key)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue