1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 11:07:44 +00:00

Merge pull request #2851 from jtracey/join-strless

join: operate on bytes instead of Strings
This commit is contained in:
Sylvestre Ledru 2022-01-16 16:24:38 +01:00 committed by GitHub
commit 00c11b184f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 105 additions and 60 deletions

View file

@ -13,7 +13,7 @@ extern crate uucore;
use clap::{crate_version, App, Arg}; use clap::{crate_version, App, Arg};
use std::cmp::Ordering; use std::cmp::Ordering;
use std::fs::File; use std::fs::File;
use std::io::{stdin, BufRead, BufReader, Lines, Stdin}; use std::io::{stdin, stdout, BufRead, BufReader, Split, Stdin, Write};
use uucore::display::Quotable; use uucore::display::Quotable;
use uucore::error::{set_exit_code, UResult, USimpleError}; use uucore::error::{set_exit_code, UResult, USimpleError};
@ -27,7 +27,7 @@ enum FileNum {
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
enum Sep { enum Sep {
Char(char), Char(u8),
Line, Line,
Whitespaces, Whitespaces,
} }
@ -49,7 +49,7 @@ struct Settings {
separator: Sep, separator: Sep,
autoformat: bool, autoformat: bool,
format: Vec<Spec>, format: Vec<Spec>,
empty: String, empty: Vec<u8>,
check_order: CheckOrder, check_order: CheckOrder,
headers: bool, headers: bool,
} }
@ -66,7 +66,7 @@ impl Default for Settings {
separator: Sep::Whitespaces, separator: Sep::Whitespaces,
autoformat: false, autoformat: false,
format: vec![], format: vec![],
empty: String::new(), empty: vec![],
check_order: CheckOrder::Default, check_order: CheckOrder::Default,
headers: false, headers: false,
} }
@ -75,13 +75,13 @@ impl Default for Settings {
/// Output representation. /// Output representation.
struct Repr<'a> { struct Repr<'a> {
separator: char, separator: u8,
format: &'a [Spec], format: &'a [Spec],
empty: &'a str, empty: &'a [u8],
} }
impl<'a> Repr<'a> { impl<'a> Repr<'a> {
fn new(separator: char, format: &'a [Spec], empty: &'a str) -> Repr<'a> { fn new(separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Repr<'a> {
Repr { Repr {
separator, separator,
format, format,
@ -94,32 +94,34 @@ impl<'a> Repr<'a> {
} }
/// Print the field or empty filler if the field is not set. /// Print the field or empty filler if the field is not set.
fn print_field(&self, field: Option<&str>) { fn print_field(&self, field: Option<&Vec<u8>>) -> Result<(), std::io::Error> {
let value = match field { let value = match field {
Some(field) => field, Some(field) => field,
None => self.empty, None => self.empty,
}; };
print!("{}", value); stdout().write_all(value)
} }
/// Print each field except the one at the index. /// Print each field except the one at the index.
fn print_fields(&self, line: &Line, index: usize) { fn print_fields(&self, line: &Line, index: usize) -> Result<(), std::io::Error> {
for i in 0..line.fields.len() { for i in 0..line.fields.len() {
if i != index { if i != index {
print!("{}{}", self.separator, line.fields[i]); stdout().write_all(&[self.separator])?;
stdout().write_all(&line.fields[i])?;
} }
} }
Ok(())
} }
/// Print each field or the empty filler if the field is not set. /// Print each field or the empty filler if the field is not set.
fn print_format<F>(&self, f: F) fn print_format<F>(&self, f: F) -> Result<(), std::io::Error>
where where
F: Fn(&Spec) -> Option<&'a str>, F: Fn(&Spec) -> Option<&'a Vec<u8>>,
{ {
for i in 0..self.format.len() { for i in 0..self.format.len() {
if i > 0 { if i > 0 {
print!("{}", self.separator); stdout().write_all(&[self.separator])?;
} }
let field = match f(&self.format[i]) { let field = match f(&self.format[i]) {
@ -127,8 +129,9 @@ impl<'a> Repr<'a> {
None => self.empty, None => self.empty,
}; };
print!("{}", field); stdout().write_all(field)?;
} }
Ok(())
} }
} }
@ -148,10 +151,12 @@ impl Input {
} }
} }
fn compare(&self, field1: Option<&str>, field2: Option<&str>) -> Ordering { fn compare(&self, field1: Option<&Vec<u8>>, field2: Option<&Vec<u8>>) -> Ordering {
if let (Some(field1), Some(field2)) = (field1, field2) { if let (Some(field1), Some(field2)) = (field1, field2) {
if self.ignore_case { if self.ignore_case {
field1.to_lowercase().cmp(&field2.to_lowercase()) field1
.to_ascii_lowercase()
.cmp(&field2.to_ascii_lowercase())
} else { } else {
field1.cmp(field2) field1.cmp(field2)
} }
@ -209,14 +214,19 @@ impl Spec {
} }
struct Line { struct Line {
fields: Vec<String>, fields: Vec<Vec<u8>>,
} }
impl Line { impl Line {
fn new(string: String, separator: Sep) -> Line { fn new(string: Vec<u8>, separator: Sep) -> Line {
let fields = match separator { let fields = match separator {
Sep::Whitespaces => string.split_whitespace().map(String::from).collect(), Sep::Whitespaces => string
Sep::Char(sep) => string.split(sep).map(String::from).collect(), // GNU join uses Bourne shell field splitters by default
.split(|c| matches!(*c, b' ' | b'\t' | b'\n'))
.filter(|f| !f.is_empty())
.map(Vec::from)
.collect(),
Sep::Char(sep) => string.split(|c| *c == sep).map(Vec::from).collect(),
Sep::Line => vec![string], Sep::Line => vec![string],
}; };
@ -224,7 +234,7 @@ impl Line {
} }
/// Get field at index. /// Get field at index.
fn get_field(&self, index: usize) -> Option<&str> { fn get_field(&self, index: usize) -> Option<&Vec<u8>> {
if index < self.fields.len() { if index < self.fields.len() {
Some(&self.fields[index]) Some(&self.fields[index])
} else { } else {
@ -238,7 +248,7 @@ struct State<'a> {
file_name: &'a str, file_name: &'a str,
file_num: FileNum, file_num: FileNum,
print_unpaired: bool, print_unpaired: bool,
lines: Lines<Box<dyn BufRead + 'a>>, lines: Split<Box<dyn BufRead + 'a>>,
seq: Vec<Line>, seq: Vec<Line>,
line_num: usize, line_num: usize,
has_failed: bool, has_failed: bool,
@ -266,7 +276,7 @@ impl<'a> State<'a> {
file_name: name, file_name: name,
file_num, file_num,
print_unpaired, print_unpaired,
lines: f.lines(), lines: f.split(b'\n'),
seq: Vec::new(), seq: Vec::new(),
line_num: 0, line_num: 0,
has_failed: false, has_failed: false,
@ -274,12 +284,13 @@ impl<'a> State<'a> {
} }
/// Skip the current unpaired line. /// Skip the current unpaired line.
fn skip_line(&mut self, input: &Input, repr: &Repr) { fn skip_line(&mut self, input: &Input, repr: &Repr) -> Result<(), std::io::Error> {
if self.print_unpaired { if self.print_unpaired {
self.print_first_line(repr); self.print_first_line(repr)?;
} }
self.reset_next_line(input); self.reset_next_line(input);
Ok(())
} }
/// Keep reading line sequence until the key does not change, return /// Keep reading line sequence until the key does not change, return
@ -299,20 +310,22 @@ impl<'a> State<'a> {
} }
/// Print lines in the buffers as headers. /// Print lines in the buffers as headers.
fn print_headers(&self, other: &State, repr: &Repr) { fn print_headers(&self, other: &State, repr: &Repr) -> Result<(), std::io::Error> {
if self.has_line() { if self.has_line() {
if other.has_line() { if other.has_line() {
self.combine(other, repr); self.combine(other, repr)?;
} else { } else {
self.print_first_line(repr); self.print_first_line(repr)?;
} }
} else if other.has_line() { } else if other.has_line() {
other.print_first_line(repr); other.print_first_line(repr)?;
} }
Ok(())
} }
/// Combine two line sequences. /// Combine two line sequences.
fn combine(&self, other: &State, repr: &Repr) { fn combine(&self, other: &State, repr: &Repr) -> Result<(), std::io::Error> {
let key = self.get_current_key(); let key = self.get_current_key();
for line1 in &self.seq { for line1 in &self.seq {
@ -331,16 +344,18 @@ impl<'a> State<'a> {
None None
} }
}); })?;
} else { } else {
repr.print_field(key); repr.print_field(key)?;
repr.print_fields(line1, self.key); repr.print_fields(line1, self.key)?;
repr.print_fields(line2, other.key); repr.print_fields(line2, other.key)?;
} }
println!(); stdout().write_all(&[b'\n'])?;
} }
} }
Ok(())
} }
/// Reset with the next line. /// Reset with the next line.
@ -377,14 +392,16 @@ impl<'a> State<'a> {
0 0
} }
fn finalize(&mut self, input: &Input, repr: &Repr) { fn finalize(&mut self, input: &Input, repr: &Repr) -> Result<(), std::io::Error> {
if self.has_line() && self.print_unpaired { if self.has_line() && self.print_unpaired {
self.print_first_line(repr); self.print_first_line(repr)?;
while let Some(line) = self.next_line(input) { while let Some(line) = self.next_line(input) {
self.print_line(&line, repr); self.print_line(&line, repr)?;
} }
} }
Ok(())
} }
/// Get the next line without the order check. /// Get the next line without the order check.
@ -423,11 +440,11 @@ impl<'a> State<'a> {
} }
/// Gets the key value of the lines stored in seq. /// Gets the key value of the lines stored in seq.
fn get_current_key(&self) -> Option<&str> { fn get_current_key(&self) -> Option<&Vec<u8>> {
self.seq[0].get_field(self.key) self.seq[0].get_field(self.key)
} }
fn print_line(&self, line: &Line, repr: &Repr) { fn print_line(&self, line: &Line, repr: &Repr) -> Result<(), std::io::Error> {
if repr.uses_format() { if repr.uses_format() {
repr.print_format(|spec| match *spec { repr.print_format(|spec| match *spec {
Spec::Key => line.get_field(self.key), Spec::Key => line.get_field(self.key),
@ -438,17 +455,17 @@ impl<'a> State<'a> {
None None
} }
} }
}); })?;
} else { } else {
repr.print_field(line.get_field(self.key)); repr.print_field(line.get_field(self.key))?;
repr.print_fields(line, self.key); repr.print_fields(line, self.key)?;
} }
println!(); stdout().write_all(&[b'\n'])
} }
fn print_first_line(&self, repr: &Repr) { fn print_first_line(&self, repr: &Repr) -> Result<(), std::io::Error> {
self.print_line(&self.seq[0], repr); self.print_line(&self.seq[0], repr)
} }
} }
@ -481,14 +498,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
settings.key1 = get_field_number(keys, key1)?; settings.key1 = get_field_number(keys, key1)?;
settings.key2 = get_field_number(keys, key2)?; settings.key2 = get_field_number(keys, key2)?;
if let Some(value) = matches.value_of("t") { if let Some(value_str) = matches.value_of("t") {
let value = value_str.as_bytes();
settings.separator = match value.len() { settings.separator = match value.len() {
0 => Sep::Line, 0 => Sep::Line,
1 => Sep::Char(value.chars().next().unwrap()), 1 => Sep::Char(value[0]),
_ => { _ => {
return Err(USimpleError::new( return Err(USimpleError::new(
1, 1,
format!("multi-character tab {}", value), format!("multi-character tab {}", value_str),
)) ))
} }
}; };
@ -507,7 +525,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
} }
if let Some(empty) = matches.value_of("e") { if let Some(empty) = matches.value_of("e") {
settings.empty = empty.to_string(); settings.empty = empty.as_bytes().to_vec();
} }
if matches.is_present("nocheck-order") { if matches.is_present("nocheck-order") {
@ -529,7 +547,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
return Err(USimpleError::new(1, "both files cannot be standard input")); return Err(USimpleError::new(1, "both files cannot be standard input"));
} }
exec(file1, file2, settings) match exec(file1, file2, settings) {
Ok(_) => Ok(()),
Err(e) => Err(USimpleError::new(1, format!("{}", e))),
}
} }
pub fn uu_app() -> App<'static, 'static> { pub fn uu_app() -> App<'static, 'static> {
@ -639,7 +660,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
) )
} }
fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> { fn exec(file1: &str, file2: &str, settings: Settings) -> Result<(), std::io::Error> {
let stdin = stdin(); let stdin = stdin();
let mut state1 = State::new( let mut state1 = State::new(
@ -686,14 +707,14 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
let repr = Repr::new( let repr = Repr::new(
match settings.separator { match settings.separator {
Sep::Char(sep) => sep, Sep::Char(sep) => sep,
_ => ' ', _ => b' ',
}, },
&format, &format,
&settings.empty, &settings.empty,
); );
if settings.headers { if settings.headers {
state1.print_headers(&state2, &repr); state1.print_headers(&state2, &repr)?;
state1.reset_read_line(&input); state1.reset_read_line(&input);
state2.reset_read_line(&input); state2.reset_read_line(&input);
} }
@ -703,17 +724,17 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
match diff { match diff {
Ordering::Less => { Ordering::Less => {
state1.skip_line(&input, &repr); state1.skip_line(&input, &repr)?;
} }
Ordering::Greater => { Ordering::Greater => {
state2.skip_line(&input, &repr); state2.skip_line(&input, &repr)?;
} }
Ordering::Equal => { Ordering::Equal => {
let next_line1 = state1.extend(&input); let next_line1 = state1.extend(&input);
let next_line2 = state2.extend(&input); let next_line2 = state2.extend(&input);
if settings.print_joined { if settings.print_joined {
state1.combine(&state2, &repr); state1.combine(&state2, &repr)?;
} }
state1.reset(next_line1); state1.reset(next_line1);
@ -722,8 +743,8 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
} }
} }
state1.finalize(&input, &repr); state1.finalize(&input, &repr)?;
state2.finalize(&input, &repr); state2.finalize(&input, &repr)?;
if state1.has_failed || state2.has_failed { if state1.has_failed || state2.has_failed {
set_exit_code(1); set_exit_code(1);

View file

@ -328,3 +328,21 @@ fn single_file_with_header() {
.succeeds() .succeeds()
.stdout_is("A 1\n"); .stdout_is("A 1\n");
} }
#[test]
fn non_line_feeds() {
new_ucmd!()
.arg("non-line_feeds_1.txt")
.arg("non-line_feeds_2.txt")
.succeeds()
.stdout_only_fixture("non-line_feeds.expected");
}
#[test]
fn non_unicode() {
new_ucmd!()
.arg("non-unicode_1.bin")
.arg("non-unicode_2.bin")
.succeeds()
.stdout_only_fixture("non-unicode.expected");
}

View file

@ -0,0 +1,2 @@
b d
a c f

View file

@ -0,0 +1,2 @@
b
a c

View file

@ -0,0 +1,2 @@
d
a f

BIN
tests/fixtures/join/non-unicode.expected vendored Normal file

Binary file not shown.

BIN
tests/fixtures/join/non-unicode_1.bin vendored Normal file

Binary file not shown.

BIN
tests/fixtures/join/non-unicode_2.bin vendored Normal file

Binary file not shown.