1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 19:17:43 +00:00

join: add support for multibyte separators (#6736)

* join: add test for multibyte separators

* join: implement support for multibyte separators

* join: use a trait instead of an enum for separator

* join: test whitespace merging
This commit is contained in:
Justin Tracey 2024-10-06 05:48:08 -04:00 committed by GitHub
parent d8eb4e2214
commit a51a731704
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 251 additions and 89 deletions

View file

@ -3,11 +3,11 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck // spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck memmem
use clap::builder::ValueParser; use clap::builder::ValueParser;
use clap::{crate_version, Arg, ArgAction, Command}; use clap::{crate_version, Arg, ArgAction, Command};
use memchr::{memchr3_iter, memchr_iter}; use memchr::{memchr_iter, memmem::Finder, Memchr3};
use std::cmp::Ordering; use std::cmp::Ordering;
use std::error::Error; use std::error::Error;
use std::ffi::OsString; use std::ffi::OsString;
@ -60,13 +60,114 @@ enum FileNum {
File2, File2,
} }
#[derive(Copy, Clone, PartialEq)] #[derive(Clone)]
enum Sep { enum SepSetting {
Char(u8), /// Any single-byte separator.
Byte(u8),
/// A single character more than one byte long.
Char(Vec<u8>),
/// No separators, join on the entire line.
Line, Line,
/// Whitespace separators.
Whitespaces, Whitespaces,
} }
trait Separator: Clone {
/// Using this separator, return the start and end index of all fields in the haystack.
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)>;
/// The separator as it appears when in the output.
fn output_separator(&self) -> &[u8];
}
/// Simple separators one byte in length.
#[derive(Copy, Clone)]
struct OneByteSep {
byte: [u8; 1],
}
impl Separator for OneByteSep {
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
for i in memchr_iter(self.byte[0], haystack) {
field_ranges.push((last_end, i));
last_end = i + 1;
}
field_ranges.push((last_end, haystack.len()));
field_ranges
}
fn output_separator(&self) -> &[u8] {
&self.byte
}
}
/// Multi-byte (but still single character) separators.
#[derive(Clone)]
struct MultiByteSep<'a> {
finder: Finder<'a>,
}
impl<'a> Separator for MultiByteSep<'a> {
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
for i in self.finder.find_iter(haystack) {
field_ranges.push((last_end, i));
last_end = i + self.finder.needle().len();
}
field_ranges.push((last_end, haystack.len()));
field_ranges
}
fn output_separator(&self) -> &[u8] {
self.finder.needle()
}
}
/// Whole-line separator.
#[derive(Copy, Clone)]
struct LineSep {}
impl Separator for LineSep {
fn field_ranges(&self, haystack: &[u8], _len_guess: usize) -> Vec<(usize, usize)> {
vec![(0, haystack.len())]
}
fn output_separator(&self) -> &[u8] {
&[]
}
}
/// Default whitespace separator.
#[derive(Copy, Clone)]
struct WhitespaceSep {}
impl Separator for WhitespaceSep {
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
// GNU join used Bourne shell field splitters by default
// FIXME: but now uses locale-dependent whitespace
for i in Memchr3::new(b' ', b'\t', b'\n', haystack) {
// leading whitespace should be dropped, contiguous whitespace merged
if i > last_end {
field_ranges.push((last_end, i));
}
last_end = i + 1;
}
field_ranges.push((last_end, haystack.len()));
field_ranges
}
fn output_separator(&self) -> &[u8] {
b" "
}
}
#[derive(Copy, Clone, PartialEq)] #[derive(Copy, Clone, PartialEq)]
enum CheckOrder { enum CheckOrder {
Default, Default,
@ -82,7 +183,7 @@ struct Settings {
print_joined: bool, print_joined: bool,
ignore_case: bool, ignore_case: bool,
line_ending: LineEnding, line_ending: LineEnding,
separator: Sep, separator: SepSetting,
autoformat: bool, autoformat: bool,
format: Vec<Spec>, format: Vec<Spec>,
empty: Vec<u8>, empty: Vec<u8>,
@ -100,7 +201,7 @@ impl Default for Settings {
print_joined: true, print_joined: true,
ignore_case: false, ignore_case: false,
line_ending: LineEnding::Newline, line_ending: LineEnding::Newline,
separator: Sep::Whitespaces, separator: SepSetting::Whitespaces,
autoformat: false, autoformat: false,
format: vec![], format: vec![],
empty: vec![], empty: vec![],
@ -111,15 +212,15 @@ impl Default for Settings {
} }
/// Output representation. /// Output representation.
struct Repr<'a> { struct Repr<'a, Sep: Separator> {
line_ending: LineEnding, line_ending: LineEnding,
separator: u8, separator: Sep,
format: &'a [Spec], format: Vec<Spec>,
empty: &'a [u8], empty: &'a [u8],
} }
impl<'a> Repr<'a> { impl<'a, Sep: Separator> Repr<'a, Sep> {
fn new(line_ending: LineEnding, separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Self { fn new(line_ending: LineEnding, separator: Sep, format: Vec<Spec>, empty: &'a [u8]) -> Self {
Repr { Repr {
line_ending, line_ending,
separator, separator,
@ -155,7 +256,7 @@ impl<'a> Repr<'a> {
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
for i in 0..line.field_ranges.len() { for i in 0..line.field_ranges.len() {
if i != index { if i != index {
writer.write_all(&[self.separator])?; writer.write_all(self.separator.output_separator())?;
writer.write_all(line.get_field(i).unwrap())?; writer.write_all(line.get_field(i).unwrap())?;
} }
} }
@ -169,7 +270,7 @@ impl<'a> Repr<'a> {
{ {
for i in 0..self.format.len() { for i in 0..self.format.len() {
if i > 0 { if i > 0 {
writer.write_all(&[self.separator])?; writer.write_all(self.separator.output_separator())?;
} }
let field = match f(&self.format[i]) { let field = match f(&self.format[i]) {
@ -188,13 +289,13 @@ impl<'a> Repr<'a> {
} }
/// Input processing parameters. /// Input processing parameters.
struct Input { struct Input<Sep: Separator> {
separator: Sep, separator: Sep,
ignore_case: bool, ignore_case: bool,
check_order: CheckOrder, check_order: CheckOrder,
} }
impl Input { impl<Sep: Separator> Input<Sep> {
fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self { fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self {
Self { Self {
separator, separator,
@ -271,24 +372,8 @@ struct Line {
} }
impl Line { impl Line {
fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self { fn new<Sep: Separator>(string: Vec<u8>, separator: &Sep, len_guess: usize) -> Self {
let mut field_ranges = Vec::with_capacity(len_guess); let field_ranges = separator.field_ranges(&string, len_guess);
let mut last_end = 0;
if separator == Sep::Whitespaces {
// GNU join uses Bourne shell field splitters by default
for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
if i > last_end {
field_ranges.push((last_end, i));
}
last_end = i + 1;
}
} else if let Sep::Char(sep) = separator {
for i in memchr_iter(sep, &string) {
field_ranges.push((last_end, i));
last_end = i + 1;
}
}
field_ranges.push((last_end, string.len()));
Self { Self {
field_ranges, field_ranges,
@ -351,7 +436,12 @@ impl<'a> State<'a> {
} }
/// Skip the current unpaired line. /// Skip the current unpaired line.
fn skip_line(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> { fn skip_line<Sep: Separator>(
&mut self,
writer: &mut impl Write,
input: &Input<Sep>,
repr: &Repr<'a, Sep>,
) -> UResult<()> {
if self.print_unpaired { if self.print_unpaired {
self.print_first_line(writer, repr)?; self.print_first_line(writer, repr)?;
} }
@ -362,7 +452,7 @@ impl<'a> State<'a> {
/// Keep reading line sequence until the key does not change, return /// Keep reading line sequence until the key does not change, return
/// the first line whose key differs. /// the first line whose key differs.
fn extend(&mut self, input: &Input) -> UResult<Option<Line>> { fn extend<Sep: Separator>(&mut self, input: &Input<Sep>) -> UResult<Option<Line>> {
while let Some(line) = self.next_line(input)? { while let Some(line) = self.next_line(input)? {
let diff = input.compare(self.get_current_key(), line.get_field(self.key)); let diff = input.compare(self.get_current_key(), line.get_field(self.key));
@ -377,11 +467,11 @@ impl<'a> State<'a> {
} }
/// Print lines in the buffers as headers. /// Print lines in the buffers as headers.
fn print_headers( fn print_headers<Sep: Separator>(
&self, &self,
writer: &mut impl Write, writer: &mut impl Write,
other: &State, other: &State,
repr: &Repr, repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
if self.has_line() { if self.has_line() {
if other.has_line() { if other.has_line() {
@ -397,11 +487,11 @@ impl<'a> State<'a> {
} }
/// Combine two line sequences. /// Combine two line sequences.
fn combine( fn combine<Sep: Separator>(
&self, &self,
writer: &mut impl Write, writer: &mut impl Write,
other: &State, other: &State,
repr: &Repr, repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let key = self.get_current_key(); let key = self.get_current_key();
@ -444,13 +534,16 @@ impl<'a> State<'a> {
} }
} }
fn reset_read_line(&mut self, input: &Input) -> Result<(), std::io::Error> { fn reset_read_line<Sep: Separator>(
let line = self.read_line(input.separator)?; &mut self,
input: &Input<Sep>,
) -> Result<(), std::io::Error> {
let line = self.read_line(&input.separator)?;
self.reset(line); self.reset(line);
Ok(()) Ok(())
} }
fn reset_next_line(&mut self, input: &Input) -> Result<(), JoinError> { fn reset_next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<(), JoinError> {
let line = self.next_line(input)?; let line = self.next_line(input)?;
self.reset(line); self.reset(line);
Ok(()) Ok(())
@ -460,7 +553,7 @@ impl<'a> State<'a> {
!self.seq.is_empty() !self.seq.is_empty()
} }
fn initialize(&mut self, read_sep: Sep, autoformat: bool) -> usize { fn initialize<Sep: Separator>(&mut self, read_sep: &Sep, autoformat: bool) -> usize {
if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) { if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) {
self.seq.push(line); self.seq.push(line);
@ -471,7 +564,12 @@ impl<'a> State<'a> {
0 0
} }
fn finalize(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> { fn finalize<Sep: Separator>(
&mut self,
writer: &mut impl Write,
input: &Input<Sep>,
repr: &Repr<'a, Sep>,
) -> UResult<()> {
if self.has_line() { if self.has_line() {
if self.print_unpaired { if self.print_unpaired {
self.print_first_line(writer, repr)?; self.print_first_line(writer, repr)?;
@ -491,7 +589,7 @@ impl<'a> State<'a> {
} }
/// Get the next line without the order check. /// Get the next line without the order check.
fn read_line(&mut self, sep: Sep) -> Result<Option<Line>, std::io::Error> { fn read_line<Sep: Separator>(&mut self, sep: &Sep) -> Result<Option<Line>, std::io::Error> {
match self.lines.next() { match self.lines.next() {
Some(value) => { Some(value) => {
self.line_num += 1; self.line_num += 1;
@ -506,8 +604,8 @@ impl<'a> State<'a> {
} }
/// Get the next line with the order check. /// Get the next line with the order check.
fn next_line(&mut self, input: &Input) -> Result<Option<Line>, JoinError> { fn next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<Option<Line>, JoinError> {
if let Some(line) = self.read_line(input.separator)? { if let Some(line) = self.read_line(&input.separator)? {
if input.check_order == CheckOrder::Disabled { if input.check_order == CheckOrder::Disabled {
return Ok(Some(line)); return Ok(Some(line));
} }
@ -543,11 +641,11 @@ impl<'a> State<'a> {
self.seq[0].get_field(self.key) self.seq[0].get_field(self.key)
} }
fn print_line( fn print_line<Sep: Separator>(
&self, &self,
writer: &mut impl Write, writer: &mut impl Write,
line: &Line, line: &Line,
repr: &Repr, repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
if repr.uses_format() { if repr.uses_format() {
repr.print_format(writer, |spec| match *spec { repr.print_format(writer, |spec| match *spec {
@ -568,31 +666,53 @@ impl<'a> State<'a> {
repr.print_line_ending(writer) repr.print_line_ending(writer)
} }
fn print_first_line(&self, writer: &mut impl Write, repr: &Repr) -> Result<(), std::io::Error> { fn print_first_line<Sep: Separator>(
&self,
writer: &mut impl Write,
repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> {
self.print_line(writer, &self.seq[0], repr) self.print_line(writer, &self.seq[0], repr)
} }
} }
fn parse_separator(value_os: &OsString) -> UResult<Sep> { fn parse_separator(value_os: &OsString) -> UResult<SepSetting> {
// Five possible separator values:
// No argument supplied, separate on whitespace; handled implicitly as the default elsewhere
// An empty string arg, whole line separation
// On unix-likes only, a single arbitrary byte
// The two-character "\0" string, interpreted as a single 0 byte
// A single scalar valid in the locale encoding (currently only UTF-8)
if value_os.is_empty() {
return Ok(SepSetting::Line);
}
#[cfg(unix)] #[cfg(unix)]
let value = value_os.as_bytes(); {
#[cfg(not(unix))] let value = value_os.as_bytes();
let value = match value_os.to_str() { if value.len() == 1 {
Some(value) => value.as_bytes(), return Ok(SepSetting::Byte(value[0]));
None => {
return Err(USimpleError::new(
1,
"unprintable field separators are only supported on unix-like platforms",
));
} }
}
let Some(value) = value_os.to_str() else {
#[cfg(unix)]
return Err(USimpleError::new(1, "non-UTF-8 multi-byte tab"));
#[cfg(not(unix))]
return Err(USimpleError::new(
1,
"unprintable field separators are only supported on unix-like platforms",
));
}; };
match value.len() {
0 => Ok(Sep::Line), let mut chars = value.chars();
1 => Ok(Sep::Char(value[0])), let c = chars.next().expect("valid string with at least one byte");
2 if value[0] == b'\\' && value[1] == b'0' => Ok(Sep::Char(0)), match chars.next() {
None => Ok(SepSetting::Char(value.into())),
Some('0') if c == '\\' => Ok(SepSetting::Byte(0)),
_ => Err(USimpleError::new( _ => Err(USimpleError::new(
1, 1,
format!("multi-character tab {}", value_os.to_string_lossy()), format!("multi-character tab {}", value),
)), )),
} }
} }
@ -695,7 +815,20 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
return Err(USimpleError::new(1, "both files cannot be standard input")); return Err(USimpleError::new(1, "both files cannot be standard input"));
} }
exec(file1, file2, settings) let sep = settings.separator.clone();
match sep {
SepSetting::Byte(byte) => exec(file1, file2, settings, OneByteSep { byte: [byte] }),
SepSetting::Char(c) => exec(
file1,
file2,
settings,
MultiByteSep {
finder: Finder::new(&c),
},
),
SepSetting::Whitespaces => exec(file1, file2, settings, WhitespaceSep {}),
SepSetting::Line => exec(file1, file2, settings, LineSep {}),
}
} }
pub fn uu_app() -> Command { pub fn uu_app() -> Command {
@ -816,7 +949,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
) )
} }
fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> { fn exec<Sep: Separator>(file1: &str, file2: &str, settings: Settings, sep: Sep) -> UResult<()> {
let stdin = stdin(); let stdin = stdin();
let mut state1 = State::new( let mut state1 = State::new(
@ -837,16 +970,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
settings.print_unpaired2, settings.print_unpaired2,
)?; )?;
let input = Input::new( let input = Input::new(sep.clone(), settings.ignore_case, settings.check_order);
settings.separator,
settings.ignore_case,
settings.check_order,
);
let format = if settings.autoformat { let format = if settings.autoformat {
let mut format = vec![Spec::Key]; let mut format = vec![Spec::Key];
let mut initialize = |state: &mut State| { let mut initialize = |state: &mut State| {
let max_fields = state.initialize(settings.separator, settings.autoformat); let max_fields = state.initialize(&sep, settings.autoformat);
for i in 0..max_fields { for i in 0..max_fields {
if i != state.key { if i != state.key {
format.push(Spec::Field(state.file_num, i)); format.push(Spec::Field(state.file_num, i));
@ -857,20 +986,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
initialize(&mut state2); initialize(&mut state2);
format format
} else { } else {
state1.initialize(settings.separator, settings.autoformat); state1.initialize(&sep, settings.autoformat);
state2.initialize(settings.separator, settings.autoformat); state2.initialize(&sep, settings.autoformat);
settings.format settings.format
}; };
let repr = Repr::new( let repr = Repr::new(settings.line_ending, sep, format, &settings.empty);
settings.line_ending,
match settings.separator {
Sep::Char(sep) => sep,
_ => b' ',
},
&format,
&settings.empty,
);
let stdout = stdout(); let stdout = stdout();
let mut writer = BufWriter::new(stdout.lock()); let mut writer = BufWriter::new(stdout.lock());

View file

@ -58,6 +58,25 @@ fn default_arguments() {
.stdout_only_fixture("default.expected"); .stdout_only_fixture("default.expected");
} }
#[test]
fn only_whitespace_separators_merge() {
new_ucmd!()
.arg("contiguous_separators.txt")
.arg("-")
.pipe_in(" a ,c ")
.succeeds()
.stdout_only("a ,,,b ,c \n");
new_ucmd!()
.arg("contiguous_separators.txt")
.arg("-t")
.arg(",")
.arg("-")
.pipe_in(" a ,c ")
.succeeds()
.stdout_only(" a ,,,b,c \n");
}
#[test] #[test]
fn different_fields() { fn different_fields() {
new_ucmd!() new_ucmd!()
@ -208,9 +227,9 @@ fn tab_multi_character() {
.arg("semicolon_fields_1.txt") .arg("semicolon_fields_1.txt")
.arg("semicolon_fields_2.txt") .arg("semicolon_fields_2.txt")
.arg("-t") .arg("-t")
.arg("э") .arg("ab")
.fails() .fails()
.stderr_is("join: multi-character tab э\n"); .stderr_is("join: multi-character tab ab\n");
} }
#[test] #[test]
@ -437,14 +456,22 @@ fn non_unicode() {
#[cfg(unix)] #[cfg(unix)]
{ {
let invalid_utf8: u8 = 167; let non_utf8_byte: u8 = 167;
new_ucmd!() new_ucmd!()
.arg("-t") .arg("-t")
.arg(OsStr::from_bytes(&[invalid_utf8])) .arg(OsStr::from_bytes(&[non_utf8_byte]))
.arg("non-unicode_1.bin") .arg("non-unicode_1.bin")
.arg("non-unicode_2.bin") .arg("non-unicode_2.bin")
.succeeds() .succeeds()
.stdout_only_fixture("non-unicode_sep.expected"); .stdout_only_fixture("non-unicode_sep.expected");
new_ucmd!()
.arg("-t")
.arg(OsStr::from_bytes(&[non_utf8_byte, non_utf8_byte]))
.arg("non-unicode_1.bin")
.arg("non-unicode_2.bin")
.fails()
.stderr_is("join: non-UTF-8 multi-byte tab\n");
} }
#[cfg(windows)] #[cfg(windows)]
@ -462,6 +489,16 @@ fn non_unicode() {
} }
} }
#[test]
fn multibyte_sep() {
new_ucmd!()
.arg("-t§")
.arg("multibyte_sep_1.txt")
.arg("multibyte_sep_2.txt")
.succeeds()
.stdout_only_fixture("multibyte_sep.expected");
}
#[test] #[test]
fn null_field_separators() { fn null_field_separators() {
new_ucmd!() new_ucmd!()

View file

@ -0,0 +1 @@
a ,,,b

View file

@ -0,0 +1 @@
a§b§c

View file

@ -0,0 +1 @@
a§b

View file

@ -0,0 +1 @@
a§c