mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
join: add support for multibyte separators (#6736)
* join: add test for multibyte separators * join: implement support for multibyte separators * join: use a trait instead of an enum for separator * join: test whitespace merging
This commit is contained in:
parent
d8eb4e2214
commit
a51a731704
6 changed files with 251 additions and 89 deletions
|
@ -3,11 +3,11 @@
|
||||||
// For the full copyright and license information, please view the LICENSE
|
// For the full copyright and license information, please view the LICENSE
|
||||||
// file that was distributed with this source code.
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck
|
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck memmem
|
||||||
|
|
||||||
use clap::builder::ValueParser;
|
use clap::builder::ValueParser;
|
||||||
use clap::{crate_version, Arg, ArgAction, Command};
|
use clap::{crate_version, Arg, ArgAction, Command};
|
||||||
use memchr::{memchr3_iter, memchr_iter};
|
use memchr::{memchr_iter, memmem::Finder, Memchr3};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::ffi::OsString;
|
use std::ffi::OsString;
|
||||||
|
@ -60,13 +60,114 @@ enum FileNum {
|
||||||
File2,
|
File2,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq)]
|
#[derive(Clone)]
|
||||||
enum Sep {
|
enum SepSetting {
|
||||||
Char(u8),
|
/// Any single-byte separator.
|
||||||
|
Byte(u8),
|
||||||
|
/// A single character more than one byte long.
|
||||||
|
Char(Vec<u8>),
|
||||||
|
/// No separators, join on the entire line.
|
||||||
Line,
|
Line,
|
||||||
|
/// Whitespace separators.
|
||||||
Whitespaces,
|
Whitespaces,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
trait Separator: Clone {
|
||||||
|
/// Using this separator, return the start and end index of all fields in the haystack.
|
||||||
|
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)>;
|
||||||
|
/// The separator as it appears when in the output.
|
||||||
|
fn output_separator(&self) -> &[u8];
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple separators one byte in length.
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
struct OneByteSep {
|
||||||
|
byte: [u8; 1],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Separator for OneByteSep {
|
||||||
|
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
|
||||||
|
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||||
|
let mut last_end = 0;
|
||||||
|
|
||||||
|
for i in memchr_iter(self.byte[0], haystack) {
|
||||||
|
field_ranges.push((last_end, i));
|
||||||
|
last_end = i + 1;
|
||||||
|
}
|
||||||
|
field_ranges.push((last_end, haystack.len()));
|
||||||
|
field_ranges
|
||||||
|
}
|
||||||
|
|
||||||
|
fn output_separator(&self) -> &[u8] {
|
||||||
|
&self.byte
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Multi-byte (but still single character) separators.
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct MultiByteSep<'a> {
|
||||||
|
finder: Finder<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Separator for MultiByteSep<'a> {
|
||||||
|
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
|
||||||
|
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||||
|
let mut last_end = 0;
|
||||||
|
|
||||||
|
for i in self.finder.find_iter(haystack) {
|
||||||
|
field_ranges.push((last_end, i));
|
||||||
|
last_end = i + self.finder.needle().len();
|
||||||
|
}
|
||||||
|
field_ranges.push((last_end, haystack.len()));
|
||||||
|
field_ranges
|
||||||
|
}
|
||||||
|
|
||||||
|
fn output_separator(&self) -> &[u8] {
|
||||||
|
self.finder.needle()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whole-line separator.
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
struct LineSep {}
|
||||||
|
|
||||||
|
impl Separator for LineSep {
|
||||||
|
fn field_ranges(&self, haystack: &[u8], _len_guess: usize) -> Vec<(usize, usize)> {
|
||||||
|
vec![(0, haystack.len())]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn output_separator(&self) -> &[u8] {
|
||||||
|
&[]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Default whitespace separator.
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
struct WhitespaceSep {}
|
||||||
|
|
||||||
|
impl Separator for WhitespaceSep {
|
||||||
|
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
|
||||||
|
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||||
|
let mut last_end = 0;
|
||||||
|
|
||||||
|
// GNU join used Bourne shell field splitters by default
|
||||||
|
// FIXME: but now uses locale-dependent whitespace
|
||||||
|
for i in Memchr3::new(b' ', b'\t', b'\n', haystack) {
|
||||||
|
// leading whitespace should be dropped, contiguous whitespace merged
|
||||||
|
if i > last_end {
|
||||||
|
field_ranges.push((last_end, i));
|
||||||
|
}
|
||||||
|
last_end = i + 1;
|
||||||
|
}
|
||||||
|
field_ranges.push((last_end, haystack.len()));
|
||||||
|
field_ranges
|
||||||
|
}
|
||||||
|
|
||||||
|
fn output_separator(&self) -> &[u8] {
|
||||||
|
b" "
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq)]
|
#[derive(Copy, Clone, PartialEq)]
|
||||||
enum CheckOrder {
|
enum CheckOrder {
|
||||||
Default,
|
Default,
|
||||||
|
@ -82,7 +183,7 @@ struct Settings {
|
||||||
print_joined: bool,
|
print_joined: bool,
|
||||||
ignore_case: bool,
|
ignore_case: bool,
|
||||||
line_ending: LineEnding,
|
line_ending: LineEnding,
|
||||||
separator: Sep,
|
separator: SepSetting,
|
||||||
autoformat: bool,
|
autoformat: bool,
|
||||||
format: Vec<Spec>,
|
format: Vec<Spec>,
|
||||||
empty: Vec<u8>,
|
empty: Vec<u8>,
|
||||||
|
@ -100,7 +201,7 @@ impl Default for Settings {
|
||||||
print_joined: true,
|
print_joined: true,
|
||||||
ignore_case: false,
|
ignore_case: false,
|
||||||
line_ending: LineEnding::Newline,
|
line_ending: LineEnding::Newline,
|
||||||
separator: Sep::Whitespaces,
|
separator: SepSetting::Whitespaces,
|
||||||
autoformat: false,
|
autoformat: false,
|
||||||
format: vec![],
|
format: vec![],
|
||||||
empty: vec![],
|
empty: vec![],
|
||||||
|
@ -111,15 +212,15 @@ impl Default for Settings {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Output representation.
|
/// Output representation.
|
||||||
struct Repr<'a> {
|
struct Repr<'a, Sep: Separator> {
|
||||||
line_ending: LineEnding,
|
line_ending: LineEnding,
|
||||||
separator: u8,
|
separator: Sep,
|
||||||
format: &'a [Spec],
|
format: Vec<Spec>,
|
||||||
empty: &'a [u8],
|
empty: &'a [u8],
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Repr<'a> {
|
impl<'a, Sep: Separator> Repr<'a, Sep> {
|
||||||
fn new(line_ending: LineEnding, separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Self {
|
fn new(line_ending: LineEnding, separator: Sep, format: Vec<Spec>, empty: &'a [u8]) -> Self {
|
||||||
Repr {
|
Repr {
|
||||||
line_ending,
|
line_ending,
|
||||||
separator,
|
separator,
|
||||||
|
@ -155,7 +256,7 @@ impl<'a> Repr<'a> {
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
for i in 0..line.field_ranges.len() {
|
for i in 0..line.field_ranges.len() {
|
||||||
if i != index {
|
if i != index {
|
||||||
writer.write_all(&[self.separator])?;
|
writer.write_all(self.separator.output_separator())?;
|
||||||
writer.write_all(line.get_field(i).unwrap())?;
|
writer.write_all(line.get_field(i).unwrap())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -169,7 +270,7 @@ impl<'a> Repr<'a> {
|
||||||
{
|
{
|
||||||
for i in 0..self.format.len() {
|
for i in 0..self.format.len() {
|
||||||
if i > 0 {
|
if i > 0 {
|
||||||
writer.write_all(&[self.separator])?;
|
writer.write_all(self.separator.output_separator())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let field = match f(&self.format[i]) {
|
let field = match f(&self.format[i]) {
|
||||||
|
@ -188,13 +289,13 @@ impl<'a> Repr<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Input processing parameters.
|
/// Input processing parameters.
|
||||||
struct Input {
|
struct Input<Sep: Separator> {
|
||||||
separator: Sep,
|
separator: Sep,
|
||||||
ignore_case: bool,
|
ignore_case: bool,
|
||||||
check_order: CheckOrder,
|
check_order: CheckOrder,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Input {
|
impl<Sep: Separator> Input<Sep> {
|
||||||
fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self {
|
fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self {
|
||||||
Self {
|
Self {
|
||||||
separator,
|
separator,
|
||||||
|
@ -271,24 +372,8 @@ struct Line {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Line {
|
impl Line {
|
||||||
fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self {
|
fn new<Sep: Separator>(string: Vec<u8>, separator: &Sep, len_guess: usize) -> Self {
|
||||||
let mut field_ranges = Vec::with_capacity(len_guess);
|
let field_ranges = separator.field_ranges(&string, len_guess);
|
||||||
let mut last_end = 0;
|
|
||||||
if separator == Sep::Whitespaces {
|
|
||||||
// GNU join uses Bourne shell field splitters by default
|
|
||||||
for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
|
|
||||||
if i > last_end {
|
|
||||||
field_ranges.push((last_end, i));
|
|
||||||
}
|
|
||||||
last_end = i + 1;
|
|
||||||
}
|
|
||||||
} else if let Sep::Char(sep) = separator {
|
|
||||||
for i in memchr_iter(sep, &string) {
|
|
||||||
field_ranges.push((last_end, i));
|
|
||||||
last_end = i + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
field_ranges.push((last_end, string.len()));
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
field_ranges,
|
field_ranges,
|
||||||
|
@ -351,7 +436,12 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Skip the current unpaired line.
|
/// Skip the current unpaired line.
|
||||||
fn skip_line(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
|
fn skip_line<Sep: Separator>(
|
||||||
|
&mut self,
|
||||||
|
writer: &mut impl Write,
|
||||||
|
input: &Input<Sep>,
|
||||||
|
repr: &Repr<'a, Sep>,
|
||||||
|
) -> UResult<()> {
|
||||||
if self.print_unpaired {
|
if self.print_unpaired {
|
||||||
self.print_first_line(writer, repr)?;
|
self.print_first_line(writer, repr)?;
|
||||||
}
|
}
|
||||||
|
@ -362,7 +452,7 @@ impl<'a> State<'a> {
|
||||||
|
|
||||||
/// Keep reading line sequence until the key does not change, return
|
/// Keep reading line sequence until the key does not change, return
|
||||||
/// the first line whose key differs.
|
/// the first line whose key differs.
|
||||||
fn extend(&mut self, input: &Input) -> UResult<Option<Line>> {
|
fn extend<Sep: Separator>(&mut self, input: &Input<Sep>) -> UResult<Option<Line>> {
|
||||||
while let Some(line) = self.next_line(input)? {
|
while let Some(line) = self.next_line(input)? {
|
||||||
let diff = input.compare(self.get_current_key(), line.get_field(self.key));
|
let diff = input.compare(self.get_current_key(), line.get_field(self.key));
|
||||||
|
|
||||||
|
@ -377,11 +467,11 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Print lines in the buffers as headers.
|
/// Print lines in the buffers as headers.
|
||||||
fn print_headers(
|
fn print_headers<Sep: Separator>(
|
||||||
&self,
|
&self,
|
||||||
writer: &mut impl Write,
|
writer: &mut impl Write,
|
||||||
other: &State,
|
other: &State,
|
||||||
repr: &Repr,
|
repr: &Repr<'a, Sep>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
if self.has_line() {
|
if self.has_line() {
|
||||||
if other.has_line() {
|
if other.has_line() {
|
||||||
|
@ -397,11 +487,11 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Combine two line sequences.
|
/// Combine two line sequences.
|
||||||
fn combine(
|
fn combine<Sep: Separator>(
|
||||||
&self,
|
&self,
|
||||||
writer: &mut impl Write,
|
writer: &mut impl Write,
|
||||||
other: &State,
|
other: &State,
|
||||||
repr: &Repr,
|
repr: &Repr<'a, Sep>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
let key = self.get_current_key();
|
let key = self.get_current_key();
|
||||||
|
|
||||||
|
@ -444,13 +534,16 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reset_read_line(&mut self, input: &Input) -> Result<(), std::io::Error> {
|
fn reset_read_line<Sep: Separator>(
|
||||||
let line = self.read_line(input.separator)?;
|
&mut self,
|
||||||
|
input: &Input<Sep>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let line = self.read_line(&input.separator)?;
|
||||||
self.reset(line);
|
self.reset(line);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reset_next_line(&mut self, input: &Input) -> Result<(), JoinError> {
|
fn reset_next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<(), JoinError> {
|
||||||
let line = self.next_line(input)?;
|
let line = self.next_line(input)?;
|
||||||
self.reset(line);
|
self.reset(line);
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -460,7 +553,7 @@ impl<'a> State<'a> {
|
||||||
!self.seq.is_empty()
|
!self.seq.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn initialize(&mut self, read_sep: Sep, autoformat: bool) -> usize {
|
fn initialize<Sep: Separator>(&mut self, read_sep: &Sep, autoformat: bool) -> usize {
|
||||||
if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) {
|
if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) {
|
||||||
self.seq.push(line);
|
self.seq.push(line);
|
||||||
|
|
||||||
|
@ -471,7 +564,12 @@ impl<'a> State<'a> {
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn finalize(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
|
fn finalize<Sep: Separator>(
|
||||||
|
&mut self,
|
||||||
|
writer: &mut impl Write,
|
||||||
|
input: &Input<Sep>,
|
||||||
|
repr: &Repr<'a, Sep>,
|
||||||
|
) -> UResult<()> {
|
||||||
if self.has_line() {
|
if self.has_line() {
|
||||||
if self.print_unpaired {
|
if self.print_unpaired {
|
||||||
self.print_first_line(writer, repr)?;
|
self.print_first_line(writer, repr)?;
|
||||||
|
@ -491,7 +589,7 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next line without the order check.
|
/// Get the next line without the order check.
|
||||||
fn read_line(&mut self, sep: Sep) -> Result<Option<Line>, std::io::Error> {
|
fn read_line<Sep: Separator>(&mut self, sep: &Sep) -> Result<Option<Line>, std::io::Error> {
|
||||||
match self.lines.next() {
|
match self.lines.next() {
|
||||||
Some(value) => {
|
Some(value) => {
|
||||||
self.line_num += 1;
|
self.line_num += 1;
|
||||||
|
@ -506,8 +604,8 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next line with the order check.
|
/// Get the next line with the order check.
|
||||||
fn next_line(&mut self, input: &Input) -> Result<Option<Line>, JoinError> {
|
fn next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<Option<Line>, JoinError> {
|
||||||
if let Some(line) = self.read_line(input.separator)? {
|
if let Some(line) = self.read_line(&input.separator)? {
|
||||||
if input.check_order == CheckOrder::Disabled {
|
if input.check_order == CheckOrder::Disabled {
|
||||||
return Ok(Some(line));
|
return Ok(Some(line));
|
||||||
}
|
}
|
||||||
|
@ -543,11 +641,11 @@ impl<'a> State<'a> {
|
||||||
self.seq[0].get_field(self.key)
|
self.seq[0].get_field(self.key)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_line(
|
fn print_line<Sep: Separator>(
|
||||||
&self,
|
&self,
|
||||||
writer: &mut impl Write,
|
writer: &mut impl Write,
|
||||||
line: &Line,
|
line: &Line,
|
||||||
repr: &Repr,
|
repr: &Repr<'a, Sep>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
if repr.uses_format() {
|
if repr.uses_format() {
|
||||||
repr.print_format(writer, |spec| match *spec {
|
repr.print_format(writer, |spec| match *spec {
|
||||||
|
@ -568,31 +666,53 @@ impl<'a> State<'a> {
|
||||||
repr.print_line_ending(writer)
|
repr.print_line_ending(writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_first_line(&self, writer: &mut impl Write, repr: &Repr) -> Result<(), std::io::Error> {
|
fn print_first_line<Sep: Separator>(
|
||||||
|
&self,
|
||||||
|
writer: &mut impl Write,
|
||||||
|
repr: &Repr<'a, Sep>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
self.print_line(writer, &self.seq[0], repr)
|
self.print_line(writer, &self.seq[0], repr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_separator(value_os: &OsString) -> UResult<Sep> {
|
fn parse_separator(value_os: &OsString) -> UResult<SepSetting> {
|
||||||
|
// Five possible separator values:
|
||||||
|
// No argument supplied, separate on whitespace; handled implicitly as the default elsewhere
|
||||||
|
// An empty string arg, whole line separation
|
||||||
|
// On unix-likes only, a single arbitrary byte
|
||||||
|
// The two-character "\0" string, interpreted as a single 0 byte
|
||||||
|
// A single scalar valid in the locale encoding (currently only UTF-8)
|
||||||
|
|
||||||
|
if value_os.is_empty() {
|
||||||
|
return Ok(SepSetting::Line);
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
let value = value_os.as_bytes();
|
{
|
||||||
#[cfg(not(unix))]
|
let value = value_os.as_bytes();
|
||||||
let value = match value_os.to_str() {
|
if value.len() == 1 {
|
||||||
Some(value) => value.as_bytes(),
|
return Ok(SepSetting::Byte(value[0]));
|
||||||
None => {
|
|
||||||
return Err(USimpleError::new(
|
|
||||||
1,
|
|
||||||
"unprintable field separators are only supported on unix-like platforms",
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(value) = value_os.to_str() else {
|
||||||
|
#[cfg(unix)]
|
||||||
|
return Err(USimpleError::new(1, "non-UTF-8 multi-byte tab"));
|
||||||
|
#[cfg(not(unix))]
|
||||||
|
return Err(USimpleError::new(
|
||||||
|
1,
|
||||||
|
"unprintable field separators are only supported on unix-like platforms",
|
||||||
|
));
|
||||||
};
|
};
|
||||||
match value.len() {
|
|
||||||
0 => Ok(Sep::Line),
|
let mut chars = value.chars();
|
||||||
1 => Ok(Sep::Char(value[0])),
|
let c = chars.next().expect("valid string with at least one byte");
|
||||||
2 if value[0] == b'\\' && value[1] == b'0' => Ok(Sep::Char(0)),
|
match chars.next() {
|
||||||
|
None => Ok(SepSetting::Char(value.into())),
|
||||||
|
Some('0') if c == '\\' => Ok(SepSetting::Byte(0)),
|
||||||
_ => Err(USimpleError::new(
|
_ => Err(USimpleError::new(
|
||||||
1,
|
1,
|
||||||
format!("multi-character tab {}", value_os.to_string_lossy()),
|
format!("multi-character tab {}", value),
|
||||||
)),
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -695,7 +815,20 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
return Err(USimpleError::new(1, "both files cannot be standard input"));
|
return Err(USimpleError::new(1, "both files cannot be standard input"));
|
||||||
}
|
}
|
||||||
|
|
||||||
exec(file1, file2, settings)
|
let sep = settings.separator.clone();
|
||||||
|
match sep {
|
||||||
|
SepSetting::Byte(byte) => exec(file1, file2, settings, OneByteSep { byte: [byte] }),
|
||||||
|
SepSetting::Char(c) => exec(
|
||||||
|
file1,
|
||||||
|
file2,
|
||||||
|
settings,
|
||||||
|
MultiByteSep {
|
||||||
|
finder: Finder::new(&c),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
SepSetting::Whitespaces => exec(file1, file2, settings, WhitespaceSep {}),
|
||||||
|
SepSetting::Line => exec(file1, file2, settings, LineSep {}),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uu_app() -> Command {
|
pub fn uu_app() -> Command {
|
||||||
|
@ -816,7 +949,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
|
fn exec<Sep: Separator>(file1: &str, file2: &str, settings: Settings, sep: Sep) -> UResult<()> {
|
||||||
let stdin = stdin();
|
let stdin = stdin();
|
||||||
|
|
||||||
let mut state1 = State::new(
|
let mut state1 = State::new(
|
||||||
|
@ -837,16 +970,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
|
||||||
settings.print_unpaired2,
|
settings.print_unpaired2,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let input = Input::new(
|
let input = Input::new(sep.clone(), settings.ignore_case, settings.check_order);
|
||||||
settings.separator,
|
|
||||||
settings.ignore_case,
|
|
||||||
settings.check_order,
|
|
||||||
);
|
|
||||||
|
|
||||||
let format = if settings.autoformat {
|
let format = if settings.autoformat {
|
||||||
let mut format = vec![Spec::Key];
|
let mut format = vec![Spec::Key];
|
||||||
let mut initialize = |state: &mut State| {
|
let mut initialize = |state: &mut State| {
|
||||||
let max_fields = state.initialize(settings.separator, settings.autoformat);
|
let max_fields = state.initialize(&sep, settings.autoformat);
|
||||||
for i in 0..max_fields {
|
for i in 0..max_fields {
|
||||||
if i != state.key {
|
if i != state.key {
|
||||||
format.push(Spec::Field(state.file_num, i));
|
format.push(Spec::Field(state.file_num, i));
|
||||||
|
@ -857,20 +986,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
|
||||||
initialize(&mut state2);
|
initialize(&mut state2);
|
||||||
format
|
format
|
||||||
} else {
|
} else {
|
||||||
state1.initialize(settings.separator, settings.autoformat);
|
state1.initialize(&sep, settings.autoformat);
|
||||||
state2.initialize(settings.separator, settings.autoformat);
|
state2.initialize(&sep, settings.autoformat);
|
||||||
settings.format
|
settings.format
|
||||||
};
|
};
|
||||||
|
|
||||||
let repr = Repr::new(
|
let repr = Repr::new(settings.line_ending, sep, format, &settings.empty);
|
||||||
settings.line_ending,
|
|
||||||
match settings.separator {
|
|
||||||
Sep::Char(sep) => sep,
|
|
||||||
_ => b' ',
|
|
||||||
},
|
|
||||||
&format,
|
|
||||||
&settings.empty,
|
|
||||||
);
|
|
||||||
|
|
||||||
let stdout = stdout();
|
let stdout = stdout();
|
||||||
let mut writer = BufWriter::new(stdout.lock());
|
let mut writer = BufWriter::new(stdout.lock());
|
||||||
|
|
|
@ -58,6 +58,25 @@ fn default_arguments() {
|
||||||
.stdout_only_fixture("default.expected");
|
.stdout_only_fixture("default.expected");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn only_whitespace_separators_merge() {
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("contiguous_separators.txt")
|
||||||
|
.arg("-")
|
||||||
|
.pipe_in(" a ,c ")
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("a ,,,b ,c \n");
|
||||||
|
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("contiguous_separators.txt")
|
||||||
|
.arg("-t")
|
||||||
|
.arg(",")
|
||||||
|
.arg("-")
|
||||||
|
.pipe_in(" a ,c ")
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only(" a ,,,b,c \n");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn different_fields() {
|
fn different_fields() {
|
||||||
new_ucmd!()
|
new_ucmd!()
|
||||||
|
@ -208,9 +227,9 @@ fn tab_multi_character() {
|
||||||
.arg("semicolon_fields_1.txt")
|
.arg("semicolon_fields_1.txt")
|
||||||
.arg("semicolon_fields_2.txt")
|
.arg("semicolon_fields_2.txt")
|
||||||
.arg("-t")
|
.arg("-t")
|
||||||
.arg("э")
|
.arg("ab")
|
||||||
.fails()
|
.fails()
|
||||||
.stderr_is("join: multi-character tab э\n");
|
.stderr_is("join: multi-character tab ab\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -437,14 +456,22 @@ fn non_unicode() {
|
||||||
|
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
{
|
{
|
||||||
let invalid_utf8: u8 = 167;
|
let non_utf8_byte: u8 = 167;
|
||||||
new_ucmd!()
|
new_ucmd!()
|
||||||
.arg("-t")
|
.arg("-t")
|
||||||
.arg(OsStr::from_bytes(&[invalid_utf8]))
|
.arg(OsStr::from_bytes(&[non_utf8_byte]))
|
||||||
.arg("non-unicode_1.bin")
|
.arg("non-unicode_1.bin")
|
||||||
.arg("non-unicode_2.bin")
|
.arg("non-unicode_2.bin")
|
||||||
.succeeds()
|
.succeeds()
|
||||||
.stdout_only_fixture("non-unicode_sep.expected");
|
.stdout_only_fixture("non-unicode_sep.expected");
|
||||||
|
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("-t")
|
||||||
|
.arg(OsStr::from_bytes(&[non_utf8_byte, non_utf8_byte]))
|
||||||
|
.arg("non-unicode_1.bin")
|
||||||
|
.arg("non-unicode_2.bin")
|
||||||
|
.fails()
|
||||||
|
.stderr_is("join: non-UTF-8 multi-byte tab\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(windows)]
|
#[cfg(windows)]
|
||||||
|
@ -462,6 +489,16 @@ fn non_unicode() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multibyte_sep() {
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("-t§")
|
||||||
|
.arg("multibyte_sep_1.txt")
|
||||||
|
.arg("multibyte_sep_2.txt")
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only_fixture("multibyte_sep.expected");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn null_field_separators() {
|
fn null_field_separators() {
|
||||||
new_ucmd!()
|
new_ucmd!()
|
||||||
|
|
1
tests/fixtures/join/contiguous_separators.txt
vendored
Normal file
1
tests/fixtures/join/contiguous_separators.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
a ,,,b
|
1
tests/fixtures/join/multibyte_sep.expected
vendored
Normal file
1
tests/fixtures/join/multibyte_sep.expected
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
a§b§c
|
1
tests/fixtures/join/multibyte_sep_1.txt
vendored
Normal file
1
tests/fixtures/join/multibyte_sep_1.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
a§b
|
1
tests/fixtures/join/multibyte_sep_2.txt
vendored
Normal file
1
tests/fixtures/join/multibyte_sep_2.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
a§c
|
Loading…
Add table
Add a link
Reference in a new issue