join: add support for multibyte separators (#6736)

* join: add test for multibyte separators * join: implement support for multibyte separators * join: use a trait instead of an enum for separator * join: test whitespace merging
2025-07-27 19:17:43 +00:00 · 2024-10-06 05:48:08 -04:00 · 2024-10-06 05:48:08 -04:00 · a51a731704
commit a51a731704
parent d8eb4e2214
6 changed files with 251 additions and 89 deletions
--- a/src/uu/join/src/join.rs
+++ b/src/uu/join/src/join.rs
@ -3,11 +3,11 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
-// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck
+// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck memmem
 use clap::builder::ValueParser;
 use clap::{crate_version, Arg, ArgAction, Command};
-use memchr::{memchr3_iter, memchr_iter};
+use memchr::{memchr_iter, memmem::Finder, Memchr3};
 use std::cmp::Ordering;
 use std::error::Error;
 use std::ffi::OsString;
@ -60,13 +60,114 @@ enum FileNum {
    File2,
 }
-#[derive(Copy, Clone, PartialEq)]
+#[derive(Clone)]
-enum Sep {
+enum SepSetting {
-    Char(u8),
+    /// Any single-byte separator.
    Byte(u8),
    /// A single character more than one byte long.
    Char(Vec<u8>),
    /// No separators, join on the entire line.
    Line,
    /// Whitespace separators.
    Whitespaces,
 }
 trait Separator: Clone {
    /// Using this separator, return the start and end index of all fields in the haystack.
    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)>;
    /// The separator as it appears when in the output.
    fn output_separator(&self) -> &[u8];
 }
 /// Simple separators one byte in length.
 #[derive(Copy, Clone)]
 struct OneByteSep {
    byte: [u8; 1],
 }
 impl Separator for OneByteSep {
    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
        let mut field_ranges = Vec::with_capacity(len_guess);
        let mut last_end = 0;
        for i in memchr_iter(self.byte[0], haystack) {
            field_ranges.push((last_end, i));
            last_end = i + 1;
        }
        field_ranges.push((last_end, haystack.len()));
        field_ranges
    }
    fn output_separator(&self) -> &[u8] {
        &self.byte
    }
 }
 /// Multi-byte (but still single character) separators.
 #[derive(Clone)]
 struct MultiByteSep<'a> {
    finder: Finder<'a>,
 }
 impl<'a> Separator for MultiByteSep<'a> {
    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
        let mut field_ranges = Vec::with_capacity(len_guess);
        let mut last_end = 0;
        for i in self.finder.find_iter(haystack) {
            field_ranges.push((last_end, i));
            last_end = i + self.finder.needle().len();
        }
        field_ranges.push((last_end, haystack.len()));
        field_ranges
    }
    fn output_separator(&self) -> &[u8] {
        self.finder.needle()
    }
 }
 /// Whole-line separator.
 #[derive(Copy, Clone)]
 struct LineSep {}
 impl Separator for LineSep {
    fn field_ranges(&self, haystack: &[u8], _len_guess: usize) -> Vec<(usize, usize)> {
        vec![(0, haystack.len())]
    }
    fn output_separator(&self) -> &[u8] {
        &[]
    }
 }
 /// Default whitespace separator.
 #[derive(Copy, Clone)]
 struct WhitespaceSep {}
 impl Separator for WhitespaceSep {
    fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
        let mut field_ranges = Vec::with_capacity(len_guess);
        let mut last_end = 0;
        // GNU join used Bourne shell field splitters by default
        // FIXME: but now uses locale-dependent whitespace
        for i in Memchr3::new(b' ', b'\t', b'\n', haystack) {
            // leading whitespace should be dropped, contiguous whitespace merged
            if i > last_end {
                field_ranges.push((last_end, i));
            }
            last_end = i + 1;
        }
        field_ranges.push((last_end, haystack.len()));
        field_ranges
    }
    fn output_separator(&self) -> &[u8] {
        b" "
    }
 }
 #[derive(Copy, Clone, PartialEq)]
 enum CheckOrder {
    Default,
@ -82,7 +183,7 @@ struct Settings {
    print_joined: bool,
    ignore_case: bool,
    line_ending: LineEnding,
-    separator: Sep,
+    separator: SepSetting,
    autoformat: bool,
    format: Vec<Spec>,
    empty: Vec<u8>,
@ -100,7 +201,7 @@ impl Default for Settings {
            print_joined: true,
            ignore_case: false,
            line_ending: LineEnding::Newline,
-            separator: Sep::Whitespaces,
+            separator: SepSetting::Whitespaces,
            autoformat: false,
            format: vec![],
            empty: vec![],
@ -111,15 +212,15 @@ impl Default for Settings {
 }
 /// Output representation.
-struct Repr<'a> {
+struct Repr<'a, Sep: Separator> {
    line_ending: LineEnding,
-    separator: u8,
+    separator: Sep,
-    format: &'a [Spec],
+    format: Vec<Spec>,
    empty: &'a [u8],
 }
-impl<'a> Repr<'a> {
+impl<'a, Sep: Separator> Repr<'a, Sep> {
-    fn new(line_ending: LineEnding, separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Self {
+    fn new(line_ending: LineEnding, separator: Sep, format: Vec<Spec>, empty: &'a [u8]) -> Self {
        Repr {
            line_ending,
            separator,
@ -155,7 +256,7 @@ impl<'a> Repr<'a> {
    ) -> Result<(), std::io::Error> {
        for i in 0..line.field_ranges.len() {
            if i != index {
-                writer.write_all(&[self.separator])?;
+                writer.write_all(self.separator.output_separator())?;
                writer.write_all(line.get_field(i).unwrap())?;
            }
        }
@ -169,7 +270,7 @@ impl<'a> Repr<'a> {
    {
        for i in 0..self.format.len() {
            if i > 0 {
-                writer.write_all(&[self.separator])?;
+                writer.write_all(self.separator.output_separator())?;
            }
            let field = match f(&self.format[i]) {
@ -188,13 +289,13 @@ impl<'a> Repr<'a> {
 }
 /// Input processing parameters.
-struct Input {
+struct Input<Sep: Separator> {
    separator: Sep,
    ignore_case: bool,
    check_order: CheckOrder,
 }
-impl Input {
+impl<Sep: Separator> Input<Sep> {
    fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self {
        Self {
            separator,
@ -271,24 +372,8 @@ struct Line {
 }
 impl Line {
-    fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self {
+    fn new<Sep: Separator>(string: Vec<u8>, separator: &Sep, len_guess: usize) -> Self {
-        let mut field_ranges = Vec::with_capacity(len_guess);
+        let field_ranges = separator.field_ranges(&string, len_guess);
        let mut last_end = 0;
        if separator == Sep::Whitespaces {
            // GNU join uses Bourne shell field splitters by default
            for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
                if i > last_end {
                    field_ranges.push((last_end, i));
                }
                last_end = i + 1;
            }
        } else if let Sep::Char(sep) = separator {
            for i in memchr_iter(sep, &string) {
                field_ranges.push((last_end, i));
                last_end = i + 1;
            }
        }
        field_ranges.push((last_end, string.len()));
        Self {
            field_ranges,
@ -351,7 +436,12 @@ impl<'a> State<'a> {
    }
    /// Skip the current unpaired line.
-    fn skip_line(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
+    fn skip_line<Sep: Separator>(
        &mut self,
        writer: &mut impl Write,
        input: &Input<Sep>,
        repr: &Repr<'a, Sep>,
    ) -> UResult<()> {
        if self.print_unpaired {
            self.print_first_line(writer, repr)?;
        }
@ -362,7 +452,7 @@ impl<'a> State<'a> {
    /// Keep reading line sequence until the key does not change, return
    /// the first line whose key differs.
-    fn extend(&mut self, input: &Input) -> UResult<Option<Line>> {
+    fn extend<Sep: Separator>(&mut self, input: &Input<Sep>) -> UResult<Option<Line>> {
        while let Some(line) = self.next_line(input)? {
            let diff = input.compare(self.get_current_key(), line.get_field(self.key));
@ -377,11 +467,11 @@ impl<'a> State<'a> {
    }
    /// Print lines in the buffers as headers.
-    fn print_headers(
+    fn print_headers<Sep: Separator>(
        &self,
        writer: &mut impl Write,
        other: &State,
-        repr: &Repr,
+        repr: &Repr<'a, Sep>,
    ) -> Result<(), std::io::Error> {
        if self.has_line() {
            if other.has_line() {
@ -397,11 +487,11 @@ impl<'a> State<'a> {
    }
    /// Combine two line sequences.
-    fn combine(
+    fn combine<Sep: Separator>(
        &self,
        writer: &mut impl Write,
        other: &State,
-        repr: &Repr,
+        repr: &Repr<'a, Sep>,
    ) -> Result<(), std::io::Error> {
        let key = self.get_current_key();
@ -444,13 +534,16 @@ impl<'a> State<'a> {
        }
    }
-    fn reset_read_line(&mut self, input: &Input) -> Result<(), std::io::Error> {
+    fn reset_read_line<Sep: Separator>(
-        let line = self.read_line(input.separator)?;
+        &mut self,
        input: &Input<Sep>,
    ) -> Result<(), std::io::Error> {
        let line = self.read_line(&input.separator)?;
        self.reset(line);
        Ok(())
    }
-    fn reset_next_line(&mut self, input: &Input) -> Result<(), JoinError> {
+    fn reset_next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<(), JoinError> {
        let line = self.next_line(input)?;
        self.reset(line);
        Ok(())
@ -460,7 +553,7 @@ impl<'a> State<'a> {
        !self.seq.is_empty()
    }
-    fn initialize(&mut self, read_sep: Sep, autoformat: bool) -> usize {
+    fn initialize<Sep: Separator>(&mut self, read_sep: &Sep, autoformat: bool) -> usize {
        if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) {
            self.seq.push(line);
@ -471,7 +564,12 @@ impl<'a> State<'a> {
        0
    }
-    fn finalize(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
+    fn finalize<Sep: Separator>(
        &mut self,
        writer: &mut impl Write,
        input: &Input<Sep>,
        repr: &Repr<'a, Sep>,
    ) -> UResult<()> {
        if self.has_line() {
            if self.print_unpaired {
                self.print_first_line(writer, repr)?;
@ -491,7 +589,7 @@ impl<'a> State<'a> {
    }
    /// Get the next line without the order check.
-    fn read_line(&mut self, sep: Sep) -> Result<Option<Line>, std::io::Error> {
+    fn read_line<Sep: Separator>(&mut self, sep: &Sep) -> Result<Option<Line>, std::io::Error> {
        match self.lines.next() {
            Some(value) => {
                self.line_num += 1;
@ -506,8 +604,8 @@ impl<'a> State<'a> {
    }
    /// Get the next line with the order check.
-    fn next_line(&mut self, input: &Input) -> Result<Option<Line>, JoinError> {
+    fn next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<Option<Line>, JoinError> {
-        if let Some(line) = self.read_line(input.separator)? {
+        if let Some(line) = self.read_line(&input.separator)? {
            if input.check_order == CheckOrder::Disabled {
                return Ok(Some(line));
            }
@ -543,11 +641,11 @@ impl<'a> State<'a> {
        self.seq[0].get_field(self.key)
    }
-    fn print_line(
+    fn print_line<Sep: Separator>(
        &self,
        writer: &mut impl Write,
        line: &Line,
-        repr: &Repr,
+        repr: &Repr<'a, Sep>,
    ) -> Result<(), std::io::Error> {
        if repr.uses_format() {
            repr.print_format(writer, |spec| match *spec {
@ -568,31 +666,53 @@ impl<'a> State<'a> {
        repr.print_line_ending(writer)
    }
-    fn print_first_line(&self, writer: &mut impl Write, repr: &Repr) -> Result<(), std::io::Error> {
+    fn print_first_line<Sep: Separator>(
        &self,
        writer: &mut impl Write,
        repr: &Repr<'a, Sep>,
    ) -> Result<(), std::io::Error> {
        self.print_line(writer, &self.seq[0], repr)
    }
 }
-fn parse_separator(value_os: &OsString) -> UResult<Sep> {
+fn parse_separator(value_os: &OsString) -> UResult<SepSetting> {
    // Five possible separator values:
    // No argument supplied, separate on whitespace; handled implicitly as the default elsewhere
    // An empty string arg, whole line separation
    // On unix-likes only, a single arbitrary byte
    // The two-character "\0" string, interpreted as a single 0 byte
    // A single scalar valid in the locale encoding (currently only UTF-8)
    if value_os.is_empty() {
        return Ok(SepSetting::Line);
    }
    #[cfg(unix)]
-    let value = value_os.as_bytes();
+    {
-    #[cfg(not(unix))]
+        let value = value_os.as_bytes();
-    let value = match value_os.to_str() {
+        if value.len() == 1 {
-        Some(value) => value.as_bytes(),
+            return Ok(SepSetting::Byte(value[0]));
        None => {
            return Err(USimpleError::new(
                1,
                "unprintable field separators are only supported on unix-like platforms",
            ));
        }
    }
    let Some(value) = value_os.to_str() else {
        #[cfg(unix)]
        return Err(USimpleError::new(1, "non-UTF-8 multi-byte tab"));
        #[cfg(not(unix))]
        return Err(USimpleError::new(
            1,
            "unprintable field separators are only supported on unix-like platforms",
        ));
    };
-    match value.len() {
+
-        0 => Ok(Sep::Line),
+    let mut chars = value.chars();
-        1 => Ok(Sep::Char(value[0])),
+    let c = chars.next().expect("valid string with at least one byte");
-        2 if value[0] == b'\\' && value[1] == b'0' => Ok(Sep::Char(0)),
+    match chars.next() {
        None => Ok(SepSetting::Char(value.into())),
        Some('0') if c == '\\' => Ok(SepSetting::Byte(0)),
        _ => Err(USimpleError::new(
            1,
-            format!("multi-character tab {}", value_os.to_string_lossy()),
+            format!("multi-character tab {}", value),
        )),
    }
 }
@ -695,7 +815,20 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
        return Err(USimpleError::new(1, "both files cannot be standard input"));
    }
-    exec(file1, file2, settings)
+    let sep = settings.separator.clone();
    match sep {
        SepSetting::Byte(byte) => exec(file1, file2, settings, OneByteSep { byte: [byte] }),
        SepSetting::Char(c) => exec(
            file1,
            file2,
            settings,
            MultiByteSep {
                finder: Finder::new(&c),
            },
        ),
        SepSetting::Whitespaces => exec(file1, file2, settings, WhitespaceSep {}),
        SepSetting::Line => exec(file1, file2, settings, LineSep {}),
    }
 }
 pub fn uu_app() -> Command {
@ -816,7 +949,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
        )
 }
-fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
+fn exec<Sep: Separator>(file1: &str, file2: &str, settings: Settings, sep: Sep) -> UResult<()> {
    let stdin = stdin();
    let mut state1 = State::new(
@ -837,16 +970,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
        settings.print_unpaired2,
    )?;
-    let input = Input::new(
+    let input = Input::new(sep.clone(), settings.ignore_case, settings.check_order);
        settings.separator,
        settings.ignore_case,
        settings.check_order,
    );
    let format = if settings.autoformat {
        let mut format = vec![Spec::Key];
        let mut initialize = |state: &mut State| {
-            let max_fields = state.initialize(settings.separator, settings.autoformat);
+            let max_fields = state.initialize(&sep, settings.autoformat);
            for i in 0..max_fields {
                if i != state.key {
                    format.push(Spec::Field(state.file_num, i));
@ -857,20 +986,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
        initialize(&mut state2);
        format
    } else {
-        state1.initialize(settings.separator, settings.autoformat);
+        state1.initialize(&sep, settings.autoformat);
-        state2.initialize(settings.separator, settings.autoformat);
+        state2.initialize(&sep, settings.autoformat);
        settings.format
    };
-    let repr = Repr::new(
+    let repr = Repr::new(settings.line_ending, sep, format, &settings.empty);
        settings.line_ending,
        match settings.separator {
            Sep::Char(sep) => sep,
            _ => b' ',
        },
        &format,
        &settings.empty,
    );
    let stdout = stdout();
    let mut writer = BufWriter::new(stdout.lock());
--- a/tests/by-util/test_join.rs
+++ b/tests/by-util/test_join.rs
@ -58,6 +58,25 @@ fn default_arguments() {
        .stdout_only_fixture("default.expected");
 }
 #[test]
 fn only_whitespace_separators_merge() {
    new_ucmd!()
        .arg("contiguous_separators.txt")
        .arg("-")
        .pipe_in(" a  ,c ")
        .succeeds()
        .stdout_only("a ,,,b ,c \n");
    new_ucmd!()
        .arg("contiguous_separators.txt")
        .arg("-t")
        .arg(",")
        .arg("-")
        .pipe_in(" a  ,c ")
        .succeeds()
        .stdout_only(" a  ,,,b,c \n");
 }
 #[test]
 fn different_fields() {
    new_ucmd!()
@ -208,9 +227,9 @@ fn tab_multi_character() {
        .arg("semicolon_fields_1.txt")
        .arg("semicolon_fields_2.txt")
        .arg("-t")
-        .arg("э")
+        .arg("ab")
        .fails()
-        .stderr_is("join: multi-character tab э\n");
+        .stderr_is("join: multi-character tab ab\n");
 }
 #[test]
@ -437,14 +456,22 @@ fn non_unicode() {
    #[cfg(unix)]
    {
-        let invalid_utf8: u8 = 167;
+        let non_utf8_byte: u8 = 167;
        new_ucmd!()
            .arg("-t")
-            .arg(OsStr::from_bytes(&[invalid_utf8]))
+            .arg(OsStr::from_bytes(&[non_utf8_byte]))
            .arg("non-unicode_1.bin")
            .arg("non-unicode_2.bin")
            .succeeds()
            .stdout_only_fixture("non-unicode_sep.expected");
        new_ucmd!()
            .arg("-t")
            .arg(OsStr::from_bytes(&[non_utf8_byte, non_utf8_byte]))
            .arg("non-unicode_1.bin")
            .arg("non-unicode_2.bin")
            .fails()
            .stderr_is("join: non-UTF-8 multi-byte tab\n");
    }
    #[cfg(windows)]
@ -462,6 +489,16 @@ fn non_unicode() {
    }
 }
 #[test]
 fn multibyte_sep() {
    new_ucmd!()
        .arg("-t§")
        .arg("multibyte_sep_1.txt")
        .arg("multibyte_sep_2.txt")
        .succeeds()
        .stdout_only_fixture("multibyte_sep.expected");
 }
 #[test]
 fn null_field_separators() {
    new_ucmd!()
--- a/tests/fixtures/join/contiguous_separators.txt
+++ b/tests/fixtures/join/contiguous_separators.txt
@ -0,0 +1 @@
 a  ,,,b
--- a/tests/fixtures/join/multibyte_sep.expected
+++ b/tests/fixtures/join/multibyte_sep.expected
@ -0,0 +1 @@
 a§b§c
--- a/tests/fixtures/join/multibyte_sep_1.txt
+++ b/tests/fixtures/join/multibyte_sep_1.txt
@ -0,0 +1 @@
 a§b
--- a/tests/fixtures/join/multibyte_sep_2.txt
+++ b/tests/fixtures/join/multibyte_sep_2.txt
@ -0,0 +1 @@
 a§c