From 9ad8a03646de504170e42fe65477bb10ed003214 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Tue, 4 Jan 2022 15:44:20 -0500 Subject: [PATCH 1/3] join: operate on bytes instead of Strings --- src/uu/join/src/join.rs | 141 +++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 60 deletions(-) diff --git a/src/uu/join/src/join.rs b/src/uu/join/src/join.rs index e396d4294..0c881f20d 100644 --- a/src/uu/join/src/join.rs +++ b/src/uu/join/src/join.rs @@ -13,7 +13,7 @@ extern crate uucore; use clap::{crate_version, App, Arg}; use std::cmp::Ordering; use std::fs::File; -use std::io::{stdin, BufRead, BufReader, Lines, Stdin}; +use std::io::{stdin, stdout, BufRead, BufReader, Split, Stdin, Write}; use uucore::display::Quotable; use uucore::error::{set_exit_code, UResult, USimpleError}; @@ -27,7 +27,7 @@ enum FileNum { #[derive(Copy, Clone)] enum Sep { - Char(char), + Char(u8), Line, Whitespaces, } @@ -49,7 +49,7 @@ struct Settings { separator: Sep, autoformat: bool, format: Vec, - empty: String, + empty: Vec, check_order: CheckOrder, headers: bool, } @@ -66,7 +66,7 @@ impl Default for Settings { separator: Sep::Whitespaces, autoformat: false, format: vec![], - empty: String::new(), + empty: vec![], check_order: CheckOrder::Default, headers: false, } @@ -75,13 +75,13 @@ impl Default for Settings { /// Output representation. struct Repr<'a> { - separator: char, + separator: u8, format: &'a [Spec], - empty: &'a str, + empty: &'a [u8], } impl<'a> Repr<'a> { - fn new(separator: char, format: &'a [Spec], empty: &'a str) -> Repr<'a> { + fn new(separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Repr<'a> { Repr { separator, format, @@ -94,32 +94,34 @@ impl<'a> Repr<'a> { } /// Print the field or empty filler if the field is not set. - fn print_field(&self, field: Option<&str>) { + fn print_field(&self, field: Option<&Vec>) -> Result<(), std::io::Error> { let value = match field { Some(field) => field, None => self.empty, }; - print!("{}", value); + stdout().write_all(value) } /// Print each field except the one at the index. - fn print_fields(&self, line: &Line, index: usize) { + fn print_fields(&self, line: &Line, index: usize) -> Result<(), std::io::Error> { for i in 0..line.fields.len() { if i != index { - print!("{}{}", self.separator, line.fields[i]); + stdout().write_all(&[self.separator])?; + stdout().write_all(&line.fields[i])?; } } + Ok(()) } /// Print each field or the empty filler if the field is not set. - fn print_format(&self, f: F) + fn print_format(&self, f: F) -> Result<(), std::io::Error> where - F: Fn(&Spec) -> Option<&'a str>, + F: Fn(&Spec) -> Option<&'a Vec>, { for i in 0..self.format.len() { if i > 0 { - print!("{}", self.separator); + stdout().write_all(&[self.separator])?; } let field = match f(&self.format[i]) { @@ -127,8 +129,9 @@ impl<'a> Repr<'a> { None => self.empty, }; - print!("{}", field); + stdout().write_all(field)?; } + Ok(()) } } @@ -148,10 +151,12 @@ impl Input { } } - fn compare(&self, field1: Option<&str>, field2: Option<&str>) -> Ordering { + fn compare(&self, field1: Option<&Vec>, field2: Option<&Vec>) -> Ordering { if let (Some(field1), Some(field2)) = (field1, field2) { if self.ignore_case { - field1.to_lowercase().cmp(&field2.to_lowercase()) + field1 + .to_ascii_lowercase() + .cmp(&field2.to_ascii_lowercase()) } else { field1.cmp(field2) } @@ -209,14 +214,19 @@ impl Spec { } struct Line { - fields: Vec, + fields: Vec>, } impl Line { - fn new(string: String, separator: Sep) -> Line { + fn new(string: Vec, separator: Sep) -> Line { let fields = match separator { - Sep::Whitespaces => string.split_whitespace().map(String::from).collect(), - Sep::Char(sep) => string.split(sep).map(String::from).collect(), + Sep::Whitespaces => string + // GNU join uses Bourne shell field splitters by default + .split(|c| matches!(*c, b' ' | b'\t' | b'\n')) + .filter(|f| !f.is_empty()) + .map(Vec::from) + .collect(), + Sep::Char(sep) => string.split(|c| *c == sep).map(Vec::from).collect(), Sep::Line => vec![string], }; @@ -224,7 +234,7 @@ impl Line { } /// Get field at index. - fn get_field(&self, index: usize) -> Option<&str> { + fn get_field(&self, index: usize) -> Option<&Vec> { if index < self.fields.len() { Some(&self.fields[index]) } else { @@ -238,7 +248,7 @@ struct State<'a> { file_name: &'a str, file_num: FileNum, print_unpaired: bool, - lines: Lines>, + lines: Split>, seq: Vec, line_num: usize, has_failed: bool, @@ -266,7 +276,7 @@ impl<'a> State<'a> { file_name: name, file_num, print_unpaired, - lines: f.lines(), + lines: f.split(b'\n'), seq: Vec::new(), line_num: 0, has_failed: false, @@ -274,12 +284,13 @@ impl<'a> State<'a> { } /// Skip the current unpaired line. - fn skip_line(&mut self, input: &Input, repr: &Repr) { + fn skip_line(&mut self, input: &Input, repr: &Repr) -> Result<(), std::io::Error> { if self.print_unpaired { - self.print_first_line(repr); + self.print_first_line(repr)?; } self.reset_next_line(input); + Ok(()) } /// Keep reading line sequence until the key does not change, return @@ -299,20 +310,22 @@ impl<'a> State<'a> { } /// Print lines in the buffers as headers. - fn print_headers(&self, other: &State, repr: &Repr) { + fn print_headers(&self, other: &State, repr: &Repr) -> Result<(), std::io::Error> { if self.has_line() { if other.has_line() { - self.combine(other, repr); + self.combine(other, repr)?; } else { - self.print_first_line(repr); + self.print_first_line(repr)?; } } else if other.has_line() { - other.print_first_line(repr); + other.print_first_line(repr)?; } + + Ok(()) } /// Combine two line sequences. - fn combine(&self, other: &State, repr: &Repr) { + fn combine(&self, other: &State, repr: &Repr) -> Result<(), std::io::Error> { let key = self.get_current_key(); for line1 in &self.seq { @@ -331,16 +344,18 @@ impl<'a> State<'a> { None } - }); + })?; } else { - repr.print_field(key); - repr.print_fields(line1, self.key); - repr.print_fields(line2, other.key); + repr.print_field(key)?; + repr.print_fields(line1, self.key)?; + repr.print_fields(line2, other.key)?; } - println!(); + stdout().write_all(&[b'\n'])?; } } + + Ok(()) } /// Reset with the next line. @@ -377,14 +392,16 @@ impl<'a> State<'a> { 0 } - fn finalize(&mut self, input: &Input, repr: &Repr) { + fn finalize(&mut self, input: &Input, repr: &Repr) -> Result<(), std::io::Error> { if self.has_line() && self.print_unpaired { - self.print_first_line(repr); + self.print_first_line(repr)?; while let Some(line) = self.next_line(input) { - self.print_line(&line, repr); + self.print_line(&line, repr)?; } } + + Ok(()) } /// Get the next line without the order check. @@ -423,11 +440,11 @@ impl<'a> State<'a> { } /// Gets the key value of the lines stored in seq. - fn get_current_key(&self) -> Option<&str> { + fn get_current_key(&self) -> Option<&Vec> { self.seq[0].get_field(self.key) } - fn print_line(&self, line: &Line, repr: &Repr) { + fn print_line(&self, line: &Line, repr: &Repr) -> Result<(), std::io::Error> { if repr.uses_format() { repr.print_format(|spec| match *spec { Spec::Key => line.get_field(self.key), @@ -438,17 +455,17 @@ impl<'a> State<'a> { None } } - }); + })?; } else { - repr.print_field(line.get_field(self.key)); - repr.print_fields(line, self.key); + repr.print_field(line.get_field(self.key))?; + repr.print_fields(line, self.key)?; } - println!(); + stdout().write_all(&[b'\n']) } - fn print_first_line(&self, repr: &Repr) { - self.print_line(&self.seq[0], repr); + fn print_first_line(&self, repr: &Repr) -> Result<(), std::io::Error> { + self.print_line(&self.seq[0], repr) } } @@ -481,14 +498,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { settings.key1 = get_field_number(keys, key1)?; settings.key2 = get_field_number(keys, key2)?; - if let Some(value) = matches.value_of("t") { + if let Some(value_str) = matches.value_of("t") { + let value = value_str.as_bytes(); settings.separator = match value.len() { 0 => Sep::Line, - 1 => Sep::Char(value.chars().next().unwrap()), + 1 => Sep::Char(value[0]), _ => { return Err(USimpleError::new( 1, - format!("multi-character tab {}", value), + format!("multi-character tab {}", value_str), )) } }; @@ -507,7 +525,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } if let Some(empty) = matches.value_of("e") { - settings.empty = empty.to_string(); + settings.empty = empty.as_bytes().to_vec(); } if matches.is_present("nocheck-order") { @@ -529,7 +547,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { return Err(USimpleError::new(1, "both files cannot be standard input")); } - exec(file1, file2, settings) + match exec(file1, file2, settings) { + Ok(_) => Ok(()), + Err(e) => Err(USimpleError::new(1, format!("{}", e))), + } } pub fn uu_app() -> App<'static, 'static> { @@ -639,7 +660,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2", ) } -fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> { +fn exec(file1: &str, file2: &str, settings: Settings) -> Result<(), std::io::Error> { let stdin = stdin(); let mut state1 = State::new( @@ -686,14 +707,14 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> { let repr = Repr::new( match settings.separator { Sep::Char(sep) => sep, - _ => ' ', + _ => b' ', }, &format, &settings.empty, ); if settings.headers { - state1.print_headers(&state2, &repr); + state1.print_headers(&state2, &repr)?; state1.reset_read_line(&input); state2.reset_read_line(&input); } @@ -703,17 +724,17 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> { match diff { Ordering::Less => { - state1.skip_line(&input, &repr); + state1.skip_line(&input, &repr)?; } Ordering::Greater => { - state2.skip_line(&input, &repr); + state2.skip_line(&input, &repr)?; } Ordering::Equal => { let next_line1 = state1.extend(&input); let next_line2 = state2.extend(&input); if settings.print_joined { - state1.combine(&state2, &repr); + state1.combine(&state2, &repr)?; } state1.reset(next_line1); @@ -722,8 +743,8 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> { } } - state1.finalize(&input, &repr); - state2.finalize(&input, &repr); + state1.finalize(&input, &repr)?; + state2.finalize(&input, &repr)?; if state1.has_failed || state2.has_failed { set_exit_code(1); From cdfe64369dc336cca683bf376f2141ba77f71e9c Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Sat, 8 Jan 2022 19:23:02 -0500 Subject: [PATCH 2/3] join: add test for non-linefeed newline characters --- tests/by-util/test_join.rs | 9 +++++++++ tests/fixtures/join/non-line_feeds.expected | 2 ++ tests/fixtures/join/non-line_feeds_1.txt | 2 ++ tests/fixtures/join/non-line_feeds_2.txt | 2 ++ 4 files changed, 15 insertions(+) create mode 100644 tests/fixtures/join/non-line_feeds.expected create mode 100644 tests/fixtures/join/non-line_feeds_1.txt create mode 100644 tests/fixtures/join/non-line_feeds_2.txt diff --git a/tests/by-util/test_join.rs b/tests/by-util/test_join.rs index 1d92bf8e7..2bbf637f9 100644 --- a/tests/by-util/test_join.rs +++ b/tests/by-util/test_join.rs @@ -328,3 +328,12 @@ fn single_file_with_header() { .succeeds() .stdout_is("A 1\n"); } + +#[test] +fn non_line_feeds() { + new_ucmd!() + .arg("non-line_feeds_1.txt") + .arg("non-line_feeds_2.txt") + .succeeds() + .stdout_only_fixture("non-line_feeds.expected"); +} diff --git a/tests/fixtures/join/non-line_feeds.expected b/tests/fixtures/join/non-line_feeds.expected new file mode 100644 index 000000000..c4cb5b448 --- /dev/null +++ b/tests/fixtures/join/non-line_feeds.expected @@ -0,0 +1,2 @@ + b d +a c f diff --git a/tests/fixtures/join/non-line_feeds_1.txt b/tests/fixtures/join/non-line_feeds_1.txt new file mode 100644 index 000000000..f3e0c0732 --- /dev/null +++ b/tests/fixtures/join/non-line_feeds_1.txt @@ -0,0 +1,2 @@ + b +a c diff --git a/tests/fixtures/join/non-line_feeds_2.txt b/tests/fixtures/join/non-line_feeds_2.txt new file mode 100644 index 000000000..0a5301e13 --- /dev/null +++ b/tests/fixtures/join/non-line_feeds_2.txt @@ -0,0 +1,2 @@ + d +a f From 4df2f3c148777188f80d85630a28f01f90bfd0d8 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Sat, 8 Jan 2022 21:28:29 -0500 Subject: [PATCH 3/3] join: add test for non-Unicode files --- tests/by-util/test_join.rs | 9 +++++++++ tests/fixtures/join/non-unicode.expected | Bin 0 -> 14 bytes tests/fixtures/join/non-unicode_1.bin | Bin 0 -> 7 bytes tests/fixtures/join/non-unicode_2.bin | Bin 0 -> 9 bytes 4 files changed, 9 insertions(+) create mode 100644 tests/fixtures/join/non-unicode.expected create mode 100644 tests/fixtures/join/non-unicode_1.bin create mode 100644 tests/fixtures/join/non-unicode_2.bin diff --git a/tests/by-util/test_join.rs b/tests/by-util/test_join.rs index 2bbf637f9..be25b9390 100644 --- a/tests/by-util/test_join.rs +++ b/tests/by-util/test_join.rs @@ -337,3 +337,12 @@ fn non_line_feeds() { .succeeds() .stdout_only_fixture("non-line_feeds.expected"); } + +#[test] +fn non_unicode() { + new_ucmd!() + .arg("non-unicode_1.bin") + .arg("non-unicode_2.bin") + .succeeds() + .stdout_only_fixture("non-unicode.expected"); +} diff --git a/tests/fixtures/join/non-unicode.expected b/tests/fixtures/join/non-unicode.expected new file mode 100644 index 0000000000000000000000000000000000000000..7c2e0d9aff2d13eba928934bd49c42bfb6577dd0 GIT binary patch literal 14 VcmYdPSk925u$&>8Aw?mL3jiJ{1FHZ4 literal 0 HcmV?d00001 diff --git a/tests/fixtures/join/non-unicode_1.bin b/tests/fixtures/join/non-unicode_1.bin new file mode 100644 index 0000000000000000000000000000000000000000..e264bd505fba5da868c832d55d77dad4a3e866b4 GIT binary patch literal 7 OcmYdPSk92bl?ng|Rss9~ literal 0 HcmV?d00001 diff --git a/tests/fixtures/join/non-unicode_2.bin b/tests/fixtures/join/non-unicode_2.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b336c66950e1e30804609258503761a29e3f1c1 GIT binary patch literal 9 QcmYdPSk92lkfM+V01U(eb^rhX literal 0 HcmV?d00001