diff --git a/Cargo.lock b/Cargo.lock index 112322477..33e33d31a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2350,12 +2350,6 @@ dependencies = [ "log", ] -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - [[package]] name = "utf8parse" version = "0.2.1" @@ -3330,7 +3324,6 @@ dependencies = [ "libc", "nix", "unicode-width", - "utf-8", "uucore", ] diff --git a/src/uu/wc/Cargo.toml b/src/uu/wc/Cargo.toml index ca0eec9bc..1b1bd41d3 100644 --- a/src/uu/wc/Cargo.toml +++ b/src/uu/wc/Cargo.toml @@ -18,7 +18,6 @@ path = "src/wc.rs" clap = { workspace=true } uucore = { workspace=true, features=["pipes"] } bytecount = { workspace=true } -utf-8 = { workspace=true } unicode-width = { workspace=true } [target.'cfg(unix)'.dependencies] diff --git a/src/uu/wc/src/utf8/LICENSE b/src/uu/wc/src/utf8/LICENSE new file mode 100644 index 000000000..6f3c83e68 --- /dev/null +++ b/src/uu/wc/src/utf8/LICENSE @@ -0,0 +1,26 @@ +// spell-checker:ignore Sapin +Copyright (c) Simon Sapin and many others + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs new file mode 100644 index 000000000..31638e758 --- /dev/null +++ b/src/uu/wc/src/utf8/mod.rs @@ -0,0 +1,93 @@ +// spell-checker:ignore Sapin +mod read; + +pub use read::{BufReadDecoder, BufReadDecoderError}; + +use std::cmp; +use std::str; + +/// +/// Incremental, zero-copy UTF-8 decoding with error handling +/// +/// The original implementation was written by Simon Sapin in the utf-8 crate . +/// uu_wc used to depend on that crate. +/// The author archived the repository . +/// They suggested incorporating the source directly into uu_wc . +/// + +#[derive(Debug, Copy, Clone)] +pub struct Incomplete { + pub buffer: [u8; 4], + pub buffer_len: u8, +} + +impl Incomplete { + pub fn empty() -> Self { + Self { + buffer: [0, 0, 0, 0], + buffer_len: 0, + } + } + + pub fn is_empty(&self) -> bool { + self.buffer_len == 0 + } + + pub fn new(bytes: &[u8]) -> Self { + let mut buffer = [0, 0, 0, 0]; + let len = bytes.len(); + buffer[..len].copy_from_slice(bytes); + Self { + buffer, + buffer_len: len as u8, + } + } + + fn take_buffer(&mut self) -> &[u8] { + let len = self.buffer_len as usize; + self.buffer_len = 0; + &self.buffer[..len] + } + + /// (consumed_from_input, None): not enough input + /// (consumed_from_input, Some(Err(()))): error bytes in buffer + /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer + fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option>) { + let initial_buffer_len = self.buffer_len as usize; + let copied_from_input; + { + let unwritten = &mut self.buffer[initial_buffer_len..]; + copied_from_input = cmp::min(unwritten.len(), input.len()); + unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); + } + let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; + match str::from_utf8(spliced) { + Ok(_) => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, Some(Ok(()))) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = valid_up_to as u8; + (consumed, Some(Ok(()))) + } else { + match error.error_len() { + Some(invalid_sequence_length) => { + let consumed = invalid_sequence_length + .checked_sub(initial_buffer_len) + .unwrap(); + self.buffer_len = invalid_sequence_length as u8; + (consumed, Some(Err(()))) + } + None => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, None) + } + } + } + } + } + } +} diff --git a/src/uu/wc/src/utf8/read.rs b/src/uu/wc/src/utf8/read.rs new file mode 100644 index 000000000..79b8ad8ae --- /dev/null +++ b/src/uu/wc/src/utf8/read.rs @@ -0,0 +1,137 @@ +// spell-checker:ignore bytestream +use super::*; +use std::error::Error; +use std::fmt; +use std::io::{self, BufRead}; +use std::str; + +/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. +pub struct BufReadDecoder { + buf_read: B, + bytes_consumed: usize, + incomplete: Incomplete, +} + +#[derive(Debug)] +pub enum BufReadDecoderError<'a> { + /// Represents one UTF-8 error in the byte stream. + /// + /// In lossy decoding, each such error should be replaced with U+FFFD. + /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) + InvalidByteSequence(&'a [u8]), + + /// An I/O error from the underlying byte stream + Io(io::Error), +} + +impl<'a> fmt::Display for BufReadDecoderError<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + BufReadDecoderError::InvalidByteSequence(bytes) => { + write!(f, "invalid byte sequence: {:02x?}", bytes) + } + BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), + } + } +} + +impl<'a> Error for BufReadDecoderError<'a> { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match *self { + BufReadDecoderError::InvalidByteSequence(_) => None, + BufReadDecoderError::Io(ref err) => Some(err), + } + } +} + +impl BufReadDecoder { + pub fn new(buf_read: B) -> Self { + Self { + buf_read, + bytes_consumed: 0, + incomplete: Incomplete::empty(), + } + } + + /// Decode and consume the next chunk of UTF-8 input. + /// + /// This method is intended to be called repeatedly until it returns `None`, + /// which represents EOF from the underlying byte stream. + /// This is similar to `Iterator::next`, + /// except that decoded chunks borrow the decoder (~iterator) + /// so they need to be handled or copied before the next chunk can start decoding. + pub fn next_strict(&mut self) -> Option> { + enum BytesSource { + BufRead(usize), + Incomplete, + } + macro_rules! try_io { + ($io_result: expr) => { + match $io_result { + Ok(value) => value, + Err(error) => return Some(Err(BufReadDecoderError::Io(error))), + } + }; + } + let (source, result) = loop { + if self.bytes_consumed > 0 { + self.buf_read.consume(self.bytes_consumed); + self.bytes_consumed = 0; + } + let buf = try_io!(self.buf_read.fill_buf()); + + // Force loop iteration to go through an explicit `continue` + enum Unreachable {} + let _: Unreachable = if self.incomplete.is_empty() { + if buf.is_empty() { + return None; // EOF + } + match str::from_utf8(buf) { + Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())), + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + break (BytesSource::BufRead(valid_up_to), Ok(())); + } + match error.error_len() { + Some(invalid_sequence_length) => { + break (BytesSource::BufRead(invalid_sequence_length), Err(())) + } + None => { + self.bytes_consumed = buf.len(); + self.incomplete = Incomplete::new(buf); + // need more input bytes + continue; + } + } + } + } + } else { + if buf.is_empty() { + break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point + } + let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); + self.bytes_consumed = consumed; + match opt_result { + None => { + // need more input bytes + continue; + } + Some(result) => break (BytesSource::Incomplete, result), + } + }; + }; + let bytes = match source { + BytesSource::BufRead(byte_count) => { + self.bytes_consumed = byte_count; + let buf = try_io!(self.buf_read.fill_buf()); + &buf[..byte_count] + } + BytesSource::Incomplete => self.incomplete.take_buffer(), + }; + match result { + Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), + Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), + } + } +} diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 984722dd1..949f05d65 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -9,6 +9,7 @@ mod count_fast; mod countable; +mod utf8; mod word_count; use clap::builder::ValueParser; use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};