From 3031874d22901d6554f72f6db0c8071b0504c71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20P=C3=A9ron?= Date: Fri, 18 Oct 2024 15:46:51 +0200 Subject: [PATCH] uucore(lib): add u8/OsString conversion utils --- src/uucore/src/lib/lib.rs | 72 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index b4b353e3e..a636fcdab 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -100,10 +100,14 @@ pub use crate::features::fsxattr; //## core functions +use std::borrow::Cow; use std::ffi::OsStr; use std::ffi::OsString; +use std::io::{BufRead, BufReader}; +use std::iter; #[cfg(unix)] -use std::os::unix::ffi::OsStrExt; +use std::os::unix::ffi::{OsStrExt, OsStringExt}; +use std::str; use std::sync::atomic::Ordering; use once_cell::sync::Lazy; @@ -240,6 +244,72 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { Ok(bytes) } +/// Helper function for converting a slice of bytes into an &OsStr +/// or OsString in non-unix targets. +/// +/// It converts `&[u8]` to `Cow` for unix targets only. +/// On non-unix (i.e. Windows), the conversion goes through the String type +/// and thus undergo UTF-8 validation, making it fail if the stream contains +/// non-UTF-8 characters. +pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { + #[cfg(unix)] + let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); + #[cfg(not(unix))] + let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { + mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") + })?)); + + Ok(os_str) +} + +/// Helper function for making an `OsString` from a byte field +/// It converts `Vec` to `OsString` for unix targets only. +/// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8 +pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { + #[cfg(unix)] + let s = OsString::from_vec(vec); + #[cfg(not(unix))] + let s = OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?); + + Ok(s) +} + +/// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec`, +/// which avoids panicking on non UTF-8 input. +pub fn read_byte_lines( + mut buf_reader: BufReader, +) -> impl Iterator> { + iter::from_fn(move || { + let mut buf = Vec::with_capacity(256); + let size = buf_reader.read_until(b'\n', &mut buf).ok()?; + + if size == 0 { + return None; + } + + // Trim (\r)\n + if buf.ends_with(b"\n") { + buf.pop(); + if buf.ends_with(b"\r") { + buf.pop(); + } + } + + Some(buf) + }) +} + +/// Equivalent to `std::BufRead::lines` which outputs each line as an `OsString` +/// This won't panic on non UTF-8 characters on Unix, +/// but it still will on Windows. +pub fn read_os_string_lines( + buf_reader: BufReader, +) -> impl Iterator { + read_byte_lines(buf_reader).map(|byte_line| os_string_from_vec(byte_line).expect("UTF-8 error")) +} + /// Prompt the user with a formatted string and returns `true` if they reply `'y'` or `'Y'` /// /// This macro functions accepts the same syntax as `format!`. The prompt is written to