diff --git a/src/uu/od/src/prn_char.rs b/src/uu/od/src/prn_char.rs index 36a00a67b..eae9fc437 100644 --- a/src/uu/od/src/prn_char.rs +++ b/src/uu/od/src/prn_char.rs @@ -2,7 +2,6 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use std::str::from_utf8; use crate::formatteriteminfo::{FormatWriter, FormatterItemInfo}; @@ -51,33 +50,13 @@ fn format_item_c(bytes: &[u8]) -> String { let b = bytes[0]; if b & 0x80 == 0x00 { + // ASCII byte (0xxxxxxx) match C_CHARS.get(b as usize) { Some(s) => format!("{s:>4}"), None => format!("{b:>4}"), } - } else if (b & 0xc0) == 0x80 { - // second or subsequent octet of an utf-8 sequence - String::from(" **") - } else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) { - // start of a 2 octet utf-8 sequence - match from_utf8(&bytes[0..2]) { - Ok(s) => format!("{s:>4}"), - Err(_) => format!(" {b:03o}"), - } - } else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) { - // start of a 3 octet utf-8 sequence - match from_utf8(&bytes[0..3]) { - Ok(s) => format!("{s:>4}"), - Err(_) => format!(" {b:03o}"), - } - } else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) { - // start of a 4 octet utf-8 sequence - match from_utf8(&bytes[0..4]) { - Ok(s) => format!("{s:>4}"), - Err(_) => format!(" {b:03o}"), - } } else { - // invalid utf-8 + // Continuation or leading byte of a multibyte UTF-8 sequence — treat as raw byte format!(" {b:03o}") } } @@ -125,27 +104,22 @@ fn test_format_item_c() { assert_eq!(" 177", format_item_c(&[0x7f])); assert_eq!(" A", format_item_c(&[0x41, 0x21])); - assert_eq!(" **", format_item_c(&[0x80])); - assert_eq!(" **", format_item_c(&[0x9f])); + assert_eq!(" 200", format_item_c(&[0x80])); + assert_eq!(" 237", format_item_c(&[0x9f])); - assert_eq!(" ß", format_item_c(&[0xc3, 0x9f])); - assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21])); + assert_eq!(" 303", format_item_c(&[0xc3, 0x9f])); + assert_eq!(" 303", format_item_c(&[0xc3, 0x9f, 0x21])); - assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80])); - assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21])); + assert_eq!(" 341", format_item_c(&[0xe1, 0x80, 0x80])); - assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96])); - assert_eq!( - " \u{1f496}", - format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21]) - ); + assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92, 0x96])); assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (UTF-8 null) assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8 assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8 assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong) assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet) - assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD + assert_eq!(" 364", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8 assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8 assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8 diff --git a/tests/by-util/test_od.rs b/tests/by-util/test_od.rs index d8c22dc82..ec1e48918 100644 --- a/tests/by-util/test_od.rs +++ b/tests/by-util/test_od.rs @@ -279,18 +279,19 @@ fn test_f64() { #[test] fn test_multibyte() { + let input = "’‐ˆ‘˜語🙂✅🐶𝛑Universität Tübingen \u{1B000}"; // spell-checker:disable-line new_ucmd!() - .arg("-c") - .arg("-w12") - .run_piped_stdin("Universität Tübingen \u{1B000}".as_bytes()) // spell-checker:disable-line + .args(&["-t", "c"]) + .run_piped_stdin(input.as_bytes()) .success() .no_stderr() .stdout_is(unindent( - " - 0000000 U n i v e r s i t ä ** t - 0000014 T ü ** b i n g e n \u{1B000} - 0000030 ** ** ** - 0000033 + r" + 0000000 342 200 231 342 200 220 313 206 342 200 230 313 234 350 252 236 + 0000020 360 237 231 202 342 234 205 360 237 220 266 360 235 233 221 U + 0000040 n i v e r s i t 303 244 t T 303 274 b + 0000060 i n g e n 360 233 200 200 + 0000072 ", )); } @@ -714,10 +715,10 @@ fn test_ascii_dump() { r" 0000000 00 01 0a 0d 10 1f 20 61 62 63 7d 7e 7f 80 90 a0 >...... abc}~....< nul soh nl cr dle us sp a b c } ~ del nul dle sp - \0 001 \n \r 020 037 a b c } ~ 177 ** ** ** >...... abc}~....< + \0 001 \n \r 020 037 a b c } ~ 177 200 220 240 >...... abc}~....< 0000020 b0 c0 d0 e0 f0 ff >......< 0 @ P ` p del - ** 300 320 340 360 377 >......< + 260 300 320 340 360 377 >......< 0000026 ", ));