mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-09-16 19:56:17 +00:00
wc: Do a chunked read with proper UTF-8 handling
This brings the results mostly in line with GNU wc and solves nasty behavior with long lines.
This commit is contained in:
parent
48437fc49d
commit
6f7d740592
8 changed files with 105 additions and 138 deletions
|
@ -53,11 +53,16 @@ fn test_utf8() {
|
|||
.args(&["-lwmcL"])
|
||||
.pipe_in_fixture("UTF_8_test.txt")
|
||||
.run()
|
||||
.stdout_is(" 300 4969 22781 22213 79\n");
|
||||
// GNU returns " 300 2086 22219 22781 79"
|
||||
//
|
||||
// TODO: we should fix the word, character, and byte count to
|
||||
// match the behavior of GNU wc
|
||||
.stdout_is(" 303 2119 23025 22457 79\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_extra() {
|
||||
new_ucmd!()
|
||||
.arg("-lwmcL")
|
||||
.pipe_in_fixture("UTF_8_weirdchars.txt")
|
||||
.run()
|
||||
.stdout_is(" 25 87 513 442 48\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
BIN
tests/fixtures/wc/UTF_8_test.txt
vendored
BIN
tests/fixtures/wc/UTF_8_test.txt
vendored
Binary file not shown.
25
tests/fixtures/wc/UTF_8_weirdchars.txt
vendored
Normal file
25
tests/fixtures/wc/UTF_8_weirdchars.txt
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
zero-width space inbetween these: xx
|
||||
and inbetween two spaces: [ ]
|
||||
and at the end of the line:
|
||||
|
||||
non-breaking space: x x [ ]
|
||||
|
||||
simple unicode: xµx [ µ ] µ
|
||||
|
||||
wide: xwx [ w ] w
|
||||
|
||||
simple emoji: x👩x [ 👩 ] 👩
|
||||
|
||||
complex emoji: x👩🔬x [ 👩🔬 ] 👩🔬
|
||||
|
||||
Hello, world!
|
||||
|
||||
line feed: xx [ ]
|
||||
|
||||
vertical tab: xx [ ]
|
||||
|
||||
horizontal tab: x x [ ]
|
||||
this should be the longest line:
|
||||
1234567 12345678 123456781234567812345678
|
||||
|
||||
Control character: xx [ ]
|
Loading…
Add table
Add a link
Reference in a new issue