mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
Fix split's handling of non-UTF-8 files
This commit is contained in:
parent
a9ac7af9e1
commit
7c1395366e
2 changed files with 176 additions and 113 deletions
|
@ -4,11 +4,15 @@ extern crate regex;
|
|||
use self::rand::{thread_rng, Rng};
|
||||
use self::regex::Regex;
|
||||
use crate::common::util::*;
|
||||
use rand::SeedableRng;
|
||||
#[cfg(not(windows))]
|
||||
use std::env;
|
||||
use std::fs::{read_dir, File};
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::{
|
||||
fs::{read_dir, File},
|
||||
io::BufWriter,
|
||||
};
|
||||
|
||||
fn random_chars(n: usize) -> String {
|
||||
thread_rng()
|
||||
|
@ -58,7 +62,7 @@ impl Glob {
|
|||
files.sort();
|
||||
let mut data: Vec<u8> = vec![];
|
||||
for name in &files {
|
||||
data.extend(self.directory.read(name).into_bytes());
|
||||
data.extend(self.directory.read_bytes(name));
|
||||
}
|
||||
data
|
||||
}
|
||||
|
@ -81,20 +85,30 @@ impl RandomFile {
|
|||
}
|
||||
|
||||
fn add_bytes(&mut self, bytes: usize) {
|
||||
let chunk_size: usize = if bytes >= 1024 { 1024 } else { bytes };
|
||||
let mut n = bytes;
|
||||
while n > chunk_size {
|
||||
let _ = write!(self.inner, "{}", random_chars(chunk_size));
|
||||
n -= chunk_size;
|
||||
// Note that just writing random characters isn't enough to cover all
|
||||
// cases. We need truly random bytes.
|
||||
let mut writer = BufWriter::new(&self.inner);
|
||||
|
||||
// Seed the rng so as to avoid spurious test failures.
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(123);
|
||||
let mut buffer = [0; 1024];
|
||||
let mut remaining_size = bytes;
|
||||
|
||||
while remaining_size > 0 {
|
||||
let to_write = std::cmp::min(remaining_size, buffer.len());
|
||||
let buf = &mut buffer[..to_write];
|
||||
rng.fill(buf);
|
||||
writer.write(buf).unwrap();
|
||||
|
||||
remaining_size -= to_write;
|
||||
}
|
||||
let _ = write!(self.inner, "{}", random_chars(n));
|
||||
}
|
||||
|
||||
/// Add n lines each of size `RandomFile::LINESIZE`
|
||||
fn add_lines(&mut self, lines: usize) {
|
||||
let mut n = lines;
|
||||
while n > 0 {
|
||||
let _ = writeln!(self.inner, "{}", random_chars(RandomFile::LINESIZE));
|
||||
writeln!(self.inner, "{}", random_chars(RandomFile::LINESIZE)).unwrap();
|
||||
n -= 1;
|
||||
}
|
||||
}
|
||||
|
@ -104,18 +118,18 @@ impl RandomFile {
|
|||
fn test_split_default() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "split_default";
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
|
||||
RandomFile::new(&at, name).add_lines(2000);
|
||||
ucmd.args(&[name]).succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
|
||||
assert_eq!(glob.count(), 2);
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_numeric_prefixed_chunks_by_bytes() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "split_num_prefixed_chunks_by_bytes";
|
||||
let glob = Glob::new(&at, ".", r"a\d\d$");
|
||||
RandomFile::new(&at, name).add_bytes(10000);
|
||||
ucmd.args(&[
|
||||
"-d", // --numeric-suffixes
|
||||
|
@ -123,52 +137,86 @@ fn test_split_numeric_prefixed_chunks_by_bytes() {
|
|||
"1000", name, "a",
|
||||
])
|
||||
.succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"a\d\d$");
|
||||
assert_eq!(glob.count(), 10);
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
for filename in glob.collect() {
|
||||
assert_eq!(glob.directory.metadata(&filename).len(), 1000);
|
||||
}
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_str_prefixed_chunks_by_bytes() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "split_str_prefixed_chunks_by_bytes";
|
||||
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
|
||||
RandomFile::new(&at, name).add_bytes(10000);
|
||||
// Important that this is less than 1024 since that's our internal buffer
|
||||
// size. Good to test that we don't overshoot.
|
||||
ucmd.args(&["-b", "1000", name, "b"]).succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
|
||||
assert_eq!(glob.count(), 10);
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
for filename in glob.collect() {
|
||||
assert_eq!(glob.directory.metadata(&filename).len(), 1000);
|
||||
}
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
// This is designed to test what happens when the desired part size is not a
|
||||
// multiple of the buffer size and we hopefully don't overshoot the desired part
|
||||
// size.
|
||||
#[test]
|
||||
fn test_split_bytes_prime_part_size() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "test_split_bytes_prime_part_size";
|
||||
RandomFile::new(&at, name).add_bytes(10000);
|
||||
// 1753 is prime and greater than the buffer size, 1024.
|
||||
ucmd.args(&["-b", "1753", name, "b"]).succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
|
||||
assert_eq!(glob.count(), 6);
|
||||
for i in 0..5 {
|
||||
assert_eq!(glob.directory.metadata(&glob.collect()[i]).len(), 1753);
|
||||
}
|
||||
assert_eq!(glob.directory.metadata(&glob.collect()[5]).len(), 1235);
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_num_prefixed_chunks_by_lines() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "split_num_prefixed_chunks_by_lines";
|
||||
let glob = Glob::new(&at, ".", r"c\d\d$");
|
||||
RandomFile::new(&at, name).add_lines(10000);
|
||||
ucmd.args(&["-d", "-l", "1000", name, "c"]).succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"c\d\d$");
|
||||
assert_eq!(glob.count(), 10);
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_str_prefixed_chunks_by_lines() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "split_str_prefixed_chunks_by_lines";
|
||||
let glob = Glob::new(&at, ".", r"d[[:alpha:]][[:alpha:]]$");
|
||||
RandomFile::new(&at, name).add_lines(10000);
|
||||
ucmd.args(&["-l", "1000", name, "d"]).succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"d[[:alpha:]][[:alpha:]]$");
|
||||
assert_eq!(glob.count(), 10);
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_additional_suffix() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "split_additional_suffix";
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]].txt$");
|
||||
RandomFile::new(&at, name).add_lines(2000);
|
||||
ucmd.args(&["--additional-suffix", ".txt", name]).succeeds();
|
||||
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]].txt$");
|
||||
assert_eq!(glob.count(), 2);
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
}
|
||||
|
||||
// note: the test_filter* tests below are unix-only
|
||||
|
@ -182,15 +230,16 @@ fn test_filter() {
|
|||
// like `test_split_default()` but run a command before writing
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "filtered";
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
|
||||
let n_lines = 3;
|
||||
RandomFile::new(&at, name).add_lines(n_lines);
|
||||
|
||||
// change all characters to 'i'
|
||||
ucmd.args(&["--filter=sed s/./i/g > $FILE", name])
|
||||
.succeeds();
|
||||
|
||||
// assert all characters are 'i' / no character is not 'i'
|
||||
// (assert that command succeded)
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
|
||||
assert!(
|
||||
glob.collate().iter().find(|&&c| {
|
||||
// is not i
|
||||
|
@ -209,7 +258,6 @@ fn test_filter_with_env_var_set() {
|
|||
// implemented like `test_split_default()` but run a command before writing
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
let name = "filtered";
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
|
||||
let n_lines = 3;
|
||||
RandomFile::new(&at, name).add_lines(n_lines);
|
||||
|
||||
|
@ -217,7 +265,9 @@ fn test_filter_with_env_var_set() {
|
|||
env::set_var("FILE", &env_var_value);
|
||||
ucmd.args(&[format!("--filter={}", "cat > $FILE").as_str(), name])
|
||||
.succeeds();
|
||||
assert_eq!(glob.collate(), at.read(name).into_bytes());
|
||||
|
||||
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
|
||||
assert_eq!(glob.collate(), at.read_bytes(name));
|
||||
assert!(env::var("FILE").unwrap_or("var was unset".to_owned()) == env_var_value);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue