mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
Incorporate overhead of Line struct
This commit is contained in:
parent
b8d667c383
commit
25021f31eb
5 changed files with 40033 additions and 26 deletions
|
@ -41,6 +41,8 @@ pub struct ExternalSorter {
|
||||||
impl ExternalSorter {
|
impl ExternalSorter {
|
||||||
pub fn new() -> ExternalSorter {
|
pub fn new() -> ExternalSorter {
|
||||||
ExternalSorter {
|
ExternalSorter {
|
||||||
|
// Default is 16G - But we never use it,
|
||||||
|
// because we always set or ignore
|
||||||
segment_size: 16000000000,
|
segment_size: 16000000000,
|
||||||
sort_dir: None,
|
sort_dir: None,
|
||||||
parallel: false,
|
parallel: false,
|
||||||
|
@ -88,13 +90,14 @@ impl ExternalSorter {
|
||||||
|
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut segments_file: Vec<File> = Vec::new();
|
let mut segments_file: Vec<File> = Vec::new();
|
||||||
// FYI, the initialization size of struct Line is 96 bytes, but below works for all <T>
|
|
||||||
let size_of_items = std::mem::size_of::<T>();
|
let size_of_items = std::mem::size_of::<T>();
|
||||||
let initial_capacity =
|
// Get size of iterator
|
||||||
if self.segment_size / size_of_items >= 2 {
|
let (_, upper_bound) = iterator.size_hint();
|
||||||
self.segment_size / size_of_items
|
// Buffer size specified + minimum overhead of struct / size of items
|
||||||
} else { 2 };
|
let initial_capacity = (self.segment_size + (upper_bound.unwrap() * size_of_items)) / size_of_items;
|
||||||
let mut buffer: Vec<T> = Vec::with_capacity(initial_capacity);
|
let mut buffer: Vec<T> = Vec::with_capacity(initial_capacity);
|
||||||
|
|
||||||
for next_item in iterator {
|
for next_item in iterator {
|
||||||
count += 1;
|
count += 1;
|
||||||
buffer.push(next_item);
|
buffer.push(next_item);
|
||||||
|
@ -102,8 +105,8 @@ impl ExternalSorter {
|
||||||
if buffer.len() > initial_capacity {
|
if buffer.len() > initial_capacity {
|
||||||
let sort_dir = self.lazy_create_dir(&mut tempdir, &mut sort_dir)?;
|
let sort_dir = self.lazy_create_dir(&mut tempdir, &mut sort_dir)?;
|
||||||
self.sort_and_write_segment(sort_dir, &mut segments_file, &mut buffer, &cmp)?;
|
self.sort_and_write_segment(sort_dir, &mut segments_file, &mut buffer, &cmp)?;
|
||||||
// Resize buffer after write out
|
// Truncate buffer back to initial capacity
|
||||||
// buffer.shrink_to_fit();
|
buffer.truncate(initial_capacity);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -293,10 +293,11 @@ impl Sortable for Line {
|
||||||
let buf_reader = BufReader::new(read);
|
let buf_reader = BufReader::new(read);
|
||||||
let result = {
|
let result = {
|
||||||
let mut line_joined = String::new();
|
let mut line_joined = String::new();
|
||||||
let mut selections_joined = SmallVec::new();
|
// Return an empty vec for selections
|
||||||
|
let selections_joined = SmallVec::new();
|
||||||
let mut p_iter = buf_reader.lines().peekable();
|
let mut p_iter = buf_reader.lines().peekable();
|
||||||
while let Some(line) = p_iter.next() {
|
while let Some(line) = p_iter.next() {
|
||||||
let mut deserialized_line: Line =
|
let deserialized_line: Line =
|
||||||
serde_json::from_str(&line.as_ref().unwrap()).unwrap();
|
serde_json::from_str(&line.as_ref().unwrap()).unwrap();
|
||||||
if let Some(_next_line) = p_iter.peek() {
|
if let Some(_next_line) = p_iter.peek() {
|
||||||
line_joined = format!("{}\n{}\n", line_joined, deserialized_line.line)
|
line_joined = format!("{}\n{}\n", line_joined, deserialized_line.line)
|
||||||
|
@ -305,7 +306,7 @@ impl Sortable for Line {
|
||||||
}
|
}
|
||||||
// I think we've done our sorting already and these selctions are irrelevant?
|
// I think we've done our sorting already and these selctions are irrelevant?
|
||||||
// @miDeb what's your sense? Could we just return an empty vec?
|
// @miDeb what's your sense? Could we just return an empty vec?
|
||||||
selections_joined.append(&mut deserialized_line.selections);
|
//selections_joined.append(&mut deserialized_line.selections);
|
||||||
}
|
}
|
||||||
Some(Line {
|
Some(Line {
|
||||||
line: line_joined,
|
line: line_joined,
|
||||||
|
@ -909,13 +910,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
|
|
||||||
if matches.is_present(OPT_BUF_SIZE) {
|
if matches.is_present(OPT_BUF_SIZE) {
|
||||||
// 16G is the default in memory buffer.
|
// 16G is the default in memory buffer.
|
||||||
// Although the "default" is never used unless extsort options are given
|
// Although the "default" is never used
|
||||||
settings.buffer_size = {
|
settings.buffer_size = {
|
||||||
let input = matches
|
let input = matches
|
||||||
.value_of(OPT_BUF_SIZE)
|
.value_of(OPT_BUF_SIZE)
|
||||||
.map(String::from)
|
.map(String::from)
|
||||||
.unwrap_or(format!("{}", DEFAULT_BUF_SIZE));
|
.unwrap_or(format!("{}", DEFAULT_BUF_SIZE));
|
||||||
|
|
||||||
GlobalSettings::human_numeric_convert(&input)
|
GlobalSettings::human_numeric_convert(&input)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,14 +8,28 @@ fn test_helper(file_name: &str, args: &str) {
|
||||||
.stdout_is_fixture(format!("{}.expected", file_name));
|
.stdout_is_fixture(format!("{}.expected", file_name));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FYI, the initialization size of our Line struct is 96 bytes.
|
||||||
|
//
|
||||||
|
// At very small buffer sizes, with that overhead we are certainly going
|
||||||
|
// to overrun our buffer way, way, way too quickly because of these excess
|
||||||
|
// bytes for the struct.
|
||||||
|
//
|
||||||
|
// For instance, seq 0..20000 > ...text = 108894 bytes
|
||||||
|
// But overhead is 1920000 + 108894 = 2028894 bytes
|
||||||
|
//
|
||||||
|
// Or kjvbible-random.txt = 4332506 bytes, but minimum size of its
|
||||||
|
// 99817 lines in memory * 96 bytes = 9582432 bytes
|
||||||
|
//
|
||||||
|
// Here, we test 108894 bytes with a 50K buffer
|
||||||
|
//
|
||||||
#[test]
|
#[test]
|
||||||
fn test_larger_than_specified_segment() {
|
fn test_larger_than_specified_segment() {
|
||||||
new_ucmd!()
|
new_ucmd!()
|
||||||
.arg("-n")
|
.arg("-n")
|
||||||
.arg("-S 100")
|
.arg("-S 50K")
|
||||||
.arg("numeric_unsorted_ints.txt")
|
.arg("ext_sort.txt")
|
||||||
.succeeds()
|
.succeeds()
|
||||||
.stdout_is_fixture(format!("{}", "numeric_unsorted_ints.expected"));
|
.stdout_is_fixture(format!("{}", "ext_sort.expected"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -202,17 +216,6 @@ fn test_non_printing_chars() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_exponents_positive_general_fixed() {
|
|
||||||
for exponents_positive_general_param in vec!["-g"] {
|
|
||||||
new_ucmd!()
|
|
||||||
.pipe_in("100E6\n\n50e10\n+100000\n\n10000K78\n10E\n\n\n1000EDKLD\n\n\n100E6\n\n50e10\n+100000\n\n")
|
|
||||||
.arg(exponents_positive_general_param)
|
|
||||||
.succeeds()
|
|
||||||
.stdout_only("\n\n\n\n\n\n\n\n10000K78\n1000EDKLD\n10E\n+100000\n+100000\n100E6\n100E6\n50e10\n50e10\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_exponents_positive_numeric() {
|
fn test_exponents_positive_numeric() {
|
||||||
test_helper("exponents-positive-numeric", "-n");
|
test_helper("exponents-positive-numeric", "-n");
|
||||||
|
|
20000
tests/fixtures/sort/ext_sort.expected
vendored
Normal file
20000
tests/fixtures/sort/ext_sort.expected
vendored
Normal file
File diff suppressed because it is too large
Load diff
20000
tests/fixtures/sort/ext_sort.txt
vendored
Normal file
20000
tests/fixtures/sort/ext_sort.txt
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue