1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 19:17:43 +00:00

Incorporate overhead of Line struct

This commit is contained in:
electricboogie 2021-04-19 21:24:52 -05:00
parent b8d667c383
commit 25021f31eb
5 changed files with 40033 additions and 26 deletions

View file

@ -41,6 +41,8 @@ pub struct ExternalSorter {
impl ExternalSorter {
pub fn new() -> ExternalSorter {
ExternalSorter {
// Default is 16G - But we never use it,
// because we always set or ignore
segment_size: 16000000000,
sort_dir: None,
parallel: false,
@ -88,13 +90,14 @@ impl ExternalSorter {
let mut count = 0;
let mut segments_file: Vec<File> = Vec::new();
// FYI, the initialization size of struct Line is 96 bytes, but below works for all <T>
let size_of_items = std::mem::size_of::<T>();
let initial_capacity =
if self.segment_size / size_of_items >= 2 {
self.segment_size / size_of_items
} else { 2 };
// Get size of iterator
let (_, upper_bound) = iterator.size_hint();
// Buffer size specified + minimum overhead of struct / size of items
let initial_capacity = (self.segment_size + (upper_bound.unwrap() * size_of_items)) / size_of_items;
let mut buffer: Vec<T> = Vec::with_capacity(initial_capacity);
for next_item in iterator {
count += 1;
buffer.push(next_item);
@ -102,8 +105,8 @@ impl ExternalSorter {
if buffer.len() > initial_capacity {
let sort_dir = self.lazy_create_dir(&mut tempdir, &mut sort_dir)?;
self.sort_and_write_segment(sort_dir, &mut segments_file, &mut buffer, &cmp)?;
// Resize buffer after write out
// buffer.shrink_to_fit();
// Truncate buffer back to initial capacity
buffer.truncate(initial_capacity);
}
}

View file

@ -293,10 +293,11 @@ impl Sortable for Line {
let buf_reader = BufReader::new(read);
let result = {
let mut line_joined = String::new();
let mut selections_joined = SmallVec::new();
// Return an empty vec for selections
let selections_joined = SmallVec::new();
let mut p_iter = buf_reader.lines().peekable();
while let Some(line) = p_iter.next() {
let mut deserialized_line: Line =
let deserialized_line: Line =
serde_json::from_str(&line.as_ref().unwrap()).unwrap();
if let Some(_next_line) = p_iter.peek() {
line_joined = format!("{}\n{}\n", line_joined, deserialized_line.line)
@ -305,7 +306,7 @@ impl Sortable for Line {
}
// I think we've done our sorting already and these selctions are irrelevant?
// @miDeb what's your sense? Could we just return an empty vec?
selections_joined.append(&mut deserialized_line.selections);
//selections_joined.append(&mut deserialized_line.selections);
}
Some(Line {
line: line_joined,
@ -909,13 +910,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
if matches.is_present(OPT_BUF_SIZE) {
// 16G is the default in memory buffer.
// Although the "default" is never used unless extsort options are given
// Although the "default" is never used
settings.buffer_size = {
let input = matches
.value_of(OPT_BUF_SIZE)
.map(String::from)
.unwrap_or(format!("{}", DEFAULT_BUF_SIZE));
GlobalSettings::human_numeric_convert(&input)
}
}

View file

@ -8,14 +8,28 @@ fn test_helper(file_name: &str, args: &str) {
.stdout_is_fixture(format!("{}.expected", file_name));
}
// FYI, the initialization size of our Line struct is 96 bytes.
//
// At very small buffer sizes, with that overhead we are certainly going
// to overrun our buffer way, way, way too quickly because of these excess
// bytes for the struct.
//
// For instance, seq 0..20000 > ...text = 108894 bytes
// But overhead is 1920000 + 108894 = 2028894 bytes
//
// Or kjvbible-random.txt = 4332506 bytes, but minimum size of its
// 99817 lines in memory * 96 bytes = 9582432 bytes
//
// Here, we test 108894 bytes with a 50K buffer
//
#[test]
fn test_larger_than_specified_segment() {
new_ucmd!()
.arg("-n")
.arg("-S 100")
.arg("numeric_unsorted_ints.txt")
.arg("-S 50K")
.arg("ext_sort.txt")
.succeeds()
.stdout_is_fixture(format!("{}", "numeric_unsorted_ints.expected"));
.stdout_is_fixture(format!("{}", "ext_sort.expected"));
}
#[test]
@ -202,17 +216,6 @@ fn test_non_printing_chars() {
}
}
#[test]
fn test_exponents_positive_general_fixed() {
for exponents_positive_general_param in vec!["-g"] {
new_ucmd!()
.pipe_in("100E6\n\n50e10\n+100000\n\n10000K78\n10E\n\n\n1000EDKLD\n\n\n100E6\n\n50e10\n+100000\n\n")
.arg(exponents_positive_general_param)
.succeeds()
.stdout_only("\n\n\n\n\n\n\n\n10000K78\n1000EDKLD\n10E\n+100000\n+100000\n100E6\n100E6\n50e10\n50e10\n");
}
}
#[test]
fn test_exponents_positive_numeric() {
test_helper("exponents-positive-numeric", "-n");

20000
tests/fixtures/sort/ext_sort.expected vendored Normal file

File diff suppressed because it is too large Load diff

20000
tests/fixtures/sort/ext_sort.txt vendored Normal file

File diff suppressed because it is too large Load diff