mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
Incorporate overhead of Line struct
This commit is contained in:
parent
b8d667c383
commit
25021f31eb
5 changed files with 40033 additions and 26 deletions
|
@ -41,6 +41,8 @@ pub struct ExternalSorter {
|
|||
impl ExternalSorter {
|
||||
pub fn new() -> ExternalSorter {
|
||||
ExternalSorter {
|
||||
// Default is 16G - But we never use it,
|
||||
// because we always set or ignore
|
||||
segment_size: 16000000000,
|
||||
sort_dir: None,
|
||||
parallel: false,
|
||||
|
@ -88,13 +90,14 @@ impl ExternalSorter {
|
|||
|
||||
let mut count = 0;
|
||||
let mut segments_file: Vec<File> = Vec::new();
|
||||
// FYI, the initialization size of struct Line is 96 bytes, but below works for all <T>
|
||||
|
||||
let size_of_items = std::mem::size_of::<T>();
|
||||
let initial_capacity =
|
||||
if self.segment_size / size_of_items >= 2 {
|
||||
self.segment_size / size_of_items
|
||||
} else { 2 };
|
||||
// Get size of iterator
|
||||
let (_, upper_bound) = iterator.size_hint();
|
||||
// Buffer size specified + minimum overhead of struct / size of items
|
||||
let initial_capacity = (self.segment_size + (upper_bound.unwrap() * size_of_items)) / size_of_items;
|
||||
let mut buffer: Vec<T> = Vec::with_capacity(initial_capacity);
|
||||
|
||||
for next_item in iterator {
|
||||
count += 1;
|
||||
buffer.push(next_item);
|
||||
|
@ -102,8 +105,8 @@ impl ExternalSorter {
|
|||
if buffer.len() > initial_capacity {
|
||||
let sort_dir = self.lazy_create_dir(&mut tempdir, &mut sort_dir)?;
|
||||
self.sort_and_write_segment(sort_dir, &mut segments_file, &mut buffer, &cmp)?;
|
||||
// Resize buffer after write out
|
||||
// buffer.shrink_to_fit();
|
||||
// Truncate buffer back to initial capacity
|
||||
buffer.truncate(initial_capacity);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -293,10 +293,11 @@ impl Sortable for Line {
|
|||
let buf_reader = BufReader::new(read);
|
||||
let result = {
|
||||
let mut line_joined = String::new();
|
||||
let mut selections_joined = SmallVec::new();
|
||||
// Return an empty vec for selections
|
||||
let selections_joined = SmallVec::new();
|
||||
let mut p_iter = buf_reader.lines().peekable();
|
||||
while let Some(line) = p_iter.next() {
|
||||
let mut deserialized_line: Line =
|
||||
let deserialized_line: Line =
|
||||
serde_json::from_str(&line.as_ref().unwrap()).unwrap();
|
||||
if let Some(_next_line) = p_iter.peek() {
|
||||
line_joined = format!("{}\n{}\n", line_joined, deserialized_line.line)
|
||||
|
@ -305,7 +306,7 @@ impl Sortable for Line {
|
|||
}
|
||||
// I think we've done our sorting already and these selctions are irrelevant?
|
||||
// @miDeb what's your sense? Could we just return an empty vec?
|
||||
selections_joined.append(&mut deserialized_line.selections);
|
||||
//selections_joined.append(&mut deserialized_line.selections);
|
||||
}
|
||||
Some(Line {
|
||||
line: line_joined,
|
||||
|
@ -909,13 +910,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
|
||||
if matches.is_present(OPT_BUF_SIZE) {
|
||||
// 16G is the default in memory buffer.
|
||||
// Although the "default" is never used unless extsort options are given
|
||||
// Although the "default" is never used
|
||||
settings.buffer_size = {
|
||||
let input = matches
|
||||
.value_of(OPT_BUF_SIZE)
|
||||
.map(String::from)
|
||||
.unwrap_or(format!("{}", DEFAULT_BUF_SIZE));
|
||||
|
||||
|
||||
GlobalSettings::human_numeric_convert(&input)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,14 +8,28 @@ fn test_helper(file_name: &str, args: &str) {
|
|||
.stdout_is_fixture(format!("{}.expected", file_name));
|
||||
}
|
||||
|
||||
// FYI, the initialization size of our Line struct is 96 bytes.
|
||||
//
|
||||
// At very small buffer sizes, with that overhead we are certainly going
|
||||
// to overrun our buffer way, way, way too quickly because of these excess
|
||||
// bytes for the struct.
|
||||
//
|
||||
// For instance, seq 0..20000 > ...text = 108894 bytes
|
||||
// But overhead is 1920000 + 108894 = 2028894 bytes
|
||||
//
|
||||
// Or kjvbible-random.txt = 4332506 bytes, but minimum size of its
|
||||
// 99817 lines in memory * 96 bytes = 9582432 bytes
|
||||
//
|
||||
// Here, we test 108894 bytes with a 50K buffer
|
||||
//
|
||||
#[test]
|
||||
fn test_larger_than_specified_segment() {
|
||||
new_ucmd!()
|
||||
.arg("-n")
|
||||
.arg("-S 100")
|
||||
.arg("numeric_unsorted_ints.txt")
|
||||
.arg("-S 50K")
|
||||
.arg("ext_sort.txt")
|
||||
.succeeds()
|
||||
.stdout_is_fixture(format!("{}", "numeric_unsorted_ints.expected"));
|
||||
.stdout_is_fixture(format!("{}", "ext_sort.expected"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -202,17 +216,6 @@ fn test_non_printing_chars() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponents_positive_general_fixed() {
|
||||
for exponents_positive_general_param in vec!["-g"] {
|
||||
new_ucmd!()
|
||||
.pipe_in("100E6\n\n50e10\n+100000\n\n10000K78\n10E\n\n\n1000EDKLD\n\n\n100E6\n\n50e10\n+100000\n\n")
|
||||
.arg(exponents_positive_general_param)
|
||||
.succeeds()
|
||||
.stdout_only("\n\n\n\n\n\n\n\n10000K78\n1000EDKLD\n10E\n+100000\n+100000\n100E6\n100E6\n50e10\n50e10\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponents_positive_numeric() {
|
||||
test_helper("exponents-positive-numeric", "-n");
|
||||
|
|
20000
tests/fixtures/sort/ext_sort.expected
vendored
Normal file
20000
tests/fixtures/sort/ext_sort.expected
vendored
Normal file
File diff suppressed because it is too large
Load diff
20000
tests/fixtures/sort/ext_sort.txt
vendored
Normal file
20000
tests/fixtures/sort/ext_sort.txt
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue