Incorporate overhead of Line struct

2025-09-14 02:57:57 +00:00 · 2021-04-19 21:24:52 -05:00 · 2021-04-19 21:24:52 -05:00 · 25021f31eb
commit 25021f31eb
parent b8d667c383
5 changed files with 40033 additions and 26 deletions
--- a/src/uu/sort/src/ext_sorter/mod.rs
+++ b/src/uu/sort/src/ext_sorter/mod.rs
@ -41,6 +41,8 @@ pub struct ExternalSorter {
 impl ExternalSorter {
    pub fn new() -> ExternalSorter {
        ExternalSorter {
+            // Default is 16G - But we never use it, 
+            // because we always set or ignore
            segment_size: 16000000000,
            sort_dir: None,
            parallel: false,
@ -88,13 +90,14 @@ impl ExternalSorter {

        let mut count = 0;
        let mut segments_file: Vec<File> = Vec::new();
-        // FYI, the initialization size of struct Line is 96 bytes, but below works for all <T>
+        
        let size_of_items = std::mem::size_of::<T>();
-        let initial_capacity = 
-        if self.segment_size / size_of_items >= 2 {
-            self.segment_size / size_of_items
-        } else { 2 };
+        // Get size of iterator
+        let (_, upper_bound) = iterator.size_hint();
+        // Buffer size specified + minimum overhead of struct / size of items
+        let initial_capacity =  (self.segment_size + (upper_bound.unwrap() * size_of_items)) / size_of_items;
        let mut buffer: Vec<T> = Vec::with_capacity(initial_capacity);
+
        for next_item in iterator {
            count += 1;
            buffer.push(next_item);
@ -102,8 +105,8 @@ impl ExternalSorter {
            if buffer.len() > initial_capacity {
                let sort_dir = self.lazy_create_dir(&mut tempdir, &mut sort_dir)?;
                self.sort_and_write_segment(sort_dir, &mut segments_file, &mut buffer, &cmp)?;
-                // Resize buffer after write out
-                // buffer.shrink_to_fit();
+                // Truncate buffer back to initial capacity
+                buffer.truncate(initial_capacity);
            }
        }

--- a/src/uu/sort/src/sort.rs
+++ b/src/uu/sort/src/sort.rs
@ -293,10 +293,11 @@ impl Sortable for Line {
        let buf_reader = BufReader::new(read);
        let result = {
            let mut line_joined = String::new();
-            let mut selections_joined = SmallVec::new();
+            // Return an empty vec for selections
+            let selections_joined = SmallVec::new();
            let mut p_iter = buf_reader.lines().peekable();
            while let Some(line) = p_iter.next() {
-                let mut deserialized_line: Line =
+                let deserialized_line: Line =
                    serde_json::from_str(&line.as_ref().unwrap()).unwrap();
                if let Some(_next_line) = p_iter.peek() {
                    line_joined = format!("{}\n{}\n", line_joined, deserialized_line.line)
@ -305,7 +306,7 @@ impl Sortable for Line {
                }
                // I think we've done our sorting already and these selctions are irrelevant?
                // @miDeb what's your sense? Could we just return an empty vec?
-                selections_joined.append(&mut deserialized_line.selections);
+                //selections_joined.append(&mut deserialized_line.selections);
            }
            Some(Line {
                line: line_joined,
@ -909,13 +910,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 {

    if matches.is_present(OPT_BUF_SIZE) {
        // 16G is the default in memory buffer.
-        // Although the "default" is never used unless extsort options are given
+        // Although the "default" is never used
        settings.buffer_size = {
            let input = matches
                .value_of(OPT_BUF_SIZE)
                .map(String::from)
                .unwrap_or(format!("{}", DEFAULT_BUF_SIZE));
-
+        
            GlobalSettings::human_numeric_convert(&input)
        }
    }
--- a/tests/by-util/test_sort.rs
+++ b/tests/by-util/test_sort.rs
@ -8,14 +8,28 @@ fn test_helper(file_name: &str, args: &str) {
        .stdout_is_fixture(format!("{}.expected", file_name));
 }

+// FYI, the initialization size of our Line struct is 96 bytes.
+//  
+// At very small buffer sizes, with that overhead we are certainly going 
+// to overrun our buffer way, way, way too quickly because of these excess 
+// bytes for the struct.
+//
+// For instance, seq 0..20000 > ...text = 108894 bytes
+// But overhead is 1920000 + 108894 = 2028894 bytes
+//
+// Or kjvbible-random.txt = 4332506 bytes, but minimum size of its 
+// 99817 lines in memory * 96 bytes = 9582432 bytes
+//
+// Here, we test 108894 bytes with a 50K buffer
+//
 #[test]
 fn test_larger_than_specified_segment() {
    new_ucmd!()
        .arg("-n")
-        .arg("-S 100")
-        .arg("numeric_unsorted_ints.txt")
+        .arg("-S 50K")
+        .arg("ext_sort.txt")
        .succeeds()
-        .stdout_is_fixture(format!("{}", "numeric_unsorted_ints.expected"));
+        .stdout_is_fixture(format!("{}", "ext_sort.expected"));
 }

 #[test]
@ -202,17 +216,6 @@ fn test_non_printing_chars() {
    }
 }

-#[test]
-fn test_exponents_positive_general_fixed() {
-    for exponents_positive_general_param in vec!["-g"] {
-        new_ucmd!()
-            .pipe_in("100E6\n\n50e10\n+100000\n\n10000K78\n10E\n\n\n1000EDKLD\n\n\n100E6\n\n50e10\n+100000\n\n")
-            .arg(exponents_positive_general_param)
-            .succeeds()
-            .stdout_only("\n\n\n\n\n\n\n\n10000K78\n1000EDKLD\n10E\n+100000\n+100000\n100E6\n100E6\n50e10\n50e10\n");
-    }
-}
-
 #[test]
 fn test_exponents_positive_numeric() {
    test_helper("exponents-positive-numeric", "-n");
--- a/tests/fixtures/sort/ext_sort.expected
+++ b/tests/fixtures/sort/ext_sort.expected
--- a/tests/fixtures/sort/ext_sort.txt
+++ b/tests/fixtures/sort/ext_sort.txt