mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
sort: use FileMerger for extsort merge step
FileMerger is much more efficient than the previous algorithm, which looped over all elements every time to determine the next element. FileMerger uses a BinaryHeap, which should bring the complexity for the merge step down from O(n²) to O(n log n).
This commit is contained in:
parent
64c1f16421
commit
38effc93b3
1 changed files with 30 additions and 148 deletions
|
@ -1,91 +1,33 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::SeekFrom;
|
||||
use std::io::{BufRead, BufReader, BufWriter, Seek, Write};
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
use tempdir::TempDir;
|
||||
|
||||
use crate::{file_to_lines_iter, FileMerger};
|
||||
|
||||
use super::{GlobalSettings, Line};
|
||||
|
||||
/// Iterator that provides sorted `T`s
|
||||
pub struct ExtSortedIterator {
|
||||
buffers: Vec<VecDeque<Line>>,
|
||||
chunk_offsets: Vec<u64>,
|
||||
max_per_chunk: usize,
|
||||
chunks: usize,
|
||||
tmp_dir: TempDir,
|
||||
settings: GlobalSettings,
|
||||
failed: bool,
|
||||
pub struct ExtSortedIterator<'a> {
|
||||
file_merger: FileMerger<'a>,
|
||||
// Keep tmp_dir around, it is deleted when dropped.
|
||||
_tmp_dir: TempDir,
|
||||
}
|
||||
|
||||
impl Iterator for ExtSortedIterator {
|
||||
impl<'a> Iterator for ExtSortedIterator<'a> {
|
||||
type Item = Line;
|
||||
|
||||
/// # Errors
|
||||
///
|
||||
/// This method can fail due to issues reading intermediate sorted chunks
|
||||
/// from disk
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.failed {
|
||||
return None;
|
||||
}
|
||||
// fill up any empty buffers
|
||||
let mut empty = true;
|
||||
for chunk_num in 0..self.chunks {
|
||||
if self.buffers[chunk_num as usize].is_empty() {
|
||||
let mut f = crash_if_err!(
|
||||
1,
|
||||
File::open(self.tmp_dir.path().join(chunk_num.to_string()))
|
||||
);
|
||||
crash_if_err!(1, f.seek(SeekFrom::Start(self.chunk_offsets[chunk_num])));
|
||||
let bytes_read = fill_buff(
|
||||
&mut self.buffers[chunk_num as usize],
|
||||
f,
|
||||
self.max_per_chunk,
|
||||
&self.settings,
|
||||
);
|
||||
self.chunk_offsets[chunk_num as usize] += bytes_read as u64;
|
||||
if !self.buffers[chunk_num as usize].is_empty() {
|
||||
empty = false;
|
||||
}
|
||||
} else {
|
||||
empty = false;
|
||||
}
|
||||
}
|
||||
if empty {
|
||||
return None;
|
||||
}
|
||||
|
||||
// find the next record to write
|
||||
// check is_empty() before unwrap()ing
|
||||
let mut idx = 0;
|
||||
for chunk_num in 0..self.chunks as usize {
|
||||
if !self.buffers[chunk_num].is_empty()
|
||||
&& (self.buffers[idx].is_empty()
|
||||
|| super::compare_by(
|
||||
self.buffers[chunk_num].front().unwrap(),
|
||||
self.buffers[idx].front().unwrap(),
|
||||
&self.settings,
|
||||
) == Ordering::Less)
|
||||
{
|
||||
idx = chunk_num;
|
||||
}
|
||||
}
|
||||
|
||||
// unwrap due to checks above
|
||||
let r = self.buffers[idx].pop_front().unwrap();
|
||||
Some(r)
|
||||
self.file_merger.next()
|
||||
}
|
||||
}
|
||||
|
||||
/// Sort (based on `compare`) the `T`s provided by `unsorted` and return an
|
||||
/// iterator
|
||||
///
|
||||
/// # Errors
|
||||
/// # Panics
|
||||
///
|
||||
/// This method can fail due to issues writing intermediate sorted chunks
|
||||
/// This method can panic due to issues writing intermediate sorted chunks
|
||||
/// to disk.
|
||||
pub fn ext_sort(
|
||||
unsorted: impl Iterator<Item = Line>,
|
||||
|
@ -93,19 +35,12 @@ pub fn ext_sort(
|
|||
) -> ExtSortedIterator {
|
||||
let tmp_dir = crash_if_err!(1, TempDir::new_in(&settings.tmp_dir, "uutils_sort"));
|
||||
|
||||
let mut iter = ExtSortedIterator {
|
||||
buffers: Vec::new(),
|
||||
chunk_offsets: Vec::new(),
|
||||
max_per_chunk: 0,
|
||||
chunks: 0,
|
||||
tmp_dir,
|
||||
settings: settings.clone(),
|
||||
failed: false,
|
||||
};
|
||||
|
||||
let mut total_read = 0;
|
||||
let mut chunk = Vec::new();
|
||||
|
||||
let mut chunks_read = 0;
|
||||
let mut file_merger = FileMerger::new(settings);
|
||||
|
||||
// make the initial chunks on disk
|
||||
for seq in unsorted {
|
||||
let seq_size = seq.estimate_size();
|
||||
|
@ -113,62 +48,35 @@ pub fn ext_sort(
|
|||
|
||||
chunk.push(seq);
|
||||
|
||||
if total_read >= settings.buffer_size {
|
||||
if total_read >= settings.buffer_size && chunk.len() >= 2 {
|
||||
super::sort_by(&mut chunk, &settings);
|
||||
write_chunk(
|
||||
settings,
|
||||
&iter.tmp_dir.path().join(iter.chunks.to_string()),
|
||||
&mut chunk,
|
||||
);
|
||||
|
||||
let file_path = tmp_dir.path().join(chunks_read.to_string());
|
||||
write_chunk(settings, &file_path, &mut chunk);
|
||||
chunk.clear();
|
||||
total_read = 0;
|
||||
iter.chunks += 1;
|
||||
chunks_read += 1;
|
||||
|
||||
file_merger.push_file(Box::new(file_to_lines_iter(file_path, settings).unwrap()))
|
||||
}
|
||||
}
|
||||
// write the last chunk
|
||||
if !chunk.is_empty() {
|
||||
super::sort_by(&mut chunk, &settings);
|
||||
|
||||
let file_path = tmp_dir.path().join(chunks_read.to_string());
|
||||
write_chunk(
|
||||
settings,
|
||||
&iter.tmp_dir.path().join(iter.chunks.to_string()),
|
||||
&tmp_dir.path().join(chunks_read.to_string()),
|
||||
&mut chunk,
|
||||
);
|
||||
iter.chunks += 1;
|
||||
|
||||
file_merger.push_file(Box::new(file_to_lines_iter(file_path, settings).unwrap()));
|
||||
}
|
||||
|
||||
// initialize buffers for each chunk
|
||||
//
|
||||
// Having a right sized buffer for each chunk for smallish values seems silly to me?
|
||||
//
|
||||
// We will have to have the entire iter in memory sometime right?
|
||||
// Set minimum to the size of the writer buffer, ~8K
|
||||
|
||||
const MINIMUM_READBACK_BUFFER: usize = 8200;
|
||||
let right_sized_buffer = settings
|
||||
.buffer_size
|
||||
.checked_div(iter.chunks)
|
||||
.unwrap_or(settings.buffer_size);
|
||||
iter.max_per_chunk = if right_sized_buffer > MINIMUM_READBACK_BUFFER {
|
||||
right_sized_buffer
|
||||
} else {
|
||||
MINIMUM_READBACK_BUFFER
|
||||
};
|
||||
iter.buffers = vec![VecDeque::new(); iter.chunks];
|
||||
iter.chunk_offsets = vec![0; iter.chunks];
|
||||
for chunk_num in 0..iter.chunks {
|
||||
let offset = fill_buff(
|
||||
&mut iter.buffers[chunk_num],
|
||||
crash_if_err!(
|
||||
1,
|
||||
File::open(iter.tmp_dir.path().join(chunk_num.to_string()))
|
||||
),
|
||||
iter.max_per_chunk,
|
||||
&settings,
|
||||
);
|
||||
iter.chunk_offsets[chunk_num] = offset as u64;
|
||||
ExtSortedIterator {
|
||||
file_merger,
|
||||
_tmp_dir: tmp_dir,
|
||||
}
|
||||
|
||||
iter
|
||||
}
|
||||
|
||||
fn write_chunk(settings: &GlobalSettings, file: &Path, chunk: &mut Vec<Line>) {
|
||||
|
@ -183,29 +91,3 @@ fn write_chunk(settings: &GlobalSettings, file: &Path, chunk: &mut Vec<Line>) {
|
|||
}
|
||||
crash_if_err!(1, buf_write.flush());
|
||||
}
|
||||
|
||||
fn fill_buff(
|
||||
vec: &mut VecDeque<Line>,
|
||||
file: File,
|
||||
max_bytes: usize,
|
||||
settings: &GlobalSettings,
|
||||
) -> usize {
|
||||
let mut total_read = 0;
|
||||
let mut bytes_read = 0;
|
||||
for line in BufReader::new(file).split(if settings.zero_terminated {
|
||||
b'\0'
|
||||
} else {
|
||||
b'\n'
|
||||
}) {
|
||||
let line_s = String::from_utf8(crash_if_err!(1, line)).unwrap();
|
||||
bytes_read += line_s.len() + 1;
|
||||
let deserialized = Line::new(line_s, settings);
|
||||
total_read += deserialized.estimate_size();
|
||||
vec.push_back(deserialized);
|
||||
if total_read > max_bytes {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bytes_read
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue