diff --git a/src/uu/sort/src/check.rs b/src/uu/sort/src/check.rs index f53e4edb4..f1cd22686 100644 --- a/src/uu/sort/src/check.rs +++ b/src/uu/sort/src/check.rs @@ -8,7 +8,7 @@ //! Check if a file is ordered use crate::{ - chunks::{self, Chunk}, + chunks::{self, Chunk, RecycledChunk}, compare_by, open, GlobalSettings, }; use itertools::Itertools; @@ -34,7 +34,7 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { move || reader(file, recycled_receiver, loaded_sender, &settings) }); for _ in 0..2 { - let _ = recycled_sender.send(Chunk::new(vec![0; 100 * 1024], |_| Vec::new())); + let _ = recycled_sender.send(RecycledChunk::new(100 * 1024)); } let mut prev_chunk: Option = None; @@ -44,21 +44,29 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { if let Some(prev_chunk) = prev_chunk.take() { // Check if the first element of the new chunk is greater than the last // element from the previous chunk - let prev_last = prev_chunk.borrow_lines().last().unwrap(); - let new_first = chunk.borrow_lines().first().unwrap(); + let prev_last = prev_chunk.lines().last().unwrap(); + let new_first = chunk.lines().first().unwrap(); - if compare_by(prev_last, new_first, settings) == Ordering::Greater { + if compare_by( + prev_last, + new_first, + settings, + prev_chunk.line_data(), + chunk.line_data(), + ) == Ordering::Greater + { if !settings.check_silent { println!("sort: {}:{}: disorder: {}", path, line_idx, new_first.line); } return 1; } - let _ = recycled_sender.send(prev_chunk); + let _ = recycled_sender.send(prev_chunk.recycle()); } - for (a, b) in chunk.borrow_lines().iter().tuple_windows() { + for (a, b) in chunk.lines().iter().tuple_windows() { line_idx += 1; - if compare_by(a, b, settings) == Ordering::Greater { + if compare_by(a, b, settings, chunk.line_data(), chunk.line_data()) == Ordering::Greater + { if !settings.check_silent { println!("sort: {}:{}: disorder: {}", path, line_idx, b.line); } @@ -74,16 +82,15 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { /// The function running on the reader thread. fn reader( mut file: Box, - receiver: Receiver, + receiver: Receiver, sender: SyncSender, settings: &GlobalSettings, ) { let mut carry_over = vec![]; - for chunk in receiver.iter() { - let (recycled_lines, recycled_buffer) = chunk.recycle(); + for recycled_chunk in receiver.iter() { let should_continue = chunks::read( &sender, - recycled_buffer, + recycled_chunk, None, &mut carry_over, &mut file, @@ -93,7 +100,6 @@ fn reader( } else { b'\n' }, - recycled_lines, settings, ); if !should_continue { diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index d452401df..9e9d212c2 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -15,7 +15,7 @@ use std::{ use memchr::memchr_iter; use ouroboros::self_referencing; -use crate::{GlobalSettings, Line}; +use crate::{numeric_str_cmp::NumInfo, GeneralF64ParseResult, GlobalSettings, Line}; /// The chunk that is passed around between threads. /// `lines` consist of slices into `buffer`. @@ -25,28 +25,87 @@ pub struct Chunk { pub buffer: Vec, #[borrows(buffer)] #[covariant] - pub lines: Vec>, + pub contents: ChunkContents<'this>, +} + +#[derive(Debug)] +pub struct ChunkContents<'a> { + pub lines: Vec>, + pub line_data: LineData<'a>, +} + +#[derive(Debug)] +pub struct LineData<'a> { + pub selections: Vec<&'a str>, + pub num_infos: Vec, + pub parsed_floats: Vec, } impl Chunk { /// Destroy this chunk and return its components to be reused. - /// - /// # Returns - /// - /// * The `lines` vector, emptied - /// * The `buffer` vector, **not** emptied - pub fn recycle(mut self) -> (Vec>, Vec) { - let recycled_lines = self.with_lines_mut(|lines| { - lines.clear(); - unsafe { + pub fn recycle(mut self) -> RecycledChunk { + let recycled_contents = self.with_contents_mut(|contents| { + contents.lines.clear(); + contents.line_data.selections.clear(); + contents.line_data.num_infos.clear(); + contents.line_data.parsed_floats.clear(); + let lines = unsafe { // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime, // because the vector is empty. // Transmuting is necessary to make recycling possible. See https://github.com/rust-lang/rfcs/pull/2802 // for a rfc to make this unnecessary. Its example is similar to the code here. - std::mem::transmute::>, Vec>>(std::mem::take(lines)) - } + std::mem::transmute::>, Vec>>(std::mem::take( + &mut contents.lines, + )) + }; + let selections = unsafe { + // SAFETY: (same as above) It is safe to (temporarily) transmute to a vector of &str with a longer lifetime, + // because the vector is empty. + std::mem::transmute::, Vec<&'static str>>(std::mem::take( + &mut contents.line_data.selections, + )) + }; + ( + lines, + selections, + std::mem::take(&mut contents.line_data.num_infos), + std::mem::take(&mut contents.line_data.parsed_floats), + ) }); - (recycled_lines, self.into_heads().buffer) + RecycledChunk { + lines: recycled_contents.0, + selections: recycled_contents.1, + num_infos: recycled_contents.2, + parsed_floats: recycled_contents.3, + buffer: self.into_heads().buffer, + } + } + + pub fn lines(&self) -> &Vec { + &self.borrow_contents().lines + } + pub fn line_data(&self) -> &LineData { + &self.borrow_contents().line_data + } +} + +pub struct RecycledChunk { + lines: Vec>, + selections: Vec<&'static str>, + num_infos: Vec, + parsed_floats: Vec, + buffer: Vec, +} + +impl RecycledChunk { + pub fn new(capacity: usize) -> Self { + RecycledChunk { + lines: Vec::new(), + selections: Vec::new(), + num_infos: Vec::new(), + parsed_floats: Vec::new(), + buffer: vec![0; capacity], + } } } @@ -63,28 +122,32 @@ impl Chunk { /// (see also `read_to_chunk` for a more detailed documentation) /// /// * `sender`: The sender to send the lines to the sorter. -/// * `buffer`: The recycled buffer. All contents will be overwritten, but it must already be filled. +/// * `recycled_chunk`: The recycled chunk, as returned by `Chunk::recycle`. /// (i.e. `buffer.len()` should be equal to `buffer.capacity()`) /// * `max_buffer_size`: How big `buffer` can be. /// * `carry_over`: The bytes that must be carried over in between invocations. /// * `file`: The current file. /// * `next_files`: What `file` should be updated to next. /// * `separator`: The line separator. -/// * `lines`: The recycled vector to fill with lines. Must be empty. /// * `settings`: The global settings. #[allow(clippy::too_many_arguments)] pub fn read( sender: &SyncSender, - mut buffer: Vec, + recycled_chunk: RecycledChunk, max_buffer_size: Option, carry_over: &mut Vec, file: &mut T, next_files: &mut impl Iterator, separator: u8, - lines: Vec>, settings: &GlobalSettings, ) -> bool { - assert!(lines.is_empty()); + let RecycledChunk { + lines, + selections, + num_infos, + parsed_floats, + mut buffer, + } = recycled_chunk; if buffer.len() < carry_over.len() { buffer.resize(carry_over.len() + 10 * 1024, 0); } @@ -101,15 +164,25 @@ pub fn read( carry_over.extend_from_slice(&buffer[read..]); if read != 0 { - let payload = Chunk::new(buffer, |buf| { + let payload = Chunk::new(buffer, |buffer| { + let selections = unsafe { + // SAFETY: It is safe to transmute to an empty vector of selections with shorter lifetime. + // It was only temporarily transmuted to a Vec> to make recycling possible. + std::mem::transmute::, Vec<&'_ str>>(selections) + }; let mut lines = unsafe { - // SAFETY: It is safe to transmute to a vector of lines with shorter lifetime, + // SAFETY: (same as above) It is safe to transmute to a vector of lines with shorter lifetime, // because it was only temporarily transmuted to a Vec> to make recycling possible. std::mem::transmute::>, Vec>>(lines) }; - let read = crash_if_err!(1, std::str::from_utf8(&buf[..read])); - parse_lines(read, &mut lines, separator, settings); - lines + let read = crash_if_err!(1, std::str::from_utf8(&buffer[..read])); + let mut line_data = LineData { + selections, + num_infos, + parsed_floats, + }; + parse_lines(read, &mut lines, &mut line_data, separator, settings); + ChunkContents { lines, line_data } }); sender.send(payload).unwrap(); } @@ -120,6 +193,7 @@ pub fn read( fn parse_lines<'a>( mut read: &'a str, lines: &mut Vec>, + line_data: &mut LineData<'a>, separator: u8, settings: &GlobalSettings, ) { @@ -128,9 +202,15 @@ fn parse_lines<'a>( read = &read[..read.len() - 1]; } + assert!(lines.is_empty()); + assert!(line_data.selections.is_empty()); + assert!(line_data.num_infos.is_empty()); + assert!(line_data.parsed_floats.is_empty()); + let mut token_buffer = vec![]; lines.extend( read.split(separator as char) - .map(|line| Line::create(line, settings)), + .enumerate() + .map(|(index, line)| Line::create(line, index, line_data, &mut token_buffer, settings)), ); } diff --git a/src/uu/sort/src/ext_sort.rs b/src/uu/sort/src/ext_sort.rs index 44ff6014a..e0814b7a2 100644 --- a/src/uu/sort/src/ext_sort.rs +++ b/src/uu/sort/src/ext_sort.rs @@ -23,15 +23,16 @@ use std::{ use itertools::Itertools; +use crate::chunks::RecycledChunk; use crate::merge::ClosedTmpFile; use crate::merge::WriteableCompressedTmpFile; use crate::merge::WriteablePlainTmpFile; use crate::merge::WriteableTmpFile; -use crate::Line; use crate::{ chunks::{self, Chunk}, - compare_by, merge, output_sorted_lines, sort_by, GlobalSettings, + compare_by, merge, sort_by, GlobalSettings, }; +use crate::{print_sorted, Line}; use tempfile::TempDir; const START_BUFFER_SIZE: usize = 8_000; @@ -98,16 +99,39 @@ fn reader_writer>, Tmp: WriteableTmpFile merger.write_all(settings); } ReadResult::SortedSingleChunk(chunk) => { - output_sorted_lines(chunk.borrow_lines().iter(), settings); + if settings.unique { + print_sorted( + chunk.lines().iter().dedup_by(|a, b| { + compare_by(a, b, settings, chunk.line_data(), chunk.line_data()) + == Ordering::Equal + }), + settings, + ); + } else { + print_sorted(chunk.lines().iter(), settings); + } } ReadResult::SortedTwoChunks([a, b]) => { - let merged_iter = a - .borrow_lines() - .iter() - .merge_by(b.borrow_lines().iter(), |line_a, line_b| { - compare_by(line_a, line_b, settings) != Ordering::Greater - }); - output_sorted_lines(merged_iter, settings); + let merged_iter = a.lines().iter().map(|line| (line, &a)).merge_by( + b.lines().iter().map(|line| (line, &b)), + |(line_a, a), (line_b, b)| { + compare_by(line_a, line_b, settings, a.line_data(), b.line_data()) + != Ordering::Greater + }, + ); + if settings.unique { + print_sorted( + merged_iter + .dedup_by(|(line_a, a), (line_b, b)| { + compare_by(line_a, line_b, settings, a.line_data(), b.line_data()) + == Ordering::Equal + }) + .map(|(line, _)| line), + settings, + ); + } else { + print_sorted(merged_iter.map(|(line, _)| line), settings); + } } ReadResult::EmptyInput => { // don't output anything @@ -118,7 +142,9 @@ fn reader_writer>, Tmp: WriteableTmpFile /// The function that is executed on the sorter thread. fn sorter(receiver: Receiver, sender: SyncSender, settings: GlobalSettings) { while let Ok(mut payload) = receiver.recv() { - payload.with_lines_mut(|lines| sort_by(lines, &settings)); + payload.with_contents_mut(|contents| { + sort_by(&mut contents.lines, &settings, &contents.line_data) + }); sender.send(payload).unwrap(); } } @@ -154,20 +180,16 @@ fn read_write_loop( for _ in 0..2 { let should_continue = chunks::read( &sender, - vec![ - 0; - if START_BUFFER_SIZE < buffer_size { - START_BUFFER_SIZE - } else { - buffer_size - } - ], + RecycledChunk::new(if START_BUFFER_SIZE < buffer_size { + START_BUFFER_SIZE + } else { + buffer_size + }), Some(buffer_size), &mut carry_over, &mut file, &mut files, separator, - Vec::new(), settings, ); @@ -216,18 +238,17 @@ fn read_write_loop( file_number += 1; - let (recycled_lines, recycled_buffer) = chunk.recycle(); + let recycled_chunk = chunk.recycle(); if let Some(sender) = &sender_option { let should_continue = chunks::read( sender, - recycled_buffer, + recycled_chunk, None, &mut carry_over, &mut file, &mut files, separator, - recycled_lines, settings, ); if !should_continue { @@ -245,12 +266,9 @@ fn write( compress_prog: Option<&str>, separator: u8, ) -> I::Closed { - chunk.with_lines_mut(|lines| { - // Write the lines to the file - let mut tmp_file = I::create(file, compress_prog); - write_lines(lines, tmp_file.as_write(), separator); - tmp_file.finished_writing() - }) + let mut tmp_file = I::create(file, compress_prog); + write_lines(chunk.lines(), tmp_file.as_write(), separator); + tmp_file.finished_writing() } fn write_lines<'a, T: Write>(lines: &[Line<'a>], writer: &mut T, separator: u8) { diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 173faaffc..12d7a9b9b 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -24,7 +24,7 @@ use itertools::Itertools; use tempfile::TempDir; use crate::{ - chunks::{self, Chunk}, + chunks::{self, Chunk, RecycledChunk}, compare_by, GlobalSettings, }; @@ -125,14 +125,14 @@ fn merge_without_limit>( })); // Send the initial chunk to trigger a read for each file request_sender - .send((file_number, Chunk::new(vec![0; 8 * 1024], |_| Vec::new()))) + .send((file_number, RecycledChunk::new(8 * 1024))) .unwrap(); } // Send the second chunk for each file for file_number in 0..reader_files.len() { request_sender - .send((file_number, Chunk::new(vec![0; 8 * 1024], |_| Vec::new()))) + .send((file_number, RecycledChunk::new(8 * 1024))) .unwrap(); } @@ -181,13 +181,12 @@ struct ReaderFile { /// The function running on the reader thread. fn reader( - recycled_receiver: Receiver<(usize, Chunk)>, + recycled_receiver: Receiver<(usize, RecycledChunk)>, files: &mut [Option>], settings: &GlobalSettings, separator: u8, ) { - for (file_idx, chunk) in recycled_receiver.iter() { - let (recycled_lines, recycled_buffer) = chunk.recycle(); + for (file_idx, recycled_chunk) in recycled_receiver.iter() { if let Some(ReaderFile { file, sender, @@ -196,13 +195,12 @@ fn reader( { let should_continue = chunks::read( sender, - recycled_buffer, + recycled_chunk, None, carry_over, file.as_read(), &mut iter::empty(), separator, - recycled_lines, settings, ); if !should_continue { @@ -234,7 +232,7 @@ struct PreviousLine { /// Merges files together. This is **not** an iterator because of lifetime problems. pub struct FileMerger<'a> { heap: binary_heap_plus::BinaryHeap>, - request_sender: Sender<(usize, Chunk)>, + request_sender: Sender<(usize, RecycledChunk)>, prev: Option, } @@ -257,14 +255,16 @@ impl<'a> FileMerger<'a> { file_number: file.file_number, }); - file.current_chunk.with_lines(|lines| { - let current_line = &lines[file.line_idx]; + file.current_chunk.with_contents(|contents| { + let current_line = &contents.lines[file.line_idx]; if settings.unique { if let Some(prev) = &prev { let cmp = compare_by( - &prev.chunk.borrow_lines()[prev.line_idx], + &prev.chunk.lines()[prev.line_idx], current_line, settings, + prev.chunk.line_data(), + file.current_chunk.line_data(), ); if cmp == Ordering::Equal { return; @@ -274,8 +274,7 @@ impl<'a> FileMerger<'a> { current_line.print(out, settings); }); - let was_last_line_for_file = - file.current_chunk.borrow_lines().len() == file.line_idx + 1; + let was_last_line_for_file = file.current_chunk.lines().len() == file.line_idx + 1; if was_last_line_for_file { if let Ok(next_chunk) = file.receiver.recv() { @@ -295,7 +294,7 @@ impl<'a> FileMerger<'a> { // If nothing is referencing the previous chunk anymore, this means that the previous line // was the last line of the chunk. We can recycle the chunk. self.request_sender - .send((prev.file_number, prev_chunk)) + .send((prev.file_number, prev_chunk.recycle())) .ok(); } } @@ -312,9 +311,11 @@ struct FileComparator<'a> { impl<'a> Compare for FileComparator<'a> { fn compare(&self, a: &MergeableFile, b: &MergeableFile) -> Ordering { let mut cmp = compare_by( - &a.current_chunk.borrow_lines()[a.line_idx], - &b.current_chunk.borrow_lines()[b.line_idx], + &a.current_chunk.lines()[a.line_idx], + &b.current_chunk.lines()[b.line_idx], self.settings, + a.current_chunk.line_data(), + b.current_chunk.line_data(), ); if cmp == Ordering::Equal { // To make sorting stable, we need to consider the file number as well, diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 7f3d2872e..202ab5d1a 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -23,11 +23,11 @@ mod ext_sort; mod merge; mod numeric_str_cmp; +use chunks::LineData; use clap::{crate_version, App, Arg}; use custom_str_cmp::custom_str_cmp; use ext_sort::ext_sort; use fnv::FnvHasher; -use itertools::Itertools; use numeric_str_cmp::{numeric_str_cmp, NumInfo, NumInfoParseSettings}; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; @@ -170,6 +170,17 @@ pub struct GlobalSettings { tmp_dir: PathBuf, compress_prog: Option, merge_batch_size: usize, + precomputed: Precomputed, +} + +/// Data needed for sorting. Should be computed once before starting to sort +/// by calling `GlobalSettings::init_precomputed`. +#[derive(Clone, Debug)] +struct Precomputed { + needs_tokens: bool, + num_infos_per_line: usize, + floats_per_line: usize, + selections_per_line: usize, } impl GlobalSettings { @@ -210,6 +221,25 @@ impl GlobalSettings { None => BufWriter::new(Box::new(stdout()) as Box), } } + + /// Precompute some data needed for sorting. + /// This function **must** be called before starting to sort, and `GlobalSettings` may not be altered + /// afterwards. + fn init_precomputed(&mut self) { + self.precomputed.needs_tokens = self.selectors.iter().any(|s| s.needs_tokens); + self.precomputed.selections_per_line = + self.selectors.iter().filter(|s| s.needs_selection).count(); + self.precomputed.num_infos_per_line = self + .selectors + .iter() + .filter(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric)) + .count(); + self.precomputed.floats_per_line = self + .selectors + .iter() + .filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric)) + .count(); + } } impl Default for GlobalSettings { @@ -237,9 +267,16 @@ impl Default for GlobalSettings { tmp_dir: PathBuf::new(), compress_prog: None, merge_batch_size: 32, + precomputed: Precomputed { + num_infos_per_line: 0, + floats_per_line: 0, + selections_per_line: 0, + needs_tokens: false, + }, } } } + #[derive(Clone, PartialEq, Debug)] struct KeySettings { mode: SortMode, @@ -322,32 +359,10 @@ impl Default for KeySettings { Self::from(&GlobalSettings::default()) } } - -#[derive(Clone, Debug)] -enum NumCache { +enum Selection<'a> { AsF64(GeneralF64ParseResult), - WithInfo(NumInfo), -} - -impl NumCache { - fn as_f64(&self) -> GeneralF64ParseResult { - match self { - NumCache::AsF64(n) => *n, - _ => unreachable!(), - } - } - fn as_num_info(&self) -> &NumInfo { - match self { - NumCache::WithInfo(n) => n, - _ => unreachable!(), - } - } -} - -#[derive(Clone, Debug)] -struct Selection<'a> { - slice: &'a str, - num_cache: Option>, + WithNumInfo(&'a str, NumInfo), + Str(&'a str), } type Field = Range; @@ -355,31 +370,44 @@ type Field = Range; #[derive(Clone, Debug)] pub struct Line<'a> { line: &'a str, - selections: Box<[Selection<'a>]>, + index: usize, } impl<'a> Line<'a> { - fn create(string: &'a str, settings: &GlobalSettings) -> Self { - let fields = if settings + /// Creates a new `Line`. + /// + /// If additional data is needed for sorting it is added to `line_data`. + /// `token_buffer` allows to reuse the allocation for tokens. + fn create( + line: &'a str, + index: usize, + line_data: &mut LineData<'a>, + token_buffer: &mut Vec, + settings: &GlobalSettings, + ) -> Self { + token_buffer.clear(); + if settings.precomputed.needs_tokens { + tokenize(line, settings.separator, token_buffer); + } + for (selector, selection) in settings .selectors .iter() - .any(|selector| selector.needs_tokens) + .map(|selector| (selector, selector.get_selection(line, token_buffer))) { - // Only tokenize if we will need tokens. - Some(tokenize(string, settings.separator)) - } else { - None - }; - - Line { - line: string, - selections: settings - .selectors - .iter() - .filter(|selector| !selector.is_default_selection) - .map(|selector| selector.get_selection(string, fields.as_deref())) - .collect(), + match selection { + Selection::AsF64(parsed_float) => line_data.parsed_floats.push(parsed_float), + Selection::WithNumInfo(str, num_info) => { + line_data.num_infos.push(num_info); + line_data.selections.push(str); + } + Selection::Str(str) => { + if selector.needs_selection { + line_data.selections.push(str) + } + } + } } + Self { line, index } } fn print(&self, writer: &mut impl Write, settings: &GlobalSettings) { @@ -408,7 +436,8 @@ impl<'a> Line<'a> { let line = self.line.replace('\t', ">"); writeln!(writer, "{}", line)?; - let fields = tokenize(self.line, settings.separator); + let mut fields = vec![]; + tokenize(self.line, settings.separator, &mut fields); for selector in settings.selectors.iter() { let mut selection = selector.get_range(self.line, Some(&fields)); match selector.settings.mode { @@ -539,51 +568,51 @@ impl<'a> Line<'a> { } } -/// Tokenize a line into fields. -fn tokenize(line: &str, separator: Option) -> Vec { +/// Tokenize a line into fields. The result is stored into `token_buffer`. +fn tokenize(line: &str, separator: Option, token_buffer: &mut Vec) { + assert!(token_buffer.is_empty()); if let Some(separator) = separator { - tokenize_with_separator(line, separator) + tokenize_with_separator(line, separator, token_buffer) } else { - tokenize_default(line) + tokenize_default(line, token_buffer) } } /// By default fields are separated by the first whitespace after non-whitespace. /// Whitespace is included in fields at the start. -fn tokenize_default(line: &str) -> Vec { - let mut tokens = vec![0..0]; +/// The result is stored into `token_buffer`. +fn tokenize_default(line: &str, token_buffer: &mut Vec) { + token_buffer.push(0..0); // pretend that there was whitespace in front of the line let mut previous_was_whitespace = true; for (idx, char) in line.char_indices() { if char.is_whitespace() { if !previous_was_whitespace { - tokens.last_mut().unwrap().end = idx; - tokens.push(idx..0); + token_buffer.last_mut().unwrap().end = idx; + token_buffer.push(idx..0); } previous_was_whitespace = true; } else { previous_was_whitespace = false; } } - tokens.last_mut().unwrap().end = line.len(); - tokens + token_buffer.last_mut().unwrap().end = line.len(); } /// Split between separators. These separators are not included in fields. -fn tokenize_with_separator(line: &str, separator: char) -> Vec { - let mut tokens = vec![]; +/// The result is stored into `token_buffer`. +fn tokenize_with_separator(line: &str, separator: char, token_buffer: &mut Vec) { let separator_indices = line.char_indices() .filter_map(|(i, c)| if c == separator { Some(i) } else { None }); let mut start = 0; for sep_idx in separator_indices { - tokens.push(start..sep_idx); + token_buffer.push(start..sep_idx); start = sep_idx + 1; } if start < line.len() { - tokens.push(start..line.len()); + token_buffer.push(start..line.len()); } - tokens } #[derive(Clone, PartialEq, Debug)] @@ -640,8 +669,10 @@ struct FieldSelector { to: Option, settings: KeySettings, needs_tokens: bool, - // Whether the selection for each line is going to be the whole line with no NumCache - is_default_selection: bool, + // Whether this selector operates on a sub-slice of a line. + // Selections are therefore not needed when this selector matches the whole line + // or the sort mode is general-numeric. + needs_selection: bool, } impl Default for FieldSelector { @@ -651,7 +682,7 @@ impl Default for FieldSelector { to: None, settings: Default::default(), needs_tokens: false, - is_default_selection: true, + needs_selection: false, } } } @@ -747,14 +778,12 @@ impl FieldSelector { Err("invalid character index 0 for the start position of a field".to_string()) } else { Ok(Self { - is_default_selection: from.field == 1 - && from.char == 1 - && to.is_none() - && !matches!( - settings.mode, - SortMode::Numeric | SortMode::GeneralNumeric | SortMode::HumanNumeric - ) - && !from.ignore_blanks, + needs_selection: (from.field != 1 + || from.char != 1 + || to.is_some() + || matches!(settings.mode, SortMode::Numeric | SortMode::HumanNumeric) + || from.ignore_blanks) + && !matches!(settings.mode, SortMode::GeneralNumeric), needs_tokens: from.field != 1 || from.char == 0 || to.is_some(), from, to, @@ -764,12 +793,16 @@ impl FieldSelector { } /// Get the selection that corresponds to this selector for the line. - /// If needs_fields returned false, tokens may be None. - fn get_selection<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Selection<'a> { + /// If needs_fields returned false, tokens may be empty. + fn get_selection<'a>(&self, line: &'a str, tokens: &[Field]) -> Selection<'a> { + // `get_range` expects `None` when we don't need tokens and would get confused by an empty vector. + let tokens = if self.needs_tokens { + Some(tokens) + } else { + None + }; let mut range = &line[self.get_range(line, tokens)]; - let num_cache = if self.settings.mode == SortMode::Numeric - || self.settings.mode == SortMode::HumanNumeric - { + if self.settings.mode == SortMode::Numeric || self.settings.mode == SortMode::HumanNumeric { // Parse NumInfo for this number. let (info, num_range) = NumInfo::parse( range, @@ -780,24 +813,18 @@ impl FieldSelector { ); // Shorten the range to what we need to pass to numeric_str_cmp later. range = &range[num_range]; - Some(Box::new(NumCache::WithInfo(info))) + Selection::WithNumInfo(range, info) } else if self.settings.mode == SortMode::GeneralNumeric { // Parse this number as f64, as this is the requirement for general numeric sorting. - Some(Box::new(NumCache::AsF64(general_f64_parse( - &range[get_leading_gen(range)], - )))) + Selection::AsF64(general_f64_parse(&range[get_leading_gen(range)])) } else { // This is not a numeric sort, so we don't need a NumCache. - None - }; - Selection { - slice: range, - num_cache, + Selection::Str(range) } } /// Look up the range in the line that corresponds to this selector. - /// If needs_fields returned false, tokens may be None. + /// If needs_fields returned false, tokens must be None. fn get_range<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range { enum Resolution { // The start index of the resolved character, inclusive @@ -1297,18 +1324,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 { ); } - exec(&files, &settings) -} + settings.init_precomputed(); -fn output_sorted_lines<'a>(iter: impl Iterator>, settings: &GlobalSettings) { - if settings.unique { - print_sorted( - iter.dedup_by(|a, b| compare_by(a, b, settings) == Ordering::Equal), - settings, - ); - } else { - print_sorted(iter, settings); - } + exec(&files, &settings) } fn exec(files: &[String], settings: &GlobalSettings) -> i32 { @@ -1328,55 +1346,59 @@ fn exec(files: &[String], settings: &GlobalSettings) -> i32 { 0 } -fn sort_by<'a>(unsorted: &mut Vec>, settings: &GlobalSettings) { +fn sort_by<'a>(unsorted: &mut Vec>, settings: &GlobalSettings, line_data: &LineData<'a>) { if settings.stable || settings.unique { - unsorted.par_sort_by(|a, b| compare_by(a, b, settings)) + unsorted.par_sort_by(|a, b| compare_by(a, b, settings, line_data, line_data)) } else { - unsorted.par_sort_unstable_by(|a, b| compare_by(a, b, settings)) + unsorted.par_sort_unstable_by(|a, b| compare_by(a, b, settings, line_data, line_data)) } } -fn compare_by<'a>(a: &Line<'a>, b: &Line<'a>, global_settings: &GlobalSettings) -> Ordering { - let mut idx = 0; +fn compare_by<'a>( + a: &Line<'a>, + b: &Line<'a>, + global_settings: &GlobalSettings, + a_line_data: &LineData<'a>, + b_line_data: &LineData<'a>, +) -> Ordering { + let mut selection_index = 0; + let mut num_info_index = 0; + let mut parsed_float_index = 0; for selector in &global_settings.selectors { - let mut _selections = None; - let (a_selection, b_selection) = if selector.is_default_selection { + let (a_str, b_str) = if !selector.needs_selection { // We can select the whole line. - // We have to store the selections outside of the if-block so that they live long enough. - _selections = Some(( - Selection { - slice: a.line, - num_cache: None, - }, - Selection { - slice: b.line, - num_cache: None, - }, - )); - // Unwrap the selections again, and return references to them. - ( - &_selections.as_ref().unwrap().0, - &_selections.as_ref().unwrap().1, - ) + (a.line, b.line) } else { - let selections = (&a.selections[idx], &b.selections[idx]); - idx += 1; + let selections = ( + a_line_data.selections + [a.index * global_settings.precomputed.selections_per_line + selection_index], + b_line_data.selections + [b.index * global_settings.precomputed.selections_per_line + selection_index], + ); + selection_index += 1; selections }; - let a_str = a_selection.slice; - let b_str = b_selection.slice; + let settings = &selector.settings; let cmp: Ordering = match settings.mode { SortMode::Random => random_shuffle(a_str, b_str, &global_settings.salt), - SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp( - (a_str, a_selection.num_cache.as_ref().unwrap().as_num_info()), - (b_str, b_selection.num_cache.as_ref().unwrap().as_num_info()), - ), - SortMode::GeneralNumeric => general_numeric_compare( - a_selection.num_cache.as_ref().unwrap().as_f64(), - b_selection.num_cache.as_ref().unwrap().as_f64(), - ), + SortMode::Numeric | SortMode::HumanNumeric => { + let a_num_info = &a_line_data.num_infos + [a.index * global_settings.precomputed.num_infos_per_line + num_info_index]; + let b_num_info = &b_line_data.num_infos + [b.index * global_settings.precomputed.num_infos_per_line + num_info_index]; + num_info_index += 1; + numeric_str_cmp((a_str, a_num_info), (b_str, b_num_info)) + } + SortMode::GeneralNumeric => { + let a_float = &a_line_data.parsed_floats + [a.index * global_settings.precomputed.floats_per_line + parsed_float_index]; + let b_float = &b_line_data.parsed_floats + [b.index * global_settings.precomputed.floats_per_line + parsed_float_index]; + parsed_float_index += 1; + general_numeric_compare(a_float, b_float) + } SortMode::Month => month_compare(a_str, b_str), SortMode::Version => version_compare(a_str, b_str), SortMode::Default => custom_str_cmp( @@ -1470,7 +1492,7 @@ fn get_leading_gen(input: &str) -> Range { } #[derive(Copy, Clone, PartialEq, PartialOrd, Debug)] -enum GeneralF64ParseResult { +pub enum GeneralF64ParseResult { Invalid, NaN, NegInfinity, @@ -1497,8 +1519,8 @@ fn general_f64_parse(a: &str) -> GeneralF64ParseResult { /// Compares two floats, with errors and non-numerics assumed to be -inf. /// Stops coercing at the first non-numeric char. /// We explicitly need to convert to f64 in this case. -fn general_numeric_compare(a: GeneralF64ParseResult, b: GeneralF64ParseResult) -> Ordering { - a.partial_cmp(&b).unwrap() +fn general_numeric_compare(a: &GeneralF64ParseResult, b: &GeneralF64ParseResult) -> Ordering { + a.partial_cmp(b).unwrap() } fn get_rand_string() -> String { @@ -1646,6 +1668,12 @@ mod tests { use super::*; + fn tokenize_helper(line: &str, separator: Option) -> Vec { + let mut buffer = vec![]; + tokenize(line, separator, &mut buffer); + buffer + } + #[test] fn test_get_hash() { let a = "Ted".to_string(); @@ -1689,20 +1717,23 @@ mod tests { #[test] fn test_tokenize_fields() { let line = "foo bar b x"; - assert_eq!(tokenize(line, None), vec![0..3, 3..7, 7..9, 9..14,],); + assert_eq!(tokenize_helper(line, None), vec![0..3, 3..7, 7..9, 9..14,],); } #[test] fn test_tokenize_fields_leading_whitespace() { let line = " foo bar b x"; - assert_eq!(tokenize(line, None), vec![0..7, 7..11, 11..13, 13..18,]); + assert_eq!( + tokenize_helper(line, None), + vec![0..7, 7..11, 11..13, 13..18,] + ); } #[test] fn test_tokenize_fields_custom_separator() { let line = "aaa foo bar b x"; assert_eq!( - tokenize(line, Some('a')), + tokenize_helper(line, Some('a')), vec![0..0, 1..1, 2..2, 3..9, 10..18,] ); } @@ -1710,11 +1741,11 @@ mod tests { #[test] fn test_tokenize_fields_trailing_custom_separator() { let line = "a"; - assert_eq!(tokenize(line, Some('a')), vec![0..0]); + assert_eq!(tokenize_helper(line, Some('a')), vec![0..0]); let line = "aa"; - assert_eq!(tokenize(line, Some('a')), vec![0..0, 1..1]); + assert_eq!(tokenize_helper(line, Some('a')), vec![0..0, 1..1]); let line = "..a..a"; - assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]); + assert_eq!(tokenize_helper(line, Some('a')), vec![0..2, 3..5]); } #[test] @@ -1722,13 +1753,7 @@ mod tests { fn test_line_size() { // We should make sure to not regress the size of the Line struct because // it is unconditional overhead for every line we sort. - assert_eq!(std::mem::size_of::(), 32); - // These are the fields of Line: - assert_eq!(std::mem::size_of::<&str>(), 16); - assert_eq!(std::mem::size_of::>(), 16); - - // How big is a selection? Constant cost all lines pay when we need selections. - assert_eq!(std::mem::size_of::(), 24); + assert_eq!(std::mem::size_of::(), 24); } #[test]