From 76abc7e51db4a37adabc14994b34a6391d841c56 Mon Sep 17 00:00:00 2001 From: David Laban Date: Wed, 3 Aug 2016 20:43:36 +0100 Subject: [PATCH 1/6] sort: refactor compare_fns into Settings Also split out a compare_by(a, b, settings) helper function, which may be used by --merge, later. --- src/sort/sort.rs | 58 +++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/sort/sort.rs b/src/sort/sort.rs index 39c3bff9b..9c0da029e 100644 --- a/src/sort/sort.rs +++ b/src/sort/sort.rs @@ -46,6 +46,7 @@ struct Settings { stable: bool, unique: bool, check: bool, + compare_fns: Vec Ordering>, } impl Default for Settings { @@ -57,6 +58,7 @@ impl Default for Settings { stable: false, unique: false, check: false, + compare_fns: Vec::new(), } } } @@ -125,6 +127,21 @@ With no FILE, or when FILE is -, read standard input.", NAME, VERSION); files.push("-".to_owned()); } + settings.compare_fns.push(match settings.mode { + SortMode::Numeric => numeric_compare, + SortMode::HumanNumeric => human_numeric_size_compare, + SortMode::Month => month_compare, + SortMode::Version => version_compare, + SortMode::Default => String::cmp + }); + + if !settings.stable { + match settings.mode { + SortMode::Default => {} + _ => settings.compare_fns.push(String::cmp) + } + } + exec(files, &settings) } @@ -150,24 +167,7 @@ fn exec(files: Vec, settings: &Settings) -> i32 { let original_lines = lines.to_vec(); - let mut compare_fns = Vec::new(); - - compare_fns.push(match settings.mode { - SortMode::Numeric => numeric_compare, - SortMode::HumanNumeric => human_numeric_size_compare, - SortMode::Month => month_compare, - SortMode::Version => version_compare, - SortMode::Default => String::cmp - }); - - if !settings.stable { - match settings.mode { - SortMode::Default => {} - _ => compare_fns.push(String::cmp) - } - } - - sort_by(&mut lines, compare_fns); + sort_by(&mut lines, &settings); if settings.unique { lines.dedup() @@ -193,20 +193,22 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } -fn sort_by(lines: &mut Vec, compare_fns: Vec) - where F: Fn( &String, &String ) -> Ordering -{ +fn sort_by(lines: &mut Vec, settings: &Settings) { lines.sort_by(|a, b| { - for compare_fn in &compare_fns { - let cmp = compare_fn(a, b); - if cmp != Ordering::Equal { - return cmp; - } - } - return Ordering::Equal; + compare_by(a, b, &settings) }) } +fn compare_by(a: &String, b: &String, settings: &Settings) -> Ordering { + for compare_fn in &settings.compare_fns { + let cmp = compare_fn(a, b); + if cmp != Ordering::Equal { + return cmp; + } + } + return Ordering::Equal; +} + /// Parse the beginning string into an f64, returning -inf instead of NaN on errors. fn permissive_f64_parse(a: &str) -> f64 { // Maybe should be split on non-digit, but then 10e100 won't parse properly. From 8a8319a337c14b88af11d33610316cf96adcf923 Mon Sep 17 00:00:00 2001 From: David Laban Date: Wed, 3 Aug 2016 22:26:58 +0100 Subject: [PATCH 2/6] sort --merge works, but ignores --unique and --reverse FileMerger receives Lines Iterables of the pre-sorted input files via push_file() It implements Iterator, which yields lines from the input files in (merged) sorted order. If the input files are not sorted, then the behavior is undefined. Internally, FileMerger uses a std::collections::BinaryHeap. MergeableFile is an internal helper that implements Ord in a way that BinaryHeap can use (note that we want smallest-first, but BinaryHeap returns largest first, so MergeableFile::cmp() calls reverse() on whatever compare_by() returns. --- src/sort/sort.rs | 109 ++++++++++++++++-- .../sort/merge_ints_interleaved.expected | 9 ++ .../sort/merge_ints_interleaved_1.txt | 3 + .../sort/merge_ints_interleaved_2.txt | 3 + .../sort/merge_ints_interleaved_3.txt | 3 + tests/test_sort.rs | 10 ++ 6 files changed, 130 insertions(+), 7 deletions(-) create mode 100644 tests/fixtures/sort/merge_ints_interleaved.expected create mode 100644 tests/fixtures/sort/merge_ints_interleaved_1.txt create mode 100644 tests/fixtures/sort/merge_ints_interleaved_2.txt create mode 100644 tests/fixtures/sort/merge_ints_interleaved_3.txt diff --git a/src/sort/sort.rs b/src/sort/sort.rs index 9c0da029e..af2c61559 100644 --- a/src/sort/sort.rs +++ b/src/sort/sort.rs @@ -19,8 +19,10 @@ extern crate semver; extern crate uucore; use std::cmp::Ordering; +use std::collections::BinaryHeap; use std::fs::File; -use std::io::{BufRead, BufReader, BufWriter, Read, stdin, stdout, Write}; +use std::io::{BufRead, BufReader, BufWriter, Lines, Read, stdin, stdout, Write}; +use std::mem::replace; use std::path::Path; use uucore::fs::is_stdin_interactive; use semver::Version; @@ -41,6 +43,7 @@ enum SortMode { struct Settings { mode: SortMode, + merge: bool, reverse: bool, outfile: Option, stable: bool, @@ -53,6 +56,7 @@ impl Default for Settings { fn default() -> Settings { Settings { mode: SortMode::Default, + merge: false, reverse: false, outfile: None, stable: false, @@ -63,6 +67,85 @@ impl Default for Settings { } } +struct MergeableFile<'a> { + lines: Lines>>, + current_line: String, + settings: &'a Settings, +} + +// BinaryHeap depends on `Ord`. Note that we want to pop smallest items +// from the heap first, and BinaryHeap.pop() returns the largest, so we +// trick it into the right order by calling reverse() here. +impl<'a> Ord for MergeableFile<'a> { + fn cmp(&self, other: &MergeableFile) -> Ordering { + compare_by(&self.current_line, &other.current_line, &self.settings).reverse() + } +} + +impl<'a> PartialOrd for MergeableFile<'a> { + fn partial_cmp(&self, other: &MergeableFile) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> PartialEq for MergeableFile<'a> { + fn eq(&self, other: &MergeableFile) -> bool { + Ordering::Equal == compare_by(&self.current_line, &other.current_line, &self.settings) + } +} + +impl<'a> Eq for MergeableFile<'a> {} + +struct FileMerger<'a> { + heap: BinaryHeap>, + settings: &'a Settings, +} + +impl<'a> FileMerger<'a> { + fn new(settings: &'a Settings) -> FileMerger<'a> { + FileMerger { + heap: BinaryHeap::new(), + settings: settings, + } + } + fn push_file(&mut self, mut lines: Lines>>){ + match lines.next() { + Some(Ok(next_line)) => { + let mergeable_file = MergeableFile { + lines: lines, + current_line: next_line, + settings: &self.settings, + }; + self.heap.push(mergeable_file); + } + _ => {} + } + } +} + +impl<'a> Iterator for FileMerger<'a> { + type Item = String; + fn next(&mut self) -> Option { + match self.heap.pop() { + Some(mut current) => { + match current.lines.next() { + Some(Ok(next_line)) => { + let ret = replace(&mut current.current_line, next_line); + self.heap.push(current); + Some(ret) + }, + _ => { + // Don't put it back in the heap (it's empty/erroring) + // but its first line is still valid. + Some(current.current_line) + }, + } + }, + None => None, + } + } +} + pub fn uumain(args: Vec) -> i32 { let mut settings: Settings = Default::default(); let mut opts = getopts::Options::new(); @@ -73,6 +156,7 @@ pub fn uumain(args: Vec) -> i32 { opts.optflag("r", "reverse", "reverse the output"); opts.optflag("h", "help", "display this help and exit"); opts.optflag("", "version", "output version information and exit"); + opts.optflag("m", "merge", "merge already sorted files; do not sort"); opts.optopt("o", "output", "write output to FILENAME instead of stdout", "FILENAME"); opts.optflag("s", "stable", "stabilize sort by disabling last-resort comparison"); opts.optflag("u", "unique", "output only the first of an equal run"); @@ -115,6 +199,7 @@ With no FILE, or when FILE is -, read standard input.", NAME, VERSION); SortMode::Default }; + settings.merge = matches.opt_present("merge"); settings.reverse = matches.opt_present("reverse"); settings.outfile = matches.opt_str("output"); settings.stable = matches.opt_present("stable"); @@ -147,6 +232,8 @@ With no FILE, or when FILE is -, read standard input.", NAME, VERSION); fn exec(files: Vec, settings: &Settings) -> i32 { let mut lines = Vec::new(); + let mut file_merger = FileMerger::new(&settings); + for path in &files { let (reader, _) = match open(path) { Some(x) => x, @@ -155,12 +242,17 @@ fn exec(files: Vec, settings: &Settings) -> i32 { let buf_reader = BufReader::new(reader); - for line in buf_reader.lines() { - match line { - Ok(n) => { - lines.push(n); - }, - _ => break + if settings.merge { + file_merger.push_file(buf_reader.lines()); + } + else { + for line in buf_reader.lines() { + if let Ok(n) = line { + lines.push(n); + } + else { + break; + } } } } @@ -185,6 +277,9 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } } } + else if settings.merge { + print_sorted(file_merger, &settings.outfile) + } else { print_sorted(lines.iter(), &settings.outfile) } diff --git a/tests/fixtures/sort/merge_ints_interleaved.expected b/tests/fixtures/sort/merge_ints_interleaved.expected new file mode 100644 index 000000000..071939893 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved.expected @@ -0,0 +1,9 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/fixtures/sort/merge_ints_interleaved_1.txt b/tests/fixtures/sort/merge_ints_interleaved_1.txt new file mode 100644 index 000000000..6e181b92b --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved_1.txt @@ -0,0 +1,3 @@ +1 +4 +7 diff --git a/tests/fixtures/sort/merge_ints_interleaved_2.txt b/tests/fixtures/sort/merge_ints_interleaved_2.txt new file mode 100644 index 000000000..62ffd8a69 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved_2.txt @@ -0,0 +1,3 @@ +2 +5 +8 diff --git a/tests/fixtures/sort/merge_ints_interleaved_3.txt b/tests/fixtures/sort/merge_ints_interleaved_3.txt new file mode 100644 index 000000000..1e3ac1e93 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved_3.txt @@ -0,0 +1,3 @@ +3 +6 +9 diff --git a/tests/test_sort.rs b/tests/test_sort.rs index dbbdb92f5..dde6c4b53 100644 --- a/tests/test_sort.rs +++ b/tests/test_sort.rs @@ -70,6 +70,16 @@ fn test_multiple_files() { .succeeds().stdout_is_fixture("multiple_files.expected"); } +#[test] +fn test_merge_interleaved() { + new_ucmd() + .arg("-m") + .arg("merge_ints_interleaved_1.txt") + .arg("merge_ints_interleaved_2.txt") + .arg("merge_ints_interleaved_3.txt") + .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); +} + #[test] fn test_check() { new_ucmd() From e1af1520e7eeeecf6086ccf764e31fd26c97dd1f Mon Sep 17 00:00:00 2001 From: David Laban Date: Thu, 11 Aug 2016 20:30:58 +0100 Subject: [PATCH 3/6] sort: make compare_by honour settings.reverse This allows sort --merge --reverse to work as well. --- src/sort/sort.rs | 11 ++++++----- tests/fixtures/sort/merge_ints_reversed.expected | 9 +++++++++ tests/fixtures/sort/merge_ints_reversed_1.txt | 3 +++ tests/fixtures/sort/merge_ints_reversed_2.txt | 3 +++ tests/fixtures/sort/merge_ints_reversed_3.txt | 3 +++ tests/test_sort.rs | 11 +++++++++++ 6 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 tests/fixtures/sort/merge_ints_reversed.expected create mode 100644 tests/fixtures/sort/merge_ints_reversed_1.txt create mode 100644 tests/fixtures/sort/merge_ints_reversed_2.txt create mode 100644 tests/fixtures/sort/merge_ints_reversed_3.txt diff --git a/src/sort/sort.rs b/src/sort/sort.rs index af2c61559..15d655ecf 100644 --- a/src/sort/sort.rs +++ b/src/sort/sort.rs @@ -265,10 +265,6 @@ fn exec(files: Vec, settings: &Settings) -> i32 { lines.dedup() } - if settings.reverse { - lines.reverse() - } - if settings.check { for (i, line) in lines.iter().enumerate() { if line != &original_lines[i] { @@ -298,7 +294,12 @@ fn compare_by(a: &String, b: &String, settings: &Settings) -> Ordering { for compare_fn in &settings.compare_fns { let cmp = compare_fn(a, b); if cmp != Ordering::Equal { - return cmp; + if settings.reverse { + return cmp.reverse(); + } + else { + return cmp; + } } } return Ordering::Equal; diff --git a/tests/fixtures/sort/merge_ints_reversed.expected b/tests/fixtures/sort/merge_ints_reversed.expected new file mode 100644 index 000000000..abb8f7739 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_reversed.expected @@ -0,0 +1,9 @@ +9 +8 +7 +6 +5 +4 +3 +2 +1 diff --git a/tests/fixtures/sort/merge_ints_reversed_1.txt b/tests/fixtures/sort/merge_ints_reversed_1.txt new file mode 100644 index 000000000..8313069f4 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_reversed_1.txt @@ -0,0 +1,3 @@ +7 +4 +1 diff --git a/tests/fixtures/sort/merge_ints_reversed_2.txt b/tests/fixtures/sort/merge_ints_reversed_2.txt new file mode 100644 index 000000000..c0416aa97 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_reversed_2.txt @@ -0,0 +1,3 @@ +8 +5 +2 diff --git a/tests/fixtures/sort/merge_ints_reversed_3.txt b/tests/fixtures/sort/merge_ints_reversed_3.txt new file mode 100644 index 000000000..bd33aa425 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_reversed_3.txt @@ -0,0 +1,3 @@ +9 +6 +3 diff --git a/tests/test_sort.rs b/tests/test_sort.rs index dde6c4b53..0075ddbe4 100644 --- a/tests/test_sort.rs +++ b/tests/test_sort.rs @@ -80,6 +80,17 @@ fn test_merge_interleaved() { .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); } +#[test] +fn test_merge_reversed() { + new_ucmd() + .arg("-m") + .arg("--reverse") + .arg("merge_ints_reversed_1.txt") + .arg("merge_ints_reversed_2.txt") + .arg("merge_ints_reversed_3.txt") + .succeeds().stdout_is_fixture("merge_ints_reversed.expected"); +} + #[test] fn test_check() { new_ucmd() From 3531c46fb8d0661447d96b3539b7de33e7581c95 Mon Sep 17 00:00:00 2001 From: David Laban Date: Thu, 11 Aug 2016 21:09:29 +0100 Subject: [PATCH 4/6] sort --merge --unique This uses Itertools' dedup() rather than Vec::dedup(). There is probably a cleaner, more polymorphic way to do this. Suggestions welcome. --- src/sort/Cargo.toml | 1 + src/sort/sort.rs | 21 +++++++++++++++------ tests/test_sort.rs | 14 ++++++++++++++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/sort/Cargo.toml b/src/sort/Cargo.toml index 545206f57..f8de86672 100644 --- a/src/sort/Cargo.toml +++ b/src/sort/Cargo.toml @@ -11,6 +11,7 @@ path = "sort.rs" getopts = "*" libc = "*" semver = "*" +itertools = "*" uucore = { path="../uucore" } [[bin]] diff --git a/src/sort/sort.rs b/src/sort/sort.rs index 15d655ecf..2fda643fc 100644 --- a/src/sort/sort.rs +++ b/src/sort/sort.rs @@ -17,6 +17,8 @@ extern crate semver; #[macro_use] extern crate uucore; +#[macro_use] +extern crate itertools; use std::cmp::Ordering; use std::collections::BinaryHeap; @@ -26,6 +28,7 @@ use std::mem::replace; use std::path::Path; use uucore::fs::is_stdin_interactive; use semver::Version; +use itertools::Itertools; // for Iterator::dedup() static NAME: &'static str = "sort"; static VERSION: &'static str = env!("CARGO_PKG_VERSION"); @@ -261,10 +264,6 @@ fn exec(files: Vec, settings: &Settings) -> i32 { sort_by(&mut lines, &settings); - if settings.unique { - lines.dedup() - } - if settings.check { for (i, line) in lines.iter().enumerate() { if line != &original_lines[i] { @@ -274,10 +273,20 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } } else if settings.merge { - print_sorted(file_merger, &settings.outfile) + if settings.unique { + print_sorted(file_merger.dedup(), &settings.outfile) + } + else { + print_sorted(file_merger, &settings.outfile) + } } else { - print_sorted(lines.iter(), &settings.outfile) + if settings.unique { + print_sorted(lines.iter().dedup(), &settings.outfile) + } + else { + print_sorted(lines.iter(), &settings.outfile) + } } 0 diff --git a/tests/test_sort.rs b/tests/test_sort.rs index 0075ddbe4..9c6ccff71 100644 --- a/tests/test_sort.rs +++ b/tests/test_sort.rs @@ -80,6 +80,20 @@ fn test_merge_interleaved() { .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); } +#[test] +fn test_merge_unique() { + new_ucmd() + .arg("-m") + .arg("--unique") + .arg("merge_ints_interleaved_1.txt") + .arg("merge_ints_interleaved_2.txt") + .arg("merge_ints_interleaved_3.txt") + .arg("merge_ints_interleaved_3.txt") + .arg("merge_ints_interleaved_2.txt") + .arg("merge_ints_interleaved_1.txt") + .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); +} + #[test] fn test_merge_reversed() { new_ucmd() From 87daf9dd8d9aa34786324bb1a0d756bb1c838bb7 Mon Sep 17 00:00:00 2001 From: David Laban Date: Fri, 12 Aug 2016 21:35:54 +0100 Subject: [PATCH 5/6] sort: use stdout_only_fixture in tests --- tests/test_sort.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_sort.rs b/tests/test_sort.rs index 9c6ccff71..18180d4a4 100644 --- a/tests/test_sort.rs +++ b/tests/test_sort.rs @@ -67,7 +67,7 @@ fn test_multiple_files() { .arg("-n") .arg("multiple_files1.txt") .arg("multiple_files2.txt") - .succeeds().stdout_is_fixture("multiple_files.expected"); + .succeeds().stdout_only_fixture("multiple_files.expected"); } #[test] @@ -77,7 +77,7 @@ fn test_merge_interleaved() { .arg("merge_ints_interleaved_1.txt") .arg("merge_ints_interleaved_2.txt") .arg("merge_ints_interleaved_3.txt") - .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); + .succeeds().stdout_only_fixture("merge_ints_interleaved.expected"); } #[test] @@ -91,7 +91,7 @@ fn test_merge_unique() { .arg("merge_ints_interleaved_3.txt") .arg("merge_ints_interleaved_2.txt") .arg("merge_ints_interleaved_1.txt") - .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); + .succeeds().stdout_only_fixture("merge_ints_interleaved.expected"); } #[test] @@ -102,7 +102,7 @@ fn test_merge_reversed() { .arg("merge_ints_reversed_1.txt") .arg("merge_ints_reversed_2.txt") .arg("merge_ints_reversed_3.txt") - .succeeds().stdout_is_fixture("merge_ints_reversed.expected"); + .succeeds().stdout_only_fixture("merge_ints_reversed.expected"); } #[test] From cf93826eeee72741ab652b1db2a699c6084d3111 Mon Sep 17 00:00:00 2001 From: David Laban Date: Sat, 13 Aug 2016 00:18:58 +0100 Subject: [PATCH 6/6] sort --check refactor to use iterator This allows us to check files without bringing them entirely into memory. Also makes it easier to find the disorder in (seq 9; echo 0) | sort --check (points at the end of the file, where our previous version would point at the start of the file) Itertools' .coalesce() was the most useful helper that I could find for comparing adjacent values in an iterator. It is designed for implementing things like .dedup(), so the resulting code is a little unintuitive. --- src/sort/sort.rs | 57 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/src/sort/sort.rs b/src/sort/sort.rs index 2fda643fc..47d7d4943 100644 --- a/src/sort/sort.rs +++ b/src/sort/sort.rs @@ -214,6 +214,10 @@ With no FILE, or when FILE is -, read standard input.", NAME, VERSION); /* if no file, default to stdin */ files.push("-".to_owned()); } + else if settings.check && files.len() != 1 { + crash!(1, "sort: extra operand `{}' not allowed with -c", files[1]) + + } settings.compare_fns.push(match settings.mode { SortMode::Numeric => numeric_compare, @@ -248,6 +252,9 @@ fn exec(files: Vec, settings: &Settings) -> i32 { if settings.merge { file_merger.push_file(buf_reader.lines()); } + else if settings.check { + return exec_check_file(buf_reader.lines(), &settings) + } else { for line in buf_reader.lines() { if let Ok(n) = line { @@ -260,19 +267,9 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } } - let original_lines = lines.to_vec(); - sort_by(&mut lines, &settings); - if settings.check { - for (i, line) in lines.iter().enumerate() { - if line != &original_lines[i] { - println!("sort: disorder in line {}", i); - return 1; - } - } - } - else if settings.merge { + if settings.merge { if settings.unique { print_sorted(file_merger.dedup(), &settings.outfile) } @@ -293,6 +290,44 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } +fn exec_check_file(lines: Lines>>, settings: &Settings) -> i32 { + // errors yields the line before each disorder, + // plus the last line (quirk of .coalesce()) + let unwrapped_lines = lines.filter_map(|maybe_line| { + if let Ok(line) = maybe_line { + Some(line) + } + else { + None + } + }); + let mut errors = unwrapped_lines.enumerate().coalesce( + |(last_i, last_line), (i, line)| { + if compare_by(&last_line, &line, &settings) == Ordering::Greater { + Err(((last_i, last_line), (i, line))) + } + else { + Ok((i, line)) + } + }); + if let Some((first_error_index, _line)) = errors.next() { + // Check for a second "error", as .coalesce() always returns the last + // line, no matter what our merging function does. + if let Some(_last_line_or_next_error) = errors.next() { + println!("sort: disorder in line {}", first_error_index); + return 1; + } + else { + // first "error" was actually the last line. + return 0; + } + } + else { + // unwrapped_lines was empty. Empty files are defined to be sorted. + return 0; + } +} + fn sort_by(lines: &mut Vec, settings: &Settings) { lines.sort_by(|a, b| { compare_by(a, b, &settings)