From 8a8319a337c14b88af11d33610316cf96adcf923 Mon Sep 17 00:00:00 2001 From: David Laban Date: Wed, 3 Aug 2016 22:26:58 +0100 Subject: [PATCH] sort --merge works, but ignores --unique and --reverse FileMerger receives Lines Iterables of the pre-sorted input files via push_file() It implements Iterator, which yields lines from the input files in (merged) sorted order. If the input files are not sorted, then the behavior is undefined. Internally, FileMerger uses a std::collections::BinaryHeap. MergeableFile is an internal helper that implements Ord in a way that BinaryHeap can use (note that we want smallest-first, but BinaryHeap returns largest first, so MergeableFile::cmp() calls reverse() on whatever compare_by() returns. --- src/sort/sort.rs | 109 ++++++++++++++++-- .../sort/merge_ints_interleaved.expected | 9 ++ .../sort/merge_ints_interleaved_1.txt | 3 + .../sort/merge_ints_interleaved_2.txt | 3 + .../sort/merge_ints_interleaved_3.txt | 3 + tests/test_sort.rs | 10 ++ 6 files changed, 130 insertions(+), 7 deletions(-) create mode 100644 tests/fixtures/sort/merge_ints_interleaved.expected create mode 100644 tests/fixtures/sort/merge_ints_interleaved_1.txt create mode 100644 tests/fixtures/sort/merge_ints_interleaved_2.txt create mode 100644 tests/fixtures/sort/merge_ints_interleaved_3.txt diff --git a/src/sort/sort.rs b/src/sort/sort.rs index 9c0da029e..af2c61559 100644 --- a/src/sort/sort.rs +++ b/src/sort/sort.rs @@ -19,8 +19,10 @@ extern crate semver; extern crate uucore; use std::cmp::Ordering; +use std::collections::BinaryHeap; use std::fs::File; -use std::io::{BufRead, BufReader, BufWriter, Read, stdin, stdout, Write}; +use std::io::{BufRead, BufReader, BufWriter, Lines, Read, stdin, stdout, Write}; +use std::mem::replace; use std::path::Path; use uucore::fs::is_stdin_interactive; use semver::Version; @@ -41,6 +43,7 @@ enum SortMode { struct Settings { mode: SortMode, + merge: bool, reverse: bool, outfile: Option, stable: bool, @@ -53,6 +56,7 @@ impl Default for Settings { fn default() -> Settings { Settings { mode: SortMode::Default, + merge: false, reverse: false, outfile: None, stable: false, @@ -63,6 +67,85 @@ impl Default for Settings { } } +struct MergeableFile<'a> { + lines: Lines>>, + current_line: String, + settings: &'a Settings, +} + +// BinaryHeap depends on `Ord`. Note that we want to pop smallest items +// from the heap first, and BinaryHeap.pop() returns the largest, so we +// trick it into the right order by calling reverse() here. +impl<'a> Ord for MergeableFile<'a> { + fn cmp(&self, other: &MergeableFile) -> Ordering { + compare_by(&self.current_line, &other.current_line, &self.settings).reverse() + } +} + +impl<'a> PartialOrd for MergeableFile<'a> { + fn partial_cmp(&self, other: &MergeableFile) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> PartialEq for MergeableFile<'a> { + fn eq(&self, other: &MergeableFile) -> bool { + Ordering::Equal == compare_by(&self.current_line, &other.current_line, &self.settings) + } +} + +impl<'a> Eq for MergeableFile<'a> {} + +struct FileMerger<'a> { + heap: BinaryHeap>, + settings: &'a Settings, +} + +impl<'a> FileMerger<'a> { + fn new(settings: &'a Settings) -> FileMerger<'a> { + FileMerger { + heap: BinaryHeap::new(), + settings: settings, + } + } + fn push_file(&mut self, mut lines: Lines>>){ + match lines.next() { + Some(Ok(next_line)) => { + let mergeable_file = MergeableFile { + lines: lines, + current_line: next_line, + settings: &self.settings, + }; + self.heap.push(mergeable_file); + } + _ => {} + } + } +} + +impl<'a> Iterator for FileMerger<'a> { + type Item = String; + fn next(&mut self) -> Option { + match self.heap.pop() { + Some(mut current) => { + match current.lines.next() { + Some(Ok(next_line)) => { + let ret = replace(&mut current.current_line, next_line); + self.heap.push(current); + Some(ret) + }, + _ => { + // Don't put it back in the heap (it's empty/erroring) + // but its first line is still valid. + Some(current.current_line) + }, + } + }, + None => None, + } + } +} + pub fn uumain(args: Vec) -> i32 { let mut settings: Settings = Default::default(); let mut opts = getopts::Options::new(); @@ -73,6 +156,7 @@ pub fn uumain(args: Vec) -> i32 { opts.optflag("r", "reverse", "reverse the output"); opts.optflag("h", "help", "display this help and exit"); opts.optflag("", "version", "output version information and exit"); + opts.optflag("m", "merge", "merge already sorted files; do not sort"); opts.optopt("o", "output", "write output to FILENAME instead of stdout", "FILENAME"); opts.optflag("s", "stable", "stabilize sort by disabling last-resort comparison"); opts.optflag("u", "unique", "output only the first of an equal run"); @@ -115,6 +199,7 @@ With no FILE, or when FILE is -, read standard input.", NAME, VERSION); SortMode::Default }; + settings.merge = matches.opt_present("merge"); settings.reverse = matches.opt_present("reverse"); settings.outfile = matches.opt_str("output"); settings.stable = matches.opt_present("stable"); @@ -147,6 +232,8 @@ With no FILE, or when FILE is -, read standard input.", NAME, VERSION); fn exec(files: Vec, settings: &Settings) -> i32 { let mut lines = Vec::new(); + let mut file_merger = FileMerger::new(&settings); + for path in &files { let (reader, _) = match open(path) { Some(x) => x, @@ -155,12 +242,17 @@ fn exec(files: Vec, settings: &Settings) -> i32 { let buf_reader = BufReader::new(reader); - for line in buf_reader.lines() { - match line { - Ok(n) => { - lines.push(n); - }, - _ => break + if settings.merge { + file_merger.push_file(buf_reader.lines()); + } + else { + for line in buf_reader.lines() { + if let Ok(n) = line { + lines.push(n); + } + else { + break; + } } } } @@ -185,6 +277,9 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } } } + else if settings.merge { + print_sorted(file_merger, &settings.outfile) + } else { print_sorted(lines.iter(), &settings.outfile) } diff --git a/tests/fixtures/sort/merge_ints_interleaved.expected b/tests/fixtures/sort/merge_ints_interleaved.expected new file mode 100644 index 000000000..071939893 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved.expected @@ -0,0 +1,9 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/fixtures/sort/merge_ints_interleaved_1.txt b/tests/fixtures/sort/merge_ints_interleaved_1.txt new file mode 100644 index 000000000..6e181b92b --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved_1.txt @@ -0,0 +1,3 @@ +1 +4 +7 diff --git a/tests/fixtures/sort/merge_ints_interleaved_2.txt b/tests/fixtures/sort/merge_ints_interleaved_2.txt new file mode 100644 index 000000000..62ffd8a69 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved_2.txt @@ -0,0 +1,3 @@ +2 +5 +8 diff --git a/tests/fixtures/sort/merge_ints_interleaved_3.txt b/tests/fixtures/sort/merge_ints_interleaved_3.txt new file mode 100644 index 000000000..1e3ac1e93 --- /dev/null +++ b/tests/fixtures/sort/merge_ints_interleaved_3.txt @@ -0,0 +1,3 @@ +3 +6 +9 diff --git a/tests/test_sort.rs b/tests/test_sort.rs index dbbdb92f5..dde6c4b53 100644 --- a/tests/test_sort.rs +++ b/tests/test_sort.rs @@ -70,6 +70,16 @@ fn test_multiple_files() { .succeeds().stdout_is_fixture("multiple_files.expected"); } +#[test] +fn test_merge_interleaved() { + new_ucmd() + .arg("-m") + .arg("merge_ints_interleaved_1.txt") + .arg("merge_ints_interleaved_2.txt") + .arg("merge_ints_interleaved_3.txt") + .succeeds().stdout_is_fixture("merge_ints_interleaved.expected"); +} + #[test] fn test_check() { new_ucmd()