1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-30 12:37:49 +00:00

uniq: avoid building list of duplicate lines

This reduces memory usage by only storing two lines of the input file at
a time. The current implementation first builds a list of all duplicate
lines ('group') and then decides which lines of the group should be
printed.
This commit is contained in:
Chirag Jadwani 2021-04-19 17:02:24 +05:30
parent 879ab2ecb0
commit 3bb99e7047

View file

@ -61,34 +61,43 @@ impl Uniq {
reader: &mut BufReader<R>, reader: &mut BufReader<R>,
writer: &mut BufWriter<W>, writer: &mut BufWriter<W>,
) { ) {
let mut lines: Vec<String> = vec![];
let mut first_line_printed = false; let mut first_line_printed = false;
let delimiters = self.delimiters; let mut group_count = 1;
let line_terminator = self.get_line_terminator(); let line_terminator = self.get_line_terminator();
// Don't print any delimiting lines before, after or between groups if delimiting method is 'none' let mut lines = reader.split(line_terminator).map(get_line_string);
let no_delimiters = delimiters == Delimiters::None; let mut line = match lines.next() {
// The 'prepend' and 'both' delimit methods will cause output to start with delimiter line Some(l) => l,
let prepend_delimiter = delimiters == Delimiters::Prepend || delimiters == Delimiters::Both; None => return,
// The 'append' and 'both' delimit methods will cause output to end with delimiter line };
let append_delimiter = delimiters == Delimiters::Append || delimiters == Delimiters::Both;
for line in reader.split(line_terminator).map(get_line_string) { // compare current `line` with consecutive lines (`next_line`) of the input
if !lines.is_empty() && self.cmp_keys(&lines[0], &line) { // and if needed, print `line` based on the command line options provided
// Print delimiter if delimit method is not 'none' and any line has been output for next_line in lines {
// before or if we need to start output with delimiter if self.cmp_keys(&line, &next_line) {
let print_delimiter = !no_delimiters && (prepend_delimiter || first_line_printed); if (group_count == 1 && !self.repeats_only)
first_line_printed |= self.print_lines(writer, &lines, print_delimiter); || (group_count > 1 && !self.uniques_only)
lines.truncate(0); {
self.print_line(writer, &line, group_count, first_line_printed);
first_line_printed = true;
} }
lines.push(line); line = next_line;
group_count = 1;
} else {
if self.all_repeated {
self.print_line(writer, &line, group_count, first_line_printed);
first_line_printed = true;
line = next_line;
} }
if !lines.is_empty() { group_count += 1;
// Print delimiter if delimit method is not 'none' and any line has been output
// before or if we need to start output with delimiter
let print_delimiter = !no_delimiters && (prepend_delimiter || first_line_printed);
first_line_printed |= self.print_lines(writer, &lines, print_delimiter);
} }
if append_delimiter && first_line_printed { }
if (group_count == 1 && !self.repeats_only) || (group_count > 1 && !self.uniques_only) {
self.print_line(writer, &line, group_count, first_line_printed);
first_line_printed = true;
}
if (self.delimiters == Delimiters::Append || self.delimiters == Delimiters::Both)
&& first_line_printed
{
crash_if_err!(1, writer.write_all(&[line_terminator])); crash_if_err!(1, writer.write_all(&[line_terminator]));
} }
} }
@ -163,27 +172,17 @@ impl Uniq {
} }
} }
fn print_lines<W: Write>( fn should_print_delimiter(&self, group_count: usize, first_line_printed: bool) -> bool {
&self, // if no delimiter option is selected then no other checks needed
writer: &mut BufWriter<W>, self.delimiters != Delimiters::None
lines: &[String], // print delimiter only before the first line of a group, not between lines of a group
print_delimiter: bool, && group_count == 1
) -> bool { // if at least one line has been output before current group then print delimiter
let mut first_line_printed = false; && (first_line_printed
let mut count = if self.all_repeated { 1 } else { lines.len() }; // or if we need to prepend delimiter then print it even at the start of the output
if lines.len() == 1 && !self.repeats_only || lines.len() > 1 && !self.uniques_only { || self.delimiters == Delimiters::Prepend
self.print_line(writer, &lines[0], count, print_delimiter); // the 'both' delimit mode should prepend and append delimiters
first_line_printed = true; || self.delimiters == Delimiters::Both)
count += 1;
}
if self.all_repeated {
for line in lines[1..].iter() {
self.print_line(writer, line, count, print_delimiter && !first_line_printed);
first_line_printed = true;
count += 1;
}
}
first_line_printed
} }
fn print_line<W: Write>( fn print_line<W: Write>(
@ -191,11 +190,11 @@ impl Uniq {
writer: &mut BufWriter<W>, writer: &mut BufWriter<W>,
line: &str, line: &str,
count: usize, count: usize,
print_delimiter: bool, first_line_printed: bool,
) { ) {
let line_terminator = self.get_line_terminator(); let line_terminator = self.get_line_terminator();
if print_delimiter { if self.should_print_delimiter(count, first_line_printed) {
crash_if_err!(1, writer.write_all(&[line_terminator])); crash_if_err!(1, writer.write_all(&[line_terminator]));
} }