From 116e253cc0391e18e145170dcff222f07d2b00a2 Mon Sep 17 00:00:00 2001 From: Chirag Jadwani Date: Mon, 15 Mar 2021 18:11:33 +0530 Subject: [PATCH] uniq: Fix skip fields Current implementation of the skip fields logic does not handle multibyte code points correctly. It assumes each code point (`char`) is one byte. If the skipped part of the input line has any multibyte code points then this can cause fields not being skipped correctly (field start index is calculated to be before it actually starts). --- src/uu/uniq/src/uniq.rs | 25 ++++++++++------------ tests/fixtures/uniq/skip-2-fields.expected | 2 +- tests/fixtures/uniq/skip-fields.txt | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index 98260bb8f..a1809f0f0 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -79,22 +79,19 @@ impl Uniq { fn skip_fields<'a>(&self, line: &'a str) -> &'a str { if let Some(skip_fields) = self.skip_fields { - if line.split_whitespace().count() > skip_fields { - let mut field = 0; - let mut i = 0; - while field < skip_fields && i < line.len() { - while i < line.len() && line.chars().nth(i).unwrap().is_whitespace() { - i += 1; - } - while i < line.len() && !line.chars().nth(i).unwrap().is_whitespace() { - i += 1; - } - field += 1; + let mut i = 0; + let mut char_indices = line.char_indices(); + for _ in 0..skip_fields { + if char_indices.find(|(_, c)| !c.is_whitespace()) == None { + return ""; + } + match char_indices.find(|(_, c)| c.is_whitespace()) { + None => return "", + + Some((next_field_i, _)) => i = next_field_i, } - &line[i..] - } else { - "" } + &line[i..] } else { line } diff --git a/tests/fixtures/uniq/skip-2-fields.expected b/tests/fixtures/uniq/skip-2-fields.expected index 8cffc2ebc..b971c2b2b 100644 --- a/tests/fixtures/uniq/skip-2-fields.expected +++ b/tests/fixtures/uniq/skip-2-fields.expected @@ -1,2 +1,2 @@ - aaa aa a + aaa ⟪⟫ a aa a diff --git a/tests/fixtures/uniq/skip-fields.txt b/tests/fixtures/uniq/skip-fields.txt index f84e377a0..4ec2744c6 100644 --- a/tests/fixtures/uniq/skip-fields.txt +++ b/tests/fixtures/uniq/skip-fields.txt @@ -1,4 +1,4 @@ - aaa aa a + aaa ⟪⟫ a ZZZ aa a ZZZ aa a ZZZ bb a