mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
cut: refactor (#4255)
refactors `cut field` logic to reduce code duplication by factoring out the common `Searcer`, which is _templatized_ on a specific `Matcher` -- `ExactMatcher` for an explicit delimiter and `WhitespaceMatcher` for white-space delimiter. before - code duplication in `Searcher` and `WhitespaceSearcher` - code duplication in `cut_fields` and `cut_fields_whitespace` after - two versions of `Matcher`s - one `Searcher` - simplify `cut_fields` by delegating actual work to specific functions
This commit is contained in:
parent
8c6d0e7630
commit
3ad36a49cb
4 changed files with 354 additions and 308 deletions
|
@ -16,12 +16,12 @@ use uucore::display::Quotable;
|
||||||
use uucore::error::{FromIo, UResult, USimpleError};
|
use uucore::error::{FromIo, UResult, USimpleError};
|
||||||
|
|
||||||
use self::searcher::Searcher;
|
use self::searcher::Searcher;
|
||||||
use self::whitespace_searcher::WhitespaceSearcher;
|
use matcher::{ExactMatcher, Matcher, WhitespaceMatcher};
|
||||||
use uucore::ranges::Range;
|
use uucore::ranges::Range;
|
||||||
use uucore::{format_usage, show, show_error, show_if_err};
|
use uucore::{format_usage, show, show_error, show_if_err};
|
||||||
|
|
||||||
|
mod matcher;
|
||||||
mod searcher;
|
mod searcher;
|
||||||
mod whitespace_searcher;
|
|
||||||
|
|
||||||
static USAGE: &str =
|
static USAGE: &str =
|
||||||
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
|
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
|
||||||
|
@ -188,23 +188,22 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::cognitive_complexity)]
|
// Output delimiter is explicitly specified
|
||||||
fn cut_fields_delimiter<R: Read>(
|
fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
|
||||||
reader: R,
|
reader: R,
|
||||||
|
matcher: &M,
|
||||||
ranges: &[Range],
|
ranges: &[Range],
|
||||||
delim: &str,
|
|
||||||
only_delimited: bool,
|
only_delimited: bool,
|
||||||
newline_char: u8,
|
newline_char: u8,
|
||||||
out_delim: &str,
|
out_delim: &str,
|
||||||
) -> UResult<()> {
|
) -> UResult<()> {
|
||||||
let mut buf_in = BufReader::new(reader);
|
let mut buf_in = BufReader::new(reader);
|
||||||
let mut out = stdout_writer();
|
let mut out = stdout_writer();
|
||||||
let input_delim_len = delim.len();
|
|
||||||
|
|
||||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||||
let mut fields_pos = 1;
|
let mut fields_pos = 1;
|
||||||
let mut low_idx = 0;
|
let mut low_idx = 0;
|
||||||
let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable();
|
let mut delim_search = Searcher::new(matcher, line).peekable();
|
||||||
let mut print_delim = false;
|
let mut print_delim = false;
|
||||||
|
|
||||||
if delim_search.peek().is_none() {
|
if delim_search.peek().is_none() {
|
||||||
|
@ -218,85 +217,6 @@ fn cut_fields_delimiter<R: Read>(
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
for &Range { low, high } in ranges {
|
|
||||||
if low - fields_pos > 0 {
|
|
||||||
low_idx = match delim_search.nth(low - fields_pos - 1) {
|
|
||||||
Some(index) => index + input_delim_len,
|
|
||||||
None => break,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
for _ in 0..=high - low {
|
|
||||||
if print_delim {
|
|
||||||
out.write_all(out_delim.as_bytes())?;
|
|
||||||
} else {
|
|
||||||
print_delim = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
match delim_search.next() {
|
|
||||||
Some(high_idx) => {
|
|
||||||
let segment = &line[low_idx..high_idx];
|
|
||||||
|
|
||||||
out.write_all(segment)?;
|
|
||||||
|
|
||||||
low_idx = high_idx + input_delim_len;
|
|
||||||
fields_pos = high + 1;
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let segment = &line[low_idx..];
|
|
||||||
|
|
||||||
out.write_all(segment)?;
|
|
||||||
|
|
||||||
if line[line.len() - 1] == newline_char {
|
|
||||||
return Ok(true);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out.write_all(&[newline_char])?;
|
|
||||||
Ok(true)
|
|
||||||
});
|
|
||||||
|
|
||||||
if let Err(e) = result {
|
|
||||||
return Err(USimpleError::new(1, e.to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn cut_fields_whitespace<R: Read>(
|
|
||||||
reader: R,
|
|
||||||
ranges: &[Range],
|
|
||||||
only_delimited: bool,
|
|
||||||
newline_char: u8,
|
|
||||||
out_delim: &str,
|
|
||||||
) -> UResult<()> {
|
|
||||||
let mut buf_in = BufReader::new(reader);
|
|
||||||
let mut out = stdout_writer();
|
|
||||||
|
|
||||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
|
||||||
let mut fields_pos = 1;
|
|
||||||
let mut low_idx = 0;
|
|
||||||
let mut delim_search = WhitespaceSearcher::new(line).peekable();
|
|
||||||
let mut print_delim = false;
|
|
||||||
|
|
||||||
if delim_search.peek().is_none() {
|
|
||||||
if !only_delimited {
|
|
||||||
out.write_all(line)?;
|
|
||||||
if line[line.len() - 1] != newline_char {
|
|
||||||
out.write_all(&[newline_char])?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(true);
|
|
||||||
}
|
|
||||||
// The logic is identical to `cut_fields_delimiter` function above, which uses
|
|
||||||
// `Searcher` that iterates over and returns the first position of the delimiter character.
|
|
||||||
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
|
|
||||||
// delimiter character positions, since each delimiter sequence length can vary.
|
|
||||||
for &Range { low, high } in ranges {
|
for &Range { low, high } in ranges {
|
||||||
if low - fields_pos > 0 {
|
if low - fields_pos > 0 {
|
||||||
// current field is not in the range, so jump to the field corresponding to the
|
// current field is not in the range, so jump to the field corresponding to the
|
||||||
|
@ -317,7 +237,7 @@ fn cut_fields_whitespace<R: Read>(
|
||||||
}
|
}
|
||||||
|
|
||||||
match delim_search.next() {
|
match delim_search.next() {
|
||||||
// print the current field up to the next whitespace
|
// print the current field up to the next field delim
|
||||||
Some((first, last)) => {
|
Some((first, last)) => {
|
||||||
let segment = &line[low_idx..first];
|
let segment = &line[low_idx..first];
|
||||||
|
|
||||||
|
@ -352,40 +272,25 @@ fn cut_fields_whitespace<R: Read>(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
// Output delimiter is the same as input delimiter
|
||||||
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
|
fn cut_fields_implicit_out_delim<R: Read, M: Matcher>(
|
||||||
match opts.delimiter {
|
reader: R,
|
||||||
Delimiter::Whitespace => cut_fields_whitespace(
|
matcher: &M,
|
||||||
reader,
|
ranges: &[Range],
|
||||||
ranges,
|
only_delimited: bool,
|
||||||
opts.only_delimited,
|
newline_char: u8,
|
||||||
newline_char,
|
) -> UResult<()> {
|
||||||
opts.out_delimiter.as_deref().unwrap_or("\t"),
|
|
||||||
),
|
|
||||||
Delimiter::String(ref delimiter) => {
|
|
||||||
if let Some(ref o_delim) = opts.out_delimiter {
|
|
||||||
return cut_fields_delimiter(
|
|
||||||
reader,
|
|
||||||
ranges,
|
|
||||||
delimiter,
|
|
||||||
opts.only_delimited,
|
|
||||||
newline_char,
|
|
||||||
o_delim,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut buf_in = BufReader::new(reader);
|
let mut buf_in = BufReader::new(reader);
|
||||||
let mut out = stdout_writer();
|
let mut out = stdout_writer();
|
||||||
let delim_len = delimiter.len();
|
|
||||||
|
|
||||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||||
let mut fields_pos = 1;
|
let mut fields_pos = 1;
|
||||||
let mut low_idx = 0;
|
let mut low_idx = 0;
|
||||||
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
|
let mut delim_search = Searcher::new(matcher, line).peekable();
|
||||||
let mut print_delim = false;
|
let mut print_delim = false;
|
||||||
|
|
||||||
if delim_search.peek().is_none() {
|
if delim_search.peek().is_none() {
|
||||||
if !opts.only_delimited {
|
if !only_delimited {
|
||||||
out.write_all(line)?;
|
out.write_all(line)?;
|
||||||
if line[line.len() - 1] != newline_char {
|
if line[line.len() - 1] != newline_char {
|
||||||
out.write_all(&[newline_char])?;
|
out.write_all(&[newline_char])?;
|
||||||
|
@ -397,25 +302,21 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
||||||
|
|
||||||
for &Range { low, high } in ranges {
|
for &Range { low, high } in ranges {
|
||||||
if low - fields_pos > 0 {
|
if low - fields_pos > 0 {
|
||||||
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
|
if let Some((first, last)) = delim_search.nth(low - fields_pos - 1) {
|
||||||
low_idx = if print_delim {
|
low_idx = if print_delim { first } else { last }
|
||||||
delim_pos
|
|
||||||
} else {
|
|
||||||
delim_pos + delim_len
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match delim_search.nth(high - low) {
|
match delim_search.nth(high - low) {
|
||||||
Some(high_idx) => {
|
Some((first, _)) => {
|
||||||
let segment = &line[low_idx..high_idx];
|
let segment = &line[low_idx..first];
|
||||||
|
|
||||||
out.write_all(segment)?;
|
out.write_all(segment)?;
|
||||||
|
|
||||||
print_delim = true;
|
print_delim = true;
|
||||||
low_idx = high_idx;
|
low_idx = first;
|
||||||
fields_pos = high + 1;
|
fields_pos = high + 1;
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
@ -440,6 +341,42 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
||||||
|
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
|
||||||
|
match opts.delimiter {
|
||||||
|
Delimiter::String(ref delim) => {
|
||||||
|
let matcher = ExactMatcher::new(delim.as_bytes());
|
||||||
|
match opts.out_delimiter {
|
||||||
|
Some(ref out_delim) => cut_fields_explicit_out_delim(
|
||||||
|
reader,
|
||||||
|
&matcher,
|
||||||
|
ranges,
|
||||||
|
opts.only_delimited,
|
||||||
|
newline_char,
|
||||||
|
out_delim,
|
||||||
|
),
|
||||||
|
None => cut_fields_implicit_out_delim(
|
||||||
|
reader,
|
||||||
|
&matcher,
|
||||||
|
ranges,
|
||||||
|
opts.only_delimited,
|
||||||
|
newline_char,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Delimiter::Whitespace => {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t");
|
||||||
|
cut_fields_explicit_out_delim(
|
||||||
|
reader,
|
||||||
|
&matcher,
|
||||||
|
ranges,
|
||||||
|
opts.only_delimited,
|
||||||
|
newline_char,
|
||||||
|
out_delim,
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
126
src/uu/cut/src/matcher.rs
Normal file
126
src/uu/cut/src/matcher.rs
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
// This file is part of the uutils coreutils package.
|
||||||
|
//
|
||||||
|
// For the full copyright and license information, please view the LICENSE
|
||||||
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
|
use memchr::{memchr, memchr2};
|
||||||
|
|
||||||
|
// Find the next matching byte sequence positions
|
||||||
|
// Return (first, last) where haystack[first..last] corresponds to the matched pattern
|
||||||
|
pub trait Matcher {
|
||||||
|
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Matches for the exact byte sequence pattern
|
||||||
|
pub struct ExactMatcher<'a> {
|
||||||
|
needle: &'a [u8],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ExactMatcher<'a> {
|
||||||
|
pub fn new(needle: &'a [u8]) -> Self {
|
||||||
|
assert!(!needle.is_empty());
|
||||||
|
Self { needle }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Matcher for ExactMatcher<'a> {
|
||||||
|
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)> {
|
||||||
|
let mut pos = 0usize;
|
||||||
|
loop {
|
||||||
|
match memchr(self.needle[0], &haystack[pos..]) {
|
||||||
|
Some(match_idx) => {
|
||||||
|
let match_idx = match_idx + pos; // account for starting from pos
|
||||||
|
if self.needle.len() == 1
|
||||||
|
|| haystack[match_idx + 1..].starts_with(&self.needle[1..])
|
||||||
|
{
|
||||||
|
return Some((match_idx, match_idx + self.needle.len()));
|
||||||
|
} else {
|
||||||
|
pos = match_idx + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Matches for any number of SPACE or TAB
|
||||||
|
pub struct WhitespaceMatcher {}
|
||||||
|
|
||||||
|
impl Matcher for WhitespaceMatcher {
|
||||||
|
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)> {
|
||||||
|
match memchr2(b' ', b'\t', haystack) {
|
||||||
|
Some(match_idx) => {
|
||||||
|
let mut skip = match_idx + 1;
|
||||||
|
while skip < haystack.len() {
|
||||||
|
match haystack[skip] {
|
||||||
|
b' ' | b'\t' => skip += 1,
|
||||||
|
_ => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some((match_idx, skip))
|
||||||
|
}
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod matcher_tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_exact_matcher_single_byte() {
|
||||||
|
let matcher = ExactMatcher::new(":".as_bytes());
|
||||||
|
// spell-checker:disable
|
||||||
|
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||||
|
assert_eq!(matcher.next_match(":".as_bytes()), Some((0, 1)));
|
||||||
|
assert_eq!(matcher.next_match(":abcxyz".as_bytes()), Some((0, 1)));
|
||||||
|
assert_eq!(matcher.next_match("abc:xyz".as_bytes()), Some((3, 4)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz:".as_bytes()), Some((6, 7)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||||
|
// spell-checker:enable
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_exact_matcher_multi_bytes() {
|
||||||
|
let matcher = ExactMatcher::new("<>".as_bytes());
|
||||||
|
// spell-checker:disable
|
||||||
|
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||||
|
assert_eq!(matcher.next_match("<>".as_bytes()), Some((0, 2)));
|
||||||
|
assert_eq!(matcher.next_match("<>abcxyz".as_bytes()), Some((0, 2)));
|
||||||
|
assert_eq!(matcher.next_match("abc<>xyz".as_bytes()), Some((3, 5)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz<>".as_bytes()), Some((6, 8)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||||
|
// spell-checker:enable
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_matcher_single_space() {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
// spell-checker:disable
|
||||||
|
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||||
|
assert_eq!(matcher.next_match(" ".as_bytes()), Some((0, 1)));
|
||||||
|
assert_eq!(matcher.next_match("\tabcxyz".as_bytes()), Some((0, 1)));
|
||||||
|
assert_eq!(matcher.next_match("abc\txyz".as_bytes()), Some((3, 4)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz ".as_bytes()), Some((6, 7)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||||
|
// spell-checker:enable
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_matcher_multi_spaces() {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
// spell-checker:disable
|
||||||
|
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||||
|
assert_eq!(matcher.next_match(" \t ".as_bytes()), Some((0, 3)));
|
||||||
|
assert_eq!(matcher.next_match("\t\tabcxyz".as_bytes()), Some((0, 2)));
|
||||||
|
assert_eq!(matcher.next_match("abc \txyz".as_bytes()), Some((3, 5)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz ".as_bytes()), Some((6, 8)));
|
||||||
|
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||||
|
// spell-checker:enable
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,82 +5,77 @@
|
||||||
// For the full copyright and license information, please view the LICENSE
|
// For the full copyright and license information, please view the LICENSE
|
||||||
// file that was distributed with this source code.
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
use memchr::memchr;
|
// spell-checker:ignore multispace
|
||||||
|
|
||||||
pub struct Searcher<'a> {
|
use super::matcher::Matcher;
|
||||||
haystack: &'a [u8],
|
|
||||||
needle: &'a [u8],
|
// Generic searcher that relies on a specific matcher
|
||||||
|
pub struct Searcher<'a, 'b, M: Matcher> {
|
||||||
|
matcher: &'a M,
|
||||||
|
haystack: &'b [u8],
|
||||||
position: usize,
|
position: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Searcher<'a> {
|
impl<'a, 'b, M: Matcher> Searcher<'a, 'b, M> {
|
||||||
pub fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> {
|
pub fn new(matcher: &'a M, haystack: &'b [u8]) -> Self {
|
||||||
assert!(!needle.is_empty());
|
Self {
|
||||||
Searcher {
|
matcher,
|
||||||
haystack,
|
haystack,
|
||||||
needle,
|
|
||||||
position: 0,
|
position: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for Searcher<'a> {
|
// Iterate over field delimiters
|
||||||
type Item = usize;
|
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
|
||||||
|
// corresponds to the delimiter.
|
||||||
|
impl<'a, 'b, M: Matcher> Iterator for Searcher<'a, 'b, M> {
|
||||||
|
type Item = (usize, usize);
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
loop {
|
match self.matcher.next_match(&self.haystack[self.position..]) {
|
||||||
if let Some(match_idx) = memchr(self.needle[0], self.haystack) {
|
Some((first, last)) => {
|
||||||
if self.needle.len() == 1
|
let result = (first + self.position, last + self.position);
|
||||||
|| self.haystack[match_idx + 1..].starts_with(&self.needle[1..])
|
self.position += last;
|
||||||
{
|
Some(result)
|
||||||
let match_pos = self.position + match_idx;
|
|
||||||
let skip = match_idx + self.needle.len();
|
|
||||||
self.haystack = &self.haystack[skip..];
|
|
||||||
self.position += skip;
|
|
||||||
return Some(match_pos);
|
|
||||||
} else {
|
|
||||||
let skip = match_idx + 1;
|
|
||||||
self.haystack = &self.haystack[skip..];
|
|
||||||
self.position += skip;
|
|
||||||
// continue
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return None;
|
|
||||||
}
|
}
|
||||||
|
None => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod exact_searcher_tests {
|
||||||
|
|
||||||
|
use super::super::matcher::ExactMatcher;
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
const NEEDLE: &[u8] = "ab".as_bytes();
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_normal() {
|
fn test_normal() {
|
||||||
let iter = Searcher::new("a.a.a".as_bytes(), "a".as_bytes());
|
let matcher = ExactMatcher::new("a".as_bytes());
|
||||||
let items: Vec<usize> = iter.collect();
|
let iter = Searcher::new(&matcher, "a.a.a".as_bytes());
|
||||||
assert_eq!(vec![0, 2, 4], items);
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty() {
|
fn test_empty() {
|
||||||
let iter = Searcher::new("".as_bytes(), "a".as_bytes());
|
let matcher = ExactMatcher::new("a".as_bytes());
|
||||||
let items: Vec<usize> = iter.collect();
|
let iter = Searcher::new(&matcher, "".as_bytes());
|
||||||
assert_eq!(vec![] as Vec<usize>, items);
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_multibyte(line: &[u8], expected: &[usize]) {
|
fn test_multibyte(line: &[u8], expected: &[(usize, usize)]) {
|
||||||
let iter = Searcher::new(line, NEEDLE);
|
let matcher = ExactMatcher::new("ab".as_bytes());
|
||||||
let items: Vec<usize> = iter.collect();
|
let iter = Searcher::new(&matcher, line);
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
assert_eq!(expected, items);
|
assert_eq!(expected, items);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multibyte_normal() {
|
fn test_multibyte_normal() {
|
||||||
test_multibyte("...ab...ab...".as_bytes(), &[3, 8]);
|
test_multibyte("...ab...ab...".as_bytes(), &[(3, 5), (8, 10)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -90,16 +85,101 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multibyte_starting_needle() {
|
fn test_multibyte_starting_needle() {
|
||||||
test_multibyte("ab...ab...".as_bytes(), &[0, 5]);
|
test_multibyte("ab...ab...".as_bytes(), &[(0, 2), (5, 7)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multibyte_trailing_needle() {
|
fn test_multibyte_trailing_needle() {
|
||||||
test_multibyte("...ab...ab".as_bytes(), &[3, 8]);
|
test_multibyte("...ab...ab".as_bytes(), &[(3, 5), (8, 10)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multibyte_first_byte_false_match() {
|
fn test_multibyte_first_byte_false_match() {
|
||||||
test_multibyte("aA..aCaC..ab..aD".as_bytes(), &[10]);
|
test_multibyte("aA..aCaC..ab..aD".as_bytes(), &[(10, 12)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_searcher_with_exact_matcher() {
|
||||||
|
let matcher = ExactMatcher::new("<>".as_bytes());
|
||||||
|
let haystack = "<><>a<>b<><>cd<><>".as_bytes();
|
||||||
|
let mut searcher = Searcher::new(&matcher, haystack);
|
||||||
|
assert_eq!(searcher.next(), Some((0, 2)));
|
||||||
|
assert_eq!(searcher.next(), Some((2, 4)));
|
||||||
|
assert_eq!(searcher.next(), Some((5, 7)));
|
||||||
|
assert_eq!(searcher.next(), Some((8, 10)));
|
||||||
|
assert_eq!(searcher.next(), Some((10, 12)));
|
||||||
|
assert_eq!(searcher.next(), Some((14, 16)));
|
||||||
|
assert_eq!(searcher.next(), Some((16, 18)));
|
||||||
|
assert_eq!(searcher.next(), None);
|
||||||
|
assert_eq!(searcher.next(), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod whitespace_searcher_tests {
|
||||||
|
|
||||||
|
use super::super::matcher::WhitespaceMatcher;
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_space() {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
let iter = Searcher::new(&matcher, " . . ".as_bytes());
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tab() {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
let iter = Searcher::new(&matcher, "\t.\t.\t".as_bytes());
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty() {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
let iter = Searcher::new(&matcher, "".as_bytes());
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
let iter = Searcher::new(&matcher, line);
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(expected, items);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multispace_normal() {
|
||||||
|
test_multispace(
|
||||||
|
"... ... \t...\t ... \t ...".as_bytes(),
|
||||||
|
&[(3, 5), (8, 10), (13, 15), (18, 21)],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multispace_begin() {
|
||||||
|
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multispace_end() {
|
||||||
|
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_searcher_with_whitespace_matcher() {
|
||||||
|
let matcher = WhitespaceMatcher {};
|
||||||
|
let haystack = "\t a b \t cd\t\t".as_bytes();
|
||||||
|
let mut searcher = Searcher::new(&matcher, haystack);
|
||||||
|
assert_eq!(searcher.next(), Some((0, 2)));
|
||||||
|
assert_eq!(searcher.next(), Some((3, 4)));
|
||||||
|
assert_eq!(searcher.next(), Some((5, 8)));
|
||||||
|
assert_eq!(searcher.next(), Some((10, 12)));
|
||||||
|
assert_eq!(searcher.next(), None);
|
||||||
|
assert_eq!(searcher.next(), None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,97 +0,0 @@
|
||||||
// This file is part of the uutils coreutils package.
|
|
||||||
//
|
|
||||||
// For the full copyright and license information, please view the LICENSE
|
|
||||||
// file that was distributed with this source code.
|
|
||||||
|
|
||||||
// spell-checker:ignore multispace
|
|
||||||
|
|
||||||
use memchr::memchr2;
|
|
||||||
|
|
||||||
pub struct WhitespaceSearcher<'a> {
|
|
||||||
haystack: &'a [u8],
|
|
||||||
position: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> WhitespaceSearcher<'a> {
|
|
||||||
pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
|
|
||||||
WhitespaceSearcher {
|
|
||||||
haystack,
|
|
||||||
position: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for WhitespaceSearcher<'a> {
|
|
||||||
type Item = (usize, usize);
|
|
||||||
|
|
||||||
// Iterate over sequences of consecutive whitespace (space and/or tab) characters.
|
|
||||||
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
|
|
||||||
// corresponds to the delimiter.
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
|
|
||||||
let mut skip = match_idx + 1;
|
|
||||||
while skip < self.haystack.len()
|
|
||||||
&& (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
|
|
||||||
{
|
|
||||||
skip += 1;
|
|
||||||
}
|
|
||||||
let match_pos = self.position + match_idx;
|
|
||||||
self.haystack = &self.haystack[skip..];
|
|
||||||
self.position += skip;
|
|
||||||
Some((match_pos, self.position))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_space() {
|
|
||||||
let iter = WhitespaceSearcher::new(" . . ".as_bytes());
|
|
||||||
let items: Vec<(usize, usize)> = iter.collect();
|
|
||||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_tab() {
|
|
||||||
let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
|
|
||||||
let items: Vec<(usize, usize)> = iter.collect();
|
|
||||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_empty() {
|
|
||||||
let iter = WhitespaceSearcher::new("".as_bytes());
|
|
||||||
let items: Vec<(usize, usize)> = iter.collect();
|
|
||||||
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
|
|
||||||
let iter = WhitespaceSearcher::new(line);
|
|
||||||
let items: Vec<(usize, usize)> = iter.collect();
|
|
||||||
assert_eq!(expected, items);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_multispace_normal() {
|
|
||||||
test_multispace(
|
|
||||||
"... ... \t...\t ... \t ...".as_bytes(),
|
|
||||||
&[(3, 5), (8, 10), (13, 15), (18, 21)],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_multispace_begin() {
|
|
||||||
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_multispace_end() {
|
|
||||||
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue