From 4555e6fe488572a5fc6474bc69e40c4a5aca8103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teemu=20P=C3=A4tsi?= Date: Sat, 24 May 2025 01:27:09 +0300 Subject: [PATCH 1/5] expr: Handle trailing backslash error --- src/uu/expr/src/expr.rs | 2 ++ src/uu/expr/src/syntax_tree.rs | 6 ++++++ tests/by-util/test_expr.rs | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/src/uu/expr/src/expr.rs b/src/uu/expr/src/expr.rs index 073bf501a..fa165f9f3 100644 --- a/src/uu/expr/src/expr.rs +++ b/src/uu/expr/src/expr.rs @@ -50,6 +50,8 @@ pub enum ExprError { UnmatchedClosingBrace, #[error("Invalid content of \\{{\\}}")] InvalidBracketContent, + #[error("Trailing backslash")] + TrailingBackslash, } impl UError for ExprError { diff --git a/src/uu/expr/src/syntax_tree.rs b/src/uu/expr/src/syntax_tree.rs index 106b4bd68..11103ee4b 100644 --- a/src/uu/expr/src/syntax_tree.rs +++ b/src/uu/expr/src/syntax_tree.rs @@ -161,6 +161,7 @@ impl StringOp { match first { Some('^') => {} // Start of string anchor is already added Some('*') => re_string.push_str(r"\*"), + Some('\\') if right.len() == 1 => return Err(ExprError::TrailingBackslash), Some(char) => re_string.push(char), None => return Ok(0.into()), }; @@ -169,6 +170,8 @@ impl StringOp { let mut prev = first.unwrap_or_default(); let mut prev_is_escaped = false; while let Some(curr) = pattern_chars.next() { + let curr_is_escaped = prev == '\\' && !prev_is_escaped; + match curr { '^' => match (prev, prev_is_escaped) { // Start of a capturing group @@ -201,6 +204,9 @@ impl StringOp { re_string.push('$'); } } + '\\' if !curr_is_escaped && pattern_chars.peek().is_none() => { + return Err(ExprError::TrailingBackslash); + } _ => re_string.push(curr), } diff --git a/tests/by-util/test_expr.rs b/tests/by-util/test_expr.rs index c5fb96c3d..e301c2470 100644 --- a/tests/by-util/test_expr.rs +++ b/tests/by-util/test_expr.rs @@ -365,6 +365,26 @@ fn test_regex() { .stdout_only("0\n"); } +#[test] +fn test_regex_trailing_backslash() { + new_ucmd!() + .args(&["\\", ":", "\\\\"]) + .succeeds() + .stdout_only("1\n"); + new_ucmd!() + .args(&["\\", ":", "\\"]) + .fails() + .stderr_only("expr: Trailing backslash\n"); + new_ucmd!() + .args(&["abc\\", ":", "abc\\\\"]) + .succeeds() + .stdout_only("4\n"); + new_ucmd!() + .args(&["abc\\", ":", "abc\\"]) + .fails() + .stderr_only("expr: Trailing backslash\n"); +} + #[test] fn test_substr() { new_ucmd!() From b0390fe36e5a7404f7ddb83fb1131974b61e835d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teemu=20P=C3=A4tsi?= Date: Sat, 24 May 2025 01:28:53 +0300 Subject: [PATCH 2/5] expr: Handle `$` at the beginning of the regex pattern --- src/uu/expr/src/syntax_tree.rs | 39 +++++++++++++++++++++------------- tests/by-util/test_expr.rs | 16 ++++++++++++++ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/uu/expr/src/syntax_tree.rs b/src/uu/expr/src/syntax_tree.rs index 11103ee4b..9b80d40c2 100644 --- a/src/uu/expr/src/syntax_tree.rs +++ b/src/uu/expr/src/syntax_tree.rs @@ -161,6 +161,7 @@ impl StringOp { match first { Some('^') => {} // Start of string anchor is already added Some('*') => re_string.push_str(r"\*"), + Some('$') if !is_end_of_expression(&pattern_chars) => re_string.push_str(r"\$"), Some('\\') if right.len() == 1 => return Err(ExprError::TrailingBackslash), Some(char) => re_string.push(char), None => return Ok(0.into()), @@ -185,23 +186,12 @@ impl StringOp { _ => re_string.push_str(r"\^"), }, '$' => { - if let Some('\\') = pattern_chars.peek() { - // The next character was checked to be a backslash - let backslash = pattern_chars.next().unwrap_or_default(); - match pattern_chars.peek() { - // End of a capturing group - Some(')') => re_string.push('$'), - // End of an alternative pattern - Some('|') => re_string.push('$'), - _ => re_string.push_str(r"\$"), - } - re_string.push(backslash); - } else if (prev_is_escaped || prev != '\\') - && pattern_chars.peek().is_some() - { + if is_end_of_expression(&pattern_chars) { + re_string.push(curr); + } else if !curr_is_escaped { re_string.push_str(r"\$"); } else { - re_string.push('$'); + re_string.push(curr); } } '\\' if !curr_is_escaped && pattern_chars.peek().is_none() => { @@ -247,6 +237,25 @@ impl StringOp { } } +/// Check if regex pattern character iterator is at the end of a regex expression or subexpression +fn is_end_of_expression(pattern_chars: &I) -> bool +where + I: Iterator + Clone, +{ + let mut pattern_chars_clone = pattern_chars.clone(); + match pattern_chars_clone.next() { + Some('\\') => { + match pattern_chars_clone.next() { + Some(')') // End of a capturing group + | Some('|') => true, // End of an alternative pattern + _ => false, + } + } + None => true, // No characters left + _ => false, + } +} + /// Check for errors in a supplied regular expression /// /// GNU coreutils shows messages for invalid regular expressions diff --git a/tests/by-util/test_expr.rs b/tests/by-util/test_expr.rs index e301c2470..2eee64555 100644 --- a/tests/by-util/test_expr.rs +++ b/tests/by-util/test_expr.rs @@ -318,6 +318,14 @@ fn test_regex() { .args(&["a$c", ":", "a$\\c"]) .succeeds() .stdout_only("3\n"); + new_ucmd!() + .args(&["$a", ":", "$a"]) + .succeeds() + .stdout_only("2\n"); + new_ucmd!() + .args(&["a", ":", "a$\\|b"]) + .succeeds() + .stdout_only("1\n"); new_ucmd!() .args(&["^^^^^^^^^", ":", "^^^"]) .succeeds() @@ -363,6 +371,14 @@ fn test_regex() { .args(&["abc", ":", "ab[^c]"]) .fails() .stdout_only("0\n"); + new_ucmd!() + .args(&["$", ":", "$"]) + .fails() + .stdout_only("0\n"); + new_ucmd!() + .args(&["a$", ":", "a$\\|b"]) + .fails() + .stdout_only("0\n"); } #[test] From 63ce37cf6e8318ecb4b678e719a086dfb5bc15b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teemu=20P=C3=A4tsi?= Date: Sat, 24 May 2025 11:14:32 +0300 Subject: [PATCH 3/5] expr: Refactor regex tests into multiple targeted functions --- tests/by-util/test_expr.rs | 207 +++++++++++++++++++------------------ 1 file changed, 106 insertions(+), 101 deletions(-) diff --git a/tests/by-util/test_expr.rs b/tests/by-util/test_expr.rs index 2eee64555..2c0eafe32 100644 --- a/tests/by-util/test_expr.rs +++ b/tests/by-util/test_expr.rs @@ -273,112 +273,12 @@ fn test_length_mb() { } #[test] -fn test_regex() { - new_ucmd!() - .args(&["a^b", ":", "a^b"]) - .succeeds() - .stdout_only("3\n"); - new_ucmd!() - .args(&["a^b", ":", "a\\^b"]) - .succeeds() - .stdout_only("3\n"); - new_ucmd!() - .args(&["b", ":", "a\\|^b"]) - .succeeds() - .stdout_only("1\n"); - new_ucmd!() - .args(&["ab", ":", "\\(^a\\)b"]) - .succeeds() - .stdout_only("a\n"); - new_ucmd!() - .args(&["a$b", ":", "a\\$b"]) - .succeeds() - .stdout_only("3\n"); - new_ucmd!() - .args(&["a", ":", "a$\\|b"]) - .succeeds() - .stdout_only("1\n"); - new_ucmd!() - .args(&["ab", ":", "a\\(b$\\)"]) - .succeeds() - .stdout_only("b\n"); - new_ucmd!() - .args(&["abc", ":", "^abc"]) - .succeeds() - .stdout_only("3\n"); - new_ucmd!() - .args(&["^abc", ":", "^^abc"]) - .succeeds() - .stdout_only("4\n"); - new_ucmd!() - .args(&["b^$ic", ":", "b^\\$ic"]) - .succeeds() - .stdout_only("5\n"); - new_ucmd!() - .args(&["a$c", ":", "a$\\c"]) - .succeeds() - .stdout_only("3\n"); - new_ucmd!() - .args(&["$a", ":", "$a"]) - .succeeds() - .stdout_only("2\n"); - new_ucmd!() - .args(&["a", ":", "a$\\|b"]) - .succeeds() - .stdout_only("1\n"); - new_ucmd!() - .args(&["^^^^^^^^^", ":", "^^^"]) - .succeeds() - .stdout_only("2\n"); - new_ucmd!() - .args(&["ab[^c]", ":", "ab[^c]"]) - .succeeds() - .stdout_only("3\n"); // Matches "ab[" - new_ucmd!() - .args(&["ab[^c]", ":", "ab\\[^c]"]) - .succeeds() - .stdout_only("6\n"); - new_ucmd!() - .args(&["[^a]", ":", "\\[^a]"]) - .succeeds() - .stdout_only("4\n"); - new_ucmd!() - .args(&["\\a", ":", "\\\\[^^]"]) - .succeeds() - .stdout_only("2\n"); - new_ucmd!() - .args(&["^a", ":", "^^[^^]"]) - .succeeds() - .stdout_only("2\n"); - new_ucmd!() - .args(&["-5", ":", "-\\{0,1\\}[0-9]*$"]) - .succeeds() - .stdout_only("2\n"); +fn test_regex_empty() { new_ucmd!().args(&["", ":", ""]).fails().stdout_only("0\n"); new_ucmd!() .args(&["abc", ":", ""]) .fails() .stdout_only("0\n"); - new_ucmd!() - .args(&["abc", ":", "bc"]) - .fails() - .stdout_only("0\n"); - new_ucmd!() - .args(&["^abc", ":", "^abc"]) - .fails() - .stdout_only("0\n"); - new_ucmd!() - .args(&["abc", ":", "ab[^c]"]) - .fails() - .stdout_only("0\n"); - new_ucmd!() - .args(&["$", ":", "$"]) - .fails() - .stdout_only("0\n"); - new_ucmd!() - .args(&["a$", ":", "a$\\|b"]) - .fails() - .stdout_only("0\n"); } #[test] @@ -401,6 +301,111 @@ fn test_regex_trailing_backslash() { .stderr_only("expr: Trailing backslash\n"); } +#[test] +fn test_regex_caret() { + new_ucmd!() + .args(&["a^b", ":", "a^b"]) + .succeeds() + .stdout_only("3\n"); + new_ucmd!() + .args(&["a^b", ":", "a\\^b"]) + .succeeds() + .stdout_only("3\n"); + new_ucmd!() + .args(&["abc", ":", "^abc"]) + .succeeds() + .stdout_only("3\n"); + new_ucmd!() + .args(&["^abc", ":", "^^abc"]) + .succeeds() + .stdout_only("4\n"); + new_ucmd!() + .args(&["b", ":", "a\\|^b"]) + .succeeds() + .stdout_only("1\n"); + new_ucmd!() + .args(&["ab", ":", "\\(^a\\)b"]) + .succeeds() + .stdout_only("a\n"); + new_ucmd!() + .args(&["^abc", ":", "^abc"]) + .fails() + .stdout_only("0\n"); + new_ucmd!() + .args(&["^^^^^^^^^", ":", "^^^"]) + .succeeds() + .stdout_only("2\n"); + new_ucmd!() + .args(&["ab[^c]", ":", "ab[^c]"]) + .succeeds() + .stdout_only("3\n"); // Matches "ab[" + new_ucmd!() + .args(&["ab[^c]", ":", "ab\\[^c]"]) + .succeeds() + .stdout_only("6\n"); + new_ucmd!() + .args(&["[^a]", ":", "\\[^a]"]) + .succeeds() + .stdout_only("4\n"); + new_ucmd!() + .args(&["\\a", ":", "\\\\[^^]"]) + .succeeds() + .stdout_only("2\n"); + // Patterns are anchored to the beginning of the pattern "^bc" + new_ucmd!() + .args(&["abc", ":", "bc"]) + .fails() + .stdout_only("0\n"); + new_ucmd!() + .args(&["^a", ":", "^^[^^]"]) + .succeeds() + .stdout_only("2\n"); + new_ucmd!() + .args(&["abc", ":", "ab[^c]"]) + .fails() + .stdout_only("0\n"); +} + +#[test] +fn test_regex_dollar() { + new_ucmd!() + .args(&["a$b", ":", "a\\$b"]) + .succeeds() + .stdout_only("3\n"); + new_ucmd!() + .args(&["a", ":", "a$\\|b"]) + .succeeds() + .stdout_only("1\n"); + new_ucmd!() + .args(&["ab", ":", "a\\(b$\\)"]) + .succeeds() + .stdout_only("b\n"); + new_ucmd!() + .args(&["a$c", ":", "a$\\c"]) + .succeeds() + .stdout_only("3\n"); + new_ucmd!() + .args(&["$a", ":", "$a"]) + .succeeds() + .stdout_only("2\n"); + new_ucmd!() + .args(&["a", ":", "a$\\|b"]) + .succeeds() + .stdout_only("1\n"); + new_ucmd!() + .args(&["-5", ":", "-\\{0,1\\}[0-9]*$"]) + .succeeds() + .stdout_only("2\n"); + new_ucmd!() + .args(&["$", ":", "$"]) + .fails() + .stdout_only("0\n"); + new_ucmd!() + .args(&["a$", ":", "a$\\|b"]) + .fails() + .stdout_only("0\n"); +} + #[test] fn test_substr() { new_ucmd!() From 2a862bc385a6ac4ec15465776b100c84115d7654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teemu=20P=C3=A4tsi?= <44954973+frendsick@users.noreply.github.com> Date: Sat, 24 May 2025 21:15:53 +0300 Subject: [PATCH 4/5] expr: Simplify checking of the end of an expression Co-authored-by: Daniel Hofstetter --- src/uu/expr/src/syntax_tree.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/uu/expr/src/syntax_tree.rs b/src/uu/expr/src/syntax_tree.rs index 9b80d40c2..544b3b773 100644 --- a/src/uu/expr/src/syntax_tree.rs +++ b/src/uu/expr/src/syntax_tree.rs @@ -244,13 +244,7 @@ where { let mut pattern_chars_clone = pattern_chars.clone(); match pattern_chars_clone.next() { - Some('\\') => { - match pattern_chars_clone.next() { - Some(')') // End of a capturing group - | Some('|') => true, // End of an alternative pattern - _ => false, - } - } + Some('\\') => matches!(pattern_chars_clone.next(), Some(')' | '|')), None => true, // No characters left _ => false, } From ab5cf741850fdd05a68212842341096a5f562a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teemu=20P=C3=A4tsi?= Date: Sat, 24 May 2025 21:17:20 +0300 Subject: [PATCH 5/5] expr: Simplify parsing special cases for `$` in regex --- src/uu/expr/src/syntax_tree.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/uu/expr/src/syntax_tree.rs b/src/uu/expr/src/syntax_tree.rs index 544b3b773..b0326f7b6 100644 --- a/src/uu/expr/src/syntax_tree.rs +++ b/src/uu/expr/src/syntax_tree.rs @@ -185,14 +185,8 @@ impl StringOp { | ('\\', false) => re_string.push(curr), _ => re_string.push_str(r"\^"), }, - '$' => { - if is_end_of_expression(&pattern_chars) { - re_string.push(curr); - } else if !curr_is_escaped { - re_string.push_str(r"\$"); - } else { - re_string.push(curr); - } + '$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => { + re_string.push_str(r"\$"); } '\\' if !curr_is_escaped && pattern_chars.peek().is_none() => { return Err(ExprError::TrailingBackslash);