1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 11:07:44 +00:00

Merge pull request #8010 from frendsick/fix/expr-regex-range-start-of-expression

expr: Fix handling of regex range quantifiers
This commit is contained in:
Daniel Hofstetter 2025-05-27 17:45:58 +02:00 committed by GitHub
commit 5fd4ab5e33
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 226 additions and 178 deletions

View file

@ -46,12 +46,12 @@ pub enum ExprError {
UnmatchedClosingParenthesis, UnmatchedClosingParenthesis,
#[error("Unmatched \\{{")] #[error("Unmatched \\{{")]
UnmatchedOpeningBrace, UnmatchedOpeningBrace,
#[error("Unmatched ) or \\}}")]
UnmatchedClosingBrace,
#[error("Invalid content of \\{{\\}}")] #[error("Invalid content of \\{{\\}}")]
InvalidBracketContent, InvalidBracketContent,
#[error("Trailing backslash")] #[error("Trailing backslash")]
TrailingBackslash, TrailingBackslash,
#[error("Regular expression too big")]
TooBigRangeQuantifierIndex,
} }
impl UError for ExprError { impl UError for ExprError {

View file

@ -8,7 +8,7 @@
use std::{cell::Cell, collections::BTreeMap}; use std::{cell::Cell, collections::BTreeMap};
use num_bigint::{BigInt, ParseBigIntError}; use num_bigint::{BigInt, ParseBigIntError};
use num_traits::{ToPrimitive, Zero}; use num_traits::ToPrimitive;
use onig::{Regex, RegexOptions, Syntax}; use onig::{Regex, RegexOptions, Syntax};
use crate::{ExprError, ExprResult}; use crate::{ExprError, ExprResult};
@ -151,55 +151,61 @@ impl StringOp {
let right = right?.eval_as_string(); let right = right?.eval_as_string();
check_posix_regex_errors(&right)?; check_posix_regex_errors(&right)?;
// All patterns are anchored so they begin with a caret (^) // Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep`
let mut re_string = String::with_capacity(right.len() + 1); let mut re_string = String::with_capacity(right.len() + 1);
re_string.push('^');
// Handle first character from the input pattern
let mut pattern_chars = right.chars().peekable(); let mut pattern_chars = right.chars().peekable();
let first = pattern_chars.next(); let mut prev = '\0';
match first {
Some('^') => {} // Start of string anchor is already added
Some('$') if !is_end_of_expression(&pattern_chars) => re_string.push_str(r"\$"),
Some('\\') if right.len() == 1 => return Err(ExprError::TrailingBackslash),
Some(char) => re_string.push(char),
None => return Ok(0.into()),
};
// Handle the rest of the input pattern.
let mut prev = first.unwrap_or_default();
let mut prev_is_escaped = false; let mut prev_is_escaped = false;
let mut is_start_of_expression = true;
// All patterns are anchored so they begin with a caret (^)
if pattern_chars.peek() != Some(&'^') {
re_string.push('^');
}
while let Some(curr) = pattern_chars.next() { while let Some(curr) = pattern_chars.next() {
let curr_is_escaped = prev == '\\' && !prev_is_escaped; let curr_is_escaped = prev == '\\' && !prev_is_escaped;
let is_first_character = prev == '\0';
match curr { match curr {
'^' => match (prev, prev_is_escaped) { // Character class negation "[^a]"
// Start of a capturing group // Explicitly escaped caret "\^"
('(', true) '^' if !is_start_of_expression && !matches!(prev, '[' | '\\') => {
// Start of an alternative pattern re_string.push_str(r"\^");
| ('|', true) }
// Character class negation "[^a]"
| ('[', false)
// Explicitly escaped caret
| ('\\', false) => re_string.push(curr),
_ => re_string.push_str(r"\^"),
},
'$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => { '$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => {
re_string.push_str(r"\$"); re_string.push_str(r"\$");
} }
'\\' if !curr_is_escaped && pattern_chars.peek().is_none() => { '\\' if !curr_is_escaped && pattern_chars.peek().is_none() => {
return Err(ExprError::TrailingBackslash); return Err(ExprError::TrailingBackslash);
} }
'{' if curr_is_escaped && is_valid_range_quantifier(&pattern_chars) => { '{' if curr_is_escaped => {
re_string.push(curr); // Handle '{' literally at the start of an expression
// Set the lower bound of range quantifier to 0 if it is missing if is_start_of_expression {
if pattern_chars.peek() == Some(&',') { if re_string.ends_with('\\') {
re_string.push('0'); let _ = re_string.pop();
}
re_string.push(curr);
} else {
// Check if the following section is a valid range quantifier
verify_range_quantifier(&pattern_chars)?;
re_string.push(curr);
// Set the lower bound of range quantifier to 0 if it is missing
if pattern_chars.peek() == Some(&',') {
re_string.push('0');
}
} }
} }
_ => re_string.push(curr), _ => re_string.push(curr),
} }
// Capturing group "\(abc\)"
// Alternative pattern "a\|b"
is_start_of_expression = curr == '\\' && is_first_character
|| curr_is_escaped && matches!(curr, '(' | '|')
|| curr == '\\' && prev_is_escaped && matches!(prev, '(' | '|');
prev_is_escaped = curr_is_escaped; prev_is_escaped = curr_is_escaped;
prev = curr; prev = curr;
} }
@ -209,7 +215,14 @@ impl StringOp {
RegexOptions::REGEX_OPTION_SINGLELINE, RegexOptions::REGEX_OPTION_SINGLELINE,
Syntax::grep(), Syntax::grep(),
) )
.map_err(|_| ExprError::InvalidRegexExpression)?; .map_err(|error| match error.code() {
// "invalid repeat range {lower,upper}"
-123 => ExprError::InvalidBracketContent,
// "too big number for repeat range"
-201 => ExprError::TooBigRangeQuantifierIndex,
_ => ExprError::InvalidRegexExpression,
})?;
Ok(if re.captures_len() > 0 { Ok(if re.captures_len() > 0 {
re.captures(&left) re.captures(&left)
.and_then(|captures| captures.at(1)) .and_then(|captures| captures.at(1))
@ -261,33 +274,52 @@ where
/// - `r"\{,6\}"` /// - `r"\{,6\}"`
/// - `r"\{3,6\}"` /// - `r"\{3,6\}"`
/// - `r"\{,\}"` /// - `r"\{,\}"`
fn is_valid_range_quantifier<I>(pattern_chars: &I) -> bool fn verify_range_quantifier<I>(pattern_chars: &I) -> Result<(), ExprError>
where where
I: Iterator<Item = char> + Clone, I: Iterator<Item = char> + Clone,
{ {
let mut pattern_chars_clone = pattern_chars.clone().peekable();
if pattern_chars_clone.peek().is_none() {
return Err(ExprError::UnmatchedOpeningBrace);
}
// Parse the string between braces // Parse the string between braces
let mut quantifier = String::new(); let mut quantifier = String::new();
let mut pattern_chars_clone = pattern_chars.clone().peekable(); let mut prev = '\0';
let Some(mut prev) = pattern_chars_clone.next() else { let mut curr_is_escaped = false;
return false;
};
let mut prev_is_escaped = false;
while let Some(curr) = pattern_chars_clone.next() { while let Some(curr) = pattern_chars_clone.next() {
if prev == '\\' && curr == '}' && !prev_is_escaped { curr_is_escaped = prev == '\\' && !curr_is_escaped;
if curr_is_escaped && curr == '}' {
break; break;
} }
if pattern_chars_clone.peek().is_none() { if pattern_chars_clone.peek().is_none() {
return false; return Err(ExprError::UnmatchedOpeningBrace);
}
if prev != '\0' {
quantifier.push(prev);
} }
quantifier.push(prev);
prev_is_escaped = prev == '\\' && !prev_is_escaped;
prev = curr; prev = curr;
} }
// Check if parsed quantifier is valid // Check if parsed quantifier is valid
let re = Regex::new(r"(\d+|\d*,\d*)").expect("valid regular expression"); let re = Regex::new(r"^([0-9]*,[0-9]*|[0-9]+)$").expect("valid regular expression");
re.is_match(&quantifier) if let Some(captures) = re.captures(&quantifier) {
let matched = captures.at(0).unwrap_or_default();
match matched.split_once(',') {
Some(("", "")) => Ok(()),
Some((x, "") | ("", x)) if x.parse::<i16>().is_ok() => Ok(()),
Some((_, "") | ("", _)) => Err(ExprError::TooBigRangeQuantifierIndex),
Some((f, l)) => match (f.parse::<i16>(), l.parse::<i16>()) {
(Ok(f), Ok(l)) if f > l => Err(ExprError::InvalidBracketContent),
(Ok(_), Ok(_)) => Ok(()),
_ => Err(ExprError::TooBigRangeQuantifierIndex),
},
None if matched.parse::<i16>().is_ok() => Ok(()),
None => Err(ExprError::TooBigRangeQuantifierIndex),
}
} else {
Err(ExprError::InvalidBracketContent)
}
} }
/// Check for errors in a supplied regular expression /// Check for errors in a supplied regular expression
@ -305,78 +337,26 @@ where
/// has specific error messages. /// has specific error messages.
fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> { fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> {
let mut escaped_parens: u64 = 0; let mut escaped_parens: u64 = 0;
let mut escaped_braces: u64 = 0; let mut prev = '\0';
let mut escaped = false; let mut curr_is_escaped = false;
let mut repeating_pattern_text = String::new(); for curr in pattern.chars() {
let mut invalid_content_error = false; curr_is_escaped = prev == '\\' && !curr_is_escaped;
match (curr_is_escaped, curr) {
for c in pattern.chars() { (true, '(') => escaped_parens += 1,
match (escaped, c) {
(true, ')') => { (true, ')') => {
escaped_parens = escaped_parens escaped_parens = escaped_parens
.checked_sub(1) .checked_sub(1)
.ok_or(ExprError::UnmatchedClosingParenthesis)?; .ok_or(ExprError::UnmatchedClosingParenthesis)?;
} }
(true, '(') => { _ => {}
escaped_parens += 1;
}
(true, '}') => {
escaped_braces = escaped_braces
.checked_sub(1)
.ok_or(ExprError::UnmatchedClosingBrace)?;
let mut repetition =
repeating_pattern_text[..repeating_pattern_text.len() - 1].splitn(2, ',');
match (
repetition
.next()
.expect("splitn always returns at least one string"),
repetition.next(),
) {
("", Some("")) => {}
(x, None | Some("")) => {
if x.parse::<i16>().is_err() {
invalid_content_error = true;
}
}
("", Some(x)) => {
if x.parse::<i16>().is_err() {
invalid_content_error = true;
}
}
(f, Some(l)) => {
if let (Ok(f), Ok(l)) = (f.parse::<i16>(), l.parse::<i16>()) {
invalid_content_error = invalid_content_error || f > l;
} else {
invalid_content_error = true;
}
}
}
repeating_pattern_text.clear();
}
(true, '{') => {
escaped_braces += 1;
}
_ => {
if escaped_braces > 0 && repeating_pattern_text.len() <= 13 {
repeating_pattern_text.push(c);
}
if escaped_braces > 0 && !(c.is_ascii_digit() || c == '\\' || c == ',') {
invalid_content_error = true;
}
}
} }
escaped = !escaped && c == '\\'; prev = curr;
} }
match (
escaped_parens.is_zero(), match escaped_parens {
escaped_braces.is_zero(), 0 => Ok(()),
invalid_content_error, _ => Err(ExprError::UnmatchedOpeningParenthesis),
) {
(true, true, false) => Ok(()),
(_, false, _) => Err(ExprError::UnmatchedOpeningBrace),
(false, _, _) => Err(ExprError::UnmatchedOpeningParenthesis),
(true, true, true) => Err(ExprError::InvalidBracketContent),
} }
} }
@ -792,8 +772,7 @@ pub fn is_truthy(s: &NumOrStr) -> bool {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::ExprError; use crate::ExprError;
use crate::ExprError::InvalidBracketContent; use crate::syntax_tree::verify_range_quantifier;
use crate::syntax_tree::is_valid_range_quantifier;
use super::{ use super::{
AstNode, AstNodeInner, BinOp, NumericOp, RelationOp, StringOp, check_posix_regex_errors, AstNode, AstNodeInner, BinOp, NumericOp, RelationOp, StringOp, check_posix_regex_errors,
@ -986,11 +965,6 @@ mod test {
check_posix_regex_errors(r"\(abc"), check_posix_regex_errors(r"\(abc"),
Err(ExprError::UnmatchedOpeningParenthesis) Err(ExprError::UnmatchedOpeningParenthesis)
); );
assert_eq!(
check_posix_regex_errors(r"\{1,2"),
Err(ExprError::UnmatchedOpeningBrace)
);
} }
#[test] #[test]
@ -999,65 +973,51 @@ mod test {
check_posix_regex_errors(r"abc\)"), check_posix_regex_errors(r"abc\)"),
Err(ExprError::UnmatchedClosingParenthesis) Err(ExprError::UnmatchedClosingParenthesis)
); );
assert_eq!(
check_posix_regex_errors(r"abc\}"),
Err(ExprError::UnmatchedClosingBrace)
);
}
#[test]
fn check_regex_empty_repeating_pattern() {
assert_eq!(
check_posix_regex_errors("ab\\{\\}"),
Err(InvalidBracketContent)
);
}
#[test]
fn check_regex_intervals_two_numbers() {
assert_eq!(
// out of order
check_posix_regex_errors("ab\\{1,0\\}"),
Err(InvalidBracketContent)
);
assert_eq!(
check_posix_regex_errors("ab\\{1,a\\}"),
Err(InvalidBracketContent)
);
assert_eq!(
check_posix_regex_errors("ab\\{a,3\\}"),
Err(InvalidBracketContent)
);
assert_eq!(
check_posix_regex_errors("ab\\{a,b\\}"),
Err(InvalidBracketContent)
);
assert_eq!(
check_posix_regex_errors("ab\\{a,\\}"),
Err(InvalidBracketContent)
);
assert_eq!(
check_posix_regex_errors("ab\\{,b\\}"),
Err(InvalidBracketContent)
);
} }
#[test] #[test]
fn test_is_valid_range_quantifier() { fn test_is_valid_range_quantifier() {
assert!(is_valid_range_quantifier(&"3\\}".chars())); assert!(verify_range_quantifier(&"3\\}".chars()).is_ok());
assert!(is_valid_range_quantifier(&"3,\\}".chars())); assert!(verify_range_quantifier(&"3,\\}".chars()).is_ok());
assert!(is_valid_range_quantifier(&",6\\}".chars())); assert!(verify_range_quantifier(&",6\\}".chars()).is_ok());
assert!(is_valid_range_quantifier(&"3,6\\}".chars())); assert!(verify_range_quantifier(&"3,6\\}".chars()).is_ok());
assert!(is_valid_range_quantifier(&",\\}".chars())); assert!(verify_range_quantifier(&",\\}".chars()).is_ok());
assert!(is_valid_range_quantifier(&"3,6\\}anything".chars())); assert!(verify_range_quantifier(&"32767\\}anything".chars()).is_ok());
assert!(!is_valid_range_quantifier(&"\\{3,6\\}".chars())); assert_eq!(
assert!(!is_valid_range_quantifier(&"\\}".chars())); verify_range_quantifier(&"\\{3,6\\}".chars()),
assert!(!is_valid_range_quantifier(&"".chars())); Err(ExprError::InvalidBracketContent)
assert!(!is_valid_range_quantifier(&"3".chars())); );
assert!(!is_valid_range_quantifier(&"3,".chars())); assert_eq!(
assert!(!is_valid_range_quantifier(&",6".chars())); verify_range_quantifier(&"\\}".chars()),
assert!(!is_valid_range_quantifier(&"3,6".chars())); Err(ExprError::InvalidBracketContent)
assert!(!is_valid_range_quantifier(&",".chars())); );
assert_eq!(
verify_range_quantifier(&"".chars()),
Err(ExprError::UnmatchedOpeningBrace)
);
assert_eq!(
verify_range_quantifier(&"3".chars()),
Err(ExprError::UnmatchedOpeningBrace)
);
assert_eq!(
verify_range_quantifier(&"3,".chars()),
Err(ExprError::UnmatchedOpeningBrace)
);
assert_eq!(
verify_range_quantifier(&",6".chars()),
Err(ExprError::UnmatchedOpeningBrace)
);
assert_eq!(
verify_range_quantifier(&"3,6".chars()),
Err(ExprError::UnmatchedOpeningBrace)
);
assert_eq!(
verify_range_quantifier(&",".chars()),
Err(ExprError::UnmatchedOpeningBrace)
);
assert_eq!(
verify_range_quantifier(&"32768\\}".chars()),
Err(ExprError::TooBigRangeQuantifierIndex)
);
} }
} }

View file

@ -3,8 +3,8 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore αbcdef ; (people) kkos // spell-checker:ignore αbcdef ; (people) kkos
// spell-checker:ignore aabcccd aabcd aabd abbbd abbcabc abbcac abbcbbbd abbcbd // spell-checker:ignore aabcccd aabcd aabd abbb abbbd abbcabc abbcac abbcbbbd abbcbd
// spell-checker:ignore abbccd abcac acabc andand bigcmp bignum emptysub // spell-checker:ignore abbccd abcabc abcac acabc andand bigcmp bignum emptysub
// spell-checker:ignore orempty oror // spell-checker:ignore orempty oror
use uutests::new_ucmd; use uutests::new_ucmd;
@ -406,6 +406,94 @@ fn test_regex_dollar() {
.stdout_only("0\n"); .stdout_only("0\n");
} }
#[test]
fn test_regex_range_quantifier() {
new_ucmd!()
.args(&["a", ":", "a\\{1\\}"])
.succeeds()
.stdout_only("1\n");
new_ucmd!()
.args(&["aaaaaaaaaa", ":", "a\\{1,\\}"])
.succeeds()
.stdout_only("10\n");
new_ucmd!()
.args(&["aaa", ":", "a\\{,3\\}"])
.succeeds()
.stdout_only("3\n");
new_ucmd!()
.args(&["aa", ":", "a\\{1,3\\}"])
.succeeds()
.stdout_only("2\n");
new_ucmd!()
.args(&["aaaa", ":", "a\\{,\\}"])
.succeeds()
.stdout_only("4\n");
new_ucmd!()
.args(&["a", ":", "ab\\{,3\\}"])
.succeeds()
.stdout_only("1\n");
new_ucmd!()
.args(&["abbb", ":", "ab\\{,3\\}"])
.succeeds()
.stdout_only("4\n");
new_ucmd!()
.args(&["abcabc", ":", "\\(abc\\)\\{,\\}"])
.succeeds()
.stdout_only("abc\n");
new_ucmd!()
.args(&["a", ":", "a\\{,6\\}"])
.succeeds()
.stdout_only("1\n");
new_ucmd!()
.args(&["{abc}", ":", "\\{abc\\}"])
.succeeds()
.stdout_only("5\n");
new_ucmd!()
.args(&["a{bc}", ":", "a\\(\\{bc\\}\\)"])
.succeeds()
.stdout_only("{bc}\n");
new_ucmd!()
.args(&["{b}", ":", "a\\|\\{b\\}"])
.succeeds()
.stdout_only("3\n");
new_ucmd!()
.args(&["{", ":", "a\\|\\{"])
.succeeds()
.stdout_only("1\n");
new_ucmd!()
.args(&["{}}}", ":", "\\{\\}\\}\\}"])
.succeeds()
.stdout_only("4\n");
new_ucmd!()
.args(&["a{}}}", ":", "a\\{\\}\\}\\}"])
.fails()
.stderr_only("expr: Invalid content of \\{\\}\n");
new_ucmd!()
.args(&["ab", ":", "ab\\{\\}"])
.fails()
.stderr_only("expr: Invalid content of \\{\\}\n");
new_ucmd!()
.args(&["_", ":", "a\\{12345678901234567890\\}"])
.fails()
.stderr_only("expr: Regular expression too big\n");
new_ucmd!()
.args(&["_", ":", "a\\{12345678901234567890,\\}"])
.fails()
.stderr_only("expr: Regular expression too big\n");
new_ucmd!()
.args(&["_", ":", "a\\{,12345678901234567890\\}"])
.fails()
.stderr_only("expr: Regular expression too big\n");
new_ucmd!()
.args(&["_", ":", "a\\{1,12345678901234567890\\}"])
.fails()
.stderr_only("expr: Regular expression too big\n");
new_ucmd!()
.args(&["_", ":", "a\\{1,1234567890abcdef\\}"])
.fails()
.stderr_only("expr: Invalid content of \\{\\}\n");
}
#[test] #[test]
fn test_substr() { fn test_substr() {
new_ucmd!() new_ucmd!()
@ -1142,7 +1230,7 @@ mod gnu_expr {
.args(&["_", ":", "a\\{32768\\}"]) .args(&["_", ":", "a\\{32768\\}"])
.fails_with_code(2) .fails_with_code(2)
.no_stdout() .no_stdout()
.stderr_contains("Invalid content of \\{\\}"); .stderr_contains("Regular expression too big\n");
} }
#[test] #[test]