1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-29 20:17:45 +00:00

Merge pull request #1602 from wishawa/master

ptx: Improve performance: remove N^2 loop and reduce repetitive regex compilations.
This commit is contained in:
Sylvestre Ledru 2021-04-07 23:14:23 +02:00 committed by GitHub
commit ac43052cc5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -169,9 +169,9 @@ fn get_config(matches: &clap::ArgMatches) -> Config {
.expect(err_msg) .expect(err_msg)
.to_string(); .to_string();
} }
if matches.is_present(options::IGNORE_CASE) { if matches.is_present(options::FLAG_TRUNCATION) {
config.trunc_str = matches config.trunc_str = matches
.value_of(options::IGNORE_CASE) .value_of(options::FLAG_TRUNCATION)
.expect(err_msg) .expect(err_msg)
.to_string(); .to_string();
} }
@ -195,8 +195,16 @@ fn get_config(matches: &clap::ArgMatches) -> Config {
config config
} }
fn read_input(input_files: &[String], config: &Config) -> HashMap<String, (Vec<String>, usize)> { struct FileContent {
let mut file_map: HashMap<String, (Vec<String>, usize)> = HashMap::new(); lines: Vec<String>,
chars_lines: Vec<Vec<char>>,
offset: usize,
}
type FileMap = HashMap<String, FileContent>;
fn read_input(input_files: &[String], config: &Config) -> FileMap {
let mut file_map: FileMap = HashMap::new();
let mut files = Vec::new(); let mut files = Vec::new();
if input_files.is_empty() { if input_files.is_empty() {
files.push("-"); files.push("-");
@ -207,7 +215,7 @@ fn read_input(input_files: &[String], config: &Config) -> HashMap<String, (Vec<S
} else { } else {
files.push(&input_files[0]); files.push(&input_files[0]);
} }
let mut lines_so_far: usize = 0; let mut offset: usize = 0;
for filename in files { for filename in files {
let reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" { let reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" {
Box::new(stdin()) Box::new(stdin())
@ -216,25 +224,33 @@ fn read_input(input_files: &[String], config: &Config) -> HashMap<String, (Vec<S
Box::new(file) Box::new(file)
}); });
let lines: Vec<String> = reader.lines().map(|x| crash_if_err!(1, x)).collect(); let lines: Vec<String> = reader.lines().map(|x| crash_if_err!(1, x)).collect();
// Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long.
// Since we will be jumping around the line a lot, we dump the content into a Vec<char>, which can be indexed in constant time.
let chars_lines: Vec<Vec<char>> = lines.iter().map(|x| x.chars().collect()).collect();
let size = lines.len(); let size = lines.len();
file_map.insert(filename.to_owned(), (lines, lines_so_far)); file_map.insert(
lines_so_far += size filename.to_owned(),
FileContent {
lines,
chars_lines,
offset,
},
);
offset += size
} }
file_map file_map
} }
fn create_word_set( /// Go through every lines in the input files and record each match occurance as a `WordRef`.
config: &Config, fn create_word_set(config: &Config, filter: &WordFilter, file_map: &FileMap) -> BTreeSet<WordRef> {
filter: &WordFilter,
file_map: &HashMap<String, (Vec<String>, usize)>,
) -> BTreeSet<WordRef> {
let reg = Regex::new(&filter.word_regex).unwrap(); let reg = Regex::new(&filter.word_regex).unwrap();
let ref_reg = Regex::new(&config.context_regex).unwrap(); let ref_reg = Regex::new(&config.context_regex).unwrap();
let mut word_set: BTreeSet<WordRef> = BTreeSet::new(); let mut word_set: BTreeSet<WordRef> = BTreeSet::new();
for (file, lines) in file_map.iter() { for (file, lines) in file_map.iter() {
let mut count: usize = 0; let mut count: usize = 0;
let offs = lines.1; let offs = lines.offset;
for line in &lines.0 { for line in &lines.lines {
// if -r, exclude reference from word set // if -r, exclude reference from word set
let (ref_beg, ref_end) = match ref_reg.find(line) { let (ref_beg, ref_end) = match ref_reg.find(line) {
Some(x) => (x.start(), x.end()), Some(x) => (x.start(), x.end()),
@ -271,12 +287,11 @@ fn create_word_set(
word_set word_set
} }
fn get_reference(config: &Config, word_ref: &WordRef, line: &str) -> String { fn get_reference(config: &Config, word_ref: &WordRef, line: &str, context_reg: &Regex) -> String {
if config.auto_ref { if config.auto_ref {
format!("{}:{}", word_ref.filename, word_ref.local_line_nr + 1) format!("{}:{}", word_ref.filename, word_ref.local_line_nr + 1)
} else if config.input_ref { } else if config.input_ref {
let reg = Regex::new(&config.context_regex).unwrap(); let (beg, end) = match context_reg.find(line) {
let (beg, end) = match reg.find(line) {
Some(x) => (x.start(), x.end()), Some(x) => (x.start(), x.end()),
None => (0, 0), None => (0, 0),
}; };
@ -329,57 +344,107 @@ fn trim_idx(s: &[char], beg: usize, end: usize) -> (usize, usize) {
} }
fn get_output_chunks( fn get_output_chunks(
all_before: &str, all_before: &[char],
keyword: &str, keyword: &str,
all_after: &str, all_after: &[char],
config: &Config, config: &Config,
) -> (String, String, String, String) { ) -> (String, String, String, String) {
assert_eq!(all_before.trim(), all_before); // Chunk size logics are mostly copied from the GNU ptx source.
assert_eq!(keyword.trim(), keyword); // https://github.com/MaiZure/coreutils-8.3/blob/master/src/ptx.c#L1234
assert_eq!(all_after.trim(), all_after); let half_line_size = (config.line_width / 2) as usize;
let mut head = String::new(); let max_before_size = cmp::max(half_line_size as isize - config.gap_size as isize, 0) as usize;
let mut before = String::new(); let max_after_size = cmp::max(
let mut after = String::new(); half_line_size as isize
let mut tail = String::new(); - (2 * config.trunc_str.len()) as isize
- keyword.len() as isize
let half_line_size = cmp::max( - 1,
(config.line_width / 2) as isize - (2 * config.trunc_str.len()) as isize,
0, 0,
) as usize; ) as usize;
let max_after_size = cmp::max(half_line_size as isize - keyword.len() as isize - 1, 0) as usize;
let max_before_size = half_line_size;
let all_before_vec: Vec<char> = all_before.chars().collect();
let all_after_vec: Vec<char> = all_after.chars().collect();
// get before // Allocate plenty space for all the chunks.
let mut bb_tmp = cmp::max(all_before.len() as isize - max_before_size as isize, 0) as usize; let mut head = String::with_capacity(half_line_size);
bb_tmp = trim_broken_word_left(&all_before_vec, bb_tmp, all_before.len()); let mut before = String::with_capacity(half_line_size);
let (before_beg, before_end) = trim_idx(&all_before_vec, bb_tmp, all_before.len()); let mut after = String::with_capacity(half_line_size);
before.push_str(&all_before[before_beg..before_end]); let mut tail = String::with_capacity(half_line_size);
// the before chunk
// trim whitespace away from all_before to get the index where the before chunk should end.
let (_, before_end) = trim_idx(all_before, 0, all_before.len());
// the minimum possible begin index of the before_chunk is the end index minus the length.
let before_beg = cmp::max(before_end as isize - max_before_size as isize, 0) as usize;
// in case that falls in the middle of a word, trim away the word.
let before_beg = trim_broken_word_left(all_before, before_beg, before_end);
// trim away white space.
let (before_beg, before_end) = trim_idx(all_before, before_beg, before_end);
// and get the string.
let before_str: String = all_before[before_beg..before_end].iter().collect();
before.push_str(&before_str);
assert!(max_before_size >= before.len()); assert!(max_before_size >= before.len());
// get after // the after chunk
let mut ae_tmp = cmp::min(max_after_size, all_after.len());
ae_tmp = trim_broken_word_right(&all_after_vec, 0, ae_tmp); // must be no longer than the minimum between the max size and the total available string.
let (after_beg, after_end) = trim_idx(&all_after_vec, 0, ae_tmp); let after_end = cmp::min(max_after_size, all_after.len());
after.push_str(&all_after[after_beg..after_end]); // in case that falls in the middle of a word, trim away the word.
let after_end = trim_broken_word_right(all_after, 0, after_end);
// trim away white space.
let (_, after_end) = trim_idx(all_after, 0, after_end);
// and get the string
let after_str: String = all_after[0..after_end].iter().collect();
after.push_str(&after_str);
assert!(max_after_size >= after.len()); assert!(max_after_size >= after.len());
// get tail // the tail chunk
let max_tail_size = max_before_size - before.len();
let (tb, _) = trim_idx(&all_after_vec, after_end, all_after.len());
let mut te_tmp = cmp::min(tb + max_tail_size, all_after.len());
te_tmp = trim_broken_word_right(&all_after_vec, tb, te_tmp);
let (tail_beg, tail_end) = trim_idx(&all_after_vec, tb, te_tmp);
tail.push_str(&all_after[tail_beg..tail_end]);
// get head // max size of the tail chunk = max size of left half - space taken by before chunk - gap size.
let max_head_size = max_after_size - after.len(); let max_tail_size = cmp::max(
let (_, he) = trim_idx(&all_before_vec, 0, before_beg); max_before_size as isize - before.len() as isize - config.gap_size as isize,
let mut hb_tmp = cmp::max(he as isize - max_head_size as isize, 0) as usize; 0,
hb_tmp = trim_broken_word_left(&all_before_vec, hb_tmp, he); ) as usize;
let (head_beg, head_end) = trim_idx(&all_before_vec, hb_tmp, he);
head.push_str(&all_before[head_beg..head_end]); // the tail chunk takes text starting from where the after chunk ends (with whitespaces trimmed).
let (tail_beg, _) = trim_idx(all_after, after_end, all_after.len());
// end = begin + max length
let tail_end = cmp::min(all_after.len(), tail_beg + max_tail_size) as usize;
// in case that falls in the middle of a word, trim away the word.
let tail_end = trim_broken_word_right(all_after, tail_beg, tail_end);
// trim away whitespace again.
let (tail_beg, tail_end) = trim_idx(all_after, tail_beg, tail_end);
// and get the string
let tail_str: String = all_after[tail_beg..tail_end].iter().collect();
tail.push_str(&tail_str);
// the head chunk
// max size of the head chunk = max size of right half - space taken by after chunk - gap size.
let max_head_size = cmp::max(
max_after_size as isize - after.len() as isize - config.gap_size as isize,
0,
) as usize;
// the head chunk takes text from before the before chunk
let (_, head_end) = trim_idx(all_before, 0, before_beg);
// begin = end - max length
let head_beg = cmp::max(head_end as isize - max_head_size as isize, 0) as usize;
// in case that falls in the middle of a word, trim away the word.
let head_beg = trim_broken_word_left(all_before, head_beg, head_end);
// trim away white space again.
let (head_beg, head_end) = trim_idx(all_before, head_beg, head_end);
// and get the string.
let head_str: String = all_before[head_beg..head_end].iter().collect();
head.push_str(&head_str);
// put right context truncation string if needed // put right context truncation string if needed
if after_end != all_after.len() && tail_beg == tail_end { if after_end != all_after.len() && tail_beg == tail_end {
@ -395,11 +460,6 @@ fn get_output_chunks(
head = format!("{}{}", config.trunc_str, head); head = format!("{}{}", config.trunc_str, head);
} }
// add space before "after" if needed
if !after.is_empty() {
after = format!(" {}", after);
}
(tail, before, after, head) (tail, before, after, head)
} }
@ -412,70 +472,95 @@ fn tex_mapper(x: char) -> String {
} }
} }
fn adjust_tex_str(context: &str) -> String { /// Escape special characters for TeX.
let ws_reg = Regex::new(r"[\t\n\v\f\r ]").unwrap(); fn format_tex_field(s: &str) -> String {
let mut fix: String = ws_reg.replace_all(context, " ").trim().to_owned(); let mapped_chunks: Vec<String> = s.chars().map(tex_mapper).collect();
let mapped_chunks: Vec<String> = fix.chars().map(tex_mapper).collect(); mapped_chunks.join("")
fix = mapped_chunks.join("");
fix
} }
fn format_tex_line(config: &Config, word_ref: &WordRef, line: &str, reference: &str) -> String { fn format_tex_line(
config: &Config,
word_ref: &WordRef,
line: &str,
chars_line: &[char],
reference: &str,
) -> String {
let mut output = String::new(); let mut output = String::new();
output.push_str(&format!("\\{} ", config.macro_name)); output.push_str(&format!("\\{} ", config.macro_name));
let all_before = if config.input_ref { let all_before = if config.input_ref {
let before = &line[0..word_ref.position]; let before = &line[0..word_ref.position];
adjust_tex_str(before.trim().trim_start_matches(reference)) let before_start_trimoff =
word_ref.position - before.trim_start_matches(reference).trim_start().len();
let before_end_index = before.len();
&chars_line[before_start_trimoff..cmp::max(before_end_index, before_start_trimoff)]
} else { } else {
adjust_tex_str(&line[0..word_ref.position]) let before_chars_trim_idx = (0, word_ref.position);
&chars_line[before_chars_trim_idx.0..before_chars_trim_idx.1]
}; };
let keyword = adjust_tex_str(&line[word_ref.position..word_ref.position_end]); let keyword = &line[word_ref.position..word_ref.position_end];
let all_after = adjust_tex_str(&line[word_ref.position_end..line.len()]); let after_chars_trim_idx = (word_ref.position_end, chars_line.len());
let all_after = &chars_line[after_chars_trim_idx.0..after_chars_trim_idx.1];
let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config); let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config);
output.push_str(&format!( output.push_str(&format!(
"{5}{0}{6}{5}{1}{6}{5}{2}{6}{5}{3}{6}{5}{4}{6}", "{5}{0}{6}{5}{1}{6}{5}{2}{6}{5}{3}{6}{5}{4}{6}",
tail, before, keyword, after, head, "{", "}" format_tex_field(&tail),
format_tex_field(&before),
format_tex_field(keyword),
format_tex_field(&after),
format_tex_field(&head),
"{",
"}"
)); ));
if config.auto_ref || config.input_ref { if config.auto_ref || config.input_ref {
output.push_str(&format!("{}{}{}", "{", adjust_tex_str(&reference), "}")); output.push_str(&format!("{}{}{}", "{", format_tex_field(&reference), "}"));
} }
output output
} }
fn adjust_roff_str(context: &str) -> String { fn format_roff_field(s: &str) -> String {
let ws_reg = Regex::new(r"[\t\n\v\f\r]").unwrap(); s.replace("\"", "\"\"")
ws_reg
.replace_all(context, " ")
.replace("\"", "\"\"")
.trim()
.to_owned()
} }
fn format_roff_line(config: &Config, word_ref: &WordRef, line: &str, reference: &str) -> String { fn format_roff_line(
config: &Config,
word_ref: &WordRef,
line: &str,
chars_line: &[char],
reference: &str,
) -> String {
let mut output = String::new(); let mut output = String::new();
output.push_str(&format!(".{}", config.macro_name)); output.push_str(&format!(".{}", config.macro_name));
let all_before = if config.input_ref { let all_before = if config.input_ref {
let before = &line[0..word_ref.position]; let before = &line[0..word_ref.position];
adjust_roff_str(before.trim().trim_start_matches(reference)) let before_start_trimoff =
word_ref.position - before.trim_start_matches(reference).trim_start().len();
let before_end_index = before.len();
&chars_line[before_start_trimoff..cmp::max(before_end_index, before_start_trimoff)]
} else { } else {
adjust_roff_str(&line[0..word_ref.position]) let before_chars_trim_idx = (0, word_ref.position);
&chars_line[before_chars_trim_idx.0..before_chars_trim_idx.1]
}; };
let keyword = adjust_roff_str(&line[word_ref.position..word_ref.position_end]); let keyword = &line[word_ref.position..word_ref.position_end];
let all_after = adjust_roff_str(&line[word_ref.position_end..line.len()]); let after_chars_trim_idx = (word_ref.position_end, chars_line.len());
let all_after = &chars_line[after_chars_trim_idx.0..after_chars_trim_idx.1];
let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config); let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config);
output.push_str(&format!( output.push_str(&format!(
" \"{}\" \"{}\" \"{}{}\" \"{}\"", " \"{}\" \"{}\" \"{}{}\" \"{}\"",
tail, before, keyword, after, head format_roff_field(&tail),
format_roff_field(&before),
format_roff_field(keyword),
format_roff_field(&after),
format_roff_field(&head)
)); ));
if config.auto_ref || config.input_ref { if config.auto_ref || config.input_ref {
output.push_str(&format!(" \"{}\"", adjust_roff_str(&reference))); output.push_str(&format!(" \"{}\"", format_roff_field(&reference)));
} }
output output
} }
fn write_traditional_output( fn write_traditional_output(
config: &Config, config: &Config,
file_map: &HashMap<String, (Vec<String>, usize)>, file_map: &FileMap,
words: &BTreeSet<WordRef>, words: &BTreeSet<WordRef>,
output_filename: &str, output_filename: &str,
) { ) {
@ -485,19 +570,39 @@ fn write_traditional_output(
let file = crash_if_err!(1, File::create(output_filename)); let file = crash_if_err!(1, File::create(output_filename));
Box::new(file) Box::new(file)
}); });
let context_reg = Regex::new(&config.context_regex).unwrap();
for word_ref in words.iter() { for word_ref in words.iter() {
let file_map_value: &(Vec<String>, usize) = file_map let file_map_value: &FileContent = file_map
.get(&(word_ref.filename)) .get(&(word_ref.filename))
.expect("Missing file in file map"); .expect("Missing file in file map");
let (ref lines, _) = *(file_map_value); let FileContent {
let reference = get_reference(config, word_ref, &lines[word_ref.local_line_nr]); ref lines,
ref chars_lines,
offset: _,
} = *(file_map_value);
let reference = get_reference(
config,
word_ref,
&lines[word_ref.local_line_nr],
&context_reg,
);
let output_line: String = match config.format { let output_line: String = match config.format {
OutFormat::Tex => { OutFormat::Tex => format_tex_line(
format_tex_line(config, word_ref, &lines[word_ref.local_line_nr], &reference) config,
} word_ref,
OutFormat::Roff => { &lines[word_ref.local_line_nr],
format_roff_line(config, word_ref, &lines[word_ref.local_line_nr], &reference) &chars_lines[word_ref.local_line_nr],
} &reference,
),
OutFormat::Roff => format_roff_line(
config,
word_ref,
&lines[word_ref.local_line_nr],
&chars_lines[word_ref.local_line_nr],
&reference,
),
OutFormat::Dumb => crash!(1, "There is no dumb format with GNU extensions disabled"), OutFormat::Dumb => crash!(1, "There is no dumb format with GNU extensions disabled"),
}; };
crash_if_err!(1, writeln!(writer, "{}", output_line)); crash_if_err!(1, writeln!(writer, "{}", output_line));