diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index 989ab52ef..38327e4e3 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -169,9 +169,9 @@ fn get_config(matches: &clap::ArgMatches) -> Config { .expect(err_msg) .to_string(); } - if matches.is_present(options::IGNORE_CASE) { + if matches.is_present(options::FLAG_TRUNCATION) { config.trunc_str = matches - .value_of(options::IGNORE_CASE) + .value_of(options::FLAG_TRUNCATION) .expect(err_msg) .to_string(); } @@ -195,8 +195,16 @@ fn get_config(matches: &clap::ArgMatches) -> Config { config } -fn read_input(input_files: &[String], config: &Config) -> HashMap, usize)> { - let mut file_map: HashMap, usize)> = HashMap::new(); +struct FileContent { + lines: Vec, + chars_lines: Vec>, + offset: usize, +} + +type FileMap = HashMap; + +fn read_input(input_files: &[String], config: &Config) -> FileMap { + let mut file_map: FileMap = HashMap::new(); let mut files = Vec::new(); if input_files.is_empty() { files.push("-"); @@ -207,7 +215,7 @@ fn read_input(input_files: &[String], config: &Config) -> HashMap> = BufReader::new(if filename == "-" { Box::new(stdin()) @@ -216,25 +224,33 @@ fn read_input(input_files: &[String], config: &Config) -> HashMap = reader.lines().map(|x| crash_if_err!(1, x)).collect(); + + // Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long. + // Since we will be jumping around the line a lot, we dump the content into a Vec, which can be indexed in constant time. + let chars_lines: Vec> = lines.iter().map(|x| x.chars().collect()).collect(); let size = lines.len(); - file_map.insert(filename.to_owned(), (lines, lines_so_far)); - lines_so_far += size + file_map.insert( + filename.to_owned(), + FileContent { + lines, + chars_lines, + offset, + }, + ); + offset += size } file_map } -fn create_word_set( - config: &Config, - filter: &WordFilter, - file_map: &HashMap, usize)>, -) -> BTreeSet { +/// Go through every lines in the input files and record each match occurance as a `WordRef`. +fn create_word_set(config: &Config, filter: &WordFilter, file_map: &FileMap) -> BTreeSet { let reg = Regex::new(&filter.word_regex).unwrap(); let ref_reg = Regex::new(&config.context_regex).unwrap(); let mut word_set: BTreeSet = BTreeSet::new(); for (file, lines) in file_map.iter() { let mut count: usize = 0; - let offs = lines.1; - for line in &lines.0 { + let offs = lines.offset; + for line in &lines.lines { // if -r, exclude reference from word set let (ref_beg, ref_end) = match ref_reg.find(line) { Some(x) => (x.start(), x.end()), @@ -271,12 +287,11 @@ fn create_word_set( word_set } -fn get_reference(config: &Config, word_ref: &WordRef, line: &str) -> String { +fn get_reference(config: &Config, word_ref: &WordRef, line: &str, context_reg: &Regex) -> String { if config.auto_ref { format!("{}:{}", word_ref.filename, word_ref.local_line_nr + 1) } else if config.input_ref { - let reg = Regex::new(&config.context_regex).unwrap(); - let (beg, end) = match reg.find(line) { + let (beg, end) = match context_reg.find(line) { Some(x) => (x.start(), x.end()), None => (0, 0), }; @@ -329,57 +344,107 @@ fn trim_idx(s: &[char], beg: usize, end: usize) -> (usize, usize) { } fn get_output_chunks( - all_before: &str, + all_before: &[char], keyword: &str, - all_after: &str, + all_after: &[char], config: &Config, ) -> (String, String, String, String) { - assert_eq!(all_before.trim(), all_before); - assert_eq!(keyword.trim(), keyword); - assert_eq!(all_after.trim(), all_after); - let mut head = String::new(); - let mut before = String::new(); - let mut after = String::new(); - let mut tail = String::new(); - - let half_line_size = cmp::max( - (config.line_width / 2) as isize - (2 * config.trunc_str.len()) as isize, + // Chunk size logics are mostly copied from the GNU ptx source. + // https://github.com/MaiZure/coreutils-8.3/blob/master/src/ptx.c#L1234 + let half_line_size = (config.line_width / 2) as usize; + let max_before_size = cmp::max(half_line_size as isize - config.gap_size as isize, 0) as usize; + let max_after_size = cmp::max( + half_line_size as isize + - (2 * config.trunc_str.len()) as isize + - keyword.len() as isize + - 1, 0, ) as usize; - let max_after_size = cmp::max(half_line_size as isize - keyword.len() as isize - 1, 0) as usize; - let max_before_size = half_line_size; - let all_before_vec: Vec = all_before.chars().collect(); - let all_after_vec: Vec = all_after.chars().collect(); - // get before - let mut bb_tmp = cmp::max(all_before.len() as isize - max_before_size as isize, 0) as usize; - bb_tmp = trim_broken_word_left(&all_before_vec, bb_tmp, all_before.len()); - let (before_beg, before_end) = trim_idx(&all_before_vec, bb_tmp, all_before.len()); - before.push_str(&all_before[before_beg..before_end]); + // Allocate plenty space for all the chunks. + let mut head = String::with_capacity(half_line_size); + let mut before = String::with_capacity(half_line_size); + let mut after = String::with_capacity(half_line_size); + let mut tail = String::with_capacity(half_line_size); + + // the before chunk + + // trim whitespace away from all_before to get the index where the before chunk should end. + let (_, before_end) = trim_idx(all_before, 0, all_before.len()); + + // the minimum possible begin index of the before_chunk is the end index minus the length. + let before_beg = cmp::max(before_end as isize - max_before_size as isize, 0) as usize; + // in case that falls in the middle of a word, trim away the word. + let before_beg = trim_broken_word_left(all_before, before_beg, before_end); + + // trim away white space. + let (before_beg, before_end) = trim_idx(all_before, before_beg, before_end); + + // and get the string. + let before_str: String = all_before[before_beg..before_end].iter().collect(); + before.push_str(&before_str); assert!(max_before_size >= before.len()); - // get after - let mut ae_tmp = cmp::min(max_after_size, all_after.len()); - ae_tmp = trim_broken_word_right(&all_after_vec, 0, ae_tmp); - let (after_beg, after_end) = trim_idx(&all_after_vec, 0, ae_tmp); - after.push_str(&all_after[after_beg..after_end]); + // the after chunk + + // must be no longer than the minimum between the max size and the total available string. + let after_end = cmp::min(max_after_size, all_after.len()); + // in case that falls in the middle of a word, trim away the word. + let after_end = trim_broken_word_right(all_after, 0, after_end); + + // trim away white space. + let (_, after_end) = trim_idx(all_after, 0, after_end); + + // and get the string + let after_str: String = all_after[0..after_end].iter().collect(); + after.push_str(&after_str); assert!(max_after_size >= after.len()); - // get tail - let max_tail_size = max_before_size - before.len(); - let (tb, _) = trim_idx(&all_after_vec, after_end, all_after.len()); - let mut te_tmp = cmp::min(tb + max_tail_size, all_after.len()); - te_tmp = trim_broken_word_right(&all_after_vec, tb, te_tmp); - let (tail_beg, tail_end) = trim_idx(&all_after_vec, tb, te_tmp); - tail.push_str(&all_after[tail_beg..tail_end]); + // the tail chunk - // get head - let max_head_size = max_after_size - after.len(); - let (_, he) = trim_idx(&all_before_vec, 0, before_beg); - let mut hb_tmp = cmp::max(he as isize - max_head_size as isize, 0) as usize; - hb_tmp = trim_broken_word_left(&all_before_vec, hb_tmp, he); - let (head_beg, head_end) = trim_idx(&all_before_vec, hb_tmp, he); - head.push_str(&all_before[head_beg..head_end]); + // max size of the tail chunk = max size of left half - space taken by before chunk - gap size. + let max_tail_size = cmp::max( + max_before_size as isize - before.len() as isize - config.gap_size as isize, + 0, + ) as usize; + + // the tail chunk takes text starting from where the after chunk ends (with whitespaces trimmed). + let (tail_beg, _) = trim_idx(all_after, after_end, all_after.len()); + + // end = begin + max length + let tail_end = cmp::min(all_after.len(), tail_beg + max_tail_size) as usize; + // in case that falls in the middle of a word, trim away the word. + let tail_end = trim_broken_word_right(all_after, tail_beg, tail_end); + + // trim away whitespace again. + let (tail_beg, tail_end) = trim_idx(all_after, tail_beg, tail_end); + + // and get the string + let tail_str: String = all_after[tail_beg..tail_end].iter().collect(); + tail.push_str(&tail_str); + + // the head chunk + + // max size of the head chunk = max size of right half - space taken by after chunk - gap size. + let max_head_size = cmp::max( + max_after_size as isize - after.len() as isize - config.gap_size as isize, + 0, + ) as usize; + + // the head chunk takes text from before the before chunk + let (_, head_end) = trim_idx(all_before, 0, before_beg); + + // begin = end - max length + let head_beg = cmp::max(head_end as isize - max_head_size as isize, 0) as usize; + // in case that falls in the middle of a word, trim away the word. + let head_beg = trim_broken_word_left(all_before, head_beg, head_end); + + // trim away white space again. + let (head_beg, head_end) = trim_idx(all_before, head_beg, head_end); + + // and get the string. + let head_str: String = all_before[head_beg..head_end].iter().collect(); + head.push_str(&head_str); // put right context truncation string if needed if after_end != all_after.len() && tail_beg == tail_end { @@ -395,11 +460,6 @@ fn get_output_chunks( head = format!("{}{}", config.trunc_str, head); } - // add space before "after" if needed - if !after.is_empty() { - after = format!(" {}", after); - } - (tail, before, after, head) } @@ -412,70 +472,95 @@ fn tex_mapper(x: char) -> String { } } -fn adjust_tex_str(context: &str) -> String { - let ws_reg = Regex::new(r"[\t\n\v\f\r ]").unwrap(); - let mut fix: String = ws_reg.replace_all(context, " ").trim().to_owned(); - let mapped_chunks: Vec = fix.chars().map(tex_mapper).collect(); - fix = mapped_chunks.join(""); - fix +/// Escape special characters for TeX. +fn format_tex_field(s: &str) -> String { + let mapped_chunks: Vec = s.chars().map(tex_mapper).collect(); + mapped_chunks.join("") } -fn format_tex_line(config: &Config, word_ref: &WordRef, line: &str, reference: &str) -> String { +fn format_tex_line( + config: &Config, + word_ref: &WordRef, + line: &str, + chars_line: &[char], + reference: &str, +) -> String { let mut output = String::new(); output.push_str(&format!("\\{} ", config.macro_name)); let all_before = if config.input_ref { let before = &line[0..word_ref.position]; - adjust_tex_str(before.trim().trim_start_matches(reference)) + let before_start_trimoff = + word_ref.position - before.trim_start_matches(reference).trim_start().len(); + let before_end_index = before.len(); + &chars_line[before_start_trimoff..cmp::max(before_end_index, before_start_trimoff)] } else { - adjust_tex_str(&line[0..word_ref.position]) + let before_chars_trim_idx = (0, word_ref.position); + &chars_line[before_chars_trim_idx.0..before_chars_trim_idx.1] }; - let keyword = adjust_tex_str(&line[word_ref.position..word_ref.position_end]); - let all_after = adjust_tex_str(&line[word_ref.position_end..line.len()]); + let keyword = &line[word_ref.position..word_ref.position_end]; + let after_chars_trim_idx = (word_ref.position_end, chars_line.len()); + let all_after = &chars_line[after_chars_trim_idx.0..after_chars_trim_idx.1]; let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config); output.push_str(&format!( "{5}{0}{6}{5}{1}{6}{5}{2}{6}{5}{3}{6}{5}{4}{6}", - tail, before, keyword, after, head, "{", "}" + format_tex_field(&tail), + format_tex_field(&before), + format_tex_field(keyword), + format_tex_field(&after), + format_tex_field(&head), + "{", + "}" )); if config.auto_ref || config.input_ref { - output.push_str(&format!("{}{}{}", "{", adjust_tex_str(&reference), "}")); + output.push_str(&format!("{}{}{}", "{", format_tex_field(&reference), "}")); } output } -fn adjust_roff_str(context: &str) -> String { - let ws_reg = Regex::new(r"[\t\n\v\f\r]").unwrap(); - ws_reg - .replace_all(context, " ") - .replace("\"", "\"\"") - .trim() - .to_owned() +fn format_roff_field(s: &str) -> String { + s.replace("\"", "\"\"") } -fn format_roff_line(config: &Config, word_ref: &WordRef, line: &str, reference: &str) -> String { +fn format_roff_line( + config: &Config, + word_ref: &WordRef, + line: &str, + chars_line: &[char], + reference: &str, +) -> String { let mut output = String::new(); output.push_str(&format!(".{}", config.macro_name)); let all_before = if config.input_ref { let before = &line[0..word_ref.position]; - adjust_roff_str(before.trim().trim_start_matches(reference)) + let before_start_trimoff = + word_ref.position - before.trim_start_matches(reference).trim_start().len(); + let before_end_index = before.len(); + &chars_line[before_start_trimoff..cmp::max(before_end_index, before_start_trimoff)] } else { - adjust_roff_str(&line[0..word_ref.position]) + let before_chars_trim_idx = (0, word_ref.position); + &chars_line[before_chars_trim_idx.0..before_chars_trim_idx.1] }; - let keyword = adjust_roff_str(&line[word_ref.position..word_ref.position_end]); - let all_after = adjust_roff_str(&line[word_ref.position_end..line.len()]); + let keyword = &line[word_ref.position..word_ref.position_end]; + let after_chars_trim_idx = (word_ref.position_end, chars_line.len()); + let all_after = &chars_line[after_chars_trim_idx.0..after_chars_trim_idx.1]; let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config); output.push_str(&format!( " \"{}\" \"{}\" \"{}{}\" \"{}\"", - tail, before, keyword, after, head + format_roff_field(&tail), + format_roff_field(&before), + format_roff_field(keyword), + format_roff_field(&after), + format_roff_field(&head) )); if config.auto_ref || config.input_ref { - output.push_str(&format!(" \"{}\"", adjust_roff_str(&reference))); + output.push_str(&format!(" \"{}\"", format_roff_field(&reference))); } output } fn write_traditional_output( config: &Config, - file_map: &HashMap, usize)>, + file_map: &FileMap, words: &BTreeSet, output_filename: &str, ) { @@ -485,19 +570,39 @@ fn write_traditional_output( let file = crash_if_err!(1, File::create(output_filename)); Box::new(file) }); + + let context_reg = Regex::new(&config.context_regex).unwrap(); + for word_ref in words.iter() { - let file_map_value: &(Vec, usize) = file_map + let file_map_value: &FileContent = file_map .get(&(word_ref.filename)) .expect("Missing file in file map"); - let (ref lines, _) = *(file_map_value); - let reference = get_reference(config, word_ref, &lines[word_ref.local_line_nr]); + let FileContent { + ref lines, + ref chars_lines, + offset: _, + } = *(file_map_value); + let reference = get_reference( + config, + word_ref, + &lines[word_ref.local_line_nr], + &context_reg, + ); let output_line: String = match config.format { - OutFormat::Tex => { - format_tex_line(config, word_ref, &lines[word_ref.local_line_nr], &reference) - } - OutFormat::Roff => { - format_roff_line(config, word_ref, &lines[word_ref.local_line_nr], &reference) - } + OutFormat::Tex => format_tex_line( + config, + word_ref, + &lines[word_ref.local_line_nr], + &chars_lines[word_ref.local_line_nr], + &reference, + ), + OutFormat::Roff => format_roff_line( + config, + word_ref, + &lines[word_ref.local_line_nr], + &chars_lines[word_ref.local_line_nr], + &reference, + ), OutFormat::Dumb => crash!(1, "There is no dumb format with GNU extensions disabled"), }; crash_if_err!(1, writeln!(writer, "{}", output_line));