From ea7df03c0aa4db5f5355b0217e2c4c67a854ada2 Mon Sep 17 00:00:00 2001 From: dokaptur Date: Tue, 27 Jan 2015 16:37:07 +0100 Subject: [PATCH] initial ptx commit --- Makefile | 1 + src/ptx/deps.mk | 1 + src/ptx/ptx.rs | 561 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 563 insertions(+) create mode 100644 src/ptx/deps.mk create mode 100644 src/ptx/ptx.rs diff --git a/Makefile b/Makefile index b8a16495a..7ab1b0921 100644 --- a/Makefile +++ b/Makefile @@ -67,6 +67,7 @@ PROGS := \ od \ paste \ printenv \ + ptx \ pwd \ readlink \ realpath \ diff --git a/src/ptx/deps.mk b/src/ptx/deps.mk new file mode 100644 index 000000000..3115648f6 --- /dev/null +++ b/src/ptx/deps.mk @@ -0,0 +1 @@ +DEPLIBS += regex \ No newline at end of file diff --git a/src/ptx/ptx.rs b/src/ptx/ptx.rs new file mode 100644 index 000000000..a029bb144 --- /dev/null +++ b/src/ptx/ptx.rs @@ -0,0 +1,561 @@ +#![crate_name = "ptx"] +#![feature(convert, collections)] + +/* + * This file is part of the uutils coreutils package. + * + * (c) Dorota Kapturkiewicz + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +extern crate getopts; +extern crate regex; + +use std::collections::{HashMap, HashSet, BTreeSet}; +use std::default::Default; +use std::fs::File; +use getopts::{Options, Matches}; +use std::io::{stdin, stdout, BufReader, BufWriter, BufRead, Read, Write}; +use regex::Regex; +use std::cmp; + + +#[path = "../common/util.rs"] +#[macro_use] +mod util; + +static NAME: &'static str = "ptx"; +static VERSION: &'static str = "1.0.0"; + +#[derive(Debug)] +enum OutFormat { + Dumb, + Roff, + Tex, +} + +#[derive(Debug)] +struct Config { + format : OutFormat, + gnu_ext : bool, + auto_ref : bool, + input_ref : bool, + right_ref : bool, + ignore_case : bool, + macro_name : String, + trunc_str : String, + context_regex : String, + line_width : usize, + gap_size : usize, +} + +impl Default for Config { + fn default() -> Config { + Config { + format : OutFormat::Dumb, + gnu_ext : true, + auto_ref : false, + input_ref : false, + right_ref : false, + ignore_case : false, + macro_name : "xx".to_string(), + trunc_str : "/".to_string(), + context_regex : "\\w+".to_string(), + line_width : 72, + gap_size : 3 + } + } +} + +fn read_word_filter_file(matches: &Matches, option: &str) -> HashSet { + let filename = matches.opt_str(option).expect("parsing options failed!"); + let reader = BufReader::new(crash_if_err!(1, File::open(filename))); + let mut words: HashSet = HashSet::new(); + for word in reader.lines() { + words.insert(crash_if_err!(1, word)); + } + words +} + +#[derive(Debug)] +struct WordFilter { + only_specified: bool, + ignore_specified: bool, + only_set: HashSet, + ignore_set: HashSet, + word_regex: String, +} + +impl WordFilter { + fn new(matches: &Matches, config: &Config) -> WordFilter { + let (o, oset): (bool, HashSet) = + if matches.opt_present("o") { + (true, read_word_filter_file(matches, "o")) + } else { + (false, HashSet::new()) + }; + let (i, iset): (bool, HashSet) = + if matches.opt_present("i") { + (true, read_word_filter_file(matches, "i")) + } else { + (false, HashSet::new()) + }; + if matches.opt_present("b") { + crash!(1, "-b not implemented yet"); + } + let reg = + if matches.opt_present("W") { + matches.opt_str("W").expect("parsing options failed!") + } else if config.gnu_ext { + "\\w+".to_string() + } else { + "[^ \t\n]+".to_string() + }; + WordFilter { + only_specified: o, + ignore_specified: i, + only_set: oset, + ignore_set: iset, + word_regex: reg + } + } +} + +#[derive(Debug, PartialOrd, PartialEq, Eq, Ord)] +struct WordRef { + word: String, + global_line_nr: usize, + local_line_nr: usize, + position: usize, + position_end: usize, + filename: String, +} + +fn print_version() { + println!("{} version {}", NAME, VERSION); +} + +fn print_usage(opts: &Options) { + let brief = "Usage: ptx [OPTION]... [INPUT]... (without -G) or: \ + ptx -G [OPTION]... [INPUT [OUTPUT]] \n Output a permuted index, \ + including context, of the words in the input files. \n\n Mandatory \ + arguments to long options are mandatory for short options too."; + let explaination = "With no FILE, or when FILE is -, read standard input. \ + Default is '-F /'."; + println!("{}\n{}", opts.usage(&brief), explaination); +} + + +fn get_config(matches: &Matches) -> Config { + let mut config: Config = Default::default(); + let err_msg = "parsing options failed"; + if matches.opt_present("G") { + config.gnu_ext = false; + config.format = OutFormat::Roff; + config.context_regex = "[^ \t\n]+".to_string(); + } else { + crash!(1, "GNU extensions not implemented yet"); + } + if matches.opt_present("S") { + crash!(1, "-S not implemented yet"); + } + config.auto_ref = matches.opt_present("A"); + config.input_ref = matches.opt_present("r"); + config.right_ref &= matches.opt_present("R"); + config.ignore_case = matches.opt_present("f"); + if matches.opt_present("M") { + config.macro_name = + matches.opt_str("M").expect(err_msg).to_string(); + } + if matches.opt_present("F") { + config.trunc_str = + matches.opt_str("F").expect(err_msg).to_string(); + } + if matches.opt_present("w") { + let width_str = matches.opt_str("w").expect(err_msg); + config.line_width = crash_if_err!( + 1, usize::from_str_radix(width_str.as_str(), 10)); + } + if matches.opt_present("g") { + let gap_str = matches.opt_str("g").expect(err_msg); + config.gap_size = crash_if_err!( + 1, usize::from_str_radix(gap_str.as_str(), 10)); + } + if matches.opt_present("O") { + config.format = OutFormat::Roff; + } + if matches.opt_present("T") { + config.format = OutFormat::Tex; + } + config +} + +fn read_input(input_files: Vec<&str>, config: &Config) -> + HashMap, usize)> { + let mut file_map : HashMap, usize)> = + HashMap::new(); + let mut files = Vec::new(); + if input_files.is_empty() { + files.push("-"); + } else { + if config.gnu_ext { + files.push_all(input_files.as_slice()); + } else { + files.push(input_files[0]); + } + } + let mut lines_so_far: usize = 0; + for filename in files { + let reader: BufReader> = BufReader::new( + if filename == "-" { + Box::new(stdin()) + } else { + let file = crash_if_err!(1, File::open(filename)); + Box::new(file) + }); + let lines: Vec = reader.lines().map(|x| crash_if_err!(1, x)) + .collect(); + let size = lines.len(); + file_map.insert(filename.to_string(), (lines, lines_so_far)); + lines_so_far += size + } + file_map +} + +fn create_word_set(config: &Config, filter: &WordFilter, + file_map: &HashMap, usize)>)-> + BTreeSet { + let reg = Regex::new(filter.word_regex.as_str()).unwrap(); + let ref_reg = Regex::new(config.context_regex.as_str()).unwrap(); + let mut word_set: BTreeSet = BTreeSet::new(); + for (file, lines) in file_map.iter() { + let mut count: usize = 0; + let offs = lines.1; + for line in (lines.0).iter() { + // if -r, exclude reference from word set + let (ref_beg, ref_end) = match ref_reg.find(line) { + Some(x) => x, + None => (0,0) + }; + // match words with given regex + for (beg, end) in reg.find_iter(line) { + if config.input_ref && ((beg, end) == (ref_beg, ref_end)) { + continue; + } + let mut word = line.slice_chars(beg, end).to_string(); + if filter.only_specified && + !(filter.only_set.contains(&word)) { + continue; + } + if filter.ignore_specified && + filter.ignore_set.contains(&word) { + continue; + } + if config.ignore_case { + word = word.to_lowercase(); + } + word_set.insert(WordRef{ + word: word, + filename: String::from(file.as_str()), + global_line_nr: offs + count, + local_line_nr: count, + position: beg, + position_end: end + }); + } + count += 1; + } + } + word_set +} + +fn get_reference(config: &Config, word_ref: &WordRef, line: &String) -> + String { + if config.auto_ref { + format!("{}:{}", word_ref.filename, word_ref.local_line_nr + 1) + } else if config.input_ref { + let reg = Regex::new(config.context_regex.as_str()).unwrap(); + let (beg, end) = match reg.find(line) { + Some(x) => x, + None => (0,0) + }; + format!("{}", line.slice_chars(beg, end)) + } else { + String::new() + } +} + +fn assert_str_integrity(s: &Vec, beg: usize, end: usize) { + assert!(beg <= end); + assert!(end <= s.len()); +} + +fn trim_broken_word_left(s: &Vec, beg: usize, end: usize) -> usize { + assert_str_integrity(s, beg, end); + if beg == end || beg == 0 || s[beg].is_whitespace() || + s[beg-1].is_whitespace() { + return beg; + } + let mut b = beg; + while b < end && !s[b].is_whitespace() { + b += 1; + } + b +} + +fn trim_broken_word_right(s: &Vec, beg: usize, end: usize) -> usize { + assert_str_integrity(s, beg, end); + if beg == end || end == s.len() || s[end-1].is_whitespace() || + s[end].is_whitespace() { + return end; + } + let mut e = end; + while beg < e && !s[e-1].is_whitespace() { + e -= 1; + } + e +} + +fn trim_idx(s: &Vec, beg: usize, end: usize) -> (usize, usize) { + assert_str_integrity(s, beg, end); + let mut b = beg; + let mut e = end; + while b < e && s[b].is_whitespace() { + b += 1; + } + while b < e && s[e-1].is_whitespace() { + e -= 1; + } + (b,e) +} + + +fn get_output_chunks(all_before: &String, keyword: &String, all_after: &String, + config: &Config) -> (String, String, String, String) { + assert!(all_before.trim() == all_before.as_str()); + assert!(keyword.trim() == keyword.as_str()); + assert!(all_after.trim() == all_after.as_str()); + let mut head = String::new(); + let mut before = String::new(); + let mut after = String::new(); + let mut tail = String::new(); + + let half_line_size = cmp::max((config.line_width/2) as isize - + (2*config.trunc_str.len()) as isize, 0) as usize; + let max_after_size = cmp::max(half_line_size as isize - + keyword.len() as isize - 1, 0) as usize; + let max_before_size = half_line_size; + let all_before_vec: Vec = all_before.chars().collect(); + let all_after_vec: Vec = all_after.chars().collect(); + + // get before + let mut bb_tmp = + cmp::max(all_before.len() as isize - max_before_size as isize, 0) as usize; + bb_tmp = trim_broken_word_left(&all_before_vec, bb_tmp, all_before.len()); + let (before_beg, before_end) = + trim_idx(&all_before_vec, bb_tmp, all_before.len()); + before.push_str(all_before.slice_chars(before_beg, before_end)); + assert!(max_before_size >= before.len()); + + // get after + let mut ae_tmp = cmp::min(max_after_size, all_after.len()); + ae_tmp = trim_broken_word_right(&all_after_vec, 0, ae_tmp); + let (after_beg, after_end) = trim_idx(&all_after_vec, 0, ae_tmp); + after.push_str(all_after.slice_chars(after_beg, after_end)); + assert!(max_after_size >= after.len()); + + // get tail + let max_tail_size = max_before_size - before.len(); + let (tb, _) = trim_idx(&all_after_vec, after_end, all_after.len()); + let mut te_tmp = cmp::min(tb + max_tail_size, all_after.len()); + te_tmp = trim_broken_word_right(&all_after_vec, tb, te_tmp); + let (tail_beg, tail_end) = trim_idx(&all_after_vec, tb, te_tmp); + tail.push_str(all_after.slice_chars(tail_beg, tail_end)); + + // get head + let max_head_size = max_after_size - after.len(); + let (_, he) = trim_idx(&all_before_vec, 0, before_beg); + let mut hb_tmp = + cmp::max(he as isize - max_head_size as isize, 0) as usize; + hb_tmp = trim_broken_word_left(&all_before_vec, hb_tmp, he); + let (head_beg, head_end) = trim_idx(&all_before_vec, hb_tmp, he); + head.push_str(all_before.slice_chars(head_beg, head_end)); + + // put right context truncation string if needed + if after_end != all_after.len() && tail_beg == tail_end { + after.push_str(config.trunc_str.as_str()); + } else if after_end != all_after.len() && tail_end != all_after.len() { + tail.push_str(config.trunc_str.as_str()); + } + + // put left context truncation string if needed + if before_beg != 0 && head_beg == head_end { + before = format!("{}{}", config.trunc_str, before); + } else if before_beg != 0 && head_beg != 0 { + head = format!("{}{}", config.trunc_str, head); + } + + // add space before "after" if needed + if after.len() > 0 { + after = format!(" {}", after); + } + + (tail, before, after, head) +} + +fn tex_mapper(x: char) -> String { + match x { + '\\' => "\\backslash{}".to_string(), + '$' | '%' | '#' | '&' | '_' => format!("\\{}", x), + '^' | '~' => format!("{}\\{}{}", x, "{", "}"), + '}' | '{' => format!("$\\{}$", x), + _ => x.to_string() + } +} + +fn adjust_tex_str(context: &str) -> String { + let ws_reg = Regex::new(r"[\t\n\v\f\r ]").unwrap(); + let mut fix: String = ws_reg.replace_all(context, " ").trim().to_string(); + let mapped_chunks: Vec = fix.chars().map(tex_mapper).collect(); + fix = mapped_chunks.connect(""); + fix +} + +fn format_tex_line(config: &Config, word_ref: &WordRef, line: &String, + reference: &String) -> String { + let mut output = String::new(); + output.push_str(&format!("\\{} ", config.macro_name)); + let all_before = if config.input_ref { + let before = line.slice_chars(0, word_ref.position); + adjust_tex_str(before.trim().trim_left_matches(reference)) + } else { + adjust_tex_str(line.slice_chars(0, word_ref.position)) + }; + let keyword = adjust_tex_str( + line.slice_chars(word_ref.position, word_ref.position_end)); + let all_after = adjust_tex_str( + line.slice_chars(word_ref.position_end, line.len())); + let (tail, before, after, head) = + get_output_chunks(&all_before, &keyword, &all_after, &config); + output.push_str(format!("{5}{0}{6}{5}{1}{6}{5}{2}{6}{5}{3}{6}{5}{4}{6}", + tail, before, keyword, after, head, "{", "}").as_str()); + if config.auto_ref || config.input_ref { + output.push_str(&format!("{}{}{}", "{", reference, "}")); + } + output +} + +fn adjust_roff_str(context: &str) -> String { + let ws_reg = Regex::new(r"[\t\n\v\f\r]").unwrap(); + ws_reg.replace_all(context, " ").replace("\"", "\"\"").trim().to_string() +} + +fn format_roff_line(config: &Config, word_ref: &WordRef, line: &str, + reference: &str) -> String { + let mut output = String::new(); + output.push_str(&format!(".{}", config.macro_name)); + let all_before = if config.input_ref { + let before = line.slice_chars(0, word_ref.position); + adjust_roff_str(before.trim().trim_left_matches(reference)) + } else { + adjust_roff_str(line.slice_chars(0, word_ref.position)) + }; + let keyword = adjust_roff_str( + line.slice_chars(word_ref.position, word_ref.position_end)); + let all_after = adjust_roff_str( + line.slice_chars(word_ref.position_end, line.len())); + let (tail, before, after, head) = + get_output_chunks(&all_before, &keyword, &all_after, &config); + output.push_str(format!(" \"{}\" \"{}\" \"{}{}\" \"{}\"", + tail, before, keyword, after, head).as_str()); + if config.auto_ref || config.input_ref { + output.push_str(&format!(" \"{}\"", reference)); + } + output +} + +fn write_traditional_output(config: &Config, + file_map: &HashMap,usize)>, + words: &BTreeSet, output_filename: &str) { + let mut writer: BufWriter> = BufWriter::new( + if output_filename == "-" { + Box::new(stdout()) + } else { + let file = crash_if_err!(1, File::create(output_filename)); + Box::new(file) + }); + for word_ref in words.iter() { + let file_map_value : &(Vec, usize) = + file_map.get(&(word_ref.filename)) + .expect("Missing file in file map"); + let (ref lines, _) = *(file_map_value); + let reference = + get_reference(config, word_ref, &lines[word_ref.local_line_nr]); + let output_line: String = match config.format { + OutFormat::Tex => format_tex_line( + config, word_ref, &lines[word_ref.local_line_nr], &reference), + OutFormat::Roff => format_roff_line( + config, word_ref, &lines[word_ref.local_line_nr], &reference), + OutFormat::Dumb => crash!( + 1, "There is no dumb format with GNU extensions disabled") + }; + crash_if_err!(1, writeln!(writer, "{}", output_line)); + } +} + +pub fn uumain(args: Vec) -> i32 { + let mut opts = Options::new(); + opts.optflag("A", "auto-reference", + "output automatically generated references"); + opts.optflag("G", "traditional", "behave more like System V 'ptx'"); + opts.optopt("F", "flag-truncation", + "use STRING for flagging line truncations", "STRING"); + opts.optopt("M", "macro-name", "macro name to use instead of 'xx'", + "STRING"); + opts.optflag("O", "format=roff", "generate output as roff directives"); + opts.optflag("R", "right-side-refs", + "put references at right, not counted in -w"); + opts.optopt("S", "sentence-regexp", "for end of lines or end of sentences", + "REGEXP"); + opts.optflag("T", "format=tex", "generate output as TeX directives"); + opts.optopt("W", "word-regexp", "use REGEXP to match each keyword", + "REGEXP"); + opts.optopt("b", "break-file", "word break characters in this FILE", + "FILE"); + opts.optflag("f", "ignore-case", + "fold lower case to upper case for sorting"); + opts.optopt("g", "gap-size", "gap size in columns between output fields", + "NUMBER"); + opts.optopt("i", "ignore-file", "read ignore word list from FILE", "FILE"); + opts.optopt("o", "only-file", "read only word list from this FILE", + "FILE"); + opts.optflag("r", "references", "first field of each line is a reference"); + opts.optopt("w", "width", "output width in columns, reference excluded", + "NUMBER"); + opts.optflag("", "help", "display this help and exit"); + opts.optflag("", "version", "output version information and exit"); + + let matches = return_if_err!(1, opts.parse(&args[1..])); + + if matches.opt_present("help") { + print_usage(&opts); + return 0; + } + if matches.opt_present("version") { + print_version(); + return 0; + } + let config = get_config(&matches); + let word_filter = WordFilter::new(&matches, &config); + let file_map = + read_input(matches.free.iter().map(|x| x.as_str()).collect(), &config); + let word_set = create_word_set(&config, &word_filter, &file_map); + let output_file = if !config.gnu_ext && matches.free.len() == 2 { + matches.free[1].as_str() + } else { + "-" + }; + write_traditional_output(&config, &file_map, &word_set, output_file); + 0 +}