diff --git a/Makefile b/Makefile index d2c81ae6c..98278d6d5 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ PROGS := \ du \ factor \ false \ + fmt \ fold \ md5sum \ mkdir \ diff --git a/README.md b/README.md index 498344e9e..f88779a09 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,6 @@ To do - dircolors - expand (in progress) - expr -- fmt - getlimits - install - join diff --git a/fmt/fmt.rs b/fmt/fmt.rs new file mode 100644 index 000000000..206b89f26 --- /dev/null +++ b/fmt/fmt.rs @@ -0,0 +1,267 @@ +#![crate_id(name="fmt", vers="0.0.1", author="kwantam")] +/* + * This file is part of `fmt` from the uutils coreutils package. + * + * (c) kwantam + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +#![feature(macro_rules)] + +extern crate core; +extern crate getopts; +extern crate libc; + +use std::io::{BufferedReader, BufferedWriter, File, IoResult}; +use std::io::stdio::{stdin_raw, stdout_raw, stdout}; +use std::os; +use linebreak::break_simple; +use parasplit::{ParagraphStream, ParaWords}; + +#[macro_export] +macro_rules! silent_unwrap( + ($exp:expr) => ( + match $exp { + Ok(_) => (), + Err(_) => unsafe { ::libc::exit(1) } + } + ) +) +#[path = "../common/util.rs"] +mod util; +mod linebreak; +mod parasplit; + +// program's NAME and VERSION are used for -V and -h +static NAME: &'static str = "fmt"; +static VERSION: &'static str = "0.0.1"; + +struct FmtOptions { + crown : bool, + tagged : bool, + mail : bool, + split_only : bool, + use_prefix : bool, + prefix : String, + xprefix : bool, + prefix_len : uint, + use_anti_prefix : bool, + anti_prefix : String, + xanti_prefix : bool, + uniform : bool, + width : uint, + goal : uint, + tabwidth : uint, +} + +#[allow(dead_code)] +fn main() { os::set_exit_status(uumain(os::args())) } + +fn uumain(args: Vec) -> int { + + let opts = [ + getopts::optflag("c", "crown-margin", "First and second line of paragraph may have different indentations, in which case the first line's indentation is preserved, and each subsequent line's indentation matches the second line."), + getopts::optflag("t", "tagged-paragraph", "Like -c, except that the first and second line of a paragraph *must* have different indentation or they are treated as separate paragraphs."), + getopts::optflag("m", "preserve-headers", "Attempt to detect and preserve mail headers in the input. Be careful when combining this flag with -p."), + getopts::optflag("s", "split-only", "Split lines only, do not reflow."), + getopts::optflag("u", "uniform-spacing", "Insert exactly one space between words, and two between sentences. Sentence breaks in the input are detected as [?!.] followed by two spaces or a newline; other punctuation is not interpreted as a sentence break."), + + getopts::optopt("p", "prefix", "Reformat only lines beginning with PREFIX, reattaching PREFIX to reformatted lines. Unless -x is specified, leading whitespace will be ignored when matching PREFIX.", "PREFIX"), + getopts::optopt("P", "skip-prefix", "Do not reformat lines beginning with PSKIP. Unless -X is specified, leading whitespace will be ignored when matching PSKIP", "PSKIP"), + + getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."), + getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."), + + getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"), + getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"), + + getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"), + + getopts::optflag("V", "version", "Output version information and exit."), + getopts::optflag("h", "help", "Display this help message and exit.") + ]; + + let matches = match getopts::getopts(args.tail(), opts.as_slice()) { + Ok(m) => m, + Err(f) => crash!(1, "{}\nTry `{} --help' for more information.", f, args.get(0)) + }; + + if matches.opt_present("h") { + print_usage(args.get(0).as_slice(), opts.as_slice(), ""); + } + + if matches.opt_present("V") || matches.opt_present("h") { + println!("uutils {} v{}", NAME, VERSION); + return 0 + } + + let mut fmt_opts = FmtOptions { + crown : false, + tagged : false, + mail : false, + uniform : false, + split_only : false, + use_prefix : false, + prefix : String::new(), + xprefix : false, + prefix_len : 0, + use_anti_prefix : false, + anti_prefix : String::new(), + xanti_prefix : false, + width : 78, + goal : 72, + tabwidth : 8, + }; + + if matches.opt_present("t") { fmt_opts.tagged = true; } + if matches.opt_present("c") { fmt_opts.crown = true; fmt_opts.tagged = false; } + if matches.opt_present("m") { fmt_opts.mail = true; } + if matches.opt_present("u") { fmt_opts.uniform = true; } + if matches.opt_present("s") { fmt_opts.split_only = true; fmt_opts.crown = false; fmt_opts.tagged = false; } + if matches.opt_present("x") { fmt_opts.xprefix = true; } + if matches.opt_present("X") { fmt_opts.xanti_prefix = true; } + + match matches.opt_str("p") { + Some(s) => { + fmt_opts.prefix = s; + fmt_opts.use_prefix = true; + fmt_opts.prefix_len = fmt_opts.prefix.as_slice().char_len() + } + None => () + }; + + match matches.opt_str("P") { + Some(s) => { + fmt_opts.anti_prefix = s; + fmt_opts.use_anti_prefix = true; + } + None => () + }; + + match matches.opt_str("w") { + Some(s) => { + fmt_opts.width = + match from_str(s.as_slice()) { + Some(t) => t, + None => { crash!(1, "Invalid WIDTH specification: `{}'", s); } + }; + fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4); + } + None => () + }; + + match matches.opt_str("g") { + Some(s) => { + fmt_opts.goal = + match from_str(s.as_slice()) { + Some(t) => t, + None => { crash!(1, "Invalid GOAL specification: `{}'", s); } + }; + if !matches.opt_present("w") { + fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4); + } else if fmt_opts.goal > fmt_opts.width { + crash!(1, "GOAL cannot be greater than WIDTH."); + } + } + None => () + }; + + match matches.opt_str("T") { + Some(s) => { + fmt_opts.tabwidth = + match from_str(s.as_slice()) { + Some(t) => t, + None => { crash!(1, "Invalid TABWIDTH specification: `{}'", s); } + }; + } + None => () + }; + + if fmt_opts.tabwidth < 1 { + fmt_opts.tabwidth = 1; + } + + // immutable now + let fmt_opts = fmt_opts; + + let mut files = matches.free; + if files.is_empty() { + files.push("-".to_string()); + } + + let mut ostream = box BufferedWriter::new(stdout_raw()) as Box; + + for i in files.iter().map(|x| x.as_slice()) { + let mut fp = + match open_file(i) { + Err(e) => { + show_warning!("{}: {}",i,e); + continue; + } + Ok(f) => f + }; + let mut pStream = ParagraphStream::new(&fmt_opts, &mut fp); + for paraResult in pStream { + match paraResult { + Err(s) => silent_unwrap!(ostream.write(s.as_bytes())), + Ok(para) => { + // indent + let pIndent = para.pfxind_str.clone().append(fmt_opts.prefix.as_slice()).append(para.indent_str.as_slice()); + let pIndentLen = para.pfxind_len + fmt_opts.prefix_len + para.indent_len; + + // words + let pWords = ParaWords::new(&fmt_opts, ¶); + let mut pWords_words = pWords.words().map(|&x| x); + + // print the init, if it exists, and get its length + let pInitLen = + if fmt_opts.crown || fmt_opts.tagged { + // handle "init" portion + silent_unwrap!(ostream.write(para.init_str.as_bytes())); + para.init_len + } else if !para.mail_header { + // for non-(crown, tagged) that's the same as a normal indent + silent_unwrap!(ostream.write(pIndent.as_bytes())); + pIndentLen + } else { + // except that mail headers get no indent at all + 0 + }; + + // does ths paragraph require uniform spacing? + let uniform = para.mail_header || fmt_opts.uniform; + + break_simple(&mut pWords_words, fmt_opts.width, pIndent.as_slice(), pIndentLen, pInitLen, uniform, &mut ostream); + silent_unwrap!(ostream.write("\n".as_bytes())); + } + } + } + + // flush the output after each file + silent_unwrap!(ostream.flush()); + } + + 0 +} + +fn print_usage(arg0: &str, opts: &[getopts::OptGroup], errmsg: &str) { + break_simple(&mut getopts::short_usage(arg0, opts).as_slice().words(), 64, " ", 7, 0, true, &mut(box stdout() as Box)); + println!("\n\n{}{}", getopts::usage("Reformat paragraphs from input files (or stdin) to stdout.", opts), errmsg); +} + +// uniform interface for opening files +// since we don't need seeking +type FileOrStdReader = BufferedReader>; + +fn open_file(filename: &str) -> IoResult { + if filename == "-" { + Ok(BufferedReader::new(box stdin_raw() as Box)) + } else { + match File::open(&Path::new(filename)) { + Ok(f) => Ok(BufferedReader::new(box f as Box)), + Err(e) => return Err(e) + } + } +} diff --git a/fmt/linebreak.rs b/fmt/linebreak.rs new file mode 100644 index 000000000..7537a2c91 --- /dev/null +++ b/fmt/linebreak.rs @@ -0,0 +1,33 @@ +/* + * This file is part of `fmt` from the uutils coreutils package. + * + * (c) kwantam + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +// break_simple implements the "tight" breaking algorithm: print words until +// maxlength would be exceeded, then print a linebreak and indent and continue. +// Note that any first line indent should already have been printed before +// calling this function, and the length of said indent should be passed as +// init_len +pub fn break_simple<'a, T: Iterator<&'a str>>(s: &'a mut T, maxlen: uint, indent_str: &'a str, indent_len: uint, init_len: uint, uniform: bool, ostream: &mut Box) -> uint { + s.fold(init_len, |l, w| accum_words_simple(maxlen, indent_len, indent_str, ostream, uniform, l, w)) +} + +fn accum_words_simple(maxlen: uint, indent_len: uint, indent_str: &str, ostream: &mut Box, uniform: bool, l: uint, w: &str) -> uint { + let wlen = w.len(); + let lnew = + if l + wlen > maxlen { + silent_unwrap!(ostream.write("\n".as_bytes())); + silent_unwrap!(ostream.write(indent_str.as_bytes())); + indent_len + } else { + l + }; + + silent_unwrap!(ostream.write(w.as_bytes())); + if uniform { silent_unwrap!(ostream.write(" ".as_bytes())); } + lnew + wlen + 1 +} diff --git a/fmt/parasplit.rs b/fmt/parasplit.rs new file mode 100644 index 000000000..583bb306c --- /dev/null +++ b/fmt/parasplit.rs @@ -0,0 +1,532 @@ +/* + * This file is part of `fmt` from the uutils coreutils package. + * + * (c) kwantam + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use core::iter::Peekable; +use std::io::Lines; +use std::slice::Items; +use std::str::CharRange; +use FileOrStdReader; +use FmtOptions; + +// lines with PSKIP, lacking PREFIX, or which are entirely blank are +// NoFormatLines; otherwise, they are FormatLines +#[deriving(Show)] +enum Line { + FormatLine(FileLine), + NoFormatLine(String, bool) +} + +impl Line { + // when we know that it's a FormatLine, as in the ParagraphStream iterator + fn get_fileline(self) -> FileLine { + match self { + FormatLine(fl) => fl, + NoFormatLine(..) => fail!("Found NoFormatLine when expecting FormatLine") + } + } + + // when we know that it's a NoFormatLine, as in the ParagraphStream iterator + fn get_noformatline(self) -> (String, bool) { + match self { + NoFormatLine(s, b) => (s, b), + FormatLine(..) => fail!("Found FormatLine when expecting NoFormatLine") + } + } +} + +// each line's prefix has to be considered to know whether to merge it with +// the next line or not +#[deriving(Show)] +struct FileLine { + line : String, + indent_end : uint, // the end of the indent, always the start of the text + prefix_end : uint, // the end of the PREFIX + pfxind_end : uint, // the end of the PREFIX's indent, that is, the spaces before the prefix + indent_len : uint, // display length of indent taking into account TABWIDTH + pfxind_len : uint, // PREFIX indent length taking into account TABWIDTH +} + +// iterator that produces a stream of Lines from a file +struct FileLines<'a> { + opts : &'a FmtOptions, + lines : Lines<'a, FileOrStdReader>, +} + +impl<'a> FileLines<'a> { + fn new<'a>(opts: &'a FmtOptions, lines: Lines<'a, FileOrStdReader>) -> FileLines<'a> { + FileLines { opts: opts, lines: lines } + } + + // returns true if this line should be formatted + fn match_prefix(&self, line: &str) -> (bool, uint) { + if !self.opts.use_prefix { return (true, 0u); } + + FileLines::match_prefix_generic(self.opts.prefix.as_slice(), line, self.opts.xprefix) + } + + // returns true if this line should be formatted + fn match_anti_prefix(&self, line: &str) -> bool { + if !self.opts.use_anti_prefix { return true; } + + match FileLines::match_prefix_generic(self.opts.anti_prefix.as_slice(), line, self.opts.xanti_prefix) { + (true, _) => false, + (_ , _) => true + } + } + + fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, uint) { + if line.starts_with(pfx) { + return (true, 0); + } + + if !exact { + // we do it this way rather than byte indexing to support unicode whitespace chars + let mut i = 0u; + while (i < line.len()) && line.char_at(i).is_whitespace() { + i = match line.char_range_at(i) { CharRange { ch: _ , next: nxi } => nxi }; + if line.slice_from(i).starts_with(pfx) { + return (true, i); + } + } + } + + (false, 0) + } + + fn displayed_length(&self, s: &str) -> uint { + s.char_len() + (self.opts.tabwidth - 1) * s.chars().filter(|x| x == &'\t').count() + } +} + +impl<'a> Iterator for FileLines<'a> { + fn next(&mut self) -> Option { + let mut n = + match self.lines.next() { + Some(t) => match t { + Ok(tt) => tt, + Err(_) => return None + }, + None => return None + }; + + // if this line is entirely whitespace, + // emit a blank line + // Err(true) indicates that this was a linebreak, + // which is important to know when detecting mail headers + if n.as_slice().is_whitespace() { + return Some(NoFormatLine("\n".to_string(), true)); + } + + // if this line does not match the prefix, + // emit the line unprocessed and iterate again + let (pmatch, poffset) = self.match_prefix(n.as_slice()); + if !pmatch { + return Some(NoFormatLine(n, false)); + } + + // if this line matches the anti_prefix + // (NOTE definition of match_anti_prefix is TRUE if we should process) + if !self.match_anti_prefix(n.as_slice()) { + return Some(NoFormatLine(n, false)); + } + + // replace trailing newline, if any, with space + let CharRange {ch, next: i} = n.as_slice().char_range_at_reverse(n.len()); + if ch == '\n' { + unsafe { + let nmut = n.as_mut_bytes(); + nmut[i] = ' ' as u8; + } + if i > 0 { + let CharRange {ch, next: _} = n.as_slice().char_range_at_reverse(i); + if ch == '.' { + n.push_char(' '); + } + } + } + + let nLen = n.len(); + // figure out the indent, prefix, and prefixindent ending points + let (indEnd, pfxEnd, pfxIndEnd) = + if self.opts.use_prefix { + let pfxEnd = poffset + self.opts.prefix.len(); + let nSlice = n.as_slice().slice_from(pfxEnd); + let nSlice2 = nSlice.trim_left(); + (pfxEnd + nSlice.len() - nSlice2.len(), pfxEnd, poffset) + } else { + let nSlice = n.as_slice().trim_left(); + (nLen - nSlice.len(), 0, 0) + }; + + // indent length + let indLen = + if indEnd > 0 { + self.displayed_length(n.as_slice().slice(pfxEnd, indEnd)) + } else { + 0 + }; + + // prefix indent length + let pfxIndLen = + if pfxIndEnd > 0 { + self.displayed_length(n.as_slice().slice_to(pfxIndEnd)) + } else { + 0 + }; + + // if we are in uniform mode, all tabs after the indent should be replaced by spaces. + // NOTE that in this implementation, [?!.]\t is NOT detected as a sentence break, but + // [?!.]\t\t is. We could expand tabs to two spaces to force detection of tab as + // sentence ending + if self.opts.uniform { + let tabinds: Vec = n.as_slice().slice_from(indEnd).char_indices().filter_map(|(i, c)| if c == '\t' { Some(i) } else { None }).collect(); + unsafe { + let nmut = n.as_mut_bytes(); + for i in tabinds.iter() { + nmut[*i] = ' ' as u8; + } + } + } + + Some(FormatLine(FileLine { + line : n, + indent_end : indEnd, + prefix_end : pfxEnd, + pfxind_end : pfxIndEnd, + indent_len : indLen, + pfxind_len : pfxIndLen, + })) + } +} + +// a paragraph : a collection of FileLines that are to be formatted +// plus info about the paragraph's indentation +// (but we only retain the String from the FileLine; the other info +// is only there to help us in deciding how to merge lines into Paragraphs +#[deriving(Show)] +pub struct Paragraph { + lines : Vec, // the lines of the file + pub init_str : String, // string representing the init, that is, the first line's indent + pub init_len : uint, // printable length of the init string considering TABWIDTH + init_end : uint, // byte location of end of init in first line String + pub indent_str : String, // string representing indent + pub indent_len : uint, // length of above + indent_end : uint, // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward) + pub pfxind_str : String, // string representing the prefix indent + pub pfxind_len : uint, // length of above + pub mail_header : bool // we need to know if this is a mail header because we do word splitting differently in that case +} + +// an iterator producing a stream of paragraphs from a stream of lines +// given a set of options. +// NOTE as you iterate through the paragraphs, any NoFormatLines are +// immediately dumped to stdout! +pub struct ParagraphStream<'a> { + lines : Peekable>, + next_mail : bool, + opts : &'a FmtOptions, +} + +impl<'a> ParagraphStream<'a> { + pub fn new<'a>(opts: &'a FmtOptions, reader: &'a mut FileOrStdReader) -> ParagraphStream<'a> { + let lines = FileLines::new(opts, reader.lines()).peekable(); + // at the beginning of the file, we might find mail headers + ParagraphStream { lines: lines, next_mail: true, opts: opts } + } + + // detect RFC822 mail header + fn is_mail_header(line: &FileLine) -> bool { + // a mail header begins with either "From " (envelope sender line) + // or with a sequence of printable ASCII chars (33 to 126, inclusive, + // except colon) followed by a colon. + if line.indent_end > 0 { + false + } else { + let lSlice = line.line.as_slice(); + if lSlice.starts_with("From ") { + true + } else { + let colonPosn = + match lSlice.find(':') { + Some(n) => n, + None => return false + }; + + // header field must be nonzero length + if colonPosn == 0 { return false; } + + return lSlice.slice_to(colonPosn).chars().all(|x| match x as uint { + y if y < 33 || y > 126 => false, + _ => true + }); + } + } + } +} + +impl<'a> Iterator> for ParagraphStream<'a> { + fn next(&mut self) -> Option> { + // return a NoFormatLine in an Err; it should immediately be output + let noformat = + match self.lines.peek() { + None => return None, + Some(l) => match l { + &FormatLine(_) => false, + &NoFormatLine(_, _) => true + } + }; + + // found a NoFormatLine, immediately dump it out + if noformat { + let (s, nm) = self.lines.next().unwrap().get_noformatline(); + self.next_mail = nm; + return Some(Err(s)); + } + + // found a FormatLine, now build a paragraph + let mut init_str = String::new(); + let mut init_end = 0; + let mut init_len = 0; + let mut indent_str = String::new(); + let mut indent_end = 0; + let mut indent_len = 0; + let mut pfxind_str = String::new(); + let mut pfxind_len = 0; + let mut pLines = Vec::new(); + + let mut in_mail = false; + let mut second_done = false; // for when we use crown or tagged mode + loop { + { // peek ahead + // need to explicitly force fl out of scope before we can call self.lines.next() + let fl = + match self.lines.peek() { + None => break, + Some(l) => { + match l { + &FormatLine(ref x) => x, + &NoFormatLine(..) => break + } + } + }; + + if pLines.len() == 0 { + // first time through the loop, get things set up + // detect mail header + if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) { + in_mail = true; + // there can't be any indent or pfxind because otherwise is_mail_header would fail + // since there cannot be any whitespace before the colon in a valid header field + indent_str.push_str(" "); + indent_len = 2; + } else { + if self.opts.crown || self.opts.tagged { + init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end)); + init_len = fl.indent_len + fl.pfxind_len + self.opts.prefix_len; + init_end = fl.indent_end; + } + + // these will be overwritten in the 2nd line of crown or tagged mode, but + // we are not guaranteed to get to the 2nd line, e.g., if the next line + // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around + indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end)); + indent_len = fl.indent_len; + indent_end = fl.indent_end; + + // in tagged mode, add 4 spaces of additional indenting by default + // (gnu fmt's behavior is different: it seems to find the closest column to + // indent_end that is divisible by 3. But honesly that behavior seems + // pretty arbitrary. + // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big. + if self.opts.tagged { + indent_str.push_str(" "); + indent_len += 4; + } + + if self.opts.use_prefix { + pfxind_str.push_str(fl.line.as_slice().slice_to(fl.pfxind_end)); + pfxind_len = fl.pfxind_len; + } + } + } else if in_mail { + // lines following mail headers must begin with spaces + if (self.opts.use_prefix && fl.pfxind_end == 0) || (!self.opts.use_prefix && fl.indent_end == 0) { + break; // this line does not begin with spaces + } + } else if !second_done && (self.opts.crown || self.opts.tagged) { + // now we have enough info to handle crown margin and tagged mode + if pfxind_len != fl.pfxind_len { + // in both crown and tagged modes we require that pfxind is the same + break; + } else if self.opts.tagged && (indent_end == fl.indent_end) { + // in tagged mode, indent also has to be different + break; + } else { + // this is part of the same paragraph, get the indent info from this line + indent_str.clear(); + indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end)); + indent_len = fl.indent_len; + indent_end = fl.indent_end; + } + second_done = true; + } else { + // detect mismatch + if (indent_end != fl.indent_end) || (indent_len != fl.indent_len) || (pfxind_len != fl.pfxind_len) { + break; + } + } + } + + pLines.push(self.lines.next().unwrap().get_fileline().line); + + // when we're in split-only mode, we never join lines, so stop here + if self.opts.split_only { + break; + } + } + + // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot. + // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank + // NoFormatLine. + self.next_mail = in_mail; + + Some(Ok(Paragraph { + lines : pLines, + init_str : init_str, + init_len : init_len, + init_end : init_end, + indent_str : indent_str, + indent_len : indent_len, + indent_end : indent_end, + pfxind_str : pfxind_str, + pfxind_len : pfxind_len, + mail_header : in_mail + })) + } +} + +pub struct ParaWords<'a> { + opts : &'a FmtOptions, + para : &'a Paragraph, + words : Vec<&'a str> +} + +impl<'a> ParaWords<'a> { + pub fn new<'a>(opts: &'a FmtOptions, para: &'a Paragraph) -> ParaWords<'a> { + let mut pw = ParaWords { opts: opts, para: para, words: Vec::new() }; + pw.create_words(); + pw + } + + fn create_words<'r>(&'r mut self) { + if self.para.mail_header { + // no extra spacing for mail headers; always exactly 1 space + // safe to trim_left on every line of a mail header, since the + // first line is guaranteed not to have any spaces + self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().trim_left().words()).collect()); + } else { + // first line + self.words.push_all_move( + if self.opts.crown || self.opts.tagged { + // crown and tagged mode has the "init" in the first line, so slice from there + WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.init_end)) + } else { + // otherwise we slice from the indent + WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end)) + }.collect()); + + if self.para.lines.len() > 1 { + let indent_end = self.para.indent_end; + let uniform = self.opts.uniform; + self.words.push_all_move( + self.para.lines.iter().skip(1) + .flat_map(|x| WordSplit::new(uniform, x.as_slice().slice_from(indent_end))) + .collect()); + } + } + } + + pub fn words(&'a self) -> Items<'a,&'a str> { return self.words.iter() } +} + +struct WordSplit<'a> { + uniform : bool, + string : &'a str, + length : uint, + position : uint +} + +impl<'a> WordSplit<'a> { + fn new<'a>(uniform: bool, string: &'a str) -> WordSplit<'a> { + // wordsplits *must* start at a non-whitespace character + let trim_string = string.trim_left(); + WordSplit { uniform: uniform, string: trim_string, length: string.len(), position: 0 } + } + + fn is_punctuation(c: char) -> bool { + match c { + '!' | '.' | '?' => true, + _ => false + } + } +} + +impl<'a> Iterator<&'a str> for WordSplit<'a> { + fn next(&mut self) -> Option<&'a str> { + if self.position >= self.length { + return None + } + + let old_position = self.position; + + // find the start of the next whitespace segment + let ws_start = + match self.string.slice_from(old_position).find(|x: char| x.is_whitespace()) { + None => self.length, + Some(s) => s + old_position + }; + + if ws_start == self.length { + self.position = self.length; + return Some(self.string.slice_from(old_position)); + } + + // find the end of the next whitespace segment + // note that this preserves the invariant that self.position points to + // non-whitespace character OR end of string + self.position = + match self.string.slice_from(ws_start).find(|x: char| !x.is_whitespace()) { + None => self.length, + Some(s) => s + ws_start + }; + + let is_sentence_end = match self.string.char_range_at_reverse(ws_start) { + CharRange { ch, next: _ } if WordSplit::is_punctuation(ch) => self.position - ws_start > 2, + _ => false + }; + + Some( + if self.uniform { + // if the last non-whitespace character is a [?!.] and + // there are two or more spaces, this is the end of a + // sentence, so keep one extra space. + if is_sentence_end { + self.string.slice(old_position, ws_start + 1) + } else { + self.string.slice(old_position, ws_start) + } + } else { + // in non-uniform mode, we just keep the whole thing + // eventually we will want to annotate where the sentence boundaries are + // so that we can give preference to splitting lines appropriately + self.string.slice(old_position, self.position) + } + ) + } +}