Merge pull request #272 from kwantam/master

implementation of fmt
2025-09-14 19:16:17 +00:00 · 2014-06-18 20:16:14 -07:00 · 2014-06-18 20:16:14 -07:00 · 99b0a11fe0
commit 99b0a11fe0
parent 6039626490 5d2a2b6a0b
5 changed files with 833 additions and 1 deletions
--- a/1
+++ b/1
@ -19,6 +19,7 @@ PROGS       := \
  du \
  factor \
  false \
+  fmt \
  fold \
  md5sum \
  mkdir \
--- a/README.md
+++ b/README.md
@ -134,7 +134,6 @@ To do
 - dircolors
 - expand (in progress)
 - expr
- fmt
 - getlimits
 - install
 - join
--- a/fmt/fmt.rs
+++ b/fmt/fmt.rs
@ -0,0 +1,267 @@
+#![crate_id(name="fmt", vers="0.0.1", author="kwantam")]
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+#![feature(macro_rules)]
+
+extern crate core;
+extern crate getopts;
+extern crate libc;
+
+use std::io::{BufferedReader, BufferedWriter, File, IoResult};
+use std::io::stdio::{stdin_raw, stdout_raw, stdout};
+use std::os;
+use linebreak::break_simple;
+use parasplit::{ParagraphStream, ParaWords};
+
+#[macro_export]
+macro_rules! silent_unwrap(
+    ($exp:expr) => (
+        match $exp {
+            Ok(_) => (),
+            Err(_) => unsafe { ::libc::exit(1) }
+        }
+    )
+)
+#[path = "../common/util.rs"]
+mod util;
+mod linebreak;
+mod parasplit;
+
+// program's NAME and VERSION are used for -V and -h
+static NAME: &'static str = "fmt";
+static VERSION: &'static str = "0.0.1";
+
+struct FmtOptions {
+    crown           : bool,
+    tagged          : bool,
+    mail            : bool,
+    split_only      : bool,
+    use_prefix      : bool,
+    prefix          : String,
+    xprefix         : bool,
+    prefix_len      : uint,
+    use_anti_prefix : bool,
+    anti_prefix     : String,
+    xanti_prefix    : bool,
+    uniform         : bool,
+    width           : uint,
+    goal            : uint,
+    tabwidth        : uint,
+}
+
+#[allow(dead_code)]
+fn main() { os::set_exit_status(uumain(os::args())) }
+
+fn uumain(args: Vec<String>) -> int {
+
+    let opts = [
+        getopts::optflag("c", "crown-margin", "First and second line of paragraph may have different indentations, in which case the first line's indentation is preserved, and each subsequent line's indentation matches the second line."),
+        getopts::optflag("t", "tagged-paragraph", "Like -c, except that the first and second line of a paragraph *must* have different indentation or they are treated as separate paragraphs."),
+        getopts::optflag("m", "preserve-headers", "Attempt to detect and preserve mail headers in the input. Be careful when combining this flag with -p."),
+        getopts::optflag("s", "split-only", "Split lines only, do not reflow."),
+        getopts::optflag("u", "uniform-spacing", "Insert exactly one space between words, and two between sentences. Sentence breaks in the input are detected as [?!.] followed by two spaces or a newline; other punctuation is not interpreted as a sentence break."),
+
+        getopts::optopt("p", "prefix", "Reformat only lines beginning with PREFIX, reattaching PREFIX to reformatted lines. Unless -x is specified, leading whitespace will be ignored when matching PREFIX.", "PREFIX"),
+        getopts::optopt("P", "skip-prefix", "Do not reformat lines beginning with PSKIP. Unless -X is specified, leading whitespace will be ignored when matching PSKIP", "PSKIP"),
+
+        getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."),
+        getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."),
+
+        getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"),
+        getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"),
+
+        getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"),
+
+        getopts::optflag("V", "version", "Output version information and exit."),
+        getopts::optflag("h", "help", "Display this help message and exit.")
+            ];
+
+    let matches = match getopts::getopts(args.tail(), opts.as_slice()) {
+        Ok(m) => m,
+        Err(f) => crash!(1, "{}\nTry `{} --help' for more information.", f, args.get(0))
+    };
+
+    if matches.opt_present("h") {
+        print_usage(args.get(0).as_slice(), opts.as_slice(), "");
+    }
+
+    if matches.opt_present("V") || matches.opt_present("h") {
+        println!("uutils {} v{}", NAME, VERSION);
+        return 0
+    }
+
+    let mut fmt_opts = FmtOptions {
+        crown           : false,
+        tagged          : false,
+        mail            : false,
+        uniform         : false,
+        split_only      : false,
+        use_prefix      : false,
+        prefix          : String::new(),
+        xprefix         : false,
+        prefix_len      : 0,
+        use_anti_prefix : false,
+        anti_prefix     : String::new(),
+        xanti_prefix    : false,
+        width           : 78,
+        goal            : 72,
+        tabwidth        : 8,
+    };
+
+    if matches.opt_present("t") { fmt_opts.tagged       = true; }
+    if matches.opt_present("c") { fmt_opts.crown        = true; fmt_opts.tagged = false; }
+    if matches.opt_present("m") { fmt_opts.mail         = true; }
+    if matches.opt_present("u") { fmt_opts.uniform      = true; }
+    if matches.opt_present("s") { fmt_opts.split_only   = true; fmt_opts.crown  = false; fmt_opts.tagged = false; }
+    if matches.opt_present("x") { fmt_opts.xprefix      = true; }
+    if matches.opt_present("X") { fmt_opts.xanti_prefix = true; }
+
+    match matches.opt_str("p") {
+        Some(s) => {
+            fmt_opts.prefix = s;
+            fmt_opts.use_prefix = true;
+            fmt_opts.prefix_len = fmt_opts.prefix.as_slice().char_len()
+        }
+        None => ()
+    };
+
+    match matches.opt_str("P") {
+        Some(s) => {
+            fmt_opts.anti_prefix = s;
+            fmt_opts.use_anti_prefix = true;
+        }
+        None => ()
+    };
+
+    match matches.opt_str("w") {
+        Some(s) => {
+            fmt_opts.width =
+                match from_str(s.as_slice()) {
+                    Some(t) => t,
+                    None => { crash!(1, "Invalid WIDTH specification: `{}'", s); }
+                };
+            fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4);
+        }
+        None => ()
+    };
+
+    match matches.opt_str("g") {
+        Some(s) => {
+            fmt_opts.goal =
+                match from_str(s.as_slice()) {
+                    Some(t) => t,
+                    None => { crash!(1, "Invalid GOAL specification: `{}'", s); }
+                };
+            if !matches.opt_present("w") {
+                fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4);
+            } else if fmt_opts.goal > fmt_opts.width {
+                crash!(1, "GOAL cannot be greater than WIDTH.");
+            }
+        }
+        None => ()
+    };
+
+    match matches.opt_str("T") {
+        Some(s) => {
+            fmt_opts.tabwidth =
+                match from_str(s.as_slice()) {
+                    Some(t) => t,
+                    None => { crash!(1, "Invalid TABWIDTH specification: `{}'", s); }
+                };
+        }
+        None => ()
+    };
+
+    if fmt_opts.tabwidth < 1 {
+        fmt_opts.tabwidth = 1;
+    }
+
+    // immutable now
+    let fmt_opts = fmt_opts;
+
+    let mut files = matches.free;
+    if files.is_empty() {
+        files.push("-".to_string());
+    }
+
+    let mut ostream = box BufferedWriter::new(stdout_raw()) as Box<Writer>;
+
+    for i in files.iter().map(|x| x.as_slice()) {
+        let mut fp =
+            match open_file(i) {
+                Err(e) => {
+                    show_warning!("{}: {}",i,e);
+                    continue;
+                }
+                Ok(f) => f
+            };
+        let mut pStream = ParagraphStream::new(&fmt_opts, &mut fp);
+        for paraResult in pStream {
+            match paraResult {
+                Err(s) => silent_unwrap!(ostream.write(s.as_bytes())),
+                Ok(para) => {
+                    // indent
+                    let pIndent = para.pfxind_str.clone().append(fmt_opts.prefix.as_slice()).append(para.indent_str.as_slice());
+                    let pIndentLen = para.pfxind_len + fmt_opts.prefix_len + para.indent_len;
+
+                    // words
+                    let pWords = ParaWords::new(&fmt_opts, &para);
+                    let mut pWords_words = pWords.words().map(|&x| x);
+
+                    // print the init, if it exists, and get its length
+                    let pInitLen =
+                        if fmt_opts.crown || fmt_opts.tagged {
+                            // handle "init" portion
+                            silent_unwrap!(ostream.write(para.init_str.as_bytes()));
+                            para.init_len
+                        } else if !para.mail_header {
+                            // for non-(crown, tagged) that's the same as a normal indent
+                            silent_unwrap!(ostream.write(pIndent.as_bytes()));
+                            pIndentLen
+                        } else {
+                            // except that mail headers get no indent at all
+                            0
+                        };
+
+                    // does ths paragraph require uniform spacing?
+                    let uniform = para.mail_header || fmt_opts.uniform;
+
+                    break_simple(&mut pWords_words, fmt_opts.width, pIndent.as_slice(), pIndentLen, pInitLen, uniform, &mut ostream);
+                    silent_unwrap!(ostream.write("\n".as_bytes()));
+                }
+            }
+        }
+
+        // flush the output after each file
+        silent_unwrap!(ostream.flush());
+    }
+
+    0
+}
+
+fn print_usage(arg0: &str, opts: &[getopts::OptGroup], errmsg: &str) {
+    break_simple(&mut getopts::short_usage(arg0, opts).as_slice().words(), 64, "       ", 7, 0, true, &mut(box stdout() as Box<Writer>));
+    println!("\n\n{}{}", getopts::usage("Reformat paragraphs from input files (or stdin) to stdout.", opts), errmsg);
+}
+
+// uniform interface for opening files
+// since we don't need seeking
+type FileOrStdReader = BufferedReader<Box<Reader>>;
+
+fn open_file(filename: &str) -> IoResult<FileOrStdReader> {
+    if filename == "-" {
+        Ok(BufferedReader::new(box stdin_raw() as Box<Reader>))
+    } else {
+        match File::open(&Path::new(filename)) {
+            Ok(f) => Ok(BufferedReader::new(box f as Box<Reader>)),
+            Err(e) => return Err(e)
+        }
+    }
+}
--- a/fmt/linebreak.rs
+++ b/fmt/linebreak.rs
@ -0,0 +1,33 @@
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+// break_simple implements the "tight" breaking algorithm: print words until
+// maxlength would be exceeded, then print a linebreak and indent and continue.
+// Note that any first line indent should already have been printed before
+// calling this function, and the length of said indent should be passed as
+// init_len
+pub fn break_simple<'a, T: Iterator<&'a str>>(s: &'a mut T, maxlen: uint, indent_str: &'a str, indent_len: uint, init_len: uint, uniform: bool, ostream: &mut Box<Writer>) -> uint {
+    s.fold(init_len, |l, w| accum_words_simple(maxlen, indent_len, indent_str, ostream, uniform, l, w))
+}
+
+fn accum_words_simple(maxlen: uint, indent_len: uint, indent_str: &str, ostream: &mut Box<Writer>, uniform: bool, l: uint, w: &str) -> uint {
+    let wlen = w.len();
+    let lnew =
+        if l + wlen > maxlen {
+            silent_unwrap!(ostream.write("\n".as_bytes()));
+            silent_unwrap!(ostream.write(indent_str.as_bytes()));
+            indent_len
+        } else {
+            l
+        };
+
+    silent_unwrap!(ostream.write(w.as_bytes()));
+    if uniform { silent_unwrap!(ostream.write(" ".as_bytes())); }
+    lnew + wlen + 1
+}
--- a/fmt/parasplit.rs
+++ b/fmt/parasplit.rs
@ -0,0 +1,532 @@
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+use core::iter::Peekable;
+use std::io::Lines;
+use std::slice::Items;
+use std::str::CharRange;
+use FileOrStdReader;
+use FmtOptions;
+
+// lines with PSKIP, lacking PREFIX, or which are entirely blank are
+// NoFormatLines; otherwise, they are FormatLines
+#[deriving(Show)]
+enum Line {
+    FormatLine(FileLine),
+    NoFormatLine(String, bool)
+}
+
+impl Line {
+    // when we know that it's a FormatLine, as in the ParagraphStream iterator
+    fn get_fileline(self) -> FileLine {
+        match self {
+            FormatLine(fl) => fl,
+            NoFormatLine(..) => fail!("Found NoFormatLine when expecting FormatLine")
+        }
+    }
+
+    // when we know that it's a NoFormatLine, as in the ParagraphStream iterator
+    fn get_noformatline(self) -> (String, bool) {
+        match self {
+            NoFormatLine(s, b) => (s, b),
+            FormatLine(..) => fail!("Found FormatLine when expecting NoFormatLine")
+        }
+    }
+}
+
+// each line's prefix has to be considered to know whether to merge it with
+// the next line or not
+#[deriving(Show)]
+struct FileLine {
+    line       : String,
+    indent_end : uint,     // the end of the indent, always the start of the text
+    prefix_end : uint,     // the end of the PREFIX
+    pfxind_end : uint,     // the end of the PREFIX's indent, that is, the spaces before the prefix
+    indent_len : uint,     // display length of indent taking into account TABWIDTH
+    pfxind_len : uint,     // PREFIX indent length taking into account TABWIDTH
+}
+
+// iterator that produces a stream of Lines from a file
+struct FileLines<'a> {
+    opts  : &'a FmtOptions,
+    lines : Lines<'a, FileOrStdReader>,
+}
+
+impl<'a> FileLines<'a> {
+    fn new<'a>(opts: &'a FmtOptions, lines: Lines<'a, FileOrStdReader>) -> FileLines<'a> {
+        FileLines { opts: opts, lines: lines }
+    }
+
+    // returns true if this line should be formatted
+    fn match_prefix(&self, line: &str) -> (bool, uint) {
+        if !self.opts.use_prefix { return (true, 0u); }
+
+        FileLines::match_prefix_generic(self.opts.prefix.as_slice(), line, self.opts.xprefix)
+    }
+
+    // returns true if this line should be formatted
+    fn match_anti_prefix(&self, line: &str) -> bool {
+        if !self.opts.use_anti_prefix { return true; }
+
+        match FileLines::match_prefix_generic(self.opts.anti_prefix.as_slice(), line, self.opts.xanti_prefix) {
+            (true, _) => false,
+            (_   , _) => true
+        }
+    }
+
+    fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, uint) {
+        if line.starts_with(pfx) {
+            return (true, 0);
+        }
+
+        if !exact {
+            // we do it this way rather than byte indexing to support unicode whitespace chars
+            let mut i = 0u;
+            while (i < line.len()) && line.char_at(i).is_whitespace() {
+                i = match line.char_range_at(i) { CharRange { ch: _ , next: nxi } => nxi };
+                if line.slice_from(i).starts_with(pfx) {
+                    return (true, i);
+                }
+            }
+        }
+
+        (false, 0)
+    }
+
+    fn displayed_length(&self, s: &str) -> uint {
+        s.char_len() + (self.opts.tabwidth - 1) * s.chars().filter(|x| x == &'\t').count()
+    }
+}
+
+impl<'a> Iterator<Line> for FileLines<'a> {
+    fn next(&mut self) -> Option<Line> {
+        let mut n =
+            match self.lines.next() {
+                Some(t) => match t {
+                    Ok(tt) => tt,
+                    Err(_) => return None
+                },
+                None => return None
+            };
+
+        // if this line is entirely whitespace,
+        // emit a blank line
+        // Err(true) indicates that this was a linebreak,
+        // which is important to know when detecting mail headers
+        if n.as_slice().is_whitespace() {
+            return Some(NoFormatLine("\n".to_string(), true));
+        }
+
+        // if this line does not match the prefix,
+        // emit the line unprocessed and iterate again
+        let (pmatch, poffset) = self.match_prefix(n.as_slice());
+        if !pmatch {
+            return Some(NoFormatLine(n, false));
+        }
+
+        // if this line matches the anti_prefix
+        // (NOTE definition of match_anti_prefix is TRUE if we should process)
+        if !self.match_anti_prefix(n.as_slice()) {
+            return Some(NoFormatLine(n, false));
+        }
+
+        // replace trailing newline, if any, with space
+        let CharRange {ch, next: i} = n.as_slice().char_range_at_reverse(n.len());
+        if ch == '\n' {
+            unsafe {
+                let nmut = n.as_mut_bytes();
+                nmut[i] = ' ' as u8;
+            }
+            if i > 0 {
+                let CharRange {ch, next: _} = n.as_slice().char_range_at_reverse(i);
+                if ch == '.' {
+                    n.push_char(' ');
+                }
+            }
+        }
+
+        let nLen = n.len();
+        // figure out the indent, prefix, and prefixindent ending points
+        let (indEnd, pfxEnd, pfxIndEnd) = 
+            if self.opts.use_prefix {
+                let pfxEnd = poffset + self.opts.prefix.len();
+                let nSlice = n.as_slice().slice_from(pfxEnd);
+                let nSlice2 = nSlice.trim_left();
+                (pfxEnd + nSlice.len() - nSlice2.len(), pfxEnd, poffset)
+            } else {
+                let nSlice = n.as_slice().trim_left();
+                (nLen - nSlice.len(), 0, 0)
+            };
+
+        // indent length
+        let indLen =
+            if indEnd > 0 {
+                self.displayed_length(n.as_slice().slice(pfxEnd, indEnd))
+            } else {
+                0
+            };
+
+        // prefix indent length
+        let pfxIndLen =
+            if pfxIndEnd > 0 {
+                self.displayed_length(n.as_slice().slice_to(pfxIndEnd))
+            } else {
+                0
+            };
+
+        // if we are in uniform mode, all tabs after the indent should be replaced by spaces.
+        // NOTE that in this implementation, [?!.]\t is NOT detected as a sentence break, but
+        // [?!.]\t\t is. We could expand tabs to two spaces to force detection of tab as
+        // sentence ending
+        if self.opts.uniform {
+            let tabinds: Vec<uint> = n.as_slice().slice_from(indEnd).char_indices().filter_map(|(i, c)| if c == '\t' { Some(i) } else { None }).collect();
+            unsafe {
+                let nmut = n.as_mut_bytes();
+                for i in tabinds.iter() {
+                    nmut[*i] = ' ' as u8;
+                }
+            }
+        }
+
+        Some(FormatLine(FileLine {
+            line       : n,
+            indent_end : indEnd,
+            prefix_end : pfxEnd,
+            pfxind_end : pfxIndEnd,
+            indent_len : indLen,
+            pfxind_len : pfxIndLen,
+        }))
+    }
+}
+
+// a paragraph : a collection of FileLines that are to be formatted
+// plus info about the paragraph's indentation
+// (but we only retain the String from the FileLine; the other info
+// is only there to help us in deciding how to merge lines into Paragraphs
+#[deriving(Show)]
+pub struct Paragraph {
+    lines           : Vec<String>,  // the lines of the file
+    pub init_str    : String,       // string representing the init, that is, the first line's indent
+    pub init_len    : uint,         // printable length of the init string considering TABWIDTH
+    init_end        : uint,         // byte location of end of init in first line String
+    pub indent_str  : String,       // string representing indent
+    pub indent_len  : uint,         // length of above
+    indent_end      : uint,         // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
+    pub pfxind_str  : String,       // string representing the prefix indent
+    pub pfxind_len  : uint,         // length of above
+    pub mail_header : bool          // we need to know if this is a mail header because we do word splitting differently in that case
+}
+
+// an iterator producing a stream of paragraphs from a stream of lines
+// given a set of options.
+// NOTE as you iterate through the paragraphs, any NoFormatLines are
+// immediately dumped to stdout!
+pub struct ParagraphStream<'a> {
+    lines     : Peekable<Line,FileLines<'a>>,
+    next_mail : bool,
+    opts      : &'a FmtOptions,
+}
+
+impl<'a> ParagraphStream<'a> {
+    pub fn new<'a>(opts: &'a FmtOptions, reader: &'a mut FileOrStdReader) -> ParagraphStream<'a> {
+        let lines = FileLines::new(opts, reader.lines()).peekable();
+        // at the beginning of the file, we might find mail headers
+        ParagraphStream { lines: lines, next_mail: true, opts: opts }
+    }
+
+    // detect RFC822 mail header
+    fn is_mail_header(line: &FileLine) -> bool {
+        // a mail header begins with either "From " (envelope sender line)
+        // or with a sequence of printable ASCII chars (33 to 126, inclusive,
+        // except colon) followed by a colon.
+        if line.indent_end > 0 {
+            false
+        } else {
+            let lSlice = line.line.as_slice();
+            if lSlice.starts_with("From ") {
+                true
+            } else {
+                let colonPosn =
+                    match lSlice.find(':') {
+                        Some(n) => n,
+                        None => return false
+                    };
+
+                // header field must be nonzero length
+                if colonPosn == 0 { return false; }
+
+                return lSlice.slice_to(colonPosn).chars().all(|x| match x as uint {
+                    y if y < 33 || y > 126 => false,
+                    _ => true
+                });
+            }
+        }
+    }
+}
+
+impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
+    fn next(&mut self) -> Option<Result<Paragraph,String>> {
+        // return a NoFormatLine in an Err; it should immediately be output
+        let noformat =
+            match self.lines.peek() {
+                None => return None,
+                Some(l) => match l {
+                    &FormatLine(_) => false,
+                    &NoFormatLine(_, _) => true
+                }
+            };
+
+        // found a NoFormatLine, immediately dump it out
+        if noformat {
+            let (s, nm) = self.lines.next().unwrap().get_noformatline();
+            self.next_mail = nm;
+            return Some(Err(s));
+        }
+
+        // found a FormatLine, now build a paragraph
+        let mut init_str = String::new();
+        let mut init_end = 0;
+        let mut init_len = 0;
+        let mut indent_str = String::new();
+        let mut indent_end = 0;
+        let mut indent_len = 0;
+        let mut pfxind_str = String::new();
+        let mut pfxind_len = 0;
+        let mut pLines = Vec::new();
+
+        let mut in_mail = false;
+        let mut second_done = false;    // for when we use crown or tagged mode
+        loop {
+            {   // peek ahead
+            // need to explicitly force fl out of scope before we can call self.lines.next()
+                let fl =
+                    match self.lines.peek() {
+                        None => break,
+                        Some(l) => {
+                            match l {
+                                &FormatLine(ref x) => x,
+                                &NoFormatLine(..) => break
+                            }
+                        }
+                    };
+
+                if pLines.len() == 0 {
+                    // first time through the loop, get things set up
+                    // detect mail header
+                    if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
+                        in_mail = true;
+                        // there can't be any indent or pfxind because otherwise is_mail_header would fail
+                        // since there cannot be any whitespace before the colon in a valid header field
+                        indent_str.push_str("  ");
+                        indent_len = 2;
+                    } else {
+                        if self.opts.crown || self.opts.tagged {
+                            init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
+                            init_len = fl.indent_len + fl.pfxind_len + self.opts.prefix_len;
+                            init_end = fl.indent_end;
+                        } 
+
+                        // these will be overwritten in the 2nd line of crown or tagged mode, but
+                        // we are not guaranteed to get to the 2nd line, e.g., if the next line
+                        // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
+                        indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end));
+                        indent_len = fl.indent_len;
+                        indent_end = fl.indent_end;
+
+                        // in tagged mode, add 4 spaces of additional indenting by default
+                        // (gnu fmt's behavior is different: it seems to find the closest column to
+                        // indent_end that is divisible by 3. But honesly that behavior seems
+                        // pretty arbitrary.
+                        // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big.
+                        if self.opts.tagged {
+                            indent_str.push_str("    ");
+                            indent_len += 4;
+                        }
+
+                        if self.opts.use_prefix {
+                            pfxind_str.push_str(fl.line.as_slice().slice_to(fl.pfxind_end));
+                            pfxind_len = fl.pfxind_len;
+                        }
+                    }
+                } else if in_mail {
+                    // lines following mail headers must begin with spaces
+                    if (self.opts.use_prefix && fl.pfxind_end == 0) || (!self.opts.use_prefix && fl.indent_end == 0) {
+                        break;  // this line does not begin with spaces
+                    }
+                } else if !second_done && (self.opts.crown || self.opts.tagged) {
+                    // now we have enough info to handle crown margin and tagged mode
+                    if pfxind_len != fl.pfxind_len {
+                        // in both crown and tagged modes we require that pfxind is the same
+                        break;
+                    } else if self.opts.tagged && (indent_end == fl.indent_end) {
+                        // in tagged mode, indent also has to be different
+                        break;
+                    } else {
+                        // this is part of the same paragraph, get the indent info from this line
+                        indent_str.clear();
+                        indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end));
+                        indent_len = fl.indent_len;
+                        indent_end = fl.indent_end;
+                    }
+                    second_done = true;
+                } else {
+                    // detect mismatch
+                    if (indent_end != fl.indent_end) || (indent_len != fl.indent_len) || (pfxind_len != fl.pfxind_len) {
+                        break;
+                    }
+                }
+            }
+
+            pLines.push(self.lines.next().unwrap().get_fileline().line);
+
+            // when we're in split-only mode, we never join lines, so stop here
+            if self.opts.split_only {
+                break;
+            }
+        }
+
+        // if this was a mail header, then the next line can be detected as one. Otherwise, it cannot.
+        // NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank
+        // NoFormatLine.
+        self.next_mail = in_mail;
+
+        Some(Ok(Paragraph {
+            lines       : pLines,
+            init_str    : init_str,
+            init_len    : init_len,
+            init_end    : init_end,
+            indent_str  : indent_str,
+            indent_len  : indent_len,
+            indent_end  : indent_end,
+            pfxind_str  : pfxind_str,
+            pfxind_len  : pfxind_len,
+            mail_header : in_mail
+        }))
+    }
+}
+
+pub struct ParaWords<'a> {
+    opts  : &'a FmtOptions,
+    para  : &'a Paragraph,
+    words : Vec<&'a str>
+}
+
+impl<'a> ParaWords<'a> {
+    pub fn new<'a>(opts: &'a FmtOptions, para: &'a Paragraph) -> ParaWords<'a> {
+        let mut pw = ParaWords { opts: opts, para: para, words: Vec::new() };
+        pw.create_words();
+        pw
+    }
+
+    fn create_words<'r>(&'r mut self) {
+        if self.para.mail_header {
+            // no extra spacing for mail headers; always exactly 1 space
+            // safe to trim_left on every line of a mail header, since the
+            // first line is guaranteed not to have any spaces
+            self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().trim_left().words()).collect());
+        } else {
+            // first line
+            self.words.push_all_move(
+                if self.opts.crown || self.opts.tagged {
+                    // crown and tagged mode has the "init" in the first line, so slice from there
+                    WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
+                } else {
+                    // otherwise we slice from the indent
+                    WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
+                }.collect());
+
+            if self.para.lines.len() > 1 {
+                let indent_end = self.para.indent_end;
+                let uniform = self.opts.uniform;
+                self.words.push_all_move(
+                    self.para.lines.iter().skip(1)
+                    .flat_map(|x| WordSplit::new(uniform, x.as_slice().slice_from(indent_end)))
+                    .collect());
+            }
+        }
+    }
+
+    pub fn words(&'a self) -> Items<'a,&'a str> { return self.words.iter() }
+}
+
+struct WordSplit<'a> {
+    uniform  : bool,
+    string   : &'a str,
+    length   : uint,
+    position : uint
+}
+
+impl<'a> WordSplit<'a> {
+    fn new<'a>(uniform: bool, string: &'a str) -> WordSplit<'a> {
+        // wordsplits *must* start at a non-whitespace character
+        let trim_string = string.trim_left();
+        WordSplit { uniform: uniform, string: trim_string, length: string.len(), position: 0 }
+    }
+
+    fn is_punctuation(c: char) -> bool {
+        match c {
+            '!' | '.' | '?' => true,
+            _ => false
+        }
+    }
+}
+
+impl<'a> Iterator<&'a str> for WordSplit<'a> {
+    fn next(&mut self) -> Option<&'a str> {
+        if self.position >= self.length {
+            return None
+        }
+
+        let old_position = self.position;
+
+        // find the start of the next whitespace segment
+        let ws_start =
+            match self.string.slice_from(old_position).find(|x: char| x.is_whitespace()) {
+                None => self.length,
+                Some(s) => s + old_position
+            };
+
+        if ws_start == self.length {
+            self.position = self.length;
+            return Some(self.string.slice_from(old_position));
+        }
+
+        // find the end of the next whitespace segment
+        // note that this preserves the invariant that self.position points to
+        // non-whitespace character OR end of string
+        self.position =
+            match self.string.slice_from(ws_start).find(|x: char| !x.is_whitespace()) {
+                None => self.length,
+                Some(s) => s + ws_start
+            };
+
+        let is_sentence_end = match self.string.char_range_at_reverse(ws_start) {
+            CharRange { ch, next: _ } if WordSplit::is_punctuation(ch) => self.position - ws_start > 2,
+            _ => false
+        };
+
+        Some(
+            if self.uniform {
+                // if the last non-whitespace character is a [?!.] and
+                // there are two or more spaces, this is the end of a
+                // sentence, so keep one extra space.
+                if is_sentence_end {
+                    self.string.slice(old_position, ws_start + 1)
+                } else {
+                    self.string.slice(old_position, ws_start)
+                }
+            } else {
+                // in non-uniform mode, we just keep the whole thing
+                // eventually we will want to annotate where the sentence boundaries are
+                // so that we can give preference to splitting lines appropriately
+                self.string.slice(old_position, self.position)
+            }
+        )
+    }
+}