diff --git a/cut/buffer.rs b/cut/buffer.rs new file mode 100644 index 000000000..9fe691c3e --- /dev/null +++ b/cut/buffer.rs @@ -0,0 +1,134 @@ +use std; +use std::io::{IoResult, IoError}; + +pub struct BufReader { + reader: R, + buffer: [u8, ..4096], + start: uint, + end: uint, // exclusive +} + +pub mod Bytes { + pub trait Select { + fn select<'a>(&'a mut self, bytes: uint) -> Selected<'a>; + } + + pub enum Selected<'a> { + NewlineFound(&'a [u8]), + Complete(&'a [u8]), + Partial(&'a [u8]), + EndOfFile, + } +} + +impl BufReader { + pub fn new(reader: R) -> BufReader { + let empty_buffer = unsafe { + std::mem::uninitialized::<[u8, ..4096]>() + }; + + BufReader { + reader: reader, + buffer: empty_buffer, + start: 0, + end: 0, + } + } + + fn read(&mut self) -> IoResult { + let buf_len = self.buffer.len(); + let buffer_fill = self.buffer.mut_slice(self.end, buf_len); + + match self.reader.read(buffer_fill) { + Ok(nread) => { + self.end += nread; + Ok(nread) + } + error => error + } + } + + #[inline] + fn maybe_fill_buf(&mut self) -> IoResult { + if self.end == self.start { + self.start = 0; + self.end = 0; + } + + if self.end <= 2048 { self.read() } else { Ok(0) } + } + + pub fn consume_line(&mut self) -> uint { + let mut bytes_consumed = 0; + + loop { + match self.maybe_fill_buf() { + Err(IoError { kind: std::io::EndOfFile, .. }) => (), + Err(err) => fail!("read error: {}", err.desc), + _ => () + } + + let buffer_used = self.end - self.start; + + if buffer_used == 0 { return bytes_consumed; } + + for idx in range(self.start, self.end) { + if self.buffer[idx] == b'\n' { + self.start = idx + 1; + return bytes_consumed + idx + 1; + } + } + + bytes_consumed += buffer_used; + + self.start = 0; + self.end = 0; + } + } +} + +impl Bytes::Select for BufReader { + fn select<'a>(&'a mut self, bytes: uint) -> Bytes::Selected<'a> { + match self.maybe_fill_buf() { + Err(IoError { kind: std::io::EndOfFile, .. }) => (), + Err(err) => fail!("read error: {}", err.desc), + _ => () + } + + let buffer_used = self.end - self.start; + + if buffer_used == 0 { return Bytes::EndOfFile; } + + let (complete, max_segment_len) = { + if bytes < buffer_used { + (true, bytes + 1) + } else { + (false, buffer_used) + } + }; + + for idx in range(self.start, self.start + max_segment_len) { + if self.buffer[idx] == b'\n' { + let segment = self.buffer.slice(self.start, idx + 1); + + self.start = idx + 1; + + return Bytes::NewlineFound(segment); + } + } + + if complete { + let segment = self.buffer.slice(self.start, + self.start + bytes); + + self.start += bytes; + Bytes::Complete(segment) + } else { + let segment = self.buffer.slice(self.start, self.end); + + self.start = 0; + self.end = 0; + Bytes::Partial(segment) + } + } +} diff --git a/cut/cut.rs b/cut/cut.rs index d53e30f00..dce917473 100644 --- a/cut/cut.rs +++ b/cut/cut.rs @@ -14,7 +14,7 @@ extern crate getopts; extern crate libc; -use std::io::{File, BufferedWriter, BufferedReader, stdin, print}; +use std::io::{stdio, File, BufferedWriter, BufferedReader, print}; use getopts::{optopt, optflag, getopts, usage}; use ranges::Range; @@ -22,6 +22,7 @@ use ranges::Range; #[path = "../common/util.rs"] mod util; mod ranges; +mod buffer; static NAME: &'static str = "cut"; static VERSION: &'static str = "1.0.0"; @@ -50,67 +51,94 @@ fn list_to_ranges(list: &str, complement: bool) -> Result, String> { } } -fn cut_bytes(mut reader: BufferedReader, +fn cut_bytes(reader: R, ranges: &Vec, opts: &Options) -> int { - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); - let (use_delim, out_delim) = match opts.out_delim.clone() { - Some(delim) => (true, delim), - None => (false, "".to_string()) - }; + use buffer::Bytes::{Select, NewlineFound, Complete, Partial, EndOfFile}; + + let mut buf_read = buffer::BufReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_until(b'\n') { - Ok(line) => line, - Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, - _ => fail!(), - }; - - let line_len = line.len(); + let mut cur_pos = 1; let mut print_delim = false; for &Range { low: low, high: high } in ranges.iter() { - if low > line_len { break; } + // skip upto low + let orig_pos = cur_pos; + loop { + match buf_read.select(low - cur_pos) { + NewlineFound(_) => { + out.write(&[b'\n']).unwrap(); + continue 'newline + } + Complete(bytes) => { + cur_pos += bytes.len(); + break + } + Partial(bytes) => cur_pos += bytes.len(), + EndOfFile => { + if orig_pos != cur_pos { + out.write(&[b'\n']).unwrap(); + } - if use_delim { - if print_delim { - out.write_str(out_delim.as_slice()).unwrap(); + break 'newline + } } - print_delim = true; } - if high >= line_len { - let segment = line.slice(low - 1, line_len); - - out.write(segment).unwrap(); - - if *line.get(line_len - 1) == b'\n' { - continue 'newline + match opts.out_delim { + Some(ref delim) => { + if print_delim { + out.write(delim.as_bytes()).unwrap(); + } + print_delim = true; } - } else { - let segment = line.slice(low - 1, high); + None => () + } - out.write(segment).unwrap(); + // write out from low to high + loop { + match buf_read.select(high - cur_pos + 1) { + NewlineFound(bytes) => { + out.write(bytes).unwrap(); + continue 'newline + } + Complete(bytes) => { + out.write(bytes).unwrap(); + cur_pos = high + 1; + break + } + Partial(bytes) => { + cur_pos += bytes.len(); + out.write(bytes).unwrap(); + } + EndOfFile => { + if cur_pos != low || low == high { + out.write(&[b'\n']).unwrap(); + } + + break 'newline + } + } } } - out.write(&[b'\n']).unwrap(); + buf_read.consume_line(); + out.write([b'\n']).unwrap(); } 0 } -fn cut_characters(mut reader: BufferedReader, +fn cut_characters(reader: R, ranges: &Vec, opts: &Options) -> int { - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); - let (use_delim, out_delim) = match opts.out_delim.clone() { - Some(delim) => (true, delim), - None => (false, "".to_string()) - }; + let mut buf_in = BufferedReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_line() { + let line = match buf_in.read_line() { Ok(line) => line, Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), @@ -126,11 +154,14 @@ fn cut_characters(mut reader: BufferedReader, None => break }; - if use_delim { - if print_delim { - out.write_str(out_delim.as_slice()).unwrap(); + match opts.out_delim { + Some(ref delim) => { + if print_delim { + out.write(delim.as_bytes()).unwrap(); + } + print_delim = true; } - print_delim = true; + None => () } match char_indices.nth(high - low) { @@ -204,15 +235,16 @@ impl<'a> Iterator<(uint, uint)> for Searcher<'a> { } } -fn cut_fields_delimiter(mut reader: BufferedReader, +fn cut_fields_delimiter(reader: R, ranges: &Vec, delim: &String, only_delimited: bool, out_delim: &String) -> int { - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let mut buf_in = BufferedReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_until(b'\n') { + let line = match buf_in.read_until(b'\n') { Ok(line) => line, Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), @@ -279,7 +311,7 @@ fn cut_fields_delimiter(mut reader: BufferedReader, 0 } -fn cut_fields(mut reader: BufferedReader, +fn cut_fields(reader: R, ranges: &Vec, opts: &FieldOptions) -> int { match opts.out_delimeter { @@ -290,10 +322,11 @@ fn cut_fields(mut reader: BufferedReader, None => () } - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let mut buf_in = BufferedReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_until(b'\n') { + let line = match buf_in.read_until(b'\n') { Ok(line) => line, Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), @@ -367,17 +400,17 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { for filename in filenames.iter() { if filename.as_slice() == "-" { - if stdin_read { continue; } + if stdin_read { continue } exit_code |= match mode { Bytes(ref ranges, ref opts) => { - cut_bytes(stdin(), ranges, opts) + cut_bytes(stdio::stdin_raw(), ranges, opts) } Characters(ref ranges, ref opts) => { - cut_characters(stdin(), ranges, opts) + cut_characters(stdio::stdin_raw(), ranges, opts) } Fields(ref ranges, ref opts) => { - cut_fields(stdin(), ranges, opts) + cut_fields(stdio::stdin_raw(), ranges, opts) } }; @@ -387,11 +420,11 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { if ! path.exists() { show_error!("{}: No such file or directory", filename); - continue; + continue } - let buf_file = match File::open(&path) { - Ok(file) => BufferedReader::new(file), + let file = match File::open(&path) { + Ok(f) => f, Err(e) => { show_error!("{}: {}", filename, e.desc); continue @@ -399,13 +432,11 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { }; exit_code |= match mode { - Bytes(ref ranges, ref opts) => cut_bytes(buf_file, ranges, opts), + Bytes(ref ranges, ref opts) => cut_bytes(file, ranges, opts), Characters(ref ranges, ref opts) => { - cut_characters(buf_file, ranges, opts) - } - Fields(ref ranges, ref opts) => { - cut_fields(buf_file, ranges, opts) + cut_characters(file, ranges, opts) } + Fields(ref ranges, ref opts) => cut_fields(file, ranges, opts) }; } }