From f06d9fe1303c37adaf1c82dee8b49deb813d1e7a Mon Sep 17 00:00:00 2001 From: polyphemus Date: Sat, 5 Jul 2014 19:57:18 +0200 Subject: [PATCH] Rewrite cut_bytes(), more performant than GNU Creates BufReader in buffer.rs. BufReader uses a stack allocated buffer to read into and returns selected slices into the buffer. This does away with any dynamic allocations in the 'newline loop. 1.5 to 2.5 more performant than previous version. 1.5 to 2.0 times more performant than GNU. --- cut/buffer.rs | 134 +++++++++++++++++++++++++++++++++++++++++++++ cut/cut.rs | 149 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 224 insertions(+), 59 deletions(-) create mode 100644 cut/buffer.rs diff --git a/cut/buffer.rs b/cut/buffer.rs new file mode 100644 index 000000000..9fe691c3e --- /dev/null +++ b/cut/buffer.rs @@ -0,0 +1,134 @@ +use std; +use std::io::{IoResult, IoError}; + +pub struct BufReader { + reader: R, + buffer: [u8, ..4096], + start: uint, + end: uint, // exclusive +} + +pub mod Bytes { + pub trait Select { + fn select<'a>(&'a mut self, bytes: uint) -> Selected<'a>; + } + + pub enum Selected<'a> { + NewlineFound(&'a [u8]), + Complete(&'a [u8]), + Partial(&'a [u8]), + EndOfFile, + } +} + +impl BufReader { + pub fn new(reader: R) -> BufReader { + let empty_buffer = unsafe { + std::mem::uninitialized::<[u8, ..4096]>() + }; + + BufReader { + reader: reader, + buffer: empty_buffer, + start: 0, + end: 0, + } + } + + fn read(&mut self) -> IoResult { + let buf_len = self.buffer.len(); + let buffer_fill = self.buffer.mut_slice(self.end, buf_len); + + match self.reader.read(buffer_fill) { + Ok(nread) => { + self.end += nread; + Ok(nread) + } + error => error + } + } + + #[inline] + fn maybe_fill_buf(&mut self) -> IoResult { + if self.end == self.start { + self.start = 0; + self.end = 0; + } + + if self.end <= 2048 { self.read() } else { Ok(0) } + } + + pub fn consume_line(&mut self) -> uint { + let mut bytes_consumed = 0; + + loop { + match self.maybe_fill_buf() { + Err(IoError { kind: std::io::EndOfFile, .. }) => (), + Err(err) => fail!("read error: {}", err.desc), + _ => () + } + + let buffer_used = self.end - self.start; + + if buffer_used == 0 { return bytes_consumed; } + + for idx in range(self.start, self.end) { + if self.buffer[idx] == b'\n' { + self.start = idx + 1; + return bytes_consumed + idx + 1; + } + } + + bytes_consumed += buffer_used; + + self.start = 0; + self.end = 0; + } + } +} + +impl Bytes::Select for BufReader { + fn select<'a>(&'a mut self, bytes: uint) -> Bytes::Selected<'a> { + match self.maybe_fill_buf() { + Err(IoError { kind: std::io::EndOfFile, .. }) => (), + Err(err) => fail!("read error: {}", err.desc), + _ => () + } + + let buffer_used = self.end - self.start; + + if buffer_used == 0 { return Bytes::EndOfFile; } + + let (complete, max_segment_len) = { + if bytes < buffer_used { + (true, bytes + 1) + } else { + (false, buffer_used) + } + }; + + for idx in range(self.start, self.start + max_segment_len) { + if self.buffer[idx] == b'\n' { + let segment = self.buffer.slice(self.start, idx + 1); + + self.start = idx + 1; + + return Bytes::NewlineFound(segment); + } + } + + if complete { + let segment = self.buffer.slice(self.start, + self.start + bytes); + + self.start += bytes; + Bytes::Complete(segment) + } else { + let segment = self.buffer.slice(self.start, self.end); + + self.start = 0; + self.end = 0; + Bytes::Partial(segment) + } + } +} diff --git a/cut/cut.rs b/cut/cut.rs index d53e30f00..dce917473 100644 --- a/cut/cut.rs +++ b/cut/cut.rs @@ -14,7 +14,7 @@ extern crate getopts; extern crate libc; -use std::io::{File, BufferedWriter, BufferedReader, stdin, print}; +use std::io::{stdio, File, BufferedWriter, BufferedReader, print}; use getopts::{optopt, optflag, getopts, usage}; use ranges::Range; @@ -22,6 +22,7 @@ use ranges::Range; #[path = "../common/util.rs"] mod util; mod ranges; +mod buffer; static NAME: &'static str = "cut"; static VERSION: &'static str = "1.0.0"; @@ -50,67 +51,94 @@ fn list_to_ranges(list: &str, complement: bool) -> Result, String> { } } -fn cut_bytes(mut reader: BufferedReader, +fn cut_bytes(reader: R, ranges: &Vec, opts: &Options) -> int { - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); - let (use_delim, out_delim) = match opts.out_delim.clone() { - Some(delim) => (true, delim), - None => (false, "".to_string()) - }; + use buffer::Bytes::{Select, NewlineFound, Complete, Partial, EndOfFile}; + + let mut buf_read = buffer::BufReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_until(b'\n') { - Ok(line) => line, - Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, - _ => fail!(), - }; - - let line_len = line.len(); + let mut cur_pos = 1; let mut print_delim = false; for &Range { low: low, high: high } in ranges.iter() { - if low > line_len { break; } + // skip upto low + let orig_pos = cur_pos; + loop { + match buf_read.select(low - cur_pos) { + NewlineFound(_) => { + out.write(&[b'\n']).unwrap(); + continue 'newline + } + Complete(bytes) => { + cur_pos += bytes.len(); + break + } + Partial(bytes) => cur_pos += bytes.len(), + EndOfFile => { + if orig_pos != cur_pos { + out.write(&[b'\n']).unwrap(); + } - if use_delim { - if print_delim { - out.write_str(out_delim.as_slice()).unwrap(); + break 'newline + } } - print_delim = true; } - if high >= line_len { - let segment = line.slice(low - 1, line_len); - - out.write(segment).unwrap(); - - if *line.get(line_len - 1) == b'\n' { - continue 'newline + match opts.out_delim { + Some(ref delim) => { + if print_delim { + out.write(delim.as_bytes()).unwrap(); + } + print_delim = true; } - } else { - let segment = line.slice(low - 1, high); + None => () + } - out.write(segment).unwrap(); + // write out from low to high + loop { + match buf_read.select(high - cur_pos + 1) { + NewlineFound(bytes) => { + out.write(bytes).unwrap(); + continue 'newline + } + Complete(bytes) => { + out.write(bytes).unwrap(); + cur_pos = high + 1; + break + } + Partial(bytes) => { + cur_pos += bytes.len(); + out.write(bytes).unwrap(); + } + EndOfFile => { + if cur_pos != low || low == high { + out.write(&[b'\n']).unwrap(); + } + + break 'newline + } + } } } - out.write(&[b'\n']).unwrap(); + buf_read.consume_line(); + out.write([b'\n']).unwrap(); } 0 } -fn cut_characters(mut reader: BufferedReader, +fn cut_characters(reader: R, ranges: &Vec, opts: &Options) -> int { - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); - let (use_delim, out_delim) = match opts.out_delim.clone() { - Some(delim) => (true, delim), - None => (false, "".to_string()) - }; + let mut buf_in = BufferedReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_line() { + let line = match buf_in.read_line() { Ok(line) => line, Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), @@ -126,11 +154,14 @@ fn cut_characters(mut reader: BufferedReader, None => break }; - if use_delim { - if print_delim { - out.write_str(out_delim.as_slice()).unwrap(); + match opts.out_delim { + Some(ref delim) => { + if print_delim { + out.write(delim.as_bytes()).unwrap(); + } + print_delim = true; } - print_delim = true; + None => () } match char_indices.nth(high - low) { @@ -204,15 +235,16 @@ impl<'a> Iterator<(uint, uint)> for Searcher<'a> { } } -fn cut_fields_delimiter(mut reader: BufferedReader, +fn cut_fields_delimiter(reader: R, ranges: &Vec, delim: &String, only_delimited: bool, out_delim: &String) -> int { - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let mut buf_in = BufferedReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_until(b'\n') { + let line = match buf_in.read_until(b'\n') { Ok(line) => line, Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), @@ -279,7 +311,7 @@ fn cut_fields_delimiter(mut reader: BufferedReader, 0 } -fn cut_fields(mut reader: BufferedReader, +fn cut_fields(reader: R, ranges: &Vec, opts: &FieldOptions) -> int { match opts.out_delimeter { @@ -290,10 +322,11 @@ fn cut_fields(mut reader: BufferedReader, None => () } - let mut out = BufferedWriter::new(std::io::stdio::stdout_raw()); + let mut buf_in = BufferedReader::new(reader); + let mut out = BufferedWriter::new(stdio::stdout_raw()); 'newline: loop { - let line = match reader.read_until(b'\n') { + let line = match buf_in.read_until(b'\n') { Ok(line) => line, Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break, _ => fail!(), @@ -367,17 +400,17 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { for filename in filenames.iter() { if filename.as_slice() == "-" { - if stdin_read { continue; } + if stdin_read { continue } exit_code |= match mode { Bytes(ref ranges, ref opts) => { - cut_bytes(stdin(), ranges, opts) + cut_bytes(stdio::stdin_raw(), ranges, opts) } Characters(ref ranges, ref opts) => { - cut_characters(stdin(), ranges, opts) + cut_characters(stdio::stdin_raw(), ranges, opts) } Fields(ref ranges, ref opts) => { - cut_fields(stdin(), ranges, opts) + cut_fields(stdio::stdin_raw(), ranges, opts) } }; @@ -387,11 +420,11 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { if ! path.exists() { show_error!("{}: No such file or directory", filename); - continue; + continue } - let buf_file = match File::open(&path) { - Ok(file) => BufferedReader::new(file), + let file = match File::open(&path) { + Ok(f) => f, Err(e) => { show_error!("{}: {}", filename, e.desc); continue @@ -399,13 +432,11 @@ fn cut_files(mut filenames: Vec, mode: Mode) -> int { }; exit_code |= match mode { - Bytes(ref ranges, ref opts) => cut_bytes(buf_file, ranges, opts), + Bytes(ref ranges, ref opts) => cut_bytes(file, ranges, opts), Characters(ref ranges, ref opts) => { - cut_characters(buf_file, ranges, opts) - } - Fields(ref ranges, ref opts) => { - cut_fields(buf_file, ranges, opts) + cut_characters(file, ranges, opts) } + Fields(ref ranges, ref opts) => cut_fields(file, ranges, opts) }; } }