mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-30 04:27:45 +00:00
Rewrite cut_bytes(), more performant than GNU
Creates BufReader in buffer.rs. BufReader uses a stack allocated buffer to read into and returns selected slices into the buffer. This does away with any dynamic allocations in the 'newline loop. 1.5 to 2.5 more performant than previous version. 1.5 to 2.0 times more performant than GNU.
This commit is contained in:
parent
b8def68668
commit
f06d9fe130
2 changed files with 224 additions and 59 deletions
134
cut/buffer.rs
Normal file
134
cut/buffer.rs
Normal file
|
@ -0,0 +1,134 @@
|
||||||
|
use std;
|
||||||
|
use std::io::{IoResult, IoError};
|
||||||
|
|
||||||
|
pub struct BufReader<R> {
|
||||||
|
reader: R,
|
||||||
|
buffer: [u8, ..4096],
|
||||||
|
start: uint,
|
||||||
|
end: uint, // exclusive
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod Bytes {
|
||||||
|
pub trait Select {
|
||||||
|
fn select<'a>(&'a mut self, bytes: uint) -> Selected<'a>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum Selected<'a> {
|
||||||
|
NewlineFound(&'a [u8]),
|
||||||
|
Complete(&'a [u8]),
|
||||||
|
Partial(&'a [u8]),
|
||||||
|
EndOfFile,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: Reader> BufReader<R> {
|
||||||
|
pub fn new(reader: R) -> BufReader<R> {
|
||||||
|
let empty_buffer = unsafe {
|
||||||
|
std::mem::uninitialized::<[u8, ..4096]>()
|
||||||
|
};
|
||||||
|
|
||||||
|
BufReader {
|
||||||
|
reader: reader,
|
||||||
|
buffer: empty_buffer,
|
||||||
|
start: 0,
|
||||||
|
end: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read(&mut self) -> IoResult<uint> {
|
||||||
|
let buf_len = self.buffer.len();
|
||||||
|
let buffer_fill = self.buffer.mut_slice(self.end, buf_len);
|
||||||
|
|
||||||
|
match self.reader.read(buffer_fill) {
|
||||||
|
Ok(nread) => {
|
||||||
|
self.end += nread;
|
||||||
|
Ok(nread)
|
||||||
|
}
|
||||||
|
error => error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn maybe_fill_buf(&mut self) -> IoResult<uint> {
|
||||||
|
if self.end == self.start {
|
||||||
|
self.start = 0;
|
||||||
|
self.end = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.end <= 2048 { self.read() } else { Ok(0) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn consume_line(&mut self) -> uint {
|
||||||
|
let mut bytes_consumed = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match self.maybe_fill_buf() {
|
||||||
|
Err(IoError { kind: std::io::EndOfFile, .. }) => (),
|
||||||
|
Err(err) => fail!("read error: {}", err.desc),
|
||||||
|
_ => ()
|
||||||
|
}
|
||||||
|
|
||||||
|
let buffer_used = self.end - self.start;
|
||||||
|
|
||||||
|
if buffer_used == 0 { return bytes_consumed; }
|
||||||
|
|
||||||
|
for idx in range(self.start, self.end) {
|
||||||
|
if self.buffer[idx] == b'\n' {
|
||||||
|
self.start = idx + 1;
|
||||||
|
return bytes_consumed + idx + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bytes_consumed += buffer_used;
|
||||||
|
|
||||||
|
self.start = 0;
|
||||||
|
self.end = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: Reader> Bytes::Select for BufReader<R> {
|
||||||
|
fn select<'a>(&'a mut self, bytes: uint) -> Bytes::Selected<'a> {
|
||||||
|
match self.maybe_fill_buf() {
|
||||||
|
Err(IoError { kind: std::io::EndOfFile, .. }) => (),
|
||||||
|
Err(err) => fail!("read error: {}", err.desc),
|
||||||
|
_ => ()
|
||||||
|
}
|
||||||
|
|
||||||
|
let buffer_used = self.end - self.start;
|
||||||
|
|
||||||
|
if buffer_used == 0 { return Bytes::EndOfFile; }
|
||||||
|
|
||||||
|
let (complete, max_segment_len) = {
|
||||||
|
if bytes < buffer_used {
|
||||||
|
(true, bytes + 1)
|
||||||
|
} else {
|
||||||
|
(false, buffer_used)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for idx in range(self.start, self.start + max_segment_len) {
|
||||||
|
if self.buffer[idx] == b'\n' {
|
||||||
|
let segment = self.buffer.slice(self.start, idx + 1);
|
||||||
|
|
||||||
|
self.start = idx + 1;
|
||||||
|
|
||||||
|
return Bytes::NewlineFound(segment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if complete {
|
||||||
|
let segment = self.buffer.slice(self.start,
|
||||||
|
self.start + bytes);
|
||||||
|
|
||||||
|
self.start += bytes;
|
||||||
|
Bytes::Complete(segment)
|
||||||
|
} else {
|
||||||
|
let segment = self.buffer.slice(self.start, self.end);
|
||||||
|
|
||||||
|
self.start = 0;
|
||||||
|
self.end = 0;
|
||||||
|
Bytes::Partial(segment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
149
cut/cut.rs
149
cut/cut.rs
|
@ -14,7 +14,7 @@
|
||||||
extern crate getopts;
|
extern crate getopts;
|
||||||
extern crate libc;
|
extern crate libc;
|
||||||
|
|
||||||
use std::io::{File, BufferedWriter, BufferedReader, stdin, print};
|
use std::io::{stdio, File, BufferedWriter, BufferedReader, print};
|
||||||
use getopts::{optopt, optflag, getopts, usage};
|
use getopts::{optopt, optflag, getopts, usage};
|
||||||
|
|
||||||
use ranges::Range;
|
use ranges::Range;
|
||||||
|
@ -22,6 +22,7 @@ use ranges::Range;
|
||||||
#[path = "../common/util.rs"]
|
#[path = "../common/util.rs"]
|
||||||
mod util;
|
mod util;
|
||||||
mod ranges;
|
mod ranges;
|
||||||
|
mod buffer;
|
||||||
|
|
||||||
static NAME: &'static str = "cut";
|
static NAME: &'static str = "cut";
|
||||||
static VERSION: &'static str = "1.0.0";
|
static VERSION: &'static str = "1.0.0";
|
||||||
|
@ -50,67 +51,94 @@ fn list_to_ranges(list: &str, complement: bool) -> Result<Vec<Range>, String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_bytes<T: Reader>(mut reader: BufferedReader<T>,
|
fn cut_bytes<R: Reader>(reader: R,
|
||||||
ranges: &Vec<Range>,
|
ranges: &Vec<Range>,
|
||||||
opts: &Options) -> int {
|
opts: &Options) -> int {
|
||||||
let mut out = BufferedWriter::new(std::io::stdio::stdout_raw());
|
use buffer::Bytes::{Select, NewlineFound, Complete, Partial, EndOfFile};
|
||||||
let (use_delim, out_delim) = match opts.out_delim.clone() {
|
|
||||||
Some(delim) => (true, delim),
|
let mut buf_read = buffer::BufReader::new(reader);
|
||||||
None => (false, "".to_string())
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
||||||
};
|
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
let line = match reader.read_until(b'\n') {
|
let mut cur_pos = 1;
|
||||||
Ok(line) => line,
|
|
||||||
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
|
||||||
_ => fail!(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let line_len = line.len();
|
|
||||||
let mut print_delim = false;
|
let mut print_delim = false;
|
||||||
|
|
||||||
for &Range { low: low, high: high } in ranges.iter() {
|
for &Range { low: low, high: high } in ranges.iter() {
|
||||||
if low > line_len { break; }
|
// skip upto low
|
||||||
|
let orig_pos = cur_pos;
|
||||||
|
loop {
|
||||||
|
match buf_read.select(low - cur_pos) {
|
||||||
|
NewlineFound(_) => {
|
||||||
|
out.write(&[b'\n']).unwrap();
|
||||||
|
continue 'newline
|
||||||
|
}
|
||||||
|
Complete(bytes) => {
|
||||||
|
cur_pos += bytes.len();
|
||||||
|
break
|
||||||
|
}
|
||||||
|
Partial(bytes) => cur_pos += bytes.len(),
|
||||||
|
EndOfFile => {
|
||||||
|
if orig_pos != cur_pos {
|
||||||
|
out.write(&[b'\n']).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
if use_delim {
|
break 'newline
|
||||||
if print_delim {
|
}
|
||||||
out.write_str(out_delim.as_slice()).unwrap();
|
|
||||||
}
|
}
|
||||||
print_delim = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if high >= line_len {
|
match opts.out_delim {
|
||||||
let segment = line.slice(low - 1, line_len);
|
Some(ref delim) => {
|
||||||
|
if print_delim {
|
||||||
out.write(segment).unwrap();
|
out.write(delim.as_bytes()).unwrap();
|
||||||
|
}
|
||||||
if *line.get(line_len - 1) == b'\n' {
|
print_delim = true;
|
||||||
continue 'newline
|
|
||||||
}
|
}
|
||||||
} else {
|
None => ()
|
||||||
let segment = line.slice(low - 1, high);
|
}
|
||||||
|
|
||||||
out.write(segment).unwrap();
|
// write out from low to high
|
||||||
|
loop {
|
||||||
|
match buf_read.select(high - cur_pos + 1) {
|
||||||
|
NewlineFound(bytes) => {
|
||||||
|
out.write(bytes).unwrap();
|
||||||
|
continue 'newline
|
||||||
|
}
|
||||||
|
Complete(bytes) => {
|
||||||
|
out.write(bytes).unwrap();
|
||||||
|
cur_pos = high + 1;
|
||||||
|
break
|
||||||
|
}
|
||||||
|
Partial(bytes) => {
|
||||||
|
cur_pos += bytes.len();
|
||||||
|
out.write(bytes).unwrap();
|
||||||
|
}
|
||||||
|
EndOfFile => {
|
||||||
|
if cur_pos != low || low == high {
|
||||||
|
out.write(&[b'\n']).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
break 'newline
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out.write(&[b'\n']).unwrap();
|
buf_read.consume_line();
|
||||||
|
out.write([b'\n']).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_characters<T: Reader>(mut reader: BufferedReader<T>,
|
fn cut_characters<R: Reader>(reader: R,
|
||||||
ranges: &Vec<Range>,
|
ranges: &Vec<Range>,
|
||||||
opts: &Options) -> int {
|
opts: &Options) -> int {
|
||||||
let mut out = BufferedWriter::new(std::io::stdio::stdout_raw());
|
let mut buf_in = BufferedReader::new(reader);
|
||||||
let (use_delim, out_delim) = match opts.out_delim.clone() {
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
||||||
Some(delim) => (true, delim),
|
|
||||||
None => (false, "".to_string())
|
|
||||||
};
|
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
let line = match reader.read_line() {
|
let line = match buf_in.read_line() {
|
||||||
Ok(line) => line,
|
Ok(line) => line,
|
||||||
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
||||||
_ => fail!(),
|
_ => fail!(),
|
||||||
|
@ -126,11 +154,14 @@ fn cut_characters<T: Reader>(mut reader: BufferedReader<T>,
|
||||||
None => break
|
None => break
|
||||||
};
|
};
|
||||||
|
|
||||||
if use_delim {
|
match opts.out_delim {
|
||||||
if print_delim {
|
Some(ref delim) => {
|
||||||
out.write_str(out_delim.as_slice()).unwrap();
|
if print_delim {
|
||||||
|
out.write(delim.as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
print_delim = true;
|
||||||
}
|
}
|
||||||
print_delim = true;
|
None => ()
|
||||||
}
|
}
|
||||||
|
|
||||||
match char_indices.nth(high - low) {
|
match char_indices.nth(high - low) {
|
||||||
|
@ -204,15 +235,16 @@ impl<'a> Iterator<(uint, uint)> for Searcher<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_fields_delimiter<T: Reader>(mut reader: BufferedReader<T>,
|
fn cut_fields_delimiter<R: Reader>(reader: R,
|
||||||
ranges: &Vec<Range>,
|
ranges: &Vec<Range>,
|
||||||
delim: &String,
|
delim: &String,
|
||||||
only_delimited: bool,
|
only_delimited: bool,
|
||||||
out_delim: &String) -> int {
|
out_delim: &String) -> int {
|
||||||
let mut out = BufferedWriter::new(std::io::stdio::stdout_raw());
|
let mut buf_in = BufferedReader::new(reader);
|
||||||
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
let line = match reader.read_until(b'\n') {
|
let line = match buf_in.read_until(b'\n') {
|
||||||
Ok(line) => line,
|
Ok(line) => line,
|
||||||
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
||||||
_ => fail!(),
|
_ => fail!(),
|
||||||
|
@ -279,7 +311,7 @@ fn cut_fields_delimiter<T: Reader>(mut reader: BufferedReader<T>,
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_fields<T: Reader>(mut reader: BufferedReader<T>,
|
fn cut_fields<R: Reader>(reader: R,
|
||||||
ranges: &Vec<Range>,
|
ranges: &Vec<Range>,
|
||||||
opts: &FieldOptions) -> int {
|
opts: &FieldOptions) -> int {
|
||||||
match opts.out_delimeter {
|
match opts.out_delimeter {
|
||||||
|
@ -290,10 +322,11 @@ fn cut_fields<T: Reader>(mut reader: BufferedReader<T>,
|
||||||
None => ()
|
None => ()
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut out = BufferedWriter::new(std::io::stdio::stdout_raw());
|
let mut buf_in = BufferedReader::new(reader);
|
||||||
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
let line = match reader.read_until(b'\n') {
|
let line = match buf_in.read_until(b'\n') {
|
||||||
Ok(line) => line,
|
Ok(line) => line,
|
||||||
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
||||||
_ => fail!(),
|
_ => fail!(),
|
||||||
|
@ -367,17 +400,17 @@ fn cut_files(mut filenames: Vec<String>, mode: Mode) -> int {
|
||||||
|
|
||||||
for filename in filenames.iter() {
|
for filename in filenames.iter() {
|
||||||
if filename.as_slice() == "-" {
|
if filename.as_slice() == "-" {
|
||||||
if stdin_read { continue; }
|
if stdin_read { continue }
|
||||||
|
|
||||||
exit_code |= match mode {
|
exit_code |= match mode {
|
||||||
Bytes(ref ranges, ref opts) => {
|
Bytes(ref ranges, ref opts) => {
|
||||||
cut_bytes(stdin(), ranges, opts)
|
cut_bytes(stdio::stdin_raw(), ranges, opts)
|
||||||
}
|
}
|
||||||
Characters(ref ranges, ref opts) => {
|
Characters(ref ranges, ref opts) => {
|
||||||
cut_characters(stdin(), ranges, opts)
|
cut_characters(stdio::stdin_raw(), ranges, opts)
|
||||||
}
|
}
|
||||||
Fields(ref ranges, ref opts) => {
|
Fields(ref ranges, ref opts) => {
|
||||||
cut_fields(stdin(), ranges, opts)
|
cut_fields(stdio::stdin_raw(), ranges, opts)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -387,11 +420,11 @@ fn cut_files(mut filenames: Vec<String>, mode: Mode) -> int {
|
||||||
|
|
||||||
if ! path.exists() {
|
if ! path.exists() {
|
||||||
show_error!("{}: No such file or directory", filename);
|
show_error!("{}: No such file or directory", filename);
|
||||||
continue;
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
let buf_file = match File::open(&path) {
|
let file = match File::open(&path) {
|
||||||
Ok(file) => BufferedReader::new(file),
|
Ok(f) => f,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
show_error!("{}: {}", filename, e.desc);
|
show_error!("{}: {}", filename, e.desc);
|
||||||
continue
|
continue
|
||||||
|
@ -399,13 +432,11 @@ fn cut_files(mut filenames: Vec<String>, mode: Mode) -> int {
|
||||||
};
|
};
|
||||||
|
|
||||||
exit_code |= match mode {
|
exit_code |= match mode {
|
||||||
Bytes(ref ranges, ref opts) => cut_bytes(buf_file, ranges, opts),
|
Bytes(ref ranges, ref opts) => cut_bytes(file, ranges, opts),
|
||||||
Characters(ref ranges, ref opts) => {
|
Characters(ref ranges, ref opts) => {
|
||||||
cut_characters(buf_file, ranges, opts)
|
cut_characters(file, ranges, opts)
|
||||||
}
|
|
||||||
Fields(ref ranges, ref opts) => {
|
|
||||||
cut_fields(buf_file, ranges, opts)
|
|
||||||
}
|
}
|
||||||
|
Fields(ref ranges, ref opts) => cut_fields(file, ranges, opts)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue