Rewrite cut_characters

This follows the cut_bytes() approach of letting read_line() create a buffer and find the newline. read_line() guarantees our buffer is a string of utf8 characters. When writing out the bytes segment we need to make sure we are cutting on utf8 boundaries, there for we must iterate over the buffer from read_line(). This implementation is(/should be) efficient as it only iterates once over the buffer. The previous performance was about 4x as slow as cut_bytes() and now it is about 2x as slow as cut_bytes().
2025-09-14 11:07:59 +00:00 · 2014-06-18 21:25:58 +02:00 · 2014-06-18 21:25:58 +02:00 · 0e46d453b7
commit 0e46d453b7
parent b1c2d7ac7c
1 changed files with 39 additions and 41 deletions
--- a/cut/cut.rs
+++ b/cut/cut.rs
@ -121,51 +121,51 @@ fn cut_characters<T: Reader>(mut reader: BufferedReader<T>,
        None => (false, "".to_str())
    };

-    let mut char_pos = 0;
-    let mut print_delim = false;
-    let mut range_pos = 0;
-
-    loop {
-        let character = match reader.read_char() {
-            Ok(character) => character,
-            Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => {
-                if char_pos > 0 {
-                    out.write_u8('\n' as u8).unwrap();
-                }
-                break
-            }
-            Err(std::io::IoError{ kind: std::io::InvalidInput, ..}) => {
-                fail!("Invalid utf8");
-            }
+    'newline: loop {
+        let line = match reader.read_line() {
+            Ok(line) => line,
+            Err(std::io::IoError{ kind: std::io::EndOfFile, ..}) => break,
            _ => fail!(),
        };

-        if character == '\n' {
-            out.write_u8('\n' as u8).unwrap();
-            char_pos = 0;
-            print_delim = false;
-            range_pos = 0;
-        } else {
-            char_pos += 1;
+        let mut char_pos = 0;
+        let mut char_indices = line.as_slice().char_indices();
+        let mut print_delim = false;

-            if char_pos > ranges.get(range_pos).high {
-                range_pos += 1;
-            }
+        for &Range{ low: low, high: high } in ranges.iter() {
+            let low_idx = match char_indices.nth(low - char_pos - 1) {
+                Some((low_idx, _)) => low_idx,
+                None => break
+            };

-            let cur_range = *ranges.get(range_pos);
-
-            if char_pos >= cur_range.low {
-                if use_delim {
-                    if print_delim && char_pos == cur_range.low {
-                        out.write_str(out_delim.as_slice()).unwrap();
-                    }
-
-                    print_delim = true;
+            if use_delim {
+                if print_delim {
+                    out.write_str(out_delim.as_slice());
                }
-
-                out.write_char(character).unwrap();
+                print_delim = true;
            }
+
+            match char_indices.nth(high - low) {
+                Some((high_idx, _)) => {
+                    let segment = line.as_bytes().slice(low_idx, high_idx);
+
+                    out.write(segment);
+                }
+                None => {
+                    let bytes = line.as_bytes();
+                    let segment = bytes.slice(low_idx, bytes.len());
+
+                    out.write(segment);
+
+                    if line.as_bytes()[bytes.len() - 1] == b'\n' {
+                        continue 'newline
+                    }
+                }
+            }
+
+            char_pos = high + 1;
        }
+        out.write(&[b'\n']);
    }

    0
@ -215,8 +215,7 @@ fn cut_files(mut filenames: Vec<String>, mode: Mode) -> int {
            let buf_file = match File::open(&path) {
                Ok(file) => BufferedReader::new(file),
                Err(e) => {
-                    show_error!("{0:s}: {1:s}", filename.as_slice(),
-                                e.desc.to_str());
+                    show_error!("{}: {}", filename, e.desc);
                    continue
                }
            };
@ -240,7 +239,6 @@ fn cut_files(mut filenames: Vec<String>, mode: Mode) -> int {
 fn main() { os::set_exit_status(uumain(os::args())); }

 pub fn uumain(args: Vec<String>) -> int {
-    let program = args.get(0).clone();
    let opts = [
        optopt("b", "bytes", "select only these bytes", "LIST"),
        optopt("c", "characters", "select only these characters", "LIST"),
@ -264,7 +262,7 @@ pub fn uumain(args: Vec<String>) -> int {

    if matches.opt_present("help") {
        println!("Usage:");
-        println!("  {0:s} OPTION... [FILE]...", program);
+        println!("  {0} OPTION... [FILE]...", args.get(0));
        println!("");
        print(usage("Print selected parts of lines from each FILE to standard output.", opts).as_slice());
        println!("");