diff --git a/Cargo.lock b/Cargo.lock index e0a5a45bd..db6789500 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,6 +48,7 @@ dependencies = [ "paste 0.0.1", "primal 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "printenv 0.0.1", + "printf 0.0.1", "ptx 0.0.1", "pwd 0.0.1", "rand 0.3.12 (registry+https://github.com/rust-lang/crates.io-index)", @@ -387,6 +388,11 @@ dependencies = [ "uucore 0.0.1", ] +[[package]] +name = "itertools" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "kernel32-sys" version = "0.2.1" @@ -615,6 +621,14 @@ dependencies = [ "uucore 0.0.1", ] +[[package]] +name = "printf" +version = "0.0.1" +dependencies = [ + "itertools 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)", + "uucore 0.0.1", +] + [[package]] name = "ptx" version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml index 530e2139b..0fa2cac40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,7 @@ generic = [ "od", "paste", "printenv", + "printf", "ptx", "pwd", "readlink", @@ -129,6 +130,7 @@ nproc = { optional=true, path="src/nproc" } od = { optional=true, path="src/od" } paste = { optional=true, path="src/paste" } printenv = { optional=true, path="src/printenv" } +printf = { optional=true, path="src/printf" } ptx = { optional=true, path="src/ptx" } pwd = { optional=true, path="src/pwd" } readlink = { optional=true, path="src/readlink" } diff --git a/Makefile b/Makefile index 8c7cd2a6d..9a580c379 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,7 @@ PROGS := \ od \ paste \ printenv \ + printf \ ptx \ pwd \ readlink \ @@ -149,6 +150,7 @@ TEST_PROGS := \ mv \ nl \ paste \ + printf \ ptx \ pwd \ readlink \ diff --git a/src/printf/Cargo.toml b/src/printf/Cargo.toml new file mode 100644 index 000000000..a2e1dfdad --- /dev/null +++ b/src/printf/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "printf" +version = "0.0.1" +authors = ["Nathan Ross"] + +[lib] +name = "uu_printf" +path = "printf.rs" + +[dependencies] +"itertools" = "*" +uucore = { path="../uucore" } + +[[bin]] +name = "printf" +path = "main.rs" diff --git a/src/printf/cli.rs b/src/printf/cli.rs new file mode 100644 index 000000000..a7a47c1d9 --- /dev/null +++ b/src/printf/cli.rs @@ -0,0 +1,34 @@ +//! stdio convenience fns +#[allow(unused_must_use)] + +use std::io::{stderr, stdout, Write}; +use std::env; + +pub static EXIT_OK: i32 = 0; +pub static EXIT_ERR: i32 = 1; + +pub fn err_msg(msg: &str) { + let exe_path = match env::current_exe() { + Ok(p) => p.to_string_lossy().into_owned(), + _ => String::from(""), + }; + writeln!(&mut stderr(), "{}: {}", exe_path, msg).unwrap(); +} + +// by default stdout only flushes +// to console when a newline is passed. +#[allow(unused_must_use)] +pub fn flush_char(c: &char) { + print!("{}", c); + stdout().flush(); +} +#[allow(unused_must_use)] +pub fn flush_str(s: &str) { + print!("{}", s); + stdout().flush(); +} +#[allow(unused_must_use)] +pub fn flush_bytes(bslice: &[u8]) { + stdout().write(bslice); + stdout().flush(); +} diff --git a/src/printf/main.rs b/src/printf/main.rs new file mode 100644 index 000000000..073aa250d --- /dev/null +++ b/src/printf/main.rs @@ -0,0 +1,5 @@ +extern crate uu_printf; + +fn main() { + std::process::exit(uu_printf::uumain(std::env::args().collect())); +} diff --git a/src/printf/memo.rs b/src/printf/memo.rs new file mode 100644 index 000000000..924a10bea --- /dev/null +++ b/src/printf/memo.rs @@ -0,0 +1,85 @@ +//! Memo runner of printf +//! Takes a format string and arguments +//! 1. tokenizes format string into tokens, consuming +//! any subst. arguments along the way. +//! 2. feeds remaining arguments into function +//! that prints tokens. + +use std::iter::Peekable; +use std::slice::Iter; +use itertools::PutBackN; +use cli; +use tokenize::token::{Token, Tokenizer}; +use tokenize::unescaped_text::UnescapedText; +use tokenize::sub::Sub; + +pub struct Memo { + tokens: Vec>, +} + +fn warn_excess_args(first_arg: &str) { + cli::err_msg(&format!("warning: ignoring excess arguments, starting with '{}'", + first_arg)); +} + +impl Memo { + pub fn new(pf_string: &String, pf_args_it: &mut Peekable>) -> Memo { + let mut pm = Memo { tokens: Vec::new() }; + let mut tmp_token: Option>; + let mut it = PutBackN::new(pf_string.chars()); + let mut has_sub = false; + loop { + tmp_token = UnescapedText::from_it(&mut it, pf_args_it); + match tmp_token { + Some(x) => pm.tokens.push(x), + None => {} + } + tmp_token = Sub::from_it(&mut it, pf_args_it); + match tmp_token { + Some(x) => { + if !has_sub { + has_sub = true; + } + pm.tokens.push(x); + } + None => {} + } + if let Some(x) = it.next() { + it.put_back(x); + } else { + break; + } + } + if !has_sub { + let mut drain = false; + if let Some(first_arg) = pf_args_it.peek() { + warn_excess_args(first_arg); + drain = true; + } + if drain { + loop { + // drain remaining args; + if pf_args_it.next().is_none() { + break; + } + } + } + } + pm + } + pub fn apply(&self, pf_args_it: &mut Peekable>) { + for tkn in self.tokens.iter() { + tkn.print(pf_args_it); + } + } + pub fn run_all(pf_string: &String, pf_args: &[String]) { + let mut arg_it = pf_args.iter().peekable(); + let pm = Memo::new(pf_string, &mut arg_it); + loop { + if arg_it.peek().is_none() { + break; + } + pm.apply(&mut arg_it); + } + } +} diff --git a/src/printf/mod.rs b/src/printf/mod.rs new file mode 100644 index 000000000..5ed2ecea8 --- /dev/null +++ b/src/printf/mod.rs @@ -0,0 +1,3 @@ +mod cli; +mod memo; +mod tokenize; diff --git a/src/printf/printf.rs b/src/printf/printf.rs new file mode 100644 index 000000000..f7a732df6 --- /dev/null +++ b/src/printf/printf.rs @@ -0,0 +1,288 @@ +#![crate_name = "uu_printf"] + +#![allow(dead_code)] + +extern crate itertools; + +mod cli; +mod memo; +mod tokenize; + +#[macro_use] +extern crate uucore; + +static NAME: &'static str = "printf"; +static VERSION: &'static str = "0.0.1"; +static SHORT_USAGE: &'static str = "printf: usage: printf [-v var] format [arguments]"; +static LONGHELP_LEAD: &'static str = "printf + + USAGE: printf FORMATSTRING [ARGUMENT]... + + basic anonymous string templating: + + prints format string at least once, repeating as long as there are remaining arguments + output prints escaped literals in the format string as character literals + output replaces anonymous fields with the next unused argument, formatted according to the field. + +Options: + --help display this help and exit + --version output version information and exit + +"; +static LONGHELP_BODY: &'static str = " + Prints the , replacing escaped character sequences with character literals + and substitution field sequences with passed arguments + + literally, with the exception of the below + escaped character sequences, and the substitution sequences described further down. + + ESCAPE SEQUENCES + + The following escape sequences, organized here in alphabetical order, + will print the corresponding character literal: + + \" double quote + + \\\\ backslash + + \\a alert (BEL) + + \\b backspace + + \\c End-of-Input + + \\e escape + + \\f form feed + + \\n new line + + \\r carriage return + + \\t horizontal tab + + \\v vertical tab + + \\NNN byte with value expressed in octal value NNN (1 to 3 digits) + values greater than 256 will be treated + + \\xHH byte with value expressed in hexadecimal value NN (1 to 2 digits) + + \\uHHHH Unicode (IEC 10646) character with value expressed in hexadecimal value HHHH (4 digits) + + \\uHHHH Unicode character with value expressed in hexadecimal value HHHH (8 digits) + + %% a single % + + SUBSTITUTIONS + + SUBSTITUTION QUICK REFERENCE + + Fields + + %s - string + %b - string parsed for literals + second parameter is max length + + %c - char + no second parameter + + %i or %d - 64-bit integer + %u - 64 bit unsigned integer + %x or %X - 64-bit unsigned integer as hex + %o - 64-bit unsigned integer as octal + second parameter is min-width, integer + output below that width is padded with leading zeroes + + %f or %F - decimal floating point value + %e or %E - scientific notation floating point value + %g or %G - shorter of specially interpreted decimal or SciNote floating point value. + second parameter is + -max places after decimal point for floating point output + -max number of significant digits for scientific notation output + + parameterizing fields + + examples: + + printf '%4.3i' 7 + has a first parameter of 4 + and a second parameter of 3 + will result in ' 007' + + printf '%.1s' abcde + has no first parameter + and a second parameter of 1 + will result in 'a' + + printf '%4c' q + has a first parameter of 4 + and no second parameter + will result in ' q' + + The first parameter of a field is the minimum width to pad the output to + if the output is less than this absolute value of this width, + it will be padded with leading spaces, or, if the argument is negative, + with trailing spaces. the default is zero. + + The second parameter of a field is particular to the output field type. + defaults can be found in the full substitution help below + + special prefixes to numeric arguments + 0 (e.g. 010) - interpret argument as octal (integer output fields only) + 0x (e.g. 0xABC) - interpret argument as hex (numeric output fields only) + \' (e.g. \'a) - interpret argument as a character constant + + HOW TO USE SUBSTITUTIONS + + Substitutions are used to pass additional argument(s) into the FORMAT string, to be formatted a + particular way. E.g. + + printf 'the letter %X comes before the letter %X' 10 11 + + will print + + 'the letter A comes before the letter B' + + because the substitution field %X means + 'take an integer argument and write it as a hexadecimal number' + + Passing more arguments than are in the format string will cause the format string to be + repeated for the remaining substitutions + + printf 'it is %i F in %s \n' 22 Portland 25 Boston 27 New York + + will print + + 'it is 22 F in Portland + it is 25 F in Boston + it is 27 F in Boston + ' + If a format string is printed but there are less arguments remaining + than there are substitution fields, substitution fields without + an argument will default to empty strings, or for numeric fields + the value 0 + + AVAILABLE SUBSTITUTIONS + + This program, like GNU coreutils printf, + interprets a modified subset of the POSIX C printf spec, + a quick reference to substitutions is below. + + STRING SUBSTITUTIONS + All string fields have a 'max width' parameter + %.3s means 'print no more than three characters of the original input' + + %s - string + + %b - escaped string - the string will be checked for any escaped literals from + the escaped literal list above, and translate them to literal charcters. + e.g. \\n will be transformed into a newline character. + + One special rule about %b mode is that octal literals are intepreted differently + In arguments passed by %b, pass octal-interpreted literals must be in the form of \\0NNN instead of \\NNN + (Although, for legacy reasons, octal literals in the form of \\NNN will still be interpreted and not throw a warning, you will have problems if you use this for a literal whose code begins with zero, as it will be viewed as in \\0NNN form.) + + CHAR SUBSTITUTIONS + The character field does not have a secondary parameter. + + %c - a single character + + INTEGER SUBSTITUTIONS + All integer fields have a 'pad with zero' parameter + %.4i means an integer which if it is less than 4 digits in length, + is padded with leading zeros until it is 4 digits in length. + + %d or %i - 64-bit integer + + %u - 64 bit unsigned integer + + %x or %X - 64 bit unsigned integer printed in Hexadecimal (base 16) + %X instead of %x means to use uppercase letters for 'a' through 'f' + + %o - 64 bit unsigned integer printed in octal (base 8) + + FLOATING POINT SUBSTITUTIONS + + All floating point fields have a 'max decimal places / max significant digits' parameter + %.10f means a decimal floating point with 7 decimal places past 0 + %.10e means a scientific notation number with 10 significant digits + %.10g means the same behavior for decimal and Sci. Note, respectively, and provides the shorter of each's output. + + Like with GNU coreutils, the value after the decimal point is these outputs is parsed as a double first before being rendered to text. For both implementations do not expect meaningful precision past the 18th decimal place. When using a number of decimal places that is 18 or higher, you can expect variation in output between GNU coreutils printf and this printf at the 18th decimal place of +/- 1 + + %f - floating point value presented in decimal, truncated and displayed to 6 decimal places by default. + There is not past-double behavior parity with Coreutils printf, values are not estimated or adjusted beyond input values. + + %e or %E - floating point value presented in scientific notation + 7 significant digits by default + %E means use to use uppercase E for the mantissa. + + %g or %G - floating point value presented in the shorter of decimal and scientific notation + behaves differently from %f and %E, please see posix printf spec for full details, + some examples of different behavior: + + Sci Note has 6 significant digits by default + Trailing zeroes are removed + Instead of being truncated, digit after last is rounded + + Like other behavior in this utility, the design choices of floating point + behavior in this utility is selected to reproduce in exact + the behavior of GNU coreutils' printf from an inputs and outputs standpoint. + + USING PARAMETERS + Most substitution fields can be parameterized using up to 2 numbers that can + be passed to the field, between the % sign and the field letter. + + The 1st parameter always indicates the minimum width of output, it is useful for creating + columnar output. Any output that would be less than this minimum width is padded with + leading spaces + The 2nd parameter is proceeded by a dot. + You do not have to use parameters + + SPECIAL FORMS OF INPUT + For numeric input, the following additional forms of input are accepted besides decimal: + + Octal (only with integer): if the argument begins with a 0 the proceeding characters + will be interpreted as octal (base 8) for integer fields + + Hexadecimal: if the argument begins with 0x the proceeding characters will be interpreted + will be interpreted as hex (base 16) for any numeric fields + for float fields, hexadecimal input results in a precision + limit (in converting input past the decimal point) of 10^-15 + + Character Constant: if the argument begins with a single quote character, the first byte + of the next character will be interpreted as an 8-bit unsigned integer. If there are + additional bytes, they will throw an error (unless the environment variable POSIXLY_CORRECt is set) + +WRITTEN BY : + Nathan E. Ross, et al. for the uutils project + +MORE INFO : + https://github.com/uutils/coreutils + +COPYRIGHT : + Copyright 2015 uutils project. + Licensed under the MIT License, please see LICENSE file for details + +"; + +pub fn uumain(args: Vec) -> i32 { + let location = &args[0]; + if args.len() <= 1 { + println!("{0}: missing operand\nTry '{0} --help' for more information.", + location); + return 1; + } + let ref formatstr = args[1]; + + if formatstr == "--help" { + print!("{} {}", LONGHELP_LEAD, LONGHELP_BODY); + } else if formatstr == "--version" { + println!("{} {}", NAME, VERSION); + } else { + let printf_args = &args[2..]; + memo::Memo::run_all(formatstr, printf_args); + } + return 0; +} diff --git a/src/printf/tokenize/mod.rs b/src/printf/tokenize/mod.rs new file mode 100644 index 000000000..0570b7489 --- /dev/null +++ b/src/printf/tokenize/mod.rs @@ -0,0 +1,4 @@ +pub mod token; +pub mod sub; +pub mod unescaped_text; +mod num_format; diff --git a/src/printf/tokenize/num_format/format_field.rs b/src/printf/tokenize/num_format/format_field.rs new file mode 100644 index 000000000..eb59fde05 --- /dev/null +++ b/src/printf/tokenize/num_format/format_field.rs @@ -0,0 +1,41 @@ +//! Primitievs used by Sub Tokenizer +//! and num_format modules +#[derive(Clone)] +pub enum FieldType { + Strf, + Floatf, + CninetyNineHexFloatf, + Scif, + Decf, + Intf, + Charf, +} + +// #[allow(non_camel_case_types)] +// pub enum FChar { +// d, +// e, +// E, +// i, +// f, +// F, +// g, +// G, +// u, +// x, +// X, +// o +// } +// + +// a Sub Tokens' fields are stored +// as a single object so they can be more simply +// passed by ref to num_format in a Sub method +#[derive(Clone)] +pub struct FormatField<'a> { + pub min_width: Option, + pub second_field: Option, + pub field_char: &'a char, + pub field_type: &'a FieldType, + pub orig: &'a String, +} diff --git a/src/printf/tokenize/num_format/formatter.rs b/src/printf/tokenize/num_format/formatter.rs new file mode 100644 index 000000000..8145001e7 --- /dev/null +++ b/src/printf/tokenize/num_format/formatter.rs @@ -0,0 +1,67 @@ +//! Primitives used by num_format and sub_modules. +//! never dealt with above (e.g. Sub Tokenizer never uses these) +use std::str::Chars; +use itertools::PutBackN; +use cli; +use super::format_field::FormatField; + +// contains the rough ingredients to final +// output for a number, organized together +// to allow for easy generalization of output manipulation +// (e.g. max number of digits after decimal) +pub struct FormatPrimitive { + pub prefix: Option, + pub pre_decimal: Option, + pub post_decimal: Option, + pub suffix: Option, +} + +impl Default for FormatPrimitive { + fn default() -> FormatPrimitive { + FormatPrimitive { + prefix: None, + pre_decimal: None, + post_decimal: None, + suffix: None, + } + } +} + +#[derive(Clone)] +#[derive(PartialEq)] +pub enum Base { + Ten = 10, + Hex = 16, + Octal = 8, +} + +// information from the beginning of a numeric argument +// the precedes the beginning of a numeric value +pub struct InPrefix { + pub radix_in: Base, + pub sign: i8, + pub offset: usize, +} + +pub trait Formatter { + // return a FormatPrimitive for + // particular field char(s), given the argument + // string and prefix information (sign, radix) + fn get_primitive(&self, + field: &FormatField, + inprefix: &InPrefix, + str_in: &str) + -> Option; + // return a string from a formatprimitive, + // given information about the field + fn primitive_to_str(&self, prim: &FormatPrimitive, field: FormatField) -> String; +} +pub fn get_it_at(offset: usize, str_in: &str) -> PutBackN { + PutBackN::new(str_in[offset..].chars()) +} + +// TODO: put this somewhere better +pub fn warn_incomplete_conv(pf_arg: &str) { + // important: keep println here not print + cli::err_msg(&format!("{}: value not completely converted", pf_arg)) +} diff --git a/src/printf/tokenize/num_format/formatters/base_conv/mod.rs b/src/printf/tokenize/num_format/formatters/base_conv/mod.rs new file mode 100644 index 000000000..c851ecda5 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/base_conv/mod.rs @@ -0,0 +1,331 @@ +pub fn arrnum_int_mult(arr_num: &Vec, basenum: u8, base_ten_int_fact: u8) -> Vec { + let mut carry: u16 = 0; + let mut rem: u16; + let mut new_amount: u16; + let fact: u16 = base_ten_int_fact as u16; + let base: u16 = basenum as u16; + + let mut ret_rev: Vec = Vec::new(); + let mut it = arr_num.iter().rev(); + loop { + let i = it.next(); + match i { + Some(u) => { + new_amount = ((u.clone() as u16) * fact) + carry; + rem = new_amount % base; + carry = (new_amount - rem) / base; + ret_rev.push(rem as u8) + } + None => { + while carry != 0 { + rem = carry % base; + carry = (carry - rem) / base; + ret_rev.push(rem as u8); + } + break; + } + } + } + let ret: Vec = ret_rev.iter().rev().map(|x| x.clone()).collect(); + ret +} + +pub struct Remainder<'a> { + pub position: usize, + pub replace: Vec, + pub arr_num: &'a Vec, +} + +pub struct DivOut<'a> { + pub quotient: u8, + pub remainder: Remainder<'a>, +} + +pub fn arrnum_int_div_step<'a>(rem_in: Remainder<'a>, + radix_in: u8, + base_ten_int_divisor: u8, + after_decimal: bool) + -> DivOut<'a> { + + let mut rem_out = Remainder { + position: rem_in.position, + replace: Vec::new(), + arr_num: rem_in.arr_num, + }; + + let mut bufferval: u16 = 0; + let base: u16 = radix_in as u16; + let divisor: u16 = base_ten_int_divisor as u16; + let mut traversed = 0; + + let mut quotient = 0; + let refd_vals = &rem_in.arr_num[rem_in.position + rem_in.replace.len()..]; + let mut it_replace = rem_in.replace.iter(); + let mut it_f = refd_vals.iter(); + loop { + let u = match it_replace.next() { + Some(u_rep) => u_rep.clone() as u16, + None => { + match it_f.next() { + Some(u_orig) => u_orig.clone() as u16, + None => { + if !after_decimal { + break; + } + 0 + } + } + } + }; + traversed += 1; + bufferval += u; + if bufferval > divisor { + while bufferval >= divisor { + quotient += 1; + bufferval -= divisor; + } + rem_out.replace = if bufferval == 0 { + Vec::new() + } else { + let remainder_as_arrnum = unsigned_to_arrnum(bufferval); + let remainder_as_base_arrnum = base_conv_vec(&remainder_as_arrnum, 10, radix_in); + remainder_as_base_arrnum + }; + rem_out.position += 1 + (traversed - rem_out.replace.len()); + break; + } else { + bufferval *= base; + } + } + DivOut { + quotient: quotient, + remainder: rem_out, + } +} +// pub struct ArrFloat { +// pub leading_zeros: u8, +// pub values: Vec, +// pub basenum: u8 +// } +// +// pub struct ArrFloatDivOut { +// pub quotient: u8, +// pub remainder: ArrFloat +// } +// +// pub fn arrfloat_int_div( +// arrfloat_in : &ArrFloat, +// base_ten_int_divisor : u8, +// precision : u16 +// ) -> DivOut { +// +// let mut remainder = ArrFloat { +// basenum: arrfloat_in.basenum, +// leading_zeros: arrfloat_in.leading_zeroes, +// values: Vec::new() +// } +// let mut quotient = 0; +// +// let mut bufferval : u16 = 0; +// let base : u16 = arrfloat_in.basenum as u16; +// let divisor : u16 = base_ten_int_divisor as u16; +// +// let mut it_f = arrfloat_in.values.iter(); +// let mut position = 0 + arrfloat_in.leading_zeroes as u16; +// let mut at_end = false; +// while position< precision { +// let next_digit = match it_f.next() { +// Some(c) => {} +// None => { 0 } +// } +// match u_cur { +// Some(u) => { +// bufferval += u.clone() as u16; +// if bufferval > divisor { +// while bufferval >= divisor { +// quotient+=1; +// bufferval -= divisor; +// } +// if bufferval == 0 { +// rem_out.position +=1; +// } else { +// rem_out.replace = Some(bufferval as u8); +// } +// break; +// } else { +// bufferval *= base; +// } +// }, +// None => { +// break; +// } +// } +// u_cur = it_f.next().clone(); +// rem_out.position+=1; +// } +// ArrFloatDivOut { quotient: quotient, remainder: remainder } +// } +// +pub fn arrnum_int_add(arrnum: &Vec, basenum: u8, base_ten_int_term: u8) -> Vec { + let mut carry: u16 = base_ten_int_term as u16; + let mut rem: u16; + let mut new_amount: u16; + let base: u16 = basenum as u16; + + let mut ret_rev: Vec = Vec::new(); + let mut it = arrnum.iter().rev(); + loop { + let i = it.next(); + match i { + Some(u) => { + new_amount = (u.clone() as u16) + carry; + rem = new_amount % base; + carry = (new_amount - rem) / base; + ret_rev.push(rem as u8) + } + None => { + while carry != 0 { + rem = carry % base; + carry = (carry - rem) / base; + ret_rev.push(rem as u8); + } + break; + } + } + } + let ret: Vec = ret_rev.iter().rev().map(|x| x.clone()).collect(); + ret +} + +pub fn base_conv_vec(src: &Vec, radix_src: u8, radix_dest: u8) -> Vec { + let mut result: Vec = Vec::new(); + result.push(0); + for i in src { + result = arrnum_int_mult(&result, radix_dest, radix_src); + result = arrnum_int_add(&result, radix_dest, i.clone()); + } + result +} + +pub fn unsigned_to_arrnum(src: u16) -> Vec { + let mut result: Vec = Vec::new(); + let mut src_tmp: u16 = src.clone(); + while src_tmp > 0 { + result.push((src_tmp % 10) as u8); + src_tmp /= 10; + } + result.reverse(); + result +} + + +// temporary needs-improvement-function +#[allow(unused_variables)] +pub fn base_conv_float(src: &Vec, radix_src: u8, radix_dest: u8) -> f64 { + // it would require a lot of addl code + // to implement this for arbitrary string input. + // until then, the below operates as an outline + // of how it would work. + let mut result: Vec = Vec::new(); + result.push(0); + let mut factor: f64 = 1.; + let radix_src_float: f64 = radix_src as f64; + let mut i = 0; + let mut r: f64 = 0 as f64; + for u in src { + if i > 15 { + break; + } + i += 1; + factor /= radix_src_float; + r += factor * (u.clone() as f64) + } + r +} + +pub fn str_to_arrnum(src: &str, radix_def_src: &RadixDef) -> Vec { + let mut intermed_in: Vec = Vec::new(); + for c in src.chars() { + match radix_def_src.from_char(c) { + Some(u) => { + intermed_in.push(u); + } + None => {} //todo err msg on incorrect + } + } + intermed_in +} + +pub fn arrnum_to_str(src: &Vec, radix_def_dest: &RadixDef) -> String { + let mut str_out = String::new(); + for u in src.iter() { + match radix_def_dest.from_u8(u.clone()) { + Some(c) => { + str_out.push(c); + } + None => {} //todo + } + } + str_out +} + +#[allow(unused_variables)] +pub fn base_conv_str(src: &str, radix_def_src: &RadixDef, radix_def_dest: &RadixDef) -> String { + let intermed_in: Vec = str_to_arrnum(src, radix_def_src); + let intermed_out = base_conv_vec(&intermed_in, + radix_def_src.get_max(), + radix_def_dest.get_max()); + arrnum_to_str(&intermed_out, radix_def_dest) +} + +pub trait RadixDef { + fn get_max(&self) -> u8; + fn from_char(&self, x: char) -> Option; + fn from_u8(&self, x: u8) -> Option; +} +pub struct RadixTen; + +const ZERO_ASC: u8 = '0' as u8; +const UPPER_A_ASC: u8 = 'A' as u8; +const LOWER_A_ASC: u8 = 'a' as u8; + +impl RadixDef for RadixTen { + fn get_max(&self) -> u8 { + 10 + } + fn from_char(&self, c: char) -> Option { + match c { + '0'...'9' => Some(c as u8 - ZERO_ASC), + _ => None, + } + } + fn from_u8(&self, u: u8) -> Option { + match u { + 0...9 => Some((ZERO_ASC + u) as char), + _ => None, + } + } +} +pub struct RadixHex; +impl RadixDef for RadixHex { + fn get_max(&self) -> u8 { + 16 + } + fn from_char(&self, c: char) -> Option { + match c { + '0'...'9' => Some(c as u8 - ZERO_ASC), + 'A'...'F' => Some(c as u8 + 10 - UPPER_A_ASC), + 'a'...'f' => Some(c as u8 + 10 - LOWER_A_ASC), + _ => None, + } + } + fn from_u8(&self, u: u8) -> Option { + match u { + 0...9 => Some((ZERO_ASC + u) as char), + 10...15 => Some((UPPER_A_ASC + (u - 10)) as char), + _ => None, + } + } +} + +mod tests; diff --git a/src/printf/tokenize/num_format/formatters/base_conv/tests.rs b/src/printf/tokenize/num_format/formatters/base_conv/tests.rs new file mode 100644 index 000000000..b69dcdb65 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/base_conv/tests.rs @@ -0,0 +1,55 @@ +#[cfg(test)] + +use super::*; + +#[test] +fn test_arrnum_int_mult() { + // (in base 10) 12 * 4 = 48 + let factor: Vec = vec![1, 2]; + let base_num = 10; + let base_ten_int_fact: u8 = 4; + let should_output: Vec = vec![4, 8]; + + let product = arrnum_int_mult(&factor, base_num, base_ten_int_fact); + assert!(product == should_output); +} + +#[test] +fn test_arrnum_int_non_base_10() { + // (in base 3) + // 5 * 4 = 20 + let factor: Vec = vec![1, 2]; + let base_num = 3; + let base_ten_int_fact: u8 = 4; + let should_output: Vec = vec![2, 0, 2]; + + let product = arrnum_int_mult(&factor, base_num, base_ten_int_fact); + assert!(product == should_output); +} + +#[test] +fn test_arrnum_int_div_shortcircuit() { + // ( + let arrnum: Vec = vec![5, 5, 5, 5, 0]; + let base_num = 10; + let base_ten_int_divisor: u8 = 41; + let remainder_passed_in = Remainder { + position: 1, + replace: vec![1, 3], + arr_num: &arrnum, + }; + + // the "replace" should mean the number being divided + // is 1350, the first time you can get 41 to go into + // 1350, its at 135, where you can get a quotient of + // 3 and a remainder of 12; + + let quotient_should_be: u8 = 3; + let remainder_position_should_be: usize = 3; + let remainder_replace_should_be = vec![1, 2]; + + let result = arrnum_int_div_step(remainder_passed_in, base_num, base_ten_int_divisor, false); + assert!(quotient_should_be == result.quotient); + assert!(remainder_position_should_be == result.remainder.position); + assert!(remainder_replace_should_be == result.remainder.replace); +} diff --git a/src/printf/tokenize/num_format/formatters/cninetyninehexfloatf.rs b/src/printf/tokenize/num_format/formatters/cninetyninehexfloatf.rs new file mode 100644 index 000000000..3c65d436e --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/cninetyninehexfloatf.rs @@ -0,0 +1,134 @@ +//! formatter for %a %F C99 Hex-floating-point subs +use super::super::format_field::FormatField; +use super::super::formatter::{InPrefix, FormatPrimitive, Formatter}; +use super::float_common::{FloatAnalysis, primitive_to_str_common}; +use super::base_conv; +use super::base_conv::RadixDef; + + +pub struct CninetyNineHexFloatf { + as_num: f64, +} +impl CninetyNineHexFloatf { + pub fn new() -> CninetyNineHexFloatf { + CninetyNineHexFloatf { as_num: 0.0 } + } +} + +impl Formatter for CninetyNineHexFloatf { + fn get_primitive(&self, + field: &FormatField, + inprefix: &InPrefix, + str_in: &str) + -> Option { + let second_field = field.second_field.unwrap_or(6) + 1; + let analysis = FloatAnalysis::analyze(&str_in, + inprefix, + Some(second_field as usize), + None, + true); + let f = get_primitive_hex(inprefix, + &str_in[inprefix.offset..], + &analysis, + second_field as usize, + *field.field_char == 'A'); + Some(f) + } + fn primitive_to_str(&self, prim: &FormatPrimitive, field: FormatField) -> String { + primitive_to_str_common(prim, &field) + } +} + +// c99 hex has unique requirements of all floating point subs in pretty much every part of building a primitive, from prefix and suffix to need for base conversion (in all other cases if you don't have decimal you must have decimal, here it's the other way around) + +// on the todo list is to have a trait for get_primitive that is implemented by each float formatter and can override a default. when that happens we can take the parts of get_primitive_dec specific to dec and spin them out to their own functions that can be overriden. +#[allow(unused_variables)] +#[allow(unused_assignments)] +fn get_primitive_hex(inprefix: &InPrefix, + str_in: &str, + analysis: &FloatAnalysis, + last_dec_place: usize, + capitalized: bool) + -> FormatPrimitive { + + let mut f: FormatPrimitive = Default::default(); + f.prefix = Some(String::from(if inprefix.sign == -1 { + "-0x" + } else { + "0x" + })); + + // assign the digits before and after the decimal points + // to separate slices. If no digits after decimal point, + // assign 0 + let (mut first_segment_raw, second_segment_raw) = match analysis.decimal_pos { + Some(pos) => (&str_in[..pos], &str_in[pos + 1..]), + None => (&str_in[..], "0"), + }; + if first_segment_raw.len() == 0 { + first_segment_raw = "0"; + } + // convert to string, hexifying if input is in dec. + // let (first_segment, second_segment) = + // match inprefix.radix_in { + // Base::Ten => { + // (to_hex(first_segment_raw, true), + // to_hex(second_segment_raw, false)) + // } + // _ => { + // (String::from(first_segment_raw), + // String::from(second_segment_raw)) + // } + // }; + // + // + // f.pre_decimal = Some(first_segment); + // f.post_decimal = Some(second_segment); + // + + // TODO actual conversion, make sure to get back mantissa. + // for hex to hex, it's really just a matter of moving the + // decimal point and calculating the mantissa by its initial + // position and its moves, with every position counting for + // the addition or subtraction of 4 (2**4, because 4 bits in a hex digit) + // to the exponent. + // decimal's going to be a little more complicated. correct simulation + // of glibc will require after-decimal division to a specified precisino. + // the difficult part of this (arrnum_int_div_step) is already implemented. + + // the hex float name may be a bit misleading in terms of how to go about the + // conversion. The best way to do it is to just convert the floatnum + // directly to base 2 and then at the end translate back to hex. + let mantissa = 0; + f.suffix = Some({ + let ind = if capitalized { + "P" + } else { + "p" + }; + if mantissa >= 0 { + format!("{}+{}", ind, mantissa) + } else { + format!("{}{}", ind, mantissa) + } + }); + f +} + +fn to_hex(src: &str, before_decimal: bool) -> String { + let rten = base_conv::RadixTen; + let rhex = base_conv::RadixHex; + if before_decimal { + base_conv::base_conv_str(src, &rten, &rhex) + } else { + let as_arrnum_ten = base_conv::str_to_arrnum(src, &rten); + let s = format!("{}", + base_conv::base_conv_float(&as_arrnum_ten, rten.get_max(), rhex.get_max())); + if s.len() > 2 { + String::from(&s[2..]) + } else { + // zero + s + } + } +} diff --git a/src/printf/tokenize/num_format/formatters/decf.rs b/src/printf/tokenize/num_format/formatters/decf.rs new file mode 100644 index 000000000..5567ebf91 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/decf.rs @@ -0,0 +1,83 @@ +//! formatter for %g %G decimal subs +use super::super::format_field::FormatField; +use super::super::formatter::{InPrefix, FormatPrimitive, Formatter}; +use super::float_common::{FloatAnalysis, get_primitive_dec, primitive_to_str_common}; + +fn get_len_fprim(fprim: &FormatPrimitive) -> usize { + let mut len = 0; + if let Some(ref s) = fprim.prefix { + len += s.len(); + } + if let Some(ref s) = fprim.pre_decimal { + len += s.len(); + } + if let Some(ref s) = fprim.post_decimal { + len += s.len(); + } + if let Some(ref s) = fprim.suffix { + len += s.len(); + } + len +} + +pub struct Decf { + as_num: f64, +} +impl Decf { + pub fn new() -> Decf { + Decf { as_num: 0.0 } + } +} +impl Formatter for Decf { + fn get_primitive(&self, + field: &FormatField, + inprefix: &InPrefix, + str_in: &str) + -> Option { + let second_field = field.second_field.unwrap_or(6) + 1; + // default to scif interp. so as to not truncate input vals + // (that would be displayed in scif) based on relation to decimal place + let analysis = FloatAnalysis::analyze(str_in, + inprefix, + Some(second_field as usize + 1), + None, + false); + let mut f_sci = get_primitive_dec(inprefix, + &str_in[inprefix.offset..], + &analysis, + second_field as usize, + Some(*field.field_char == 'G')); + // strip trailing zeroes + match f_sci.post_decimal.clone() { + Some(ref post_dec) => { + let mut i = post_dec.len(); + { + let mut it = post_dec.chars(); + while let Some(c) = it.next_back() { + if c != '0' { + break; + } + i -= 1; + } + } + if i != post_dec.len() { + f_sci.post_decimal = Some(String::from(&post_dec[0..i])); + } + } + None => {} + } + let f_fl = get_primitive_dec(inprefix, + &str_in[inprefix.offset..], + &analysis, + second_field as usize, + None); + Some(if get_len_fprim(&f_fl) >= get_len_fprim(&f_sci) { + f_sci + } else { + f_fl + }) + } + fn primitive_to_str(&self, prim: &FormatPrimitive, field: FormatField) -> String { + primitive_to_str_common(prim, &field) + } +} diff --git a/src/printf/tokenize/num_format/formatters/float_common.rs b/src/printf/tokenize/num_format/formatters/float_common.rs new file mode 100644 index 000000000..9da8fd907 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/float_common.rs @@ -0,0 +1,353 @@ +use super::super::format_field::FormatField; +use super::super::formatter::{InPrefix, Base, FormatPrimitive, warn_incomplete_conv, get_it_at}; +use super::base_conv; +use super::base_conv::RadixDef; + +// if the memory, copy, and comparison cost of chars +// becomes an issue, we can always operate in vec here +// rather than just at de_hex + +pub struct FloatAnalysis { + pub len_important: usize, + // none means no decimal point. + pub decimal_pos: Option, + pub follow: Option, +} +fn has_enough_digits(hex_input: bool, + hex_output: bool, + string_position: usize, + starting_position: usize, + limit: usize) + -> bool { + // -1s are for rounding + if hex_output { + if hex_input { + ((string_position - 1) - starting_position >= limit) + } else { + false //undecidable without converting + } + } else { + if hex_input { + ((((string_position - 1) - starting_position) * 9) / 8 >= limit) + } else { + ((string_position - 1) - starting_position >= limit) + } + } + +} + +impl FloatAnalysis { + pub fn analyze(str_in: &str, + inprefix: &InPrefix, + max_sd_opt: Option, + max_after_dec_opt: Option, + hex_output: bool) + -> FloatAnalysis { + // this fn assumes + // the input string + // has no leading spaces or 0s + let mut str_it = get_it_at(inprefix.offset, str_in); + let mut ret = FloatAnalysis { + len_important: 0, + decimal_pos: None, + follow: None, + }; + let hex_input = match inprefix.radix_in { + Base::Hex => true, + Base::Ten => false, + Base::Octal => { + panic!("this should never happen: floats should never receive octal input"); + } + }; + let mut i = 0; + let mut pos_before_first_nonzero_after_decimal: Option = None; + while let Some(c) = str_it.next() { + match c { + e @ '0'...'9' | e @ 'A'...'F' | e @ 'a'...'f' => { + if !hex_input { + match e { + '0'...'9' => {} + _ => { + warn_incomplete_conv(str_in); + break; + } + } + } + if ret.decimal_pos.is_some() && + pos_before_first_nonzero_after_decimal.is_none() && + e != '0' { + pos_before_first_nonzero_after_decimal = Some(i - 1); + } + if let Some(max_sd) = max_sd_opt { + if i == max_sd { + // follow is used in cases of %g + // where the character right after the last + // sd is considered is rounded affecting + // the previous digit in 1/2 of instances + ret.follow = Some(e); + } else if ret.decimal_pos.is_some() && i > max_sd { + break; + } + } + if let Some(max_after_dec) = max_after_dec_opt { + if let Some(p) = ret.decimal_pos { + if has_enough_digits(hex_input, hex_output, i, p, max_after_dec) { + break; + } + } + } else if let Some(max_sd) = max_sd_opt { + if let Some(p) = pos_before_first_nonzero_after_decimal { + if has_enough_digits(hex_input, hex_output, i, p, max_sd) { + break; + } + } + } + } + '.' => { + if ret.decimal_pos.is_none() { + ret.decimal_pos = Some(i); + } else { + warn_incomplete_conv(str_in); + break; + } + } + _ => { + warn_incomplete_conv(str_in); + break; + } + }; + i += 1; + } + ret.len_important = i; + ret + } +} + +fn de_hex(src: &str, before_decimal: bool) -> String { + let rten = base_conv::RadixTen; + let rhex = base_conv::RadixHex; + if before_decimal { + base_conv::base_conv_str(src, &rhex, &rten) + } else { + let as_arrnum_hex = base_conv::str_to_arrnum(src, &rhex); + let s = format!("{}", + base_conv::base_conv_float(&as_arrnum_hex, rhex.get_max(), rten.get_max())); + if s.len() > 2 { + String::from(&s[2..]) + } else { + // zero + s + } + } +} + +// takes a string in, +// truncates to a position, +// bumps the last digit up one, +// and if the digit was nine +// propagate to the next, etc. +fn _round_str_from(in_str: &str, position: usize) -> (String, bool) { + + let mut it = in_str[0..position].chars(); + let mut rev = String::new(); + let mut i = position; + let mut finished_in_dec = false; + while let Some(c) = it.next_back() { + i -= 1; + match c { + '9' => { + rev.push('0'); + } + e @ _ => { + rev.push(((e as u8) + 1) as char); + finished_in_dec = true; + break; + } + } + } + let mut fwd = String::from(&in_str[0..i]); + for ch in rev.chars().rev() { + fwd.push(ch); + } + (fwd, finished_in_dec) +} + +fn round_terminal_digit(before_dec: String, + after_dec: String, + position: usize) + -> (String, String) { + + if position < after_dec.len() { + let digit_at_pos: char; + { + digit_at_pos = (&after_dec[position..position + 1]) + .chars() + .next() + .expect(""); + } + match digit_at_pos { + '5'...'9' => { + let (new_after_dec, finished_in_dec) = _round_str_from(&after_dec, position); + if finished_in_dec { + return (before_dec, new_after_dec); + } else { + let (new_before_dec, _) = _round_str_from(&before_dec, before_dec.len()); + return (new_before_dec, new_after_dec); + } + // TODO + } + _ => {} + } + } + (before_dec, after_dec) +} + +pub fn get_primitive_dec(inprefix: &InPrefix, + str_in: &str, + analysis: &FloatAnalysis, + last_dec_place: usize, + sci_mode: Option) + -> FormatPrimitive { + let mut f: FormatPrimitive = Default::default(); + + // add negative sign section + if inprefix.sign == -1 { + f.prefix = Some(String::from("-")); + } + + // assign the digits before and after the decimal points + // to separate slices. If no digits after decimal point, + // assign 0 + let (mut first_segment_raw, second_segment_raw) = match analysis.decimal_pos { + Some(pos) => (&str_in[..pos], &str_in[pos + 1..]), + None => (&str_in[..], "0"), + }; + if first_segment_raw.len() == 0 { + first_segment_raw = "0"; + } + // convert to string, de_hexifying if input is in hex. + let (first_segment, second_segment) = match inprefix.radix_in { + Base::Hex => { + (de_hex(first_segment_raw, true), + de_hex(second_segment_raw, false)) + } + _ => { + (String::from(first_segment_raw), + String::from(second_segment_raw)) + } + }; + let (pre_dec_unrounded, post_dec_unrounded, mantissa) = if sci_mode.is_some() { + if first_segment.len() > 1 { + let mut post_dec = String::from(&first_segment[1..]); + post_dec.push_str(&second_segment); + (String::from(&first_segment[0..1]), + post_dec, + first_segment.len() as isize - 1) + } else { + match first_segment.chars().next() { + Some('0') => { + let mut it = second_segment.chars().enumerate(); + let mut m: isize = 0; + let mut pre = String::from("0"); + let mut post = String::from("0"); + while let Some((i, c)) = it.next() { + match c { + '0' => {} + _ => { + m = ((i as isize) + 1) * -1; + pre = String::from(&second_segment[i..i + 1]); + post = String::from(&second_segment[i + 1..]); + break; + } + } + } + (pre, post, m) + } + Some(_) => (first_segment, second_segment, 0), + None => { + panic!("float_common: no chars in first segment."); + } + } + } + } else { + (first_segment, second_segment, 0) + }; + + let (pre_dec_draft, post_dec_draft) = round_terminal_digit(pre_dec_unrounded, + post_dec_unrounded, + last_dec_place - 1); + + f.pre_decimal = Some(pre_dec_draft); + f.post_decimal = Some(post_dec_draft); + if let Some(capitalized) = sci_mode { + let si_ind = if capitalized { + 'E' + } else { + 'e' + }; + f.suffix = Some(if mantissa >= 0 { + format!("{}+{:02}", si_ind, mantissa) + } else { + // negative sign is considered in format!s + // leading zeroes + format!("{}{:03}", si_ind, mantissa) + }); + } + + f +} + +pub fn primitive_to_str_common(prim: &FormatPrimitive, field: &FormatField) -> String { + let mut final_str = String::new(); + match prim.prefix { + Some(ref prefix) => { + final_str.push_str(&prefix); + } + None => {} + } + match prim.pre_decimal { + Some(ref pre_decimal) => { + final_str.push_str(&pre_decimal); + } + None => { + panic!("error, format primitives provided to int, will, incidentally under correct \ + behavior, always have a pre_dec value."); + } + } + let decimal_places = field.second_field.unwrap_or(6); + match prim.post_decimal { + Some(ref post_decimal) => { + if post_decimal.len() > 0 && decimal_places > 0 { + final_str.push('.'); + let len_avail = post_decimal.len() as u32; + + if decimal_places >= len_avail { + // println!("dec {}, len avail {}", decimal_places, len_avail); + final_str.push_str(post_decimal); + + if *field.field_char != 'g' && *field.field_char != 'G' { + let diff = decimal_places - len_avail; + for _ in 0..diff { + final_str.push('0'); + } + } + } else { + // println!("printing to only {}", decimal_places); + final_str.push_str(&post_decimal[0..decimal_places as usize]); + } + } + } + None => { + panic!("error, format primitives provided to int, will, incidentally under correct \ + behavior, always have a pre_dec value."); + } + } + match prim.suffix { + Some(ref suffix) => { + final_str.push_str(suffix); + } + None => {} + } + + final_str +} diff --git a/src/printf/tokenize/num_format/formatters/floatf.rs b/src/printf/tokenize/num_format/formatters/floatf.rs new file mode 100644 index 000000000..648eda9a4 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/floatf.rs @@ -0,0 +1,36 @@ +//! formatter for %f %F common-notation floating-point subs +use super::super::format_field::FormatField; +use super::super::formatter::{InPrefix, FormatPrimitive, Formatter}; +use super::float_common::{FloatAnalysis, get_primitive_dec, primitive_to_str_common}; + +pub struct Floatf { + as_num: f64, +} +impl Floatf { + pub fn new() -> Floatf { + Floatf { as_num: 0.0 } + } +} +impl Formatter for Floatf { + fn get_primitive(&self, + field: &FormatField, + inprefix: &InPrefix, + str_in: &str) + -> Option { + let second_field = field.second_field.unwrap_or(6) + 1; + let analysis = FloatAnalysis::analyze(&str_in, + inprefix, + None, + Some(second_field as usize), + false); + let f = get_primitive_dec(inprefix, + &str_in[inprefix.offset..], + &analysis, + second_field as usize, + None); + Some(f) + } + fn primitive_to_str(&self, prim: &FormatPrimitive, field: FormatField) -> String { + primitive_to_str_common(prim, &field) + } +} diff --git a/src/printf/tokenize/num_format/formatters/intf.rs b/src/printf/tokenize/num_format/formatters/intf.rs new file mode 100644 index 000000000..8dddc29ac --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/intf.rs @@ -0,0 +1,283 @@ +//! formatter for unsigned and signed int subs +//! unsigned ints: %X %x (hex u64) %o (octal u64) %u (base ten u64) +//! signed ints: %i %d (both base ten i64) +use std::u64; +use std::i64; +use super::super::format_field::FormatField; +use super::super::formatter::{InPrefix, FormatPrimitive, Base, Formatter, warn_incomplete_conv, + get_it_at}; + +pub struct Intf { + a: u32, +} + +// see the Intf::analyze() function below +struct IntAnalysis { + check_past_max: bool, + past_max: bool, + is_zero: bool, + len_digits: u8, +} + +impl Intf { + pub fn new() -> Intf { + Intf { a: 0 } + } + // take a ref to argument string, and basic information + // about prefix (offset, radix, sign), and analyze string + // to gain the IntAnalysis information above + // check_past_max: true if the number *may* be above max, + // but we don't know either way. One of several reasons + // we may have to parse as int. + // past_max: true if the object is past max, false if not + // in the future we should probably combine these into an + // Option + // is_zero: true if number is zero, false otherwise + // len_digits: length of digits used to create the int + // important, for example, if we run into a non-valid character + fn analyze(str_in: &str, signed_out: bool, inprefix: &InPrefix) -> IntAnalysis { + // the maximum number of digits we could conceivably + // have before the decimal point without exceeding the + // max + let mut str_it = get_it_at(inprefix.offset, str_in); + let max_sd_in = if signed_out { + match inprefix.radix_in { + Base::Ten => 19, + Base::Octal => 21, + Base::Hex => 16, + } + } else { + match inprefix.radix_in { + Base::Ten => 20, + Base::Octal => 22, + Base::Hex => 16, + } + }; + let mut ret = IntAnalysis { + check_past_max: false, + past_max: false, + is_zero: false, + len_digits: 0, + }; + + // todo turn this to a while let now that we know + // no special behavior on EOI break + loop { + let c_opt = str_it.next(); + if let Some(c) = c_opt { + match c { + '0'...'9' | 'a'...'f' | 'A'...'F' => { + if ret.len_digits == 0 && c == '0' { + ret.is_zero = true; + } else if ret.is_zero { + ret.is_zero = false; + } + ret.len_digits += 1; + if ret.len_digits == max_sd_in { + if let Some(next_ch) = str_it.next() { + match next_ch { + '0'...'9' => { + ret.past_max = true; + } + _ => { + // force conversion + // to check if its above max. + // todo: spin out convert + // into fn, call it here to try + // read val, on Ok() + // save val for reuse later + // that way on same-base in and out + // we don't needlessly convert int + // to str, we can just copy it over. + ret.check_past_max = true; + str_it.put_back(next_ch); + } + } + if ret.past_max { + break; + } + } else { + ret.check_past_max = true; + } + } + } + _ => { + warn_incomplete_conv(str_in); + break; + } + } + } else { + // breaks on EOL + break; + } + } + ret + } + // get a FormatPrimitive of the maximum value for the field char + // and given sign + fn get_max(fchar: char, sign: i8) -> FormatPrimitive { + let mut fmt_prim: FormatPrimitive = Default::default(); + fmt_prim.pre_decimal = Some(String::from(match fchar { + 'd' | 'i' => { + match sign { + 1 => "9223372036854775807", + _ => { + fmt_prim.prefix = Some(String::from("-")); + "9223372036854775808" + } + } + } + 'x' | 'X' => "ffffffffffffffff", + 'o' => "1777777777777777777777", + 'u' | _ => "18446744073709551615", + })); + fmt_prim + } + // conv_from_segment contract: + // 1. takes + // - a string that begins with a non-zero digit, and proceeds + // with zero or more following digits until the end of the string + // - a radix to interpret those digits as + // - a char that communicates: + // whether to interpret+output the string as an i64 or u64 + // what radix to write the parsed number as. + // 2. parses it as a rust integral type + // 3. outputs FormatPrimitive with: + // - if the string falls within bounds: + // number parsed and written in the correct radix + // - if the string falls outside bounds: + // for i64 output, the int minimum or int max (depending on sign) + // for u64 output, the u64 max in the output radix + fn conv_from_segment(segment: &str, radix_in: Base, fchar: char, sign: i8) -> FormatPrimitive { + match fchar { + 'i' | 'd' => { + match i64::from_str_radix(segment, radix_in as u32) { + Ok(i) => { + let mut fmt_prim: FormatPrimitive = Default::default(); + if sign == -1 { + fmt_prim.prefix = Some(String::from("-")); + } + fmt_prim.pre_decimal = Some(format!("{}", i)); + fmt_prim + } + Err(_) => Intf::get_max(fchar, sign), + } + } + _ => { + match u64::from_str_radix(segment, radix_in as u32) { + Ok(u) => { + let mut fmt_prim: FormatPrimitive = Default::default(); + let u_f = if sign == -1 { + u64::MAX - (u - 1) + } else { + u + }; + fmt_prim.pre_decimal = Some(match fchar { + 'X' => format!("{:X}", u_f), + 'x' => format!("{:x}", u_f), + 'o' => format!("{:o}", u_f), + _ => format!("{}", u_f), + }); + fmt_prim + } + Err(_) => Intf::get_max(fchar, sign), + } + } + } + } +} +impl Formatter for Intf { + fn get_primitive(&self, + field: &FormatField, + inprefix: &InPrefix, + str_in: &str) + -> Option { + + let begin = inprefix.offset; + + // get information about the string. see Intf::Analyze + // def above. + let convert_hints = Intf::analyze(str_in, + *field.field_char == 'i' || *field.field_char == 'd', + inprefix); + // We always will have a formatprimitive to return + Some(if convert_hints.len_digits == 0 || convert_hints.is_zero { + // if non-digit or end is reached before a non-zero digit + let mut fmt_prim: FormatPrimitive = Default::default(); + fmt_prim.pre_decimal = Some(String::from("0")); + fmt_prim + } else if !convert_hints.past_max { + // if the number is or may be below the bounds limit + let radix_out = match *field.field_char { + 'd' | 'i' | 'u' => Base::Ten, + 'x' | 'X' => Base::Hex, + 'o' | _ => Base::Octal, + }; + let radix_mismatch = !radix_out.eq(&inprefix.radix_in); + let decr_from_max: bool = inprefix.sign == -1 && *field.field_char != 'i'; + let end = begin + convert_hints.len_digits as usize; + + // convert to int if any one of these is true: + // - number of digits in int indicates it may be past max + // - we're subtracting from the max + // - we're converting the base + if convert_hints.check_past_max || decr_from_max || radix_mismatch { + // radix of in and out is the same. + let segment = String::from(&str_in[begin..end]); + let m = Intf::conv_from_segment(&segment, + inprefix.radix_in.clone(), + *field.field_char, + inprefix.sign); + m + } else { + // otherwise just do a straight string copy. + let mut fmt_prim: FormatPrimitive = Default::default(); + + // this is here and not earlier because + // zero doesn't get a sign, and conv_from_segment + // creates its format primitive separately + if inprefix.sign == -1 && *field.field_char == 'i' { + + fmt_prim.prefix = Some(String::from("-")); + } + fmt_prim.pre_decimal = Some(String::from(&str_in[begin..end])); + fmt_prim + } + } else { + Intf::get_max(*field.field_char, inprefix.sign) + }) + + } + fn primitive_to_str(&self, prim: &FormatPrimitive, field: FormatField) -> String { + let mut finalstr: String = String::new(); + match prim.prefix { + Some(ref prefix) => { + finalstr.push_str(&prefix); + } + None => {} + } + // integral second fields is zero-padded minimum-width + // which gets handled before general minimum-width + match prim.pre_decimal { + Some(ref pre_decimal) => { + match field.second_field { + Some(min) => { + let mut i = min; + let len = pre_decimal.len() as u32; + while i > len { + finalstr.push('0'); + i -= 1; + } + } + None => {} + } + finalstr.push_str(&pre_decimal); + } + None => { + panic!("error, format primitives provided to int, will, incidentally under \ + correct behavior, always have a pre_dec value."); + } + } + finalstr + } +} diff --git a/src/printf/tokenize/num_format/formatters/mod.rs b/src/printf/tokenize/num_format/formatters/mod.rs new file mode 100644 index 000000000..329e36d87 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/mod.rs @@ -0,0 +1,7 @@ +pub mod intf; +pub mod floatf; +pub mod cninetyninehexfloatf; +pub mod scif; +pub mod decf; +mod float_common; +mod base_conv; diff --git a/src/printf/tokenize/num_format/formatters/scif.rs b/src/printf/tokenize/num_format/formatters/scif.rs new file mode 100644 index 000000000..78d03f996 --- /dev/null +++ b/src/printf/tokenize/num_format/formatters/scif.rs @@ -0,0 +1,36 @@ +//! formatter for %e %E scientific notation subs +use super::super::format_field::FormatField; +use super::super::formatter::{InPrefix, FormatPrimitive, Formatter}; +use super::float_common::{FloatAnalysis, get_primitive_dec, primitive_to_str_common}; + +pub struct Scif { + as_num: f64, +} +impl Scif { + pub fn new() -> Scif { + Scif { as_num: 0.0 } + } +} +impl Formatter for Scif { + fn get_primitive(&self, + field: &FormatField, + inprefix: &InPrefix, + str_in: &str) + -> Option { + let second_field = field.second_field.unwrap_or(6) + 1; + let analysis = FloatAnalysis::analyze(str_in, + inprefix, + Some(second_field as usize + 1), + None, + false); + let f = get_primitive_dec(inprefix, + &str_in[inprefix.offset..], + &analysis, + second_field as usize, + Some(*field.field_char == 'E')); + Some(f) + } + fn primitive_to_str(&self, prim: &FormatPrimitive, field: FormatField) -> String { + primitive_to_str_common(prim, &field) + } +} diff --git a/src/printf/tokenize/num_format/mod.rs b/src/printf/tokenize/num_format/mod.rs new file mode 100644 index 000000000..d40cf92de --- /dev/null +++ b/src/printf/tokenize/num_format/mod.rs @@ -0,0 +1,4 @@ +pub mod format_field; +mod formatter; +mod formatters; +pub mod num_format; diff --git a/src/printf/tokenize/num_format/num_format.rs b/src/printf/tokenize/num_format/num_format.rs new file mode 100644 index 000000000..77a4ef62a --- /dev/null +++ b/src/printf/tokenize/num_format/num_format.rs @@ -0,0 +1,284 @@ +//! handles creating printed output for numeric substitutions + +use std::env; +use std::vec::Vec; +use cli; +use super::format_field::{FormatField, FieldType}; +use super::formatter::{Formatter, FormatPrimitive, InPrefix, Base}; +use super::formatters::intf::Intf; +use super::formatters::floatf::Floatf; +use super::formatters::cninetyninehexfloatf::CninetyNineHexFloatf; +use super::formatters::scif::Scif; +use super::formatters::decf::Decf; + +pub fn warn_expected_numeric(pf_arg: &String) { + // important: keep println here not print + cli::err_msg(&format!("{}: expected a numeric value", pf_arg)); +} + +// when character costant arguments have excess characters +// issue a warning when POSIXLY_CORRECT is not set +fn warn_char_constant_ign(remaining_bytes: Vec) { + match env::var("POSIXLY_CORRECT") { + Ok(_) => {} + Err(e) => { + match e { + env::VarError::NotPresent => { + cli::err_msg(&format!("warning: {:?}: character(s) following character \ + constant have been ignored", + &*remaining_bytes)); + } + _ => {} + } + } + } +} + +// this function looks at the first few +// characters of an argument and returns a value if we can learn +// a value from that (e.g. no argument? return 0, char constant? ret value) +fn get_provided(str_in_opt: Option<&String>) -> Option { + const C_S_QUOTE: u8 = 39; + const C_D_QUOTE: u8 = 34; + match str_in_opt { + Some(str_in) => { + let mut byte_it = str_in.bytes(); + if let Some(qchar) = byte_it.next() { + match qchar { + C_S_QUOTE | C_D_QUOTE => { + return Some(match byte_it.next() { + Some(second_byte) => { + let mut ignored: Vec = Vec::new(); + while let Some(cont) = byte_it.next() { + ignored.push(cont); + } + if ignored.len() > 0 { + warn_char_constant_ign(ignored); + } + second_byte as u8 + } + // no byte after quote + None => { + let so_far = (qchar as u8 as char).to_string(); + warn_expected_numeric(&so_far); + 0 as u8 + } + }); + } + // first byte is not quote + _ => { + return None; + } + // no first byte + } + } else { + Some(0 as u8) + } + } + None => Some(0), + } +} + +// takes a string and returns +// a sign, +// a base, +// and an offset for index after all +// initial spacing, sign, base prefix, and leading zeroes +fn get_inprefix(str_in: &String, field_type: &FieldType) -> InPrefix { + let mut str_it = str_in.chars(); + let mut ret = InPrefix { + radix_in: Base::Ten, + sign: 1, + offset: 0, + }; + let mut topchar = str_it.next().clone(); + // skip spaces and ensure topchar is the first non-space char + // (or None if none exists) + loop { + match topchar { + Some(' ') => { + ret.offset += 1; + topchar = str_it.next(); + } + _ => { + break; + } + } + } + // parse sign + match topchar { + Some('+') => { + ret.offset += 1; + topchar = str_it.next(); + } + Some('-') => { + ret.sign = -1; + ret.offset += 1; + topchar = str_it.next(); + } + _ => {} + } + // we want to exit with offset being + // the index of the first non-zero + // digit before the decimal point or + // if there is none, the zero before the + // decimal point, or, if there is none, + // the decimal point. + + // while we are determining the offset + // we will ensure as a convention + // the offset is always on the first character + // that we are yet unsure if it is the + // final offset. If the zero could be before + // a decimal point we don't move past the zero. + let mut is_hex = false; + if Some('0') == topchar { + if let Some(base) = str_it.next() { + // lead zeroes can only exist in + // octal and hex base + let mut do_clean_lead_zeroes = false; + match base { + 'x' | 'X' => { + is_hex = true; + ret.offset += 2; + ret.radix_in = Base::Hex; + do_clean_lead_zeroes = true; + } + e @ '0'...'9' => { + ret.offset += 1; + match *field_type { + FieldType::Intf => { + ret.radix_in = Base::Octal; + } + _ => {} + } + if e == '0' { + do_clean_lead_zeroes = true; + } + } + _ => {} + } + if do_clean_lead_zeroes { + let mut first = true; + while let Some(ch_zero) = str_it.next() { + // see notes on offset above: + // this is why the offset for octals and decimals + // that reach this branch is 1 even though + // they have already eaten the characters '00' + // this is also why when hex encounters its + // first zero it does not move its offset + // forward because it does not know for sure + // that it's current offset (of that zero) + // is not the final offset, + // whereas at that point octal knows its + // current offset is not the final offset. + match ch_zero { + '0' => { + if !(is_hex && first) { + ret.offset += 1; + } + } + // if decimal, keep last zero if one exists + // (it's possible for last zero to + // not exist at this branch if we're in hex input) + '.' => break, + // other digit, etc. + _ => { + if !(is_hex && first) { + ret.offset += 1; + } + break; + } + } + if first { + first = false; + } + + } + } + } + } + ret +} + +// this is the function a Sub's print will delegate to +// if it is a numeric field, passing the field details +// and an iterator to the argument +pub fn num_format(field: &FormatField, in_str_opt: Option<&String>) -> Option { + + + let fchar = field.field_char.clone(); + + // num format mainly operates by further delegating to one of + // several Formatter structs depending on the field + // see formatter.rs for more details + + // to do switch to static dispatch + let fmtr: Box = match *field.field_type { + FieldType::Intf => Box::new(Intf::new()), + FieldType::Floatf => Box::new(Floatf::new()), + FieldType::CninetyNineHexFloatf => Box::new(CninetyNineHexFloatf::new()), + FieldType::Scif => Box::new(Scif::new()), + FieldType::Decf => Box::new(Decf::new()), + _ => { + panic!("asked to do num format with non-num fieldtype"); + } + }; + let prim_opt= + // if we can get an assumed value from looking at the first + // few characters, use that value to create the FormatPrimitive + if let Some(provided_num) = get_provided(in_str_opt) { + let mut tmp : FormatPrimitive = Default::default(); + match fchar { + 'u' | 'i' | 'd' => { + tmp.pre_decimal = Some( + format!("{}", provided_num)); + }, + 'x' | 'X' => { + tmp.pre_decimal = Some( + format!("{:x}", provided_num)); + }, + 'o' => { + tmp.pre_decimal = Some( + format!("{:o}", provided_num)); + }, + 'e' | 'E' | 'g' | 'G' => { + let as_str = format!("{}", provided_num); + let inprefix = get_inprefix( + &as_str, + &field.field_type + ); + tmp=fmtr.get_primitive(field, &inprefix, &as_str) + .expect("err during default provided num"); + }, + _ => { + tmp.pre_decimal = Some( + format!("{}", provided_num)); + tmp.post_decimal = Some(String::from("0")); + } + } + Some(tmp) + } else { + // otherwise we'll interpret the argument as a number + // using the appropriate Formatter + let in_str = in_str_opt.expect( + "please send the devs this message: + \n get_provided is failing to ret as Some(0) on no str "); + // first get information about the beginning of the + // numeric argument that would be useful for + // any formatter (int or float) + let inprefix = get_inprefix( + in_str, + &field.field_type + ); + // then get the FormatPrimitive from the Formatter + fmtr.get_primitive(field, &inprefix, in_str) + }; + // if we have a formatPrimitive, print its results + // according to the field-char appropriate Formatter + if let Some(prim) = prim_opt { + Some(fmtr.primitive_to_str(&prim, field.clone())) + } else { + None + } +} diff --git a/src/printf/tokenize/sub.rs b/src/printf/tokenize/sub.rs new file mode 100644 index 000000000..8c2303bb9 --- /dev/null +++ b/src/printf/tokenize/sub.rs @@ -0,0 +1,425 @@ +//! Sub is a token that represents a +//! segment of the format string that is a substitution +//! it is created by Sub's implementation of the Tokenizer trait +//! Subs which have numeric field chars make use of the num_format +//! submodule +use std::slice::Iter; +use std::iter::Peekable; +use std::str::Chars; +use std::process::exit; +use cli; +use itertools::PutBackN; +use super::token; +use super::unescaped_text::UnescapedText; +use super::num_format::format_field::{FormatField, FieldType}; +use super::num_format::num_format; +// use std::collections::HashSet; + +fn err_conv(sofar: &String) { + cli::err_msg(&format!("%{}: invalid conversion specification", sofar)); + exit(cli::EXIT_ERR); +} + +fn convert_asterisk_arg_int(asterisk_arg: &String) -> isize { + // this is a costly way to parse the + // args used for asterisk values into integers + // from various bases. Actually doing it correctly + // (going through the pipeline to intf, but returning + // the integer instead of writing it to string and then + // back) is on the refactoring TODO + let field_type = FieldType::Intf; + let field_char = 'i'; + let field_info = FormatField { + min_width: Some(0), + second_field: Some(0), + orig: asterisk_arg, + field_type: &field_type, + field_char: &field_char, + }; + num_format::num_format(&field_info, Some(asterisk_arg)) + .unwrap() + .parse::() + .unwrap() +} + +pub enum CanAsterisk { + Fixed(T), + Asterisk, +} + +// Sub is a tokenizer which creates tokens +// for substitution segments of a format string +pub struct Sub { + min_width: CanAsterisk>, + second_field: CanAsterisk>, + field_char: char, + field_type: FieldType, + orig: String, +} +impl Sub { + pub fn new(min_width: CanAsterisk>, + second_field: CanAsterisk>, + field_char: char, + orig: String) + -> Sub { + // for more dry printing, field characters are grouped + // in initialization of token. + let field_type = match field_char { + 's' | 'b' => FieldType::Strf, + 'd' | 'i' | 'u' | 'o' | 'x' | 'X' => FieldType::Intf, + 'f' | 'F' => FieldType::Floatf, + 'a' | 'A' => FieldType::CninetyNineHexFloatf, + 'e' | 'E' => FieldType::Scif, + 'g' | 'G' => FieldType::Decf, + 'c' => FieldType::Charf, + _ => { + // should be unreachable. + println!("Invalid fieldtype"); + exit(cli::EXIT_ERR); + } + }; + Sub { + min_width: min_width, + second_field: second_field, + field_char: field_char, + field_type: field_type, + orig: orig, + } + } +} + +struct SubParser { + min_width_tmp: Option, + min_width_is_asterisk: bool, + past_decimal: bool, + second_field_tmp: Option, + second_field_is_asterisk: bool, + specifiers_found: bool, + field_char: Option, + text_so_far: String, +} + +impl SubParser { + fn new() -> SubParser { + SubParser { + min_width_tmp: None, + min_width_is_asterisk: false, + past_decimal: false, + second_field_tmp: None, + second_field_is_asterisk: false, + specifiers_found: false, + field_char: None, + text_so_far: String::new(), + } + } + fn from_it(it: &mut PutBackN, + args: &mut Peekable>) + -> Option> { + let mut parser = SubParser::new(); + if parser.sub_vals_retrieved(it) { + let t: Box = SubParser::build_token(parser); + t.print(args); + Some(t) + } else { + None + } + } + fn build_token(parser: SubParser) -> Box { + // not a self method so as to allow move of subparser vals. + // return new Sub struct as token + let t: Box = Box::new(Sub::new(if parser.min_width_is_asterisk { + CanAsterisk::Asterisk + } else { + CanAsterisk::Fixed(parser.min_width_tmp.map(|x| x.parse::().unwrap())) + }, + if parser.second_field_is_asterisk { + CanAsterisk::Asterisk + } else { + CanAsterisk::Fixed(parser.second_field_tmp.map(|x| x.parse::().unwrap())) + }, + parser.field_char.unwrap(), + parser.text_so_far)); + t + } + fn sub_vals_retrieved(&mut self, it: &mut PutBackN) -> bool { + + if !SubParser::successfully_eat_prefix(it, &mut self.text_so_far) { + return false; + } + // this fn in particular is much longer than it needs to be + // .could get a lot + // of code savings just by cleaning it up. shouldn't use a regex + // though, as we want to mimic the original behavior of printing + // the field as interpreted up until the error in the field. + + let mut legal_fields = vec![// 'a', 'A', //c99 hex float implementation not yet complete + 'b', + 'c', + 'd', + 'e', + 'E', + 'f', + 'F', + 'g', + 'G', + 'i', + 'o', + 's', + 'u', + 'x', + 'X']; + let mut specifiers = vec!['h', 'j', 'l', 'L', 't', 'z']; + legal_fields.sort(); + specifiers.sort(); + + // divide substitution from %([0-9]+)?(.[0-9+])?([a-zA-Z]) + // into min_width, second_field, field_char + while let Some(ch) = it.next() { + self.text_so_far.push(ch); + match ch as char { + '-' | '*' | '0'...'9' => { + if !self.past_decimal { + if self.min_width_is_asterisk || self.specifiers_found { + err_conv(&self.text_so_far); + } + if self.min_width_tmp.is_none() { + self.min_width_tmp = Some(String::new()); + } + match self.min_width_tmp.as_mut() { + Some(x) => { + if (ch == '-' || ch == '*') && x.len() > 0 { + err_conv(&self.text_so_far); + } + if ch == '*' { + self.min_width_is_asterisk = true; + } + x.push(ch); + } + None => { + panic!("should be unreachable"); + } + } + } else { + // second field should never have a + // negative value + if self.second_field_is_asterisk || ch == '-' || self.specifiers_found { + err_conv(&self.text_so_far); + } + if self.second_field_tmp.is_none() { + self.second_field_tmp = Some(String::new()); + } + match self.second_field_tmp.as_mut() { + Some(x) => { + if ch == '*' && x.len() > 0 { + err_conv(&self.text_so_far); + } + if ch == '*' { + self.second_field_is_asterisk = true; + } + x.push(ch); + } + None => { + panic!("should be unreachable"); + } + } + } + } + '.' => { + if !self.past_decimal { + self.past_decimal = true; + } else { + err_conv(&self.text_so_far); + } + } + x if legal_fields.binary_search(&x).is_ok() => { + self.field_char = Some(ch); + self.text_so_far.push(ch); + break; + } + x if specifiers.binary_search(&x).is_ok() => { + if !self.past_decimal { + self.past_decimal = true; + } + if !self.specifiers_found { + self.specifiers_found = true; + } + } + _ => { + err_conv(&self.text_so_far); + } + } + } + if !self.field_char.is_some() { + err_conv(&self.text_so_far); + } + let field_char_retrieved = self.field_char.unwrap(); + if self.past_decimal && self.second_field_tmp.is_none() { + self.second_field_tmp = Some(String::from("0")); + } + self.validate_field_params(field_char_retrieved); + // if the dot is provided without a second field + // printf interprets it as 0. + match self.second_field_tmp.as_mut() { + Some(x) => { + if x.len() == 0 { + self.min_width_tmp = Some(String::from("0")); + } + } + _ => {} + } + + true + } + fn successfully_eat_prefix(it: &mut PutBackN, text_so_far: &mut String) -> bool { + // get next two chars, + // if they're '%%' we're not tokenizing it + // else put chars back + let preface = it.next(); + let n_ch = it.next(); + if preface == Some('%') && n_ch != Some('%') { + match n_ch { + Some(x) => { + it.put_back(x); + true + } + None => { + text_so_far.push('%'); + err_conv(&text_so_far); + false + } + } + } else { + n_ch.map(|x| it.put_back(x)); + preface.map(|x| it.put_back(x)); + false + } + } + fn validate_field_params(&self, field_char: char) { + // check for illegal combinations here when possible vs + // on each application so we check less per application + // to do: move these checks to Sub::new + if (field_char == 's' && self.min_width_tmp == Some(String::from("0"))) || + (field_char == 'c' && + (self.min_width_tmp == Some(String::from("0")) || self.past_decimal)) || + (field_char == 'b' && + (self.min_width_tmp.is_some() || self.past_decimal || + self.second_field_tmp.is_some())) { + err_conv(&self.text_so_far); + } + } +} + + + +impl token::Tokenizer for Sub { + fn from_it(it: &mut PutBackN, + args: &mut Peekable>) + -> Option> { + SubParser::from_it(it, args) + } +} +impl token::Token for Sub { + fn print(&self, pf_args_it: &mut Peekable>) { + let field = FormatField { + min_width: match self.min_width { + CanAsterisk::Fixed(x) => x, + CanAsterisk::Asterisk => { + match pf_args_it.next() { + // temporary, use intf.rs instead + Some(x) => Some(convert_asterisk_arg_int(x)), + None => Some(0), + } + } + }, + second_field: match self.second_field { + CanAsterisk::Fixed(x) => x, + CanAsterisk::Asterisk => { + match pf_args_it.next() { + // temporary, use intf.rs instead + Some(x) => { + let result = convert_asterisk_arg_int(x); + if result < 0 { + None + } else { + Some(result as u32) + } + } + None => Some(0), + } + } + }, + field_char: &self.field_char, + field_type: &self.field_type, + orig: &self.orig, + }; + let pf_arg = pf_args_it.next(); + + // minimum width is handled independently of actual + // field char + let pre_min_width_opt: Option = match *field.field_type { + // if %s just return arg + // if %b use UnescapedText module's unescaping-fn + // if %c return first char of arg + FieldType::Strf | FieldType::Charf => { + match pf_arg { + Some(arg_string) => { + match *field.field_char { + 's' => { + Some(match field.second_field { + Some(max) => String::from(&arg_string[..max as usize]), + None => arg_string.clone(), + }) + } + 'b' => { + let mut a_it = PutBackN::new(arg_string.chars()); + UnescapedText::from_it_core(&mut a_it, true); + None + } + // for 'c': get iter of string vals, + // get opt of first val + // and map it to opt + 'c' | _ => arg_string.chars().next().map(|x| x.to_string()), + } + } + None => None, + } + } + _ => { + // non string/char fields are delegated to num_format + num_format::num_format(&field, pf_arg) + } + }; + match pre_min_width_opt { + // if have a string, print it, ensuring minimum width is met. + Some(pre_min_width) => { + print!("{}", + match field.min_width { + Some(min_width) => { + let diff: isize = min_width.abs() as isize - + pre_min_width.len() as isize; + if diff > 0 { + let mut final_str = String::new(); + // definitely more efficient ways + // to do this. + let pad_before = min_width > 0; + if !pad_before { + final_str.push_str(&pre_min_width); + } + for _ in 0..diff { + final_str.push(' '); + } + if pad_before { + final_str.push_str(&pre_min_width); + } + final_str + } else { + pre_min_width + } + } + None => pre_min_width, + }); + } + None => {} + } + } +} diff --git a/src/printf/tokenize/token.rs b/src/printf/tokenize/token.rs new file mode 100644 index 000000000..996151cd1 --- /dev/null +++ b/src/printf/tokenize/token.rs @@ -0,0 +1,30 @@ +//! Traits and enums dealing with Tokenization of printf Format String +#[allow(unused_must_use)] + +use std::iter::Peekable; +use std::str::Chars; +use std::slice::Iter; +use itertools::PutBackN; + +// A token object is an object that can print the expected output +// of a contiguous segment of the format string, and +// requires at most 1 argusegment +pub trait Token { + fn print(&self, args: &mut Peekable>); +} + +// A tokenizer object is an object that takes an iterator +// at a position in a format string, and sees whether +// it can return a token of a type it knows how to produce +// if so, return the token, move the iterator past the +// format string text the token repsresents, and if an +// argument is used move the argument iter forward one + +// creating token of a format string segment should also cause +// printing of that token's value. Essentially tokenizing +// a whole format string will print the format string and consume +// a number of arguments equal to the number of argument-using tokens + +pub trait Tokenizer { + fn from_it(it: &mut PutBackN, args: &mut Peekable>) -> Option>; +} diff --git a/src/printf/tokenize/unescaped_text.rs b/src/printf/tokenize/unescaped_text.rs new file mode 100644 index 000000000..4053d6bba --- /dev/null +++ b/src/printf/tokenize/unescaped_text.rs @@ -0,0 +1,253 @@ +//! UnescapedText is a tokenizer impl +//! for tokenizing character literals, +//! and escaped character literals (of allowed escapes), +//! into an unescaped text byte array + +use std::iter::Peekable; +use std::slice::Iter; +use std::str::Chars; +use std::char::from_u32; +use std::process::exit; +use cli; +use itertools::PutBackN; +use super::token; + +pub struct UnescapedText(Vec); +impl UnescapedText { + fn new() -> UnescapedText { + UnescapedText(Vec::new()) + } + // take an iterator to the format string + // consume between min and max chars + // and return it as a base-X number + fn base_to_u32(min_chars: u8, max_chars: u8, base: u32, it: &mut PutBackN) -> u32 { + let mut retval: u32 = 0; + let mut found = 0; + while found < max_chars { + // if end of input break + let nc = it.next(); + match nc { + Some(digit) => { + // if end of hexchars break + match digit.to_digit(base) { + Some(d) => { + found += 1; + retval *= base; + retval += d; + } + None => { + it.put_back(digit); + break; + } + } + } + None => { + break; + } + } + } + if found < min_chars { + // only ever expected for hex + println!("missing hexadecimal number in escape"); //todo stderr + exit(cli::EXIT_ERR); + } + retval + } + // validates against valid + // IEC 10646 vals - these values + // are pinned against the more popular + // printf so as to not disrupt when + // dropped-in as a replacement. + fn validate_iec(val: u32, eight_word: bool) { + let mut preface = 'u'; + let mut leading_zeros = 4; + if eight_word { + preface = 'U'; + leading_zeros = 8; + } + let err_msg = format!("invalid universal character name {0}{1:02$x}", + preface, + val, + leading_zeros); + if (val < 159 && (val != 36 && val != 64 && val != 96)) || (val > 55296 && val < 57343) { + println!("{}", err_msg);//todo stderr + exit(cli::EXIT_ERR); + } + } + // pass an iterator that succeeds an '/', + // and process the remaining character + // adding the unescaped bytes + // to the passed byte_vec + // in subs_mode change octal behavior + fn handle_escaped(byte_vec: &mut Vec, it: &mut PutBackN, subs_mode: bool) { + let ch = match it.next() { + Some(x) => x, + None => '\\', + }; + match ch { + '0'...'9' | 'x' => { + let min_len = 1; + let mut max_len = 2; + let mut base = 16; + let ignore = false; + match ch { + 'x' => {} + e @ '0'...'9' => { + max_len = 3; + base = 8; + // in practice, gnu coreutils printf + // interprets octals without a + // leading zero in %b + // but it only skips leading zeros + // in %b mode. + // if we ever want to match gnu coreutil + // printf's docs instead of its behavior + // we'd set this to true. + // if subs_mode && e != '0' + // { ignore = true; } + if !subs_mode || e != '0' { + it.put_back(ch); + } + } + _ => {} + } + if !ignore { + let val = (UnescapedText::base_to_u32(min_len, max_len, base, it) % 256) as u8; + byte_vec.push(val); + let bvec = [val]; + cli::flush_bytes(&bvec); + } else { + byte_vec.push(ch as u8); + } + } + e @ _ => { + // only for hex and octal + // is byte encoding specified. + // otherwise, why not leave the door open + // for other encodings unless it turns out + // a bottleneck. + let mut s = String::new(); + let ch = match e { + '\\' => '\\', + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + // bell + 'a' => '\x07', + // backspace + 'b' => '\x08', + // vertical tab + 'v' => '\x0B', + // form feed + 'f' => '\x0C', + // escape character + 'e' => '\x1B', + 'c' => exit(cli::EXIT_OK), + 'u' | 'U' => { + let len = match e { + 'u' => 4, + 'U' | _ => 8, + }; + let val = UnescapedText::base_to_u32(len, len, 16, it); + UnescapedText::validate_iec(val, false); + if let Some(c) = from_u32(val) { + c + } else { + '-' + } + } + _ => { + s.push('\\'); + ch + } + }; + s.push(ch); + cli::flush_str(&s); + byte_vec.extend(s.bytes()); + } + }; + + } + + // take an iteratator to a string, + // and return a wrapper around a Vec of unescaped bytes + // break on encounter of sub symbol ('%[^%]') unless called + // through %b subst. + pub fn from_it_core(it: &mut PutBackN, subs_mode: bool) -> Option> { + let mut addchar = false; + let mut new_text = UnescapedText::new(); + let mut tmp_str = String::new(); + { + let mut new_vec: &mut Vec = &mut (new_text.0); + while let Some(ch) = it.next() { + if !addchar { + addchar = true; + } + match ch as char { + x if x != '\\' && x != '%' => { + // lazy branch eval + // remember this fn could be called + // many times in a single exec through %b + cli::flush_char(&ch); + tmp_str.push(ch); + } + '\\' => { + // the literal may be a literal bytecode + // and not valid utf-8. Str only supports + // valid utf-8. + // if we find the unnecessary drain + // on non hex or octal escapes is costly + // then we can make it faster/more complex + // with as-necessary draining. + if tmp_str.len() > 0 { + new_vec.extend(tmp_str.bytes()); + tmp_str = String::new(); + } + UnescapedText::handle_escaped(new_vec, it, subs_mode) + } + x if x == '%' && !subs_mode => { + if let Some(follow) = it.next() { + if follow == '%' { + cli::flush_char(&ch); + tmp_str.push(ch); + } else { + it.put_back(follow); + it.put_back(ch); + break; + } + } else { + it.put_back(ch); + break; + } + } + _ => { + cli::flush_char(&ch); + tmp_str.push(ch); + } + } + } + if tmp_str.len() > 0 { + new_vec.extend(tmp_str.bytes()); + } + } + match addchar { + true => Some(Box::new(new_text)), + false => None, + } + } +} +#[allow(unused_variables)] +impl token::Tokenizer for UnescapedText { + fn from_it(it: &mut PutBackN, + args: &mut Peekable>) + -> Option> { + UnescapedText::from_it_core(it, false) + } +} +#[allow(unused_variables)] +impl token::Token for UnescapedText { + fn print(&self, pf_args_it: &mut Peekable>) { + cli::flush_bytes(&self.0[..]); + } +} diff --git a/tests/printf.rs b/tests/printf.rs new file mode 100644 index 000000000..52370b751 --- /dev/null +++ b/tests/printf.rs @@ -0,0 +1,280 @@ +#[macro_use] +mod common; + +use common::util::*; + +static UTIL_NAME: &'static str = "printf"; + +fn expect_stdout(input: Vec<&str>, expected: &str) { + let (_, mut ucmd) = testing(UTIL_NAME); + let results = ucmd.args(&input).run(); + // assert_empty_stderr!(result); + // assert!(result.success); + assert_eq!(expected, results.stdout); +} + +#[test] +fn basic_literal() { + expect_stdout(vec!["hello world"], "hello world"); +} + +#[test] +fn escaped_tab() { + expect_stdout(vec!["hello\\t world"], "hello\t world"); +} + +#[test] +fn escaped_newline() { + expect_stdout(vec!["hello\\n world"], "hello\n world"); +} + +#[test] +fn escaped_slash() { + expect_stdout(vec!["hello\\\\ world"], "hello\\ world"); +} + +#[test] +fn escaped_hex() { + expect_stdout(vec!["\\x41"], "A"); +} + +#[test] +fn escaped_octal() { + expect_stdout(vec!["\\101"], "A"); +} + +#[test] +fn escaped_unicode_fourdigit() { + expect_stdout(vec!["\\u0125"], "ĥ"); +} + +#[test] +fn escaped_unicode_eightdigit() { + expect_stdout(vec!["\\U00000125"], "ĥ"); +} + +#[test] +fn escaped_percent_sign() { + expect_stdout(vec!["hello%% world"], "hello% world"); +} + +#[test] +fn escaped_unrecognized() { + expect_stdout(vec!["c\\d"], "c\\d"); +} + +#[test] +fn sub_string() { + expect_stdout(vec!["hello %s", "world"], "hello world"); +} + +#[test] +fn sub_multifield() { + expect_stdout(vec!["%s %s", "hello", "world"], "hello world"); +} + +#[test] +fn sub_repeat_formatstr() { + expect_stdout(vec!["%s.", "hello", "world"], "hello.world."); +} + +#[test] +fn sub_string_ignore_escapes() { + expect_stdout(vec!["hello %s", "\\tworld"], "hello \\tworld"); +} + +#[test] +fn sub_bstring_handle_escapes() { + expect_stdout(vec!["hello %b", "\\tworld"], "hello \tworld"); +} + +#[test] +fn sub_bstring_ignore_subs() { + expect_stdout(vec!["hello %b", "world %% %i"], "hello world %% %i"); +} + +#[test] +fn sub_char() { + expect_stdout(vec!["the letter %c", "A"], "the letter A"); +} + +#[test] +fn sub_num_int() { + expect_stdout(vec!["twenty is %i", "20"], "twenty is 20"); +} + +#[test] +fn sub_num_int_minwidth() { + expect_stdout(vec!["twenty is %1i", "20"], "twenty is 20"); +} + +#[test] +fn sub_num_int_neg() { + expect_stdout(vec!["neg. twenty is %i", "-20"], "neg. twenty is -20"); +} + +#[test] +fn sub_num_int_oct_in() { + expect_stdout(vec!["twenty is %i", "024"], "twenty is 20"); +} + +#[test] +fn sub_num_int_oct_in_neg() { + expect_stdout(vec!["neg. twenty is %i", "-024"], "neg. twenty is -20"); +} + +#[test] +fn sub_num_int_hex_in() { + expect_stdout(vec!["twenty is %i", "0x14"], "twenty is 20"); +} + +#[test] +fn sub_num_int_hex_in_neg() { + expect_stdout(vec!["neg. twenty is %i", "-0x14"], "neg. twenty is -20"); +} + +#[test] +fn sub_num_int_charconst_in() { + expect_stdout(vec!["ninetyseven is %i", "'a"], "ninetyseven is 97"); +} + +#[test] +fn sub_num_uint() { + expect_stdout(vec!["twenty is %u", "20"], "twenty is 20"); +} + +#[test] +fn sub_num_octal() { + expect_stdout(vec!["twenty in octal is %o", "20"], "twenty in octal is 24"); +} + +#[test] +fn sub_num_hex_lower() { + expect_stdout(vec!["thirty in hex is %x", "30"], "thirty in hex is 1e"); +} + +#[test] +fn sub_num_hex_upper() { + expect_stdout(vec!["thirty in hex is %X", "30"], "thirty in hex is 1E"); +} + +#[test] +fn sub_num_float() { + expect_stdout(vec!["twenty is %f", "20"], "twenty is 20.000000"); +} + +#[test] +fn sub_num_float_round() { + expect_stdout(vec!["two is %f", "1.9999995"], "two is 2.000000"); +} + +#[test] +fn sub_num_sci_lower() { + expect_stdout(vec!["twenty is %e", "20"], "twenty is 2.000000e+01"); +} + +#[test] +fn sub_num_sci_upper() { + expect_stdout(vec!["twenty is %E", "20"], "twenty is 2.000000E+01"); +} + +#[test] +fn sub_num_sci_trunc() { + expect_stdout(vec!["pi is ~ %e", "3.1415926535"], "pi is ~ 3.141593e+00"); +} + +#[test] +fn sub_num_dec_trunc() { + expect_stdout(vec!["pi is ~ %g", "3.1415926535"], "pi is ~ 3.141593"); +} + +#[test] +fn sub_minwidth() { + expect_stdout(vec!["hello %7s", "world"], "hello world"); +} + +#[test] +fn sub_minwidth_negative() { + expect_stdout(vec!["hello %-7s", "world"], "hello world "); +} + +#[test] +fn sub_str_max_chars_input() { + expect_stdout(vec!["hello %7.2s", "world"], "hello wo"); +} + +#[test] +fn sub_int_decimal() { + expect_stdout(vec!["%0.i", "11"], "11"); +} + +#[test] +fn sub_int_leading_zeroes() { + expect_stdout(vec!["%.4i", "11"], "0011"); +} + +#[test] +fn sub_int_leading_zeroes_prio() { + expect_stdout(vec!["%5.4i", "11"], " 0011"); +} + +#[test] +fn sub_float_dec_places() { + expect_stdout(vec!["pi is ~ %.11f", "3.1415926535"], + "pi is ~ 3.14159265350"); +} + +#[test] +fn sub_float_hex_in() { + expect_stdout(vec!["%f", "0xF1.1F"], "241.121094"); +} + +#[test] +fn sub_float_no_octal_in() { + expect_stdout(vec!["%f", "077"], "77.000000"); +} + +#[test] +fn sub_any_asterisk_firstparam() { + expect_stdout(vec!["%*i", "3", "11", "4", "12"], " 11 12"); +} + +#[test] +fn sub_any_asterisk_second_param() { + expect_stdout(vec!["%.*i", "3", "11", "4", "12"], "0110012"); +} + +#[test] +fn sub_any_asterisk_both_params() { + expect_stdout(vec!["%*.*i", "4", "3", "11", "5", "4", "12"], " 011 0012"); +} + +#[test] +fn sub_any_asterisk_octal_arg() { + expect_stdout(vec!["%.*i", "011", "12345678"], "012345678"); +} + +#[test] +fn sub_any_asterisk_hex_arg() { + expect_stdout(vec!["%.*i", "0xA", "123456789"], "0123456789"); +} + +#[test] +fn sub_any_specifiers_no_params() { + expect_stdout(vec!["%ztlhLji", "3"], "3"); +} + +#[test] +fn sub_any_specifiers_after_first_param() { + expect_stdout(vec!["%0ztlhLji", "3"], "3"); +} + +#[test] +fn sub_any_specifiers_after_period() { + expect_stdout(vec!["%0.ztlhLji", "3"], "3"); +} + +#[test] +fn sub_any_specifiers_after_second_param() { + expect_stdout(vec!["%0.0ztlhLji", "3"], "3"); +}