1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 10:28:10 +00:00

AK: Add an exact and fast hex float parsing algorithm

Similar to decimal floating point parsing the current strtod hex float
parsing gives a lot of incorrect results. We can use a similar technique
as with decimal parsing however hex floats are much simpler as we don't
need to scale with a power of 5.

For hex floats we just provide the parse_first_hexfloat API as there is
currently no need for a parse_hexfloat_completely API.

Again the accepted input for parse_first_hexfloat is very lenient and
any validation should be done before calling this method.
This commit is contained in:
davidot 2022-10-13 02:23:07 +02:00 committed by Linus Groh
parent 53b7f5e6a1
commit 2334cd85a2
3 changed files with 419 additions and 0 deletions

View file

@ -2015,4 +2015,239 @@ template Optional<double> parse_floating_point_completely(char const* start, cha
template Optional<float> parse_floating_point_completely(char const* start, char const* end);
struct HexFloatParseResult {
bool is_negative = false;
bool valid = false;
char const* last_parsed = nullptr;
u64 mantissa = 0;
i64 exponent = 0;
};
static HexFloatParseResult parse_hexfloat(char const* start)
{
HexFloatParseResult result {};
if (start == nullptr || *start == '\0')
return result;
char const* parse_head = start;
bool any_digits = false;
bool truncated_non_zero = false;
if (*parse_head == '-') {
result.is_negative = true;
++parse_head;
if (*parse_head == '\0' || (!is_ascii_hex_digit(*parse_head) && *parse_head != floating_point_decimal_separator))
return result;
} else if (*parse_head == '+') {
++parse_head;
if (*parse_head == '\0' || (!is_ascii_hex_digit(*parse_head) && *parse_head != floating_point_decimal_separator))
return result;
}
if (*parse_head == '0' && (*(parse_head + 1) != '\0') && (*(parse_head + 1) == 'x' || *(parse_head + 1) == 'X')) {
// Skip potential 0[xX], we have to do this here since the sign comes at the front
parse_head += 2;
}
auto add_mantissa_digit = [&] {
any_digits = true;
// We assume you already checked this is actually a digit
auto digit = parse_ascii_hex_digit(*parse_head);
// Because the power of sixteen is just scaling of power of two we don't
// need to keep all the remaining digits beyond the first 52 bits, just because
// it's easy we store the first 16 digits. However for rounding we do need to parse
// all the digits and keep track if we see any non zero one.
if (result.mantissa < (1ull << 60)) {
result.mantissa = (result.mantissa * 16) + digit;
return true;
}
if (digit != 0)
truncated_non_zero = true;
return false;
};
while (*parse_head != '\0' && is_ascii_hex_digit(*parse_head)) {
add_mantissa_digit();
++parse_head;
}
if (*parse_head != '\0' && *parse_head == floating_point_decimal_separator) {
++parse_head;
i64 digits_after_separator = 0;
while (*parse_head != '\0' && is_ascii_hex_digit(*parse_head)) {
// Track how many characters we actually read into the mantissa
digits_after_separator += add_mantissa_digit() ? 1 : 0;
++parse_head;
}
// We parsed x digits after the dot so need to multiply with 2^(-x * 4)
// Since every digit is 4 bits
result.exponent = -digits_after_separator * 4;
}
if (!any_digits)
return result;
if (*parse_head != '\0' && (*parse_head == 'p' || *parse_head == 'P')) {
[&] {
auto const* head_before_p = parse_head;
ArmedScopeGuard reset_ptr { [&] { parse_head = head_before_p; } };
++parse_head;
if (*parse_head == '\0')
return;
bool exponent_is_negative = false;
i64 explicit_exponent = 0;
if (*parse_head == '-' || *parse_head == '+') {
exponent_is_negative = *parse_head == '-';
++parse_head;
if (*parse_head == '\0')
return;
}
if (!is_ascii_digit(*parse_head))
return;
// We have at least one digit (with optional preceding sign) so we will not reset
reset_ptr.disarm();
while (*parse_head != '\0' && is_ascii_digit(*parse_head)) {
// If we hit exponent overflow the number is so huge we are in trouble anyway, see
// a comment in parse_numbers.
if (explicit_exponent < 0x10000000)
explicit_exponent = 10 * explicit_exponent + (*parse_head - '0');
++parse_head;
}
if (exponent_is_negative)
explicit_exponent = -explicit_exponent;
result.exponent += explicit_exponent;
}();
}
result.valid = true;
// Round up exactly halfway with truncated non zeros, but don't if it would cascade up
if (truncated_non_zero && (result.mantissa & 0xF) != 0xF) {
VERIFY(result.mantissa >= 0x1000'0000'0000'0000);
result.mantissa |= 1;
}
result.last_parsed = parse_head;
return result;
}
template<FloatingPoint T>
static FloatingPointBuilder build_hex_float(HexFloatParseResult& parse_result)
{
using FloatingPointRepr = FloatingPointInfo<T>;
VERIFY(parse_result.mantissa != 0);
if (parse_result.exponent >= FloatingPointRepr::infinity_exponent())
return FloatingPointBuilder::infinity<T>();
auto leading_zeros = count_leading_zeroes(parse_result.mantissa);
u64 normalized_mantissa = parse_result.mantissa << leading_zeros;
// No need to multiply with some power of 5 here the exponent is already a power of 2.
u8 upperbit = normalized_mantissa >> 63;
FloatingPointBuilder parts;
parts.mantissa = normalized_mantissa >> (upperbit + 64 - FloatingPointRepr::mantissa_bits() - 3);
parts.exponent = parse_result.exponent + upperbit - leading_zeros + FloatingPointRepr::exponent_bias() + 62;
if (parts.exponent <= 0) {
// subnormal
if (-parts.exponent + 1 >= 64) {
parts.mantissa = 0;
parts.exponent = 0;
return parts;
}
parts.mantissa >>= -parts.exponent + 1;
parts.mantissa += parts.mantissa & 1;
parts.mantissa >>= 1;
if (parts.mantissa < (1ull << FloatingPointRepr::mantissa_bits())) {
parts.exponent = 0;
} else {
parts.exponent = 1;
}
return parts;
}
// Here we don't have to only do this halfway check for some exponents
if ((parts.mantissa & 0b11) == 0b01) {
// effectively all discard bits from z.high are 0
if (normalized_mantissa == (parts.mantissa << (upperbit + 64 - FloatingPointRepr::mantissa_bits() - 3)))
parts.mantissa &= ~u64(1);
}
parts.mantissa += parts.mantissa & 1;
parts.mantissa >>= 1;
if (parts.mantissa >= (2ull << FloatingPointRepr::mantissa_bits())) {
parts.mantissa = 1ull << FloatingPointRepr::mantissa_bits();
++parts.exponent;
}
parts.mantissa &= ~(1ull << FloatingPointRepr::mantissa_bits());
if (parts.exponent >= FloatingPointRepr::infinity_exponent()) {
parts.mantissa = 0;
parts.exponent = FloatingPointRepr::infinity_exponent();
}
return parts;
}
template<FloatingPoint T>
FloatingPointParseResults<T> parse_first_hexfloat_until_zero_character(char const* start)
{
using FloatingPointRepr = FloatingPointInfo<T>;
auto parse_result = parse_hexfloat(start);
if (!parse_result.valid)
return { nullptr, FloatingPointError::NoOrInvalidInput, __builtin_nan("") };
FloatingPointParseResults<T> full_result {};
full_result.end_ptr = parse_result.last_parsed;
// We special case this to be able to differentiate between 0 and values rounded down to 0
if (parse_result.mantissa == 0) {
full_result.value = 0.;
return full_result;
}
auto result = build_hex_float<T>(parse_result);
full_result.value = result.template to_value<T>(parse_result.is_negative);
if (result.exponent == FloatingPointRepr::infinity_exponent()) {
VERIFY(result.mantissa == 0);
full_result.error = FloatingPointError::OutOfRange;
} else if (result.mantissa == 0 && result.exponent == 0) {
full_result.error = FloatingPointError::RoundedDownToZero;
}
return full_result;
}
template FloatingPointParseResults<double> parse_first_hexfloat_until_zero_character(char const* start);
template FloatingPointParseResults<float> parse_first_hexfloat_until_zero_character(char const* start);
}