diff --git a/Userland/Libraries/LibSoftGPU/CMakeLists.txt b/Userland/Libraries/LibSoftGPU/CMakeLists.txt index 11e603e1b8..96c04d4d42 100644 --- a/Userland/Libraries/LibSoftGPU/CMakeLists.txt +++ b/Userland/Libraries/LibSoftGPU/CMakeLists.txt @@ -6,5 +6,6 @@ set(SOURCES Sampler.cpp ) +add_compile_options(-Wno-psabi) serenity_lib(LibSoftGPU softgpu) target_link_libraries(LibSoftGPU LibM LibCore LibGfx) diff --git a/Userland/Libraries/LibSoftGPU/Device.cpp b/Userland/Libraries/LibSoftGPU/Device.cpp index d01e0d9d7d..0ada46423e 100644 --- a/Userland/Libraries/LibSoftGPU/Device.cpp +++ b/Userland/Libraries/LibSoftGPU/Device.cpp @@ -6,12 +6,16 @@ */ #include +#include +#include #include #include #include #include #include #include +#include +#include namespace SoftGPU { @@ -24,13 +28,17 @@ static long long g_num_sampler_calls; using IntVector2 = Gfx::Vector2; using IntVector3 = Gfx::Vector3; +using AK::SIMD::exp; +using AK::SIMD::expand4; +using AK::SIMD::f32x4; + constexpr static int edge_function(const IntVector2& a, const IntVector2& b, const IntVector2& c) { return ((c.x() - a.x()) * (b.y() - a.y()) - (c.y() - a.y()) * (b.x() - a.x())); } -template -constexpr static T interpolate(const T& v0, const T& v1, const T& v2, const FloatVector3& barycentric_coords) +template +constexpr static auto interpolate(const T& v0, const T& v1, const T& v2, const Vector3& barycentric_coords) { return v0 * barycentric_coords.x() + v1 * barycentric_coords.y() + v2 * barycentric_coords.z(); } @@ -369,47 +377,56 @@ static void rasterize_triangle(const RasterizerOptions& options, Gfx::Bitmap& re // Draw the pixels according to the previously generated mask auto coords = b0; - for (int y = 0; y < RASTERIZER_BLOCK_SIZE; y++, coords += step_y) { - if (pixel_mask[y] == 0) { - coords += dbdx * RASTERIZER_BLOCK_SIZE; - continue; - } + for (int y = 0; y < RASTERIZER_BLOCK_SIZE; y += 2, coords += step_y + dbdy) { + for (int x = 0; x < RASTERIZER_BLOCK_SIZE; x += 2, coords += dbdx + dbdx) { - auto* pixel = pixel_staging[y]; - for (int x = 0; x < RASTERIZER_BLOCK_SIZE; x++, coords += dbdx, pixel++) { - if (~pixel_mask[y] & (1 << x)) - continue; + PixelQuad quad; + + auto a = coords; + auto b = coords + dbdx; + auto c = coords + dbdy; + auto d = coords + dbdx + dbdy; // Perspective correct barycentric coordinates - auto barycentric = FloatVector3(coords.x(), coords.y(), coords.z()) * one_over_area; - auto const w_coordinates = FloatVector3 { - vertex0.window_coordinates.w(), - vertex1.window_coordinates.w(), - vertex2.window_coordinates.w(), + auto barycentric = Vector3 { + f32x4 { float(a.x()), float(b.x()), float(c.x()), float(d.x()) }, + f32x4 { float(a.y()), float(b.y()), float(c.y()), float(d.y()) }, + f32x4 { float(a.z()), float(b.z()), float(c.z()), float(d.z()) }, + } * one_over_area; + + auto const w_coordinates = Vector3 { + expand4(vertex0.window_coordinates.w()), + expand4(vertex1.window_coordinates.w()), + expand4(vertex2.window_coordinates.w()), }; - float const interpolated_reciprocal_w = interpolate(w_coordinates.x(), w_coordinates.y(), w_coordinates.z(), barycentric); - float const interpolated_w = 1 / interpolated_reciprocal_w; + + auto const interpolated_reciprocal_w = interpolate(w_coordinates.x(), w_coordinates.y(), w_coordinates.z(), barycentric); + auto const interpolated_w = 1.0f / interpolated_reciprocal_w; barycentric = barycentric * w_coordinates * interpolated_w; // FIXME: make this more generic. We want to interpolate more than just color and uv - FloatVector4 vertex_color; if (options.shade_smooth) { - vertex_color = interpolate(vertex0.color, vertex1.color, vertex2.color, barycentric); + quad.vertex_color = interpolate(expand4(vertex0.color), expand4(vertex1.color), expand4(vertex2.color), barycentric); } else { - vertex_color = vertex0.color; + quad.vertex_color = expand4(vertex0.color); } - auto uv = interpolate(vertex0.tex_coord, vertex1.tex_coord, vertex2.tex_coord, barycentric); + quad.uv = interpolate(expand4(vertex0.tex_coord), expand4(vertex1.tex_coord), expand4(vertex2.tex_coord), barycentric); // Calculate depth of fragment for fog // // OpenGL 1.5 spec chapter 3.10: "An implementation may choose to approximate the // eye-coordinate distance from the eye to each fragment center by |Ze|." - float fog_fragment_depth = interpolate(vertex0_eye_absz, vertex1_eye_absz, vertex2_eye_absz, barycentric); + quad.fog_depth = interpolate(expand4(vertex0_eye_absz), expand4(vertex1_eye_absz), expand4(vertex2_eye_absz), barycentric); + + pixel_shader(quad); - *pixel = pixel_shader(uv, vertex_color, fog_fragment_depth); INCREASE_STATISTICS_COUNTER(g_num_pixels_shaded, 1); + pixel_staging[y][x] = { quad.out_color.x()[0], quad.out_color.y()[0], quad.out_color.z()[0], quad.out_color.w()[0] }; + pixel_staging[y][x + 1] = { quad.out_color.x()[1], quad.out_color.y()[1], quad.out_color.z()[1], quad.out_color.w()[1] }; + pixel_staging[y + 1][x] = { quad.out_color.x()[2], quad.out_color.y()[2], quad.out_color.z()[2], quad.out_color.w()[2] }; + pixel_staging[y + 1][x + 1] = { quad.out_color.x()[3], quad.out_color.y()[3], quad.out_color.z()[3], quad.out_color.w()[3] }; } } @@ -797,29 +814,29 @@ void Device::draw_primitives(PrimitiveType primitive_type, FloatMatrix4x4 const& void Device::submit_triangle(const Triangle& triangle, Vector const& enabled_texture_units) { - rasterize_triangle(m_options, *m_render_target, *m_depth_buffer, triangle, [this, &enabled_texture_units](FloatVector4 const& uv, FloatVector4 const& color, float fog_depth) -> FloatVector4 { - FloatVector4 fragment = color; + rasterize_triangle(m_options, *m_render_target, *m_depth_buffer, triangle, [this, &enabled_texture_units](PixelQuad& quad) { + quad.out_color = quad.vertex_color; for (size_t i : enabled_texture_units) { // FIXME: implement GL_TEXTURE_1D, GL_TEXTURE_3D and GL_TEXTURE_CUBE_MAP auto const& sampler = m_samplers[i]; - FloatVector4 texel = sampler.sample_2d({ uv.x(), uv.y() }); + auto texel = sampler.sample_2d({ quad.uv.x(), quad.uv.y() }); INCREASE_STATISTICS_COUNTER(g_num_sampler_calls, 1); // FIXME: Implement more blend modes switch (sampler.config().fixed_function_texture_env_mode) { case TextureEnvMode::Modulate: - fragment = fragment * texel; + quad.out_color = quad.out_color * texel; break; case TextureEnvMode::Replace: - fragment = texel; + quad.out_color = texel; break; case TextureEnvMode::Decal: { - float src_alpha = fragment.w(); - fragment.set_x(mix(fragment.x(), texel.x(), src_alpha)); - fragment.set_y(mix(fragment.y(), texel.y(), src_alpha)); - fragment.set_z(mix(fragment.z(), texel.z(), src_alpha)); + auto src_alpha = quad.out_color.w(); + quad.out_color.set_x(mix(quad.out_color.x(), texel.x(), src_alpha)); + quad.out_color.set_y(mix(quad.out_color.y(), texel.y(), src_alpha)); + quad.out_color.set_z(mix(quad.out_color.z(), texel.z(), src_alpha)); break; } default: @@ -829,29 +846,33 @@ void Device::submit_triangle(const Triangle& triangle, Vector const& ena // Calculate fog // Math from here: https://opengl-notes.readthedocs.io/en/latest/topics/texturing/aliasing.html + + // FIXME: exponential fog is not vectorized, we should add a SIMD exp function that calculates an approximation. if (m_options.fog_enabled) { - float factor = 0.0f; + auto factor = expand4(0.0f); switch (m_options.fog_mode) { case FogMode::Linear: - factor = (m_options.fog_end - fog_depth) / (m_options.fog_end - m_options.fog_start); - break; - case FogMode::Exp: - factor = expf(-m_options.fog_density * fog_depth); - break; - case FogMode::Exp2: - factor = expf(-((m_options.fog_density * fog_depth) * (m_options.fog_density * fog_depth))); + factor = (m_options.fog_end - quad.fog_depth) / (m_options.fog_end - m_options.fog_start); break; + case FogMode::Exp: { + auto argument = -m_options.fog_density * quad.fog_depth; + factor = exp(argument); + } break; + case FogMode::Exp2: { + auto argument = m_options.fog_density * quad.fog_depth; + argument *= -argument; + factor = exp(argument); + } break; default: VERIFY_NOT_REACHED(); } // Mix texel's RGB with fog's RBG - leave alpha alone - fragment.set_x(mix(m_options.fog_color.x(), fragment.x(), factor)); - fragment.set_y(mix(m_options.fog_color.y(), fragment.y(), factor)); - fragment.set_z(mix(m_options.fog_color.z(), fragment.z(), factor)); + auto fog_color = expand4(m_options.fog_color); + quad.out_color.set_x(mix(fog_color.x(), quad.out_color.x(), factor)); + quad.out_color.set_y(mix(fog_color.y(), quad.out_color.y(), factor)); + quad.out_color.set_z(mix(fog_color.z(), quad.out_color.z(), factor)); } - - return fragment; }); } diff --git a/Userland/Libraries/LibSoftGPU/Sampler.cpp b/Userland/Libraries/LibSoftGPU/Sampler.cpp index 296ebc6ba7..973a7e9f0a 100644 --- a/Userland/Libraries/LibSoftGPU/Sampler.cpp +++ b/Userland/Libraries/LibSoftGPU/Sampler.cpp @@ -4,43 +4,55 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include +#include #include #include +#include #include #include namespace SoftGPU { -static constexpr float fracf(float value) +using AK::SIMD::f32x4; +using AK::SIMD::i32x4; +using AK::SIMD::u32x4; + +using AK::SIMD::clamp; +using AK::SIMD::expand4; +using AK::SIMD::floor_int_range; +using AK::SIMD::frac_int_range; +using AK::SIMD::maskbits; +using AK::SIMD::to_f32x4; +using AK::SIMD::to_i32x4; +using AK::SIMD::to_u32x4; +using AK::SIMD::truncate_int_range; + +static f32x4 wrap_repeat(f32x4 value) { - return value - floorf(value); + return frac_int_range(value); } -static constexpr float wrap_repeat(float value) +[[maybe_unused]] static f32x4 wrap_clamp(f32x4 value) { - return fracf(value); + return clamp(value, expand4(0.0f), expand4(1.0f)); } -[[maybe_unused]] static constexpr float wrap_clamp(float value) +static f32x4 wrap_clamp_to_edge(f32x4 value, u32x4 num_texels) { - return clamp(value, 0.0f, 1.0f); -} - -static constexpr float wrap_clamp_to_edge(float value, unsigned num_texels) -{ - float const clamp_limit = 1.f / (2 * num_texels); + f32x4 const clamp_limit = 1.f / to_f32x4(2 * num_texels); return clamp(value, clamp_limit, 1.0f - clamp_limit); } -static constexpr float wrap_mirrored_repeat(float value, unsigned num_texels) +static f32x4 wrap_mirrored_repeat(f32x4 value, u32x4 num_texels) { - float integer = floorf(value); - float frac = value - integer; - bool iseven = fmodf(integer, 2.0f) == 0.0f; - return wrap_clamp_to_edge(iseven ? frac : 1 - frac, num_texels); + f32x4 integer = floor_int_range(value); + f32x4 frac = value - integer; + auto is_odd = to_i32x4(integer) & 1; + return wrap_clamp_to_edge(is_odd ? 1 - frac : frac, num_texels); } -static constexpr float wrap(float value, TextureWrapMode mode, unsigned num_texels) +static f32x4 wrap(f32x4 value, TextureWrapMode mode, u32x4 num_texels) { switch (mode) { case TextureWrapMode::Repeat: @@ -60,59 +72,103 @@ static constexpr float wrap(float value, TextureWrapMode mode, unsigned num_texe } } -FloatVector4 Sampler::sample_2d(FloatVector2 const& uv) const +ALWAYS_INLINE static Vector4 texel4(Image const& image, u32x4 layer, u32x4 level, u32x4 x, u32x4 y, u32x4 z) +{ + auto t0 = image.texel(layer[0], level[0], x[0], y[0], z[0]); + auto t1 = image.texel(layer[1], level[1], x[1], y[1], z[1]); + auto t2 = image.texel(layer[2], level[2], x[2], y[2], z[2]); + auto t3 = image.texel(layer[3], level[3], x[3], y[3], z[3]); + + return Vector4 { + f32x4 { t0.x(), t1.x(), t2.x(), t3.x() }, + f32x4 { t0.y(), t1.y(), t2.y(), t3.y() }, + f32x4 { t0.z(), t1.z(), t2.z(), t3.z() }, + f32x4 { t0.w(), t1.w(), t2.w(), t3.w() }, + }; +} + +ALWAYS_INLINE static Vector4 texel4border(Image const& image, u32x4 layer, u32x4 level, u32x4 x, u32x4 y, u32x4 z, FloatVector4 const& border, u32x4 w, u32x4 h) +{ + auto border_mask = maskbits(x < 0 || x >= w || y < 0 || y >= h); + + auto t0 = border_mask & 1 ? border : image.texel(layer[0], level[0], x[0], y[0], z[0]); + auto t1 = border_mask & 2 ? border : image.texel(layer[1], level[1], x[1], y[1], z[1]); + auto t2 = border_mask & 4 ? border : image.texel(layer[2], level[2], x[2], y[2], z[2]); + auto t3 = border_mask & 8 ? border : image.texel(layer[3], level[3], x[3], y[3], z[3]); + + return Vector4 { + f32x4 { t0.x(), t1.x(), t2.x(), t3.x() }, + f32x4 { t0.y(), t1.y(), t2.y(), t3.y() }, + f32x4 { t0.z(), t1.z(), t2.z(), t3.z() }, + f32x4 { t0.w(), t1.w(), t2.w(), t3.w() }, + }; +} + +Vector4 Sampler::sample_2d(Vector2 const& uv) const { if (m_config.bound_image.is_null()) - return { 0, 0, 0, 1 }; + return expand4(FloatVector4 { 1, 0, 0, 1 }); auto const& image = *m_config.bound_image; - unsigned const layer = 0; + u32x4 const layer = expand4(0u); // FIXME: calculate actual mipmap level to use - unsigned const level = 0; + u32x4 const level = expand4(0u); - unsigned width = image.level_width(level); - unsigned height = image.level_height(level); + u32x4 const width = { + image.level_width(level[0]), + image.level_width(level[1]), + image.level_width(level[2]), + image.level_width(level[3]), + }; + u32x4 const height = { + image.level_height(level[0]), + image.level_height(level[1]), + image.level_height(level[2]), + image.level_height(level[3]), + }; - float s = wrap(uv.x(), m_config.texture_wrap_u, width); - float t = wrap(uv.y(), m_config.texture_wrap_v, height); + f32x4 s = wrap(uv.x(), m_config.texture_wrap_u, width); + f32x4 t = wrap(uv.y(), m_config.texture_wrap_v, height); - float u = s * width; - float v = t * height; + f32x4 u = s * to_f32x4(width); + f32x4 v = t * to_f32x4(height); if (m_config.texture_mag_filter == TextureFilter::Nearest) { - unsigned i = min(static_cast(u), width - 1); - unsigned j = min(static_cast(v), height - 1); - return image.texel(layer, level, i, j, 0); + u32x4 i = to_i32x4(u) % width; + u32x4 j = to_i32x4(v) % height; + u32x4 k = expand4(0u); + + return texel4(image, layer, level, i, j, k); } u -= 0.5f; v -= 0.5f; - int i0 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? static_cast(floorf(u)) % width : floorf(u); - int j0 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? static_cast(floorf(v)) % height : floorf(v); + i32x4 i0 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? to_i32x4(to_u32x4(floor_int_range(u)) % width) : to_i32x4(floor_int_range(u)); + i32x4 j0 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? to_i32x4(to_u32x4(floor_int_range(v)) % height) : to_i32x4(floor_int_range(v)); - int i1 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? (i0 + 1) % width : i0 + 1; - int j1 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? (j0 + 1) % height : j0 + 1; + i32x4 i1 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? to_i32x4((i0 + 1) % width) : i0 + 1; + i32x4 j1 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? to_i32x4((j0 + 1) % height) : j0 + 1; - FloatVector4 t0, t1, t2, t3; + u32x4 k = expand4(0u); + + Vector4 t0, t1, t2, t3; if (m_config.texture_wrap_u == TextureWrapMode::Repeat && m_config.texture_wrap_v == TextureWrapMode::Repeat) { - t0 = image.texel(layer, level, i0, j0, 0); - t1 = image.texel(layer, level, i1, j0, 0); - t2 = image.texel(layer, level, i0, j1, 0); - t3 = image.texel(layer, level, i1, j1, 0); + t0 = texel4(image, layer, level, to_u32x4(i0), to_u32x4(j0), k); + t1 = texel4(image, layer, level, to_u32x4(i1), to_u32x4(j0), k); + t2 = texel4(image, layer, level, to_u32x4(i0), to_u32x4(j1), k); + t3 = texel4(image, layer, level, to_u32x4(i1), to_u32x4(j1), k); } else { - int w = static_cast(width); - int h = static_cast(height); - t0 = (i0 < 0 || i0 >= w || j0 < 0 || j0 >= h) ? m_config.border_color : image.texel(layer, level, i0, j0, 0); - t1 = (i1 < 0 || i1 >= w || j0 < 0 || j0 >= h) ? m_config.border_color : image.texel(layer, level, i1, j0, 0); - t2 = (i0 < 0 || i0 >= w || j1 < 0 || j1 >= h) ? m_config.border_color : image.texel(layer, level, i0, j1, 0); - t3 = (i1 < 0 || i1 >= w || j1 < 0 || j1 >= h) ? m_config.border_color : image.texel(layer, level, i1, j1, 0); + t0 = texel4border(image, layer, level, to_u32x4(i0), to_u32x4(j0), k, m_config.border_color, width, height); + t1 = texel4border(image, layer, level, to_u32x4(i1), to_u32x4(j0), k, m_config.border_color, width, height); + t2 = texel4border(image, layer, level, to_u32x4(i0), to_u32x4(j1), k, m_config.border_color, width, height); + t3 = texel4border(image, layer, level, to_u32x4(i1), to_u32x4(j1), k, m_config.border_color, width, height); } - float const alpha = fracf(u); - float const beta = fracf(v); + f32x4 const alpha = frac_int_range(u); + f32x4 const beta = frac_int_range(v); auto const lerp_0 = mix(t0, t1, alpha); auto const lerp_1 = mix(t2, t3, alpha); diff --git a/Userland/Libraries/LibSoftGPU/Sampler.h b/Userland/Libraries/LibSoftGPU/Sampler.h index d83a16c5bd..a235cac4ba 100644 --- a/Userland/Libraries/LibSoftGPU/Sampler.h +++ b/Userland/Libraries/LibSoftGPU/Sampler.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -52,7 +53,7 @@ struct SamplerConfig final { class Sampler final { public: - FloatVector4 sample_2d(FloatVector2 const& uv) const; + Vector4 sample_2d(Vector2 const& uv) const; void set_config(SamplerConfig const& config) { m_config = config; } SamplerConfig const& config() const { return m_config; }