mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 11:52:44 +00:00 
			
		
		
		
	 11c807ebd1
			
		
	
	
		11c807ebd1
		
	
	
	
	
		
			
			This implements an 8-bit front stencil buffer. Stencil operations are SIMD optimized. LibGL changes include: * New `glStencilMask` and `glStencilMaskSeparate` functions * New context parameter `GL_STENCIL_CLEAR_VALUE`
		
			
				
	
	
		
			159 lines
		
	
	
	
		
			3.8 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			159 lines
		
	
	
	
		
			3.8 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2021, Stephan Unverwerth <s.unverwerth@serenityos.org>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #include <AK/SIMD.h>
 | |
| 
 | |
| // Functions returning vectors or accepting vector arguments have different calling conventions
 | |
| // depending on whether the target architecture supports SSE or not. GCC generates warning "psabi"
 | |
| // when compiling for non-SSE architectures. We disable this warning because these functions
 | |
| // are static and should never be visible from outside the translation unit that includes this header.
 | |
| #pragma GCC diagnostic push
 | |
| #pragma GCC diagnostic ignored "-Wpsabi"
 | |
| 
 | |
| namespace AK::SIMD {
 | |
| 
 | |
| // SIMD Vector Expansion
 | |
| 
 | |
| ALWAYS_INLINE static constexpr f32x4 expand4(float f)
 | |
| {
 | |
|     return f32x4 { f, f, f, f };
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static constexpr i32x4 expand4(i32 i)
 | |
| {
 | |
|     return i32x4 { i, i, i, i };
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static constexpr u32x4 expand4(u32 u)
 | |
| {
 | |
|     return u32x4 { u, u, u, u };
 | |
| }
 | |
| 
 | |
| // Casting
 | |
| 
 | |
| template<typename TSrc>
 | |
| ALWAYS_INLINE static u32x4 to_u32x4(TSrc v)
 | |
| {
 | |
|     return __builtin_convertvector(v, u32x4);
 | |
| }
 | |
| 
 | |
| template<typename TSrc>
 | |
| ALWAYS_INLINE static i32x4 to_i32x4(TSrc v)
 | |
| {
 | |
|     return __builtin_convertvector(v, i32x4);
 | |
| }
 | |
| 
 | |
| template<typename TSrc>
 | |
| ALWAYS_INLINE static f32x4 to_f32x4(TSrc v)
 | |
| {
 | |
|     return __builtin_convertvector(v, f32x4);
 | |
| }
 | |
| 
 | |
| // Masking
 | |
| 
 | |
| ALWAYS_INLINE static i32 maskbits(i32x4 mask)
 | |
| {
 | |
| #if defined(__SSE__)
 | |
|     return __builtin_ia32_movmskps((f32x4)mask);
 | |
| #else
 | |
|     return ((mask[0] & 0x80000000) >> 31) | ((mask[1] & 0x80000000) >> 30) | ((mask[2] & 0x80000000) >> 29) | ((mask[3] & 0x80000000) >> 28);
 | |
| #endif
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static bool all(i32x4 mask)
 | |
| {
 | |
|     return maskbits(mask) == 15;
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static bool any(i32x4 mask)
 | |
| {
 | |
|     return maskbits(mask) != 0;
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static bool none(i32x4 mask)
 | |
| {
 | |
|     return maskbits(mask) == 0;
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static int maskcount(i32x4 mask)
 | |
| {
 | |
|     constexpr static int count_lut[16] { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
 | |
|     return count_lut[maskbits(mask)];
 | |
| }
 | |
| 
 | |
| // Load / Store
 | |
| 
 | |
| ALWAYS_INLINE static f32x4 load4(float const* a, float const* b, float const* c, float const* d)
 | |
| {
 | |
|     return f32x4 { *a, *b, *c, *d };
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static u32x4 load4(u32 const* a, u32 const* b, u32 const* c, u32 const* d)
 | |
| {
 | |
|     return u32x4 { *a, *b, *c, *d };
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static f32x4 load4_masked(float const* a, float const* b, float const* c, float const* d, i32x4 mask)
 | |
| {
 | |
|     int bits = maskbits(mask);
 | |
|     return f32x4 {
 | |
|         bits & 1 ? *a : 0.f,
 | |
|         bits & 2 ? *b : 0.f,
 | |
|         bits & 4 ? *c : 0.f,
 | |
|         bits & 8 ? *d : 0.f,
 | |
|     };
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static i32x4 load4_masked(u8 const* a, u8 const* b, u8 const* c, u8 const* d, i32x4 mask)
 | |
| {
 | |
|     int bits = maskbits(mask);
 | |
|     return i32x4 {
 | |
|         bits & 1 ? *a : 0,
 | |
|         bits & 2 ? *b : 0,
 | |
|         bits & 4 ? *c : 0,
 | |
|         bits & 8 ? *d : 0,
 | |
|     };
 | |
| }
 | |
| 
 | |
| ALWAYS_INLINE static u32x4 load4_masked(u32 const* a, u32 const* b, u32 const* c, u32 const* d, i32x4 mask)
 | |
| {
 | |
|     int bits = maskbits(mask);
 | |
|     return u32x4 {
 | |
|         bits & 1 ? *a : 0u,
 | |
|         bits & 2 ? *b : 0u,
 | |
|         bits & 4 ? *c : 0u,
 | |
|         bits & 8 ? *d : 0u,
 | |
|     };
 | |
| }
 | |
| 
 | |
| template<typename VectorType, typename UnderlyingType = decltype(declval<VectorType>()[0])>
 | |
| ALWAYS_INLINE static void store4(VectorType v, UnderlyingType* a, UnderlyingType* b, UnderlyingType* c, UnderlyingType* d)
 | |
| {
 | |
|     *a = v[0];
 | |
|     *b = v[1];
 | |
|     *c = v[2];
 | |
|     *d = v[3];
 | |
| }
 | |
| 
 | |
| template<typename VectorType, typename UnderlyingType = decltype(declval<VectorType>()[0])>
 | |
| ALWAYS_INLINE static void store4_masked(VectorType v, UnderlyingType* a, UnderlyingType* b, UnderlyingType* c, UnderlyingType* d, i32x4 mask)
 | |
| {
 | |
|     int bits = maskbits(mask);
 | |
|     if (bits & 1)
 | |
|         *a = v[0];
 | |
|     if (bits & 2)
 | |
|         *b = v[1];
 | |
|     if (bits & 4)
 | |
|         *c = v[2];
 | |
|     if (bits & 8)
 | |
|         *d = v[3];
 | |
| }
 | |
| 
 | |
| }
 | |
| 
 | |
| #pragma GCC diagnostic pop
 |