4747#define sinm__aligned_var (type , bytes ) type __attribute__((aligned(bytes)))
4848#endif
4949
50+ #ifndef SINM_USE_SIMD
51+ #if defined(__x86_64__ ) || defined(_M_X64 ) || defined(__i386__ ) || defined(_M_IX86 )
52+ #define SINM_USE_SIMD 1
53+ #endif
54+ #endif
55+
5056#ifndef SINM_TYPES
5157#define SINM_TYPES
5258typedef enum
@@ -96,12 +102,15 @@ sinm_composite(const uint32_t *in1, const uint32_t *in2, uint32_t *out, int32_t
96102
97103#else // SI_NORMALMAP_IMPLEMENTATION
98104
105+ #if SINM_USE_SIMD
99106#ifdef _MSC_VER
100107#include <intrin.h>
101108#else
102109#include <x86intrin.h>
103110#endif
111+ #endif
104112
113+ #if SINM_USE_SIMD
105114#ifdef __AVX__
106115#define SINM_SIMD_ALIGNMENT 32
107116#define simd_prefix_float (name ) _mm256_##name
@@ -112,6 +121,8 @@ sinm_composite(const uint32_t *in1, const uint32_t *in2, uint32_t *out, int32_t
112121#define simd__or_ix (a , b ) _mm256_or_si256(a, b)
113122#define simd__loadu_ix (a ) _mm256_loadu_si256(a)
114123#define simd__storeu_ix (ptr , v ) _mm256_storeu_si256(ptr, v)
124+ #define simd__setzero_ix () _mm256_setzero_si256()
125+ #define simd__setzero_ps () _mm256_setzero_ps()
115126#else
116127#define simd_prefix_float (name ) _mm_##name
117128#define SINM_SIMD_ALIGNMENT 16
@@ -122,16 +133,31 @@ sinm_composite(const uint32_t *in1, const uint32_t *in2, uint32_t *out, int32_t
122133#define simd__or_ix (a , b ) _mm_or_si128(a, b)
123134#define simd__loadu_ix (a ) _mm_loadu_si128(a)
124135#define simd__storeu_ix (ptr , v ) _mm_storeu_si128(ptr, v)
136+ #define simd__setzero_ix () _mm_setzero_si128()
137+ #define simd__setzero_ps () _mm_setzero_ps()
125138#endif // __AVX__
126139
127140#define simd__set1_epi32 (a ) simd_prefix_float(set1_epi32(a))
128- #define simd__setzero_ix () simd_prefix_float(setzero_si256())
129- #define simd__setzero_ps () simd_prefix_float(setzero_ps())
130141#define simd__andnot_ps (a , b ) simd_prefix_float(andnot_ps(a, b))
131142#define simd__add_epi32 (a , b ) simd_prefix_float(add_epi32(a, b))
132143#define simd__sub_epi32 (a , b ) simd_prefix_float(sub_epi32(a, b))
144+
145+ #if defined(__AVX__ ) || defined(__SSE4_1__ )
133146#define simd__max_epi32 (a , b ) simd_prefix_float(max_epi32(a, b))
134147#define simd__min_epi32 (a , b ) simd_prefix_float(min_epi32(a, b))
148+ #else
149+ static sinm__inline __m128i sinm__sse2_max_epi32 (__m128i a , __m128i b ) {
150+ __m128i mask = _mm_cmpgt_epi32 (a , b );
151+ return _mm_or_si128 (_mm_and_si128 (mask , a ), _mm_andnot_si128 (mask , b ));
152+ }
153+ static sinm__inline __m128i sinm__sse2_min_epi32 (__m128i a , __m128i b ) {
154+ __m128i mask = _mm_cmpgt_epi32 (b , a );
155+ return _mm_or_si128 (_mm_and_si128 (mask , a ), _mm_andnot_si128 (mask , b ));
156+ }
157+ #define simd__max_epi32 (a , b ) sinm__sse2_max_epi32(a, b)
158+ #define simd__min_epi32 (a , b ) sinm__sse2_min_epi32(a, b)
159+ #endif
160+
135161#define simd__loadu_ps (a ) simd_prefix_float(loadu_ps(a))
136162#define simd__srli_epi32 (a , i ) simd_prefix_float(srli_epi32(a, i))
137163#define simd__slli_epi32 (a , i ) simd_prefix_float(slli_epi32(a, i))
@@ -145,6 +171,7 @@ sinm_composite(const uint32_t *in1, const uint32_t *in2, uint32_t *out, int32_t
145171#define simd__div_ps (a , b ) simd_prefix_float(div_ps(a, b))
146172#define simd__hadd_ps (a , b ) simd_prefix_float(hadd_ps(a, b))
147173#define simd__cvtss_f32 (a ) simd_prefix_float(cvtss_f32(a))
174+ #endif // SINM_USE_SIMD
148175
149176#define sinm__min (a , b ) ((a) < (b) ? (a) : (b))
150177#define sinm__max (a , b ) ((a) > (b) ? (a) : (b))
@@ -165,11 +192,13 @@ sinm__length(float x, float y, float z)
165192 return sqrtf (x * x + y * y + z * z );
166193}
167194
195+ #if SINM_USE_SIMD
168196sinm__inline static simd__float
169197sinm__length_simd (simd__float x , simd__float y , simd__float z )
170198{
171199 return simd__sqrt_ps (simd__add_ps (simd__add_ps (simd__mul_ps (x , x ), simd__mul_ps (y , y )), simd__mul_ps (z , z )));
172200}
201+ #endif
173202
174203sinm__inline static sinm__v3
175204sinm__normalized (float x , float y , float z )
@@ -222,6 +251,7 @@ sinm__rgba_to_v3(uint32_t c)
222251 return result ;
223252}
224253
254+ #if SINM_USE_SIMD
225255static sinm__inline void
226256sinm__rgba_to_v3_simd (simd__int c , simd__float * x , simd__float * y , simd__float * z )
227257{
@@ -231,6 +261,7 @@ sinm__rgba_to_v3_simd(simd__int c, simd__float *x, simd__float *y, simd__float *
231261 * y = simd__cvtepi32_ps (simd__sub_epi32 (simd__and_ix (simd__srli_epi32 (c , 8 ), ff ), v127 ));
232262 * z = simd__cvtepi32_ps (simd__sub_epi32 (simd__and_ix (simd__srli_epi32 (c , 16 ), ff ), v127 ));
233263}
264+ #endif
234265
235266static sinm__inline uint32_t
236267sinm__unit_vector_to_rgba (sinm__v3 v )
@@ -241,6 +272,7 @@ sinm__unit_vector_to_rgba(sinm__v3 v)
241272 return r | g << 8u | b << 16u | 255u << 24u ;
242273}
243274
275+ #if SINM_USE_SIMD
244276static sinm__inline simd__int
245277sinm__v3_to_rgba_simd (simd__float x , simd__float y , simd__float z )
246278{
@@ -253,6 +285,7 @@ sinm__v3_to_rgba_simd(simd__float x, simd__float y, simd__float z)
253285 simd__int c = simd__or_ix (simd__or_ix (simd__or_ix (r , simd__slli_epi32 (g , 8 )), simd__slli_epi32 (b , 16 )), a );
254286 return c ;
255287}
288+ #endif
256289
257290SINM_DEF void
258291sinm__generate_gaussian_box (float * outBoxes , int32_t n , float sigma )
@@ -396,6 +429,7 @@ sinm__sobel3x3_normals(const uint32_t *in, uint32_t *out, int32_t w, int32_t h,
396429 sinm__sobel3x3_normals_row_range (in , out , 0 , w , w , h , scale , flipY );
397430}
398431
432+ #if SINM_USE_SIMD
399433static void
400434sinm__sobel3x3_normals_simd (const uint32_t * in , uint32_t * out , int32_t w , int32_t h , float scale , int flipY )
401435{
@@ -473,6 +507,7 @@ sinm__sobel3x3_normals_simd(const uint32_t *in, uint32_t *out, int32_t w, int32_
473507
474508 sinm__sobel3x3_normals_row_range (in , out , w - remainder - 8 , w , w , h , scale , flipY );
475509}
510+ #endif
476511
477512SINM_DEF void
478513sinm__normalize (uint32_t * in , int32_t w , int32_t h , float scale , int flipY )
@@ -485,6 +520,7 @@ sinm__normalize(uint32_t *in, int32_t w, int32_t h, float scale, int flipY)
485520 }
486521}
487522
523+ #if SINM_USE_SIMD
488524SINM_DEF void
489525sinm__normalize_simd (uint32_t * in , int32_t w , int32_t h , float scale , int flipY )
490526{
@@ -505,11 +541,16 @@ sinm__normalize_simd(uint32_t *in, int32_t w, int32_t h, float scale, int flipY)
505541
506542 sinm__normalize (offset_in , 1 , remainder , scale , flipY );
507543}
544+ #endif
508545
509546SINM_DEF sinm__inline void
510547sinm_normalize (uint32_t * in , int32_t w , int32_t h , float scale , int flipY )
511548{
549+ #if SINM_USE_SIMD
512550 sinm__normalize_simd (in , w , h , scale , flipY );
551+ #else
552+ sinm__normalize (in , w , h , scale , flipY );
553+ #endif
513554}
514555
515556SINM_DEF void
@@ -531,6 +572,7 @@ sinm__composite(const uint32_t *in1, const uint32_t *in2, uint32_t *out, int32_t
531572 }
532573}
533574
575+ #if SINM_USE_SIMD
534576SINM_DEF void
535577sinm__composite_simd (const uint32_t * in1 , const uint32_t * in2 , uint32_t * out , int32_t w , int32_t h )
536578{
@@ -563,11 +605,16 @@ sinm__composite_simd(const uint32_t *in1, const uint32_t *in2, uint32_t *out, in
563605
564606 sinm__composite (offset_in1 , offset_in2 , offset_out , 1 , remainder );
565607}
608+ #endif
566609
567610SINM_DEF sinm__inline void
568611sinm_composite (const uint32_t * in1 , const uint32_t * in2 , uint32_t * out , int32_t w , int32_t h )
569612{
613+ #if SINM_USE_SIMD
570614 sinm__composite_simd (in1 , in2 , out , w , h );
615+ #else
616+ sinm__composite (in1 , in2 , out , w , h );
617+ #endif
571618}
572619
573620SINM_DEF sinm__inline uint32_t *
@@ -615,6 +662,7 @@ sinm__greyscale(const uint32_t *in, uint32_t *out, int32_t w, int32_t h, sinm_gr
615662 }
616663}
617664
665+ #if SINM_USE_SIMD
618666static void
619667sinm__simd_greyscale (const uint32_t * in , uint32_t * out , int32_t w , int32_t h , sinm_greyscale_type type )
620668{
@@ -694,12 +742,16 @@ sinm__simd_greyscale(const uint32_t *in, uint32_t *out, int32_t w, int32_t h, si
694742 sinm__greyscale (offset_in , offset_out , 1 , remainder , type );
695743 }
696744}
745+ #endif
697746
698747SINM_DEF void
699748sinm_greyscale (const uint32_t * in , uint32_t * out , int32_t w , int32_t h , sinm_greyscale_type type )
700749{
701- int32_t count = w * h ;
750+ #if SINM_USE_SIMD
702751 sinm__simd_greyscale (in , out , w , h , type );
752+ #else
753+ sinm__greyscale (in , out , w , h , type );
754+ #endif
703755}
704756
705757SINM_DEF int
@@ -729,7 +781,11 @@ sinm_normal_map_buffer(const uint32_t *in,
729781 memcpy (intermediate , out , w * h * sizeof (uint32_t ));
730782 }
731783
784+ #if SINM_USE_SIMD
732785 sinm__sobel3x3_normals_simd (intermediate , out , w , h , scale , flipY );
786+ #else
787+ sinm__sobel3x3_normals (intermediate , out , w , h , scale , flipY );
788+ #endif
733789
734790 free (intermediate );
735791 return 1 ;
0 commit comments