From 33b9da1b748f9ae505ba111aa1ee543b18d18312 Mon Sep 17 00:00:00 2001 From: Yuxuan Shui Date: Mon, 20 Jun 2016 16:48:49 -0400 Subject: [PATCH 4/4] bx: Convert float4_neon.h to use public neon intrinsics (cherry picked from commit 8cfa474ab2c27b68224c0c8e61601bd74dca64cc) Signed-off-by: Jeffy Chen --- 3rdparty/bx/include/bx/float4_neon.h | 235 +++++++++++++++++------------------ 1 file changed, 117 insertions(+), 118 deletions(-) diff --git a/3rdparty/bx/include/bx/float4_neon.h b/3rdparty/bx/include/bx/float4_neon.h index 3b6fa185..7c3feeb2 100644 --- a/3rdparty/bx/include/bx/float4_neon.h +++ b/3rdparty/bx/include/bx/float4_neon.h @@ -6,13 +6,12 @@ #ifndef BX_FLOAT4_NEON_H_HEADER_GUARD #define BX_FLOAT4_NEON_H_HEADER_GUARD +#include + namespace bx { - typedef __builtin_neon_sf float4_t __attribute__( (__vector_size__(16) ) ); + typedef float32x4_t float4_t; - typedef __builtin_neon_sf _f32x2_t __attribute__( (__vector_size__( 8) ) ); - typedef __builtin_neon_si _i32x4_t __attribute__( (__vector_size__(16) ) ); - typedef __builtin_neon_usi _u32x4_t __attribute__( (__vector_size__(16) ) ); #define ELEMx 0 #define ELEMy 1 @@ -21,7 +20,7 @@ namespace bx #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ { \ - return __builtin_shuffle(_a, (_u32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ + return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ } #include "float4_swizzle.inl" @@ -56,117 +55,117 @@ IMPLEMENT_TEST(xyzw , xyzw); BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 0, 1, 4, 5 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 4, 5, 0, 1 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 6, 7, 2, 3 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 2, 3, 6, 7 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 0, 4, 1, 5 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 1, 5, 0, 4 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 2, 6, 3, 7 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 }); } BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) { - return __builtin_shuffle(_a, _b, (_u32x4_t){ 6, 2, 7, 3 }); + return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 }); } BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a) { - return __builtin_neon_vget_lanev4sf(_a, 0, 3); + return vgetq_lane_f32(_a, 0); } BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a) { - return __builtin_neon_vget_lanev4sf(_a, 1, 3); + return vgetq_lane_f32(_a, 1); } BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a) { - return __builtin_neon_vget_lanev4sf(_a, 2, 3); + return vgetq_lane_f32(_a, 2); } BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a) { - return __builtin_neon_vget_lanev4sf(_a, 3, 3); + return vgetq_lane_f32(_a, 3); } BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) { - return __builtin_neon_vld1v4sf( (const __builtin_neon_sf*)_ptr); + return vld1q_f32((const float32_t *)_ptr); } BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a) { - __builtin_neon_vst1v4sf( (__builtin_neon_sf*)_ptr, _a); + vst1q_f32((float32_t *)_ptr, _a); } BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a) { - __builtin_neon_vst1_lanev4sf( (__builtin_neon_sf*)_ptr, _a, 0); + vst1q_lane_f32((float32_t *)_ptr, _a, 0); } BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a) { - __builtin_neon_vst1v4sf( (__builtin_neon_sf*)_ptr, _a); + vst1q_f32((float32_t *)_ptr, _a); } BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) { - const float4_t val[4] = {_x, _y, _z, _w}; + const float32_t val[4] = {_x, _y, _z, _w}; return float4_ld(val); } BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { const uint32_t val[4] = {_x, _y, _z, _w}; - const _i32x4_t tmp = __builtin_neon_vld1v4si( (const __builtin_neon_si*)val); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const uint32x4_t tmp = vld1q_u32(val); + const float4_t result = vreinterpretq_f32_u32(tmp); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) { - const float4_t tmp0 = __builtin_neon_vld1v4sf( (const __builtin_neon_sf *)_ptr); - const _f32x2_t tmp1 = __builtin_neon_vget_lowv4sf(tmp0); - const float4_t result = __builtin_neon_vdup_lanev4sf(tmp1, 0); + const float4_t tmp0 = vld1q_f32((const float32_t *)_ptr); + const float32x2_t tmp1 = vget_low_f32(tmp0); + const float4_t result = vdupq_lane_f32(tmp1, 0); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) { - return __builtin_neon_vdup_nv4sf(_a); + return vdupq_n_f32(_a); } BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) { - const _i32x4_t tmp = __builtin_neon_vdup_nv4si( (__builtin_neon_si)_a); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const int32x4_t tmp = vdupq_n_s32(_a); + const float4_t result = vreinterpretq_f32_s32(tmp); return result; } @@ -178,131 +177,131 @@ IMPLEMENT_TEST(xyzw , xyzw); BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a) { - const _i32x4_t itof = __builtin_neon_vreinterpretv4siv4sf(_a); - const float4_t result = __builtin_neon_vcvtv4si(itof, 1); + const int32x4_t itof = vreinterpretq_s32_f32(_a); + const float4_t result = vcvtq_f32_s32(itof); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a) { - const _i32x4_t ftoi = __builtin_neon_vcvtv4sf(_a, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(ftoi); + const int32x4_t ftoi = vcvtq_s32_f32(_a); + const float4_t result = vreinterpretq_f32_s32(ftoi); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b) { - return __builtin_neon_vaddv4sf(_a, _b, 3); + return vaddq_f32(_a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b) { - return __builtin_neon_vsubv4sf(_a, _b, 3); + return vsubq_f32(_a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b) { - return __builtin_neon_vmulv4sf(_a, _b, 3); + return vmulq_f32(_a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a) { - return __builtin_neon_vrecpev4sf(_a, 3); + return vrecpeq_f32(_a); } BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a) { - return __builtin_neon_vrsqrtev4sf(_a, 3); + return vrsqrteq_f32(_a); } BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) { - const _i32x4_t tmp = __builtin_neon_vceqv4sf(_a, _b, 3); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const uint32x4_t tmp = vceqq_f32(_a, _b); + const float4_t result = vreinterpretq_f32_u32(tmp); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) { - const _i32x4_t tmp = __builtin_neon_vcgtv4sf(_b, _a, 3); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const uint32x4_t tmp = vcltq_f32(_a, _b); + const float4_t result = vreinterpretq_f32_u32(tmp); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) { - const _i32x4_t tmp = __builtin_neon_vcgev4sf(_b, _a, 3); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const uint32x4_t tmp = vcleq_f32(_a, _b); + const float4_t result = vreinterpretq_f32_u32(tmp); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) { - const _i32x4_t tmp = __builtin_neon_vcgtv4sf(_a, _b, 3); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const uint32x4_t tmp = vcgtq_f32(_a, _b); + const float4_t result = vreinterpretq_f32_u32(tmp); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) { - const _i32x4_t tmp = __builtin_neon_vcgev4sf(_a, _b, 3); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp); + const uint32x4_t tmp = vcgeq_f32(_a, _b); + const float4_t result = vreinterpretq_f32_u32(tmp); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_min(float4_t _a, float4_t _b) { - return __builtin_neon_vminv4sf(_a, _b, 3); + return vminq_f32(_a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_max(float4_t _a, float4_t _b) { - return __builtin_neon_vmaxv4sf(_a, _b, 3); + return vmaxq_f32(_a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vandv4si(tmp0, tmp1, 0); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vandq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vbicv4si(tmp0, tmp1, 0); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vorrv4si(tmp0, tmp1, 0); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_veorv4si(tmp0, tmp1, 0); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = veorq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } @@ -311,17 +310,17 @@ IMPLEMENT_TEST(xyzw , xyzw); { if (__builtin_constant_p(_count) ) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vshl_nv4si(tmp0, _count, 0); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1); + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count); + const float4_t result = vreinterpretq_f32_u32(tmp1); return result; } - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t shift = __builtin_neon_vdup_nv4si( (__builtin_neon_si)_count); - const _i32x4_t tmp1 = __builtin_neon_vshlv4si(tmp0, shift, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1); + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const int32x4_t shift = vdupq_n_s32(_count); + const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); + const float4_t result = vreinterpretq_f32_u32(tmp1); return result; } @@ -330,17 +329,17 @@ IMPLEMENT_TEST(xyzw , xyzw); { if (__builtin_constant_p(_count) ) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vshr_nv4si(tmp0, _count, 0); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1); + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count); + const float4_t result = vreinterpretq_f32_u32(tmp1); return result; } - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t shift = __builtin_neon_vdup_nv4si( (__builtin_neon_si)-_count); - const _i32x4_t tmp1 = __builtin_neon_vshlv4si(tmp0, shift, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1); + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const int32x4_t shift = vdupq_n_s32(-_count); + const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); + const float4_t result = vreinterpretq_f32_u32(tmp1); return result; } @@ -349,97 +348,97 @@ IMPLEMENT_TEST(xyzw , xyzw); { if (__builtin_constant_p(_count) ) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vshr_nv4si(tmp0, _count, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count); + const float4_t result = vreinterpretq_f32_s32(tmp1); return result; } - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t shift = __builtin_neon_vdup_nv4si( (__builtin_neon_si)-_count); - const _i32x4_t tmp1 = __builtin_neon_vshlv4si(tmp0, shift, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t shift = vdupq_n_s32(-_count); + const int32x4_t tmp1 = vshlq_s32(tmp0, shift); + const float4_t result = vreinterpretq_f32_s32(tmp1); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_madd(float4_t _a, float4_t _b, float4_t _c) { - return __builtin_neon_vmlav4sf(_c, _a, _b, 3); + return vmlaq_f32(_c, _a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_nmsub(float4_t _a, float4_t _b, float4_t _c) { - return __builtin_neon_vmlsv4sf(_c, _a, _b, 3); + return vmlsq_f32(_c, _a, _b); } BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vceqv4si(tmp0, tmp1, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_u32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vcgtv4si(tmp1, tmp0, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_u32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vcgtv4si(tmp0, tmp1, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_u32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_imin(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vminv4si(tmp0, tmp1, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vminq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_imax(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vmaxv4si(tmp0, tmp1, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vaddv4si(tmp0, tmp1, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b) { - const _i32x4_t tmp0 = __builtin_neon_vreinterpretv4siv4sf(_a); - const _i32x4_t tmp1 = __builtin_neon_vreinterpretv4siv4sf(_b); - const _i32x4_t tmp2 = __builtin_neon_vsubv4si(tmp0, tmp1, 1); - const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2); + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_s32(tmp2); return result; } -- 2.11.0