From 5f1672ca85252d74fb501f582f6e1477700f8a9f Mon Sep 17 00:00:00 2001 From: s1lent Date: Sat, 9 Dec 2017 16:47:08 +0700 Subject: [PATCH] Move math sse to macros REHLDS_SSE --- rehlds/build.gradle | 4 +- rehlds/common/mathlib.h | 24 + rehlds/engine/mathlib.cpp | 986 +++++++++-------------------- rehlds/engine/mathlib_e.h | 61 +- rehlds/engine/mathlib_sse.cpp | 375 +++++++++++ rehlds/engine/mathlib_sse.h | 74 +++ rehlds/engine/sse_mathfun.h | 2 +- rehlds/msvc/ReHLDS.vcxproj | 6 +- rehlds/msvc/ReHLDS.vcxproj.filters | 6 + rehlds/public/rehlds/crc32c.cpp | 9 + rehlds/public/rehlds/crc32c.h | 2 + rehlds/public/rehlds/static_map.h | 7 +- 12 files changed, 812 insertions(+), 744 deletions(-) create mode 100644 rehlds/engine/mathlib_sse.cpp create mode 100644 rehlds/engine/mathlib_sse.h diff --git a/rehlds/build.gradle b/rehlds/build.gradle index b5cabf7..dafa2b9 100644 --- a/rehlds/build.gradle +++ b/rehlds/build.gradle @@ -184,11 +184,11 @@ void setupToolchain(NativeBinarySpec b) { } if (unitTestExecutable) { - cfg.singleDefines 'REHLDS_UNIT_TESTS' + cfg.singleDefines 'REHLDS_UNIT_TESTS', 'REHLDS_SSE' } if (rehldsFixes) { - cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS' + cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_SSE', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS' } ToolchainConfigUtils.apply(project, cfg, b) diff --git a/rehlds/common/mathlib.h b/rehlds/common/mathlib.h index 83affe8..545ecb3 100644 --- a/rehlds/common/mathlib.h +++ b/rehlds/common/mathlib.h @@ -82,6 +82,30 @@ inline T clamp(T a, T min, T max) return (a > max) ? max : (a < min) ? min : a; } +template +inline T M_min(T a, T b) +{ + return min(a, b); +} + +template +inline T M_max(T a, T b) +{ + return max(a, b); +} + +template +inline T M_clamp(T a, T min, T max) +{ + return clamp(a, min, max); +} + +template +inline double M_sqrt(T value) +{ + return sqrt(value); +} + template inline T bswap(T s) { diff --git a/rehlds/engine/mathlib.cpp b/rehlds/engine/mathlib.cpp index 8b81108..4647b97 100644 --- a/rehlds/engine/mathlib.cpp +++ b/rehlds/engine/mathlib.cpp @@ -28,91 +28,15 @@ #include "precompiled.h" -// Intrisics guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/ -// Shufps calculator: http://wurstcaptures.untergrund.net/assembler_tricks.html - vec3_t vec3_origin; //int nanmask; //short int new_cw; //short int old_cw; //DLONG dlong; -// aligned vec4_t -typedef ALIGN16 vec4_t avec4_t; -typedef ALIGN16 unsigned int aivec4_t[4]; - -// conversion multiplier -const avec4_t deg2rad = -{ - (float)M_PI / 180.f, - (float)M_PI / 180.f, - (float)M_PI / 180.f, - (float)M_PI / 180.f -}; - -const aivec4_t negmask[4] = -{ - 0x80000000, - 0x80000000, - 0x80000000, - 0x80000000 -}; - -const aivec4_t negmask_1001 = -{ - 0x80000000, - 0, - 0, - 0x80000000 -}; - -const aivec4_t negmask_0010 = -{ - 0, - 0, - 0x80000000, - 0 -}; - -// save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow. -inline void xmm2vec(vec_t *v, const __m128 m) -{ - _mm_storel_pi((__m64*)v, m); - _mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02)); -} - -FUNC_TARGET("sse4.1") -inline __m128 dotProduct3D(__m128 v1, __m128 v2) -{ - if (cpuinfo.sse4_1) - return _mm_dp_ps(v1, v2, 0x71); - __m128 v = _mm_mul_ps(v1, v2); - return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3 -} - -inline __m128 crossProduct3D(__m128 a, __m128 b) -{ - __m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); - __m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1))); - __m128 m = _mm_sub_ps(tmp1, tmp2); - - return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1)); -} - -inline __m128 length3D(__m128 v) -{ - return _mm_sqrt_ps(dotProduct3D(v, v)); -} - -inline __m128 length2D(__m128 v) -{ - v = _mm_mul_ps(v, v); - return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3 -} - float anglemod(float a) { - return (360.0 / 65536) * ((int)(a*(65536 / 360.0)) & 65535); + return (360.0 / 65536) * ((int)(a * (65536 / 360.0)) & 65535); } void BOPS_Error(void) @@ -120,91 +44,317 @@ void BOPS_Error(void) Sys_Error("%s: Bad signbits", __func__); } -#ifdef REHLDS_OPT_PEDANTIC -int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p) +NOBODY int InvertMatrix(const float *m, float *out); +//{ +// float wtmp; +// float m0; +// float m1; +// float m2; +// float m3; +// float s; +// float *r0; +// float *r1; +// float *r2; +// float *r3; +//} + +void EXT_FUNC AngleVectors_ext(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) { - double dist1, dist2; - int sides = 0; + AngleVectors(angles, forward, right, up); +} - __m128 emin = _mm_loadu_ps(emins); - __m128 emax = _mm_loadu_ps(emaxs); - avec4_t d1, d2; +NOBODY void AngleIMatrix(const vec_t *angles, float *matrix); +//{ +// float angle; +// float sr; +// float sp; +// float sy; +// float cr; +// float cp; +// float cy; +//} - // general case - switch (p->signbits) +NOBODY void NormalizeAngles(float *angles); +//{ +// int i; +//} + +NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac); +//{ +// int i; +// float ang1; +// float ang2; +// float d; +// NormalizeAngles(float *angles); +// NormalizeAngles(float *angles); +// NormalizeAngles(float *angles); +//} + +void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out) +{ + out[0] = _DotProduct(in1, in2[0]) + in2[0][3]; + out[1] = _DotProduct(in1, in2[1]) + in2[1][3]; + out[2] = _DotProduct(in1, in2[2]) + in2[2][3]; +} + +NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out); +//{ +//} + +void _VectorAdd(vec_t *veca, vec_t *vecb, vec_t *out) +{ + out[0] = veca[0] + vecb[0]; + out[1] = veca[1] + vecb[1]; + out[2] = veca[2] + vecb[2]; +} + +NOBODY void _VectorCopy(vec_t *in, vec_t *out); +//{ +//} + +float VectorNormalize(vec3_t v) +{ + float length, ilength; + length = Length(v); + + const int res = clamp(0.0, 1.0, 2.0); + + if (length) { - case 0: - _mm_store_ps(d1, emax); - _mm_store_ps(d2, emin); - break; - case 1: - _mm_store_ps(d1, emax); - _mm_store_ps(d2, emin); - d1[0] = emins[0]; - d2[0] = emaxs[0]; - break; - case 2: - _mm_store_ps(d1, emax); - _mm_store_ps(d2, emin); - d1[1] = emins[1]; - d2[1] = emaxs[1]; - break; - case 3: - _mm_store_ps(d1, emin); - _mm_store_ps(d2, emax); - d1[2] = emaxs[2]; - d2[2] = emins[2]; - break; - case 4: - _mm_store_ps(d1, emax); - _mm_store_ps(d2, emin); - d1[2] = emins[2]; - d2[2] = emaxs[2]; - break; - case 5: - _mm_store_ps(d1, emin); - _mm_store_ps(d2, emax); - d1[1] = emaxs[1]; - d2[1] = emins[1]; - break; - case 6: - _mm_store_ps(d1, emin); - _mm_store_ps(d2, emax); - d1[0] = emaxs[0]; - d2[0] = emins[0]; - break; - case 7: - _mm_store_ps(d1, emin); - _mm_store_ps(d2, emax); - break; - default: - BOPS_Error(); - dist1 = dist2 = 0.0; - break; + ilength = 1 / length; + v[0] *= ilength; + v[1] *= ilength; + v[2] *= ilength; } - dist1 = _DotProduct(p->normal, d1); - dist2 = _DotProduct(p->normal, d2); - - if (dist1 >= p->dist) - sides = 1; - if (dist2 < p->dist) - sides |= 2; - - return sides; + return length; } + +NOBODY void VectorInverse(vec_t *v); +//{ +//} + +NOBODY int Q_log2(int val); +//{ +// int answer; +//} + +NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up); +//{ +// vec3_t tmp; +// CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross) +// VectorNormalize(vec_t *v); +// CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross); +// VectorNormalize(vec_t *v); +//} + +void VectorAngles(const vec_t *forward, vec_t *angles) +{ + float length, yaw, pitch; + + if (forward[1] == 0 && forward[0] == 0) + { + yaw = 0; + if (forward[2] > 0) + pitch = 90; + else + pitch = 270; + } + else + { + yaw = (atan2((double)forward[1], (double)forward[0]) * 180.0 / M_PI); + if (yaw < 0) + yaw += 360; + +#ifdef REHLDS_SSE + length = Length2D(forward); #else + length = Q_sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1])); +#endif // REHLDS_SSE + + pitch = atan2((double)forward[2], (double)length) * 180.0 / M_PI; + if (pitch < 0) + pitch += 360; + } + + angles[0] = pitch; + angles[1] = yaw; + angles[2] = 0; +} + +NOBODY void R_ConcatRotations(float *in1, float *in2, float *out); +//{ +// +//} + +NOBODY void FloorDivMod(double numer, double denom, int *quotient, int *rem); +//{ +// int q; +// int r; +// double x; +// floor(double __x); +// floor(double __x); +//} + +NOBODY int GreatestCommonDivisor(int i1, int i2); +//{ +//} + +NOBODY fixed16_t Invert24To16(fixed16_t val); +//{ +//} + +#if !defined(REHLDS_SSE) +void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) +{ + float sr, sp, sy, cr, cp, cy; + +#ifndef SWDS + g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up); +#endif // SWDS + + float angle; + angle = (float)(angles[YAW] * (M_PI * 2 / 360)); + sy = sin(angle); + cy = cos(angle); + angle = (float)(angles[PITCH] * (M_PI * 2 / 360)); + sp = sin(angle); + cp = cos(angle); + angle = (float)(angles[ROLL] * (M_PI * 2 / 360)); + sr = sin(angle); + cr = cos(angle); + + if (forward) + { + forward[0] = cp*cy; + forward[1] = cp*sy; + forward[2] = -sp; + } + if (right) + { + right[0] = (-1 * sr*sp*cy + -1 * cr*-sy); + right[1] = (-1 * sr*sp*sy + -1 * cr*cy); + right[2] = -1 * sr*cp; + } + if (up) + { + up[0] = (cr*sp*cy + -sr*-sy); + up[1] = (cr*sp*sy + -sr*cy); + up[2] = cr*cp; + } +} + +void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) +{ + float sr, sp, sy, cr, cp, cy; + + float angle; + angle = (float)(angles[YAW] * (M_PI * 2 / 360)); + sy = sin(angle); + cy = cos(angle); + angle = (float)(angles[PITCH] * (M_PI * 2 / 360)); + sp = sin(angle); + cp = cos(angle); + angle = (float)(angles[ROLL] * (M_PI * 2 / 360)); + sr = sin(angle); + cr = cos(angle); + + if (forward) + { + forward[0] = cp*cy; + forward[1] = (sr*sp*cy + cr*-sy); + forward[2] = (cr*sp*cy + -sr*-sy); + } + if (right) + { + right[0] = cp*sy; + right[1] = (sr*sp*sy + cr*cy); + right[2] = (cr*sp*sy + -sr*cy); + } + if (up) + { + up[0] = -sp; + up[1] = sr*cp; + up[2] = cr*cp; + } +} + +void AngleMatrix(const vec_t *angles, float(*matrix)[4]) +{ + float sr, sp, sy, cr, cp, cy; + + float angle; + angle = (float)(angles[ROLL] * (M_PI * 2 / 360)); + sy = sin(angle); + cy = cos(angle); + angle = (float)(angles[YAW] * (M_PI * 2 / 360)); + sp = sin(angle); + cp = cos(angle); + angle = (float)(angles[PITCH] * (M_PI * 2 / 360)); + sr = sin(angle); + cr = cos(angle); + + matrix[0][0] = cr * cp; + matrix[0][1] = sy * sr * cp - cy * sp; + matrix[0][2] = cy * sr * cp + sy * sp; + matrix[0][3] = 0.0f; + + matrix[1][0] = cr * sp; + matrix[1][1] = sy * sr * sp + cy * cp; + matrix[1][2] = cy * sr * sp - sy * cp; + matrix[1][3] = 0.0f; + + matrix[2][0] = -sr; + matrix[2][1] = sy * cr; + matrix[2][2] = cy * cr; + matrix[2][3] = 0.0f; +} + +void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out) +{ + out[0] = scale * vecm[0] + veca[0]; + out[1] = scale * vecm[1] + veca[1]; + out[2] = scale * vecm[2] + veca[2]; +} + +real_t _DotProduct(const vec_t *v1, const vec_t *v2) +{ + return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; +} + +void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross) +{ + cross[0] = v1[1] * v2[2] - v1[2] * v2[1]; + cross[1] = v1[2] * v2[0] - v1[0] * v2[2]; + cross[2] = v1[0] * v2[1] - v1[1] * v2[0]; +} + +void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4]) +{ + out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] + in1[0][2] * in2[2][0]; + out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] + in1[0][2] * in2[2][1]; + out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] + in1[0][2] * in2[2][2]; + out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] + in1[0][2] * in2[2][3] + in1[0][3]; + out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] + in1[1][2] * in2[2][0]; + out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] + in1[1][2] * in2[2][1]; + out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] + in1[1][2] * in2[2][2]; + out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] + in1[1][2] * in2[2][3] + in1[1][3]; + out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] + in1[2][2] * in2[2][0]; + out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] + in1[2][2] * in2[2][1]; + out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] + in1[2][2] * in2[2][2]; + out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] + in1[2][2] * in2[2][3] + in1[2][3]; +} + +float Length(const vec_t *v) +{ + return Q_sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); +} int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p) { -#if (1) // Engine actual types - double dist1, dist2; -#else - // From sources - float dist1, dist2; -#endif - int sides = 0; + real_t dist1, dist2; + int sides = 0; // general case switch (p->signbits) @@ -249,546 +399,22 @@ int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p) if (dist1 >= p->dist) sides = 1; + if (dist2 < p->dist) sides |= 2; return sides; } -#endif -NOBODY int InvertMatrix(const float *m, float *out); -//{ -// float wtmp; // 159 -// float m0; // 160 -// float m1; // 160 -// float m2; // 160 -// float m3; // 160 -// float s; // 160 -// float *r0; // 161 -// float *r1; // 161 -// float *r2; // 161 -// float *r3; // 161 -//} - -void EXT_FUNC AngleVectors_ext(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) { - AngleVectors(angles, forward, right, up); -} - -#ifdef REHLDS_FIXES -// parallel SSE version -void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) +qboolean VectorCompare(const vec_t *v1, const vec_t *v2) { -#ifndef SWDS - g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up); -#endif // SWDS - - __m128 s, c; - sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c); - - __m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr] - __m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp] - __m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr]; - - m1 = _mm_shuffle_ps(c, s, 0x15); // [cy][cy][sy][sp] - m2 = _mm_shuffle_ps(s, c, 0xA0); // [sp][sp][cr][cr] - m1 = _mm_shuffle_ps(m1, m1, 0xC8); // [cy][sy][cy][sp] - - __m128 m3 = _mm_shuffle_ps(s, s, 0x4A); // [sr][sr][sp][sy]; - m3 = _mm_mul_ps(m3, _mm_mul_ps(m1, m2)); // [sp*cy*sr][sp*sy*sr][cr*cy*sp][cr*sp*sy] - - m2 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy] - m1 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr] - m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [sy][cr][sy][cy] - m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr] - m1 = _mm_mul_ps(m1, m2); // [-cr*sy][cy*cr][sr*sy][-sr*cy] - - m3 = _mm_add_ps(m3, m1); - - if (forward) - { - _mm_storel_pi((__m64 *)forward, _mm_shuffle_ps(cp_mults, cp_mults, 0x08)); - forward[2] = -_mm_cvtss_f32(s); - } - if (right) - { - __m128 r = _mm_shuffle_ps(m3, cp_mults, 0xF4); // [m3(0)][m3(1)][cp(3)][cp(3)] - xmm2vec(right, _mm_xor_ps(r, _mm_load_ps((float *)&negmask))); - } - if (up) - { - _mm_storel_pi((__m64 *)up, _mm_shuffle_ps(m3, m3, 0x0E)); - up[2] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x01)); - } -} -#else // REHLDS_FIXES -void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) -{ - float sr, sp, sy, cr, cp, cy; - -#ifndef SWDS - g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up); -#endif // SWDS - - float angle; - angle = (float)(angles[YAW] * (M_PI * 2 / 360)); - sy = sin(angle); - cy = cos(angle); - angle = (float)(angles[PITCH] * (M_PI * 2 / 360)); - sp = sin(angle); - cp = cos(angle); - angle = (float)(angles[ROLL] * (M_PI * 2 / 360)); - sr = sin(angle); - cr = cos(angle); - - if (forward) - { - forward[0] = cp*cy; - forward[1] = cp*sy; - forward[2] = -sp; - } - if (right) - { - right[0] = (-1 * sr*sp*cy + -1 * cr*-sy); - right[1] = (-1 * sr*sp*sy + -1 * cr*cy); - right[2] = -1 * sr*cp; - } - if (up) - { - up[0] = (cr*sp*cy + -sr*-sy); - up[1] = (cr*sp*sy + -sr*cy); - up[2] = cr*cp; - } -} -#endif // REHLDS_FIXES - -#ifdef REHLDS_FIXES -// parallel SSE version -void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) -{ - __m128 s, c; - sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c); - - __m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr] - __m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp] - __m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr]; - - m1 = _mm_shuffle_ps(s, s, 0x50); // [sp][sp][sy][sy] - m2 = _mm_shuffle_ps(c, s, 0x05); // [cy][cy][sp][sp] - - __m128 m3 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr] - m1 = _mm_mul_ps(m1, m2); - m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [sr][cr][sr][cr] - m3 = _mm_mul_ps(m3, m1); // [sp*cy*sr][sp*cy*cr][sy*sp*sr][sy*sp*cr] - - m2 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr] - m1 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy] - m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [cr][sr][cy][sr] - m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr] - m1 = _mm_mul_ps(m1, m2); // [-cr*sy][sr*sy][cy*cr][-sr*cy] - - m3 = _mm_add_ps(m3, m1); - - if (forward) - { - forward[0] = _mm_cvtss_f32(cp_mults); - _mm_storel_pi((__m64*)(forward + 1), m3); // (sr*sp*cy + cr*-sy); - } - if (right) - { - right[0] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x02)); - _mm_storel_pi((__m64*)(right + 1), _mm_shuffle_ps(m3, m3, 0x0E)); - } - if (up) - { - up[0] = -_mm_cvtss_f32(s); - _mm_storel_pi((__m64 *)&up[1], _mm_shuffle_ps(cp_mults, cp_mults, 0x07)); - } -} -#else // REHLDS_FIXES -void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) -{ - float sr, sp, sy, cr, cp, cy; - - float angle; - angle = (float)(angles[YAW] * (M_PI * 2 / 360)); - sy = sin(angle); - cy = cos(angle); - angle = (float)(angles[PITCH] * (M_PI * 2 / 360)); - sp = sin(angle); - cp = cos(angle); - angle = (float)(angles[ROLL] * (M_PI * 2 / 360)); - sr = sin(angle); - cr = cos(angle); - - if (forward) - { - forward[0] = cp*cy; - forward[1] = (sr*sp*cy + cr*-sy); - forward[2] = (cr*sp*cy + -sr*-sy); - } - if (right) - { - right[0] = cp*sy; - right[1] = (sr*sp*sy + cr*cy); - right[2] = (cr*sp*sy + -sr*cy); - } - if (up) - { - up[0] = -sp; - up[1] = sr*cp; - up[2] = cr*cp; - } -} -#endif - -#ifdef REHLDS_FIXES -// parallel SSE version -void AngleMatrix(const vec_t *angles, float(*matrix)[4]) -{ - __m128 s, c; - sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c); - - /* - matrix[0][1] = sr * sp * cy - cr * sy; - matrix[1][1] = sr * sp * sy + cr * cy; - matrix[0][2] = cr * sp * cy + sr * sy; - matrix[1][2] = cr * sp * sy - sr * cy; - */ - __m128 m1; - __m128 m2 = _mm_shuffle_ps(s, c, 0x00); // [sp][sp][cp][cp] - __m128 m3 = _mm_shuffle_ps(c, s, 0x55); // [cy][cy][sy][sy] - - m1 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr] - m2 = _mm_shuffle_ps(m2, m2, 0x00); // [sp][sp][sp][sp] - m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [cy][sy][cy][sy] - - m2 = _mm_mul_ps(m2, _mm_mul_ps(m1, m3)); // m1*m2*m3 - - m1 = _mm_shuffle_ps(m1, m1, 0x1B); // [cr][cr][sr][sr] - m3 = _mm_shuffle_ps(m3, m3, 0xB1); // [sy][cy][sy][cy] - m3 = _mm_xor_ps(m3, _mm_load_ps((float *)&negmask_1001)); - m3 = _mm_mul_ps(m3, m1); - - m2 = _mm_add_ps(m2, m3); - - /* - matrix[0][0] = cp * cy; - matrix[1][0] = cp * sy; - matrix[2][1] = sr * cp; - matrix[2][2] = cr * cp; - */ - m1 = _mm_shuffle_ps(s, c, 0x29); // [sy][sr][cr][cp] - c = _mm_shuffle_ps(c, c, 0x40); // [cp][cp][cp][cy] - m1 = _mm_mul_ps(m1, c); - - // matrix[0] - m3 = _mm_shuffle_ps(m2, m2, 0xE1); - _mm_storeu_ps(&matrix[0][0], m3); - matrix[0][0] = _mm_cvtss_f32(_mm_shuffle_ps(m1, m1, 0x03)); - *(int *)&matrix[0][3] = 0; - - // matrix[1] - m2 = _mm_shuffle_ps(m2, m2, 0xB4); - _mm_storeu_ps(&matrix[1][0], m2); - matrix[1][0] = _mm_cvtss_f32(m1); - *(int *)&matrix[1][3] = 0; - - // matrix[2] - _mm_storeu_ps(&matrix[2][0], m1); - matrix[2][0] = -_mm_cvtss_f32(s); - *(int *)&matrix[2][3] = 0; -} -#else // REHLDS_FIXES -void AngleMatrix(const vec_t *angles, float(*matrix)[4]) -{ - float sr, sp, sy, cr, cp, cy; - - float angle; - angle = (float)(angles[ROLL] * (M_PI * 2 / 360)); - sy = sin(angle); - cy = cos(angle); - angle = (float)(angles[YAW] * (M_PI * 2 / 360)); - sp = sin(angle); - cp = cos(angle); - angle = (float)(angles[PITCH] * (M_PI * 2 / 360)); - sr = sin(angle); - cr = cos(angle); - - matrix[0][0] = cr * cp; - matrix[0][1] = sy * sr * cp - cy * sp; - matrix[0][2] = cy * sr * cp + sy * sp; - matrix[0][3] = 0.0f; - - matrix[1][0] = cr * sp; - matrix[1][1] = sy * sr * sp + cy * cp; - matrix[1][2] = cy * sr * sp - sy * cp; - matrix[1][3] = 0.0f; - - matrix[2][0] = -sr; - matrix[2][1] = sy * cr; - matrix[2][2] = cy * cr; - matrix[2][3] = 0.0f; -} -#endif // REHLDS_FIXES - -NOBODY void AngleIMatrix(const vec_t *angles, float *matrix); -//{ -// float angle; // 372 -// float sr; // 373 -// float sp; // 373 -// float sy; // 373 -// float cr; // 373 -// float cp; // 373 -// float cy; // 373 -//} - -NOBODY void NormalizeAngles(float *angles); -//{ -// int i; // 402 -//} - -NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac); -//{ -// int i; // 428 -// float ang1; // 429 -// float ang2; // 429 -// float d; // 430 -// NormalizeAngles(float *angles); // 432 -// NormalizeAngles(float *angles); // 433 -// NormalizeAngles(float *angles); // 453 -//} - -void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out) -{ - out[0] = _DotProduct(in1, in2[0]) + in2[0][3]; - out[1] = _DotProduct(in1, in2[1]) + in2[1][3]; - out[2] = _DotProduct(in1, in2[2]) + in2[2][3]; -} - -int VectorCompare(const vec_t *v1, const vec_t *v2) -{ -#ifdef REHLDS_OPT_PEDANTIC - __m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2)); - return !(_mm_movemask_ps(cmp) & (1|2|4)); -#else // REHLDS_OPT_PEDANTIC for (int i = 0; i < 3; i++) { - if (v1[i] != v2[i]) return 0; + if (v1[i] != v2[i]) + return FALSE; } - return 1; -#endif // REHLDS_OPT_PEDANTIC + return TRUE; } -#ifdef REHLDS_FIXES -void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out) -{ - xmm2vec(out, _mm_add_ps(_mm_mul_ps(_mm_set_ps1(scale), _mm_loadu_ps(vecm)), _mm_loadu_ps(veca))); -} -#else -void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out) -{ - out[0] = scale * vecm[0] + veca[0]; - out[1] = scale * vecm[1] + veca[1]; - out[2] = scale * vecm[2] + veca[2]; -} -#endif - -#ifndef REHLDS_FIXES -real_t _DotProduct(const vec_t *v1, const vec_t *v2) -{ - return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; -} -#else // REHLDS_FIXES -float _DotProduct(const vec_t *v1, const vec_t *v2) -{ - // _mm_loadu_ps - load xmm from unaligned address - // _mm_cvtss_f32 - return low float value of xmm - // _mm_dp_ps - dot product - // 0x71 = 0b01110001 - mask for multiplying operands and result - // dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f) - - return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2))); -} -#endif // REHLDS_FIXES - -NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out); -//{ -//} - -void _VectorAdd(vec_t *veca, vec_t *vecb, vec_t *out) -{ - out[0] = veca[0] + vecb[0]; - out[1] = veca[1] + vecb[1]; - out[2] = veca[2] + vecb[2]; -} - -NOBODY void _VectorCopy(vec_t *in, vec_t *out); -//{ -//} - -void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross) -{ -#ifdef REHLDS_FIXES - xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2))); -#else // REHLDS_FIXES - cross[0] = v1[1] * v2[2] - v1[2] * v2[1]; - cross[1] = v1[2] * v2[0] - v1[0] * v2[2]; - cross[2] = v1[0] * v2[1] - v1[1] * v2[0]; -#endif // REHLDS_FIXES -} - -float Length(const vec_t *v) -{ -#ifdef REHLDS_FIXES - return _mm_cvtss_f32(length3D(_mm_loadu_ps(v))); -#endif // REHLDS_FIXES - - float length; - - length = 0.0f; - for (int i = 0; i < 3; i++) - { - length = v[i] * v[i] + length; - } - return Q_sqrt(length); -} - -float Length2D(const vec_t *v) -{ -#ifdef REHLDS_FIXES - return _mm_cvtss_f32(length2D(_mm_loadu_ps(v))); -#endif // REHLDS_FIXES - - return Q_sqrt(v[0] * v[0] + v[1] * v[1]); -} - -float VectorNormalize(vec3_t v) -{ - float length, ilength; - -#ifdef REHLDS_FIXES - length = Length(v); // rsqrt is very inaccurate :( -#else // REHLDS_FIXES - length = v[0] * v[0] + v[1] * v[1] + v[2] * v[2]; - length = Q_sqrt(length); -#endif // REHLDS_FIXES - - if (length) - { - ilength = 1 / length; - v[0] *= ilength; - v[1] *= ilength; - v[2] *= ilength; - } - - return length; -} - -NOBODY void VectorInverse(vec_t *v); -//{ -//} - -NOBODY int Q_log2(int val); -//{ -// int answer; // 568 -//} - -NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up); -//{ -// vec3_t tmp; // 576 -// CrossProduct(const vec_t *v1, -// const vec_t *v2, -// vec_t *cross); /* size=0, low_pc=0 */ // 590 -// VectorNormalize(vec_t *v); /* size=0, low_pc=0 */ // 591 -// CrossProduct(const vec_t *v1, -// const vec_t *v2, -// vec_t *cross); /* size=0, low_pc=0 */ // 592 -// VectorNormalize(vec_t *v); /* size=0, low_pc=0 */ // 593 -//} - -void VectorAngles(const vec_t *forward, vec_t *angles) -{ - float length, yaw, pitch; - - if (forward[1] == 0 && forward[0] == 0) - { - yaw = 0; - if (forward[2] > 0) - pitch = 90; - else - pitch = 270; - } - else - { - yaw = (atan2((double)forward[1], (double)forward[0]) * 180.0 / M_PI); - if (yaw < 0) - yaw += 360; - -#ifdef REHLDS_FIXES - length = Length2D(forward); -#else // REHLDS_FIXES - length = Q_sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1])); -#endif // REHLDS_FIXES - - pitch = atan2((double)forward[2], (double)length) * 180.0 / M_PI; - if (pitch < 0) - pitch += 360; - } - - angles[0] = pitch; - angles[1] = yaw; - angles[2] = 0; -} - -NOBODY void R_ConcatRotations(float *in1, float *in2, float *out); -//{ -// -//} - -#ifdef REHLDS_FIXES -void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4]) -{ - for (size_t i = 0; i < 3; i++) - { - __m128 a1 = _mm_mul_ps(_mm_set_ps1(in1[i][0]), _mm_loadu_ps(in2[0])); - __m128 a2 = _mm_mul_ps(_mm_set_ps1(in1[i][1]), _mm_loadu_ps(in2[1])); - __m128 a3 = _mm_mul_ps(_mm_set_ps1(in1[i][2]), _mm_loadu_ps(in2[2])); - _mm_storeu_ps(out[i], _mm_add_ps(a1, _mm_add_ps(a2, a3))); - out[i][3] += in1[i][3]; - } -} -#else // REHLDS_FIXES -void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4]) -{ - out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] + in1[0][2] * in2[2][0]; - out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] + in1[0][2] * in2[2][1]; - out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] + in1[0][2] * in2[2][2]; - out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] + in1[0][2] * in2[2][3] + in1[0][3]; - out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] + in1[1][2] * in2[2][0]; - out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] + in1[1][2] * in2[2][1]; - out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] + in1[1][2] * in2[2][2]; - out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] + in1[1][2] * in2[2][3] + in1[1][3]; - out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] + in1[2][2] * in2[2][0]; - out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] + in1[2][2] * in2[2][1]; - out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] + in1[2][2] * in2[2][2]; - out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] + in1[2][2] * in2[2][3] + in1[2][3]; -} -#endif // REHLDS_FIXES - -NOBODY void FloorDivMod(double numer, double denom, int *quotient, int *rem); -//{ -// int q; // 702 -// int r; // 702 -// double x; // 703 -// floor(double __x); /* size=0, low_pc=0 */ // 726 -// floor(double __x); /* size=0, low_pc=0 */ // 717 -//} - -NOBODY int GreatestCommonDivisor(int i1, int i2); -//{ -//} - -NOBODY fixed16_t Invert24To16(fixed16_t val); -//{ -//} +#endif // #if !defined(REHLDS_SSE) diff --git a/rehlds/engine/mathlib_e.h b/rehlds/engine/mathlib_e.h index 7bfcb2a..77f69ee 100644 --- a/rehlds/engine/mathlib_e.h +++ b/rehlds/engine/mathlib_e.h @@ -30,9 +30,10 @@ #include "maintypes.h" #include "model.h" +#include "mathlib_sse.h" -#ifndef REHLDS_FIXES -// NOTE: In some cases we need high precision of floating-point, +#if !defined(REHLDS_FIXES) && !defined(REHLDS_SSE) +// NOTE: In some cases we need high precision of floating-point, // so use double instead of float, otherwise unittest will fail typedef double real_t; #else @@ -77,59 +78,6 @@ static const int nanmask = 0x7F800000; #define IS_NAN(fvar) ((*reinterpret_cast(&(fvar)) & nanmask) == nanmask) -inline float M_sqrt(float value) { - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value))); -} - -inline double M_sqrt(double value) { - auto v = _mm_load_sd(&value); - return _mm_cvtsd_f64(_mm_sqrt_sd(v, v)); -} - -template -inline double M_sqrt(T value) { - return sqrt(value); -} - -inline float M_min(float a, float b) { - return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b))); -} - -inline double M_min(double a, double b) { - return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b))); -} - -template -inline T M_min(T a, T b) { - return min(a, b); -} - -inline float M_max(float a, float b) { - return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b))); -} - -inline double M_max(double a, double b) { - return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b))); -} - -template -inline T M_max(T a, T b) { - return max(a, b); -} - -inline float M_clamp(float a, float min, float max) { - return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max))); -} - -inline double M_clamp(double a, double min, double max) { - return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max))); -} - -template -inline T M_clamp(T a, T min, T max) { - return clamp(a, min, max); -} - inline void VectorAdd(const vec_t *veca, const vec_t *vecb, vec_t *out) { out[0] = veca[0] + vecb[0]; @@ -208,7 +156,7 @@ NOBODY void AngleIMatrix(const vec_t *angles, float *matrix); NOBODY void NormalizeAngles(float *angles); NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac); void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out); -int VectorCompare(const vec_t *v1, const vec_t *v2); +qboolean VectorCompare(const vec_t *v1, const vec_t *v2); void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out); real_t _DotProduct(const vec_t *v1, const vec_t *v2); NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out); @@ -219,7 +167,6 @@ float Length(const vec_t *v); float Length2D(const vec_t *v); float VectorNormalize(vec_t *v); NOBODY void VectorInverse(vec_t *v); -void VectorScale(const vec_t *in, float scale, vec_t *out); NOBODY int Q_log2(int val); NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up); void VectorAngles(const vec_t *forward, vec_t *angles); diff --git a/rehlds/engine/mathlib_sse.cpp b/rehlds/engine/mathlib_sse.cpp new file mode 100644 index 0000000..e99d85a --- /dev/null +++ b/rehlds/engine/mathlib_sse.cpp @@ -0,0 +1,375 @@ +/* +* +* This program is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License as published by the +* Free Software Foundation; either version 2 of the License, or (at +* your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +* +* In addition, as a special exception, the author gives permission to +* link the code of this program with the Half-Life Game Engine ("HL +* Engine") and Modified Game Libraries ("MODs") developed by Valve, +* L.L.C ("Valve"). You must obey the GNU General Public License in all +* respects for all of the code used other than the HL Engine and MODs +* from Valve. If you modify this file, you may extend this exception +* to your version of the file, but you are not obligated to do so. If +* you do not wish to do so, delete this exception statement from your +* version. +* +*/ + +#include "precompiled.h" + +#if defined(REHLDS_SSE) + +// Intrisics guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/ +// Shufps calculator: http://wurstcaptures.untergrund.net/assembler_tricks.html + +// aligned vec4_t +typedef ALIGN16 vec4_t avec4_t; +typedef ALIGN16 unsigned int aivec4_t[4]; + +// conversion multiplier +const avec4_t deg2rad = +{ + (float)M_PI / 180.f, + (float)M_PI / 180.f, + (float)M_PI / 180.f, + (float)M_PI / 180.f +}; + +const aivec4_t negmask[4] = +{ + 0x80000000, + 0x80000000, + 0x80000000, + 0x80000000 +}; + +const aivec4_t negmask_1001 = +{ + 0x80000000, + 0, + 0, + 0x80000000 +}; + +const aivec4_t negmask_0010 = +{ + 0, + 0, + 0x80000000, + 0 +}; + +// save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow. +inline void xmm2vec(vec_t *v, const __m128 m) +{ + _mm_storel_pi((__m64*)v, m); + _mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02)); +} + +FUNC_TARGET("sse4.1") +inline __m128 dotProduct3D(__m128 v1, __m128 v2) +{ + if (cpuinfo.sse4_1) + return _mm_dp_ps(v1, v2, 0x71); + __m128 v = _mm_mul_ps(v1, v2); + return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3 +} + +inline __m128 crossProduct3D(__m128 a, __m128 b) +{ + __m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); + __m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1))); + __m128 m = _mm_sub_ps(tmp1, tmp2); + + return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1)); +} + +inline __m128 length3D(__m128 v) +{ + return _mm_sqrt_ps(dotProduct3D(v, v)); +} + +inline __m128 length2D(__m128 v) +{ + v = _mm_mul_ps(v, v); + return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3 +} + +int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p) +{ + double dist1, dist2; + int sides = 0; + + __m128 emin = _mm_loadu_ps(emins); + __m128 emax = _mm_loadu_ps(emaxs); + avec4_t d1, d2; + + // general case + switch (p->signbits) + { + case 0: + _mm_store_ps(d1, emax); + _mm_store_ps(d2, emin); + break; + case 1: + _mm_store_ps(d1, emax); + _mm_store_ps(d2, emin); + d1[0] = emins[0]; + d2[0] = emaxs[0]; + break; + case 2: + _mm_store_ps(d1, emax); + _mm_store_ps(d2, emin); + d1[1] = emins[1]; + d2[1] = emaxs[1]; + break; + case 3: + _mm_store_ps(d1, emin); + _mm_store_ps(d2, emax); + d1[2] = emaxs[2]; + d2[2] = emins[2]; + break; + case 4: + _mm_store_ps(d1, emax); + _mm_store_ps(d2, emin); + d1[2] = emins[2]; + d2[2] = emaxs[2]; + break; + case 5: + _mm_store_ps(d1, emin); + _mm_store_ps(d2, emax); + d1[1] = emaxs[1]; + d2[1] = emins[1]; + break; + case 6: + _mm_store_ps(d1, emin); + _mm_store_ps(d2, emax); + d1[0] = emaxs[0]; + d2[0] = emins[0]; + break; + case 7: + _mm_store_ps(d1, emin); + _mm_store_ps(d2, emax); + break; + default: + BOPS_Error(); + dist1 = dist2 = 0.0; + break; + } + + dist1 = _DotProduct(p->normal, d1); + dist2 = _DotProduct(p->normal, d2); + + if (dist1 >= p->dist) + sides = 1; + + if (dist2 < p->dist) + sides |= 2; + + return sides; +} + +qboolean VectorCompare(const vec_t *v1, const vec_t *v2) +{ + __m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2)); + return !(_mm_movemask_ps(cmp) & (1|2|4)); +} + +void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) +{ +#ifndef SWDS + g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up); +#endif // SWDS + + __m128 s, c; + sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c); + + __m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr] + __m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp] + __m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr]; + + m1 = _mm_shuffle_ps(c, s, 0x15); // [cy][cy][sy][sp] + m2 = _mm_shuffle_ps(s, c, 0xA0); // [sp][sp][cr][cr] + m1 = _mm_shuffle_ps(m1, m1, 0xC8); // [cy][sy][cy][sp] + + __m128 m3 = _mm_shuffle_ps(s, s, 0x4A); // [sr][sr][sp][sy]; + m3 = _mm_mul_ps(m3, _mm_mul_ps(m1, m2)); // [sp*cy*sr][sp*sy*sr][cr*cy*sp][cr*sp*sy] + + m2 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy] + m1 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr] + m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [sy][cr][sy][cy] + m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr] + m1 = _mm_mul_ps(m1, m2); // [-cr*sy][cy*cr][sr*sy][-sr*cy] + + m3 = _mm_add_ps(m3, m1); + + if (forward) + { + _mm_storel_pi((__m64 *)forward, _mm_shuffle_ps(cp_mults, cp_mults, 0x08)); + forward[2] = -_mm_cvtss_f32(s); + } + if (right) + { + __m128 r = _mm_shuffle_ps(m3, cp_mults, 0xF4); // [m3(0)][m3(1)][cp(3)][cp(3)] + xmm2vec(right, _mm_xor_ps(r, _mm_load_ps((float *)&negmask))); + } + if (up) + { + _mm_storel_pi((__m64 *)up, _mm_shuffle_ps(m3, m3, 0x0E)); + up[2] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x01)); + } +} + +void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) +{ + __m128 s, c; + sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c); + + __m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr] + __m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp] + __m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr]; + + m1 = _mm_shuffle_ps(s, s, 0x50); // [sp][sp][sy][sy] + m2 = _mm_shuffle_ps(c, s, 0x05); // [cy][cy][sp][sp] + + __m128 m3 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr] + m1 = _mm_mul_ps(m1, m2); + m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [sr][cr][sr][cr] + m3 = _mm_mul_ps(m3, m1); // [sp*cy*sr][sp*cy*cr][sy*sp*sr][sy*sp*cr] + + m2 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr] + m1 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy] + m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [cr][sr][cy][sr] + m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr] + m1 = _mm_mul_ps(m1, m2); // [-cr*sy][sr*sy][cy*cr][-sr*cy] + + m3 = _mm_add_ps(m3, m1); + + if (forward) + { + forward[0] = _mm_cvtss_f32(cp_mults); + _mm_storel_pi((__m64*)(forward + 1), m3); // (sr*sp*cy + cr*-sy); + } + if (right) + { + right[0] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x02)); + _mm_storel_pi((__m64*)(right + 1), _mm_shuffle_ps(m3, m3, 0x0E)); + } + if (up) + { + up[0] = -_mm_cvtss_f32(s); + _mm_storel_pi((__m64 *)&up[1], _mm_shuffle_ps(cp_mults, cp_mults, 0x07)); + } +} + +void AngleMatrix(const vec_t *angles, float(*matrix)[4]) +{ + __m128 s, c; + sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c); + + /* + matrix[0][1] = sr * sp * cy - cr * sy; + matrix[1][1] = sr * sp * sy + cr * cy; + matrix[0][2] = cr * sp * cy + sr * sy; + matrix[1][2] = cr * sp * sy - sr * cy; + */ + __m128 m1; + __m128 m2 = _mm_shuffle_ps(s, c, 0x00); // [sp][sp][cp][cp] + __m128 m3 = _mm_shuffle_ps(c, s, 0x55); // [cy][cy][sy][sy] + + m1 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr] + m2 = _mm_shuffle_ps(m2, m2, 0x00); // [sp][sp][sp][sp] + m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [cy][sy][cy][sy] + + m2 = _mm_mul_ps(m2, _mm_mul_ps(m1, m3)); // m1*m2*m3 + + m1 = _mm_shuffle_ps(m1, m1, 0x1B); // [cr][cr][sr][sr] + m3 = _mm_shuffle_ps(m3, m3, 0xB1); // [sy][cy][sy][cy] + m3 = _mm_xor_ps(m3, _mm_load_ps((float *)&negmask_1001)); + m3 = _mm_mul_ps(m3, m1); + + m2 = _mm_add_ps(m2, m3); + + /* + matrix[0][0] = cp * cy; + matrix[1][0] = cp * sy; + matrix[2][1] = sr * cp; + matrix[2][2] = cr * cp; + */ + m1 = _mm_shuffle_ps(s, c, 0x29); // [sy][sr][cr][cp] + c = _mm_shuffle_ps(c, c, 0x40); // [cp][cp][cp][cy] + m1 = _mm_mul_ps(m1, c); + + // matrix[0] + m3 = _mm_shuffle_ps(m2, m2, 0xE1); + _mm_storeu_ps(&matrix[0][0], m3); + matrix[0][0] = _mm_cvtss_f32(_mm_shuffle_ps(m1, m1, 0x03)); + *(int *)&matrix[0][3] = 0; + + // matrix[1] + m2 = _mm_shuffle_ps(m2, m2, 0xB4); + _mm_storeu_ps(&matrix[1][0], m2); + matrix[1][0] = _mm_cvtss_f32(m1); + *(int *)&matrix[1][3] = 0; + + // matrix[2] + _mm_storeu_ps(&matrix[2][0], m1); + matrix[2][0] = -_mm_cvtss_f32(s); + *(int *)&matrix[2][3] = 0; +} + +void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out) +{ + xmm2vec(out, _mm_add_ps(_mm_mul_ps(_mm_set_ps1(scale), _mm_loadu_ps(vecm)), _mm_loadu_ps(veca))); +} + +float _DotProduct(const vec_t *v1, const vec_t *v2) +{ + // _mm_loadu_ps - load xmm from unaligned address + // _mm_cvtss_f32 - return low float value of xmm + // _mm_dp_ps - dot product + // 0x71 = 0b01110001 - mask for multiplying operands and result + // dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f) + + return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2))); +} + +float Length(const vec_t *v) +{ + return _mm_cvtss_f32(length3D(_mm_loadu_ps(v))); // rsqrt is very inaccurate :( +} + +float Length2D(const vec_t *v) +{ + return _mm_cvtss_f32(length2D(_mm_loadu_ps(v))); +} + +void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross) +{ + xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2))); +} + +void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4]) +{ + for (size_t i = 0; i < 3; i++) + { + __m128 a1 = _mm_mul_ps(_mm_set_ps1(in1[i][0]), _mm_loadu_ps(in2[0])); + __m128 a2 = _mm_mul_ps(_mm_set_ps1(in1[i][1]), _mm_loadu_ps(in2[1])); + __m128 a3 = _mm_mul_ps(_mm_set_ps1(in1[i][2]), _mm_loadu_ps(in2[2])); + _mm_storeu_ps(out[i], _mm_add_ps(a1, _mm_add_ps(a2, a3))); + out[i][3] += in1[i][3]; + } +} + +#endif // #if defined(REHLDS_SSE) diff --git a/rehlds/engine/mathlib_sse.h b/rehlds/engine/mathlib_sse.h new file mode 100644 index 0000000..0e9550f --- /dev/null +++ b/rehlds/engine/mathlib_sse.h @@ -0,0 +1,74 @@ +/* +* +* This program is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License as published by the +* Free Software Foundation; either version 2 of the License, or (at +* your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +* +* In addition, as a special exception, the author gives permission to +* link the code of this program with the Half-Life Game Engine ("HL +* Engine") and Modified Game Libraries ("MODs") developed by Valve, +* L.L.C ("Valve"). You must obey the GNU General Public License in all +* respects for all of the code used other than the HL Engine and MODs +* from Valve. If you modify this file, you may extend this exception +* to your version of the file, but you are not obligated to do so. If +* you do not wish to do so, delete this exception statement from your +* version. +* +*/ + +#pragma once + +#if defined(REHLDS_SSE) + +inline float M_min(float a, float b) +{ + return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b))); +} + +inline double M_min(double a, double b) +{ + return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b))); +} + +inline float M_max(float a, float b) +{ + return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b))); +} + +inline double M_max(double a, double b) +{ + return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b))); +} + +inline float M_sqrt(float value) +{ + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value))); +} + +inline double M_sqrt(double value) +{ + auto v = _mm_load_sd(&value); + return _mm_cvtsd_f64(_mm_sqrt_sd(v, v)); +} + +inline float M_clamp(float a, float min, float max) +{ + return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max))); +} + +inline double M_clamp(double a, double min, double max) +{ + return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max))); +} + +#endif // #if defined(REHLDS_SSE) diff --git a/rehlds/engine/sse_mathfun.h b/rehlds/engine/sse_mathfun.h index 24fd11c..2407327 100644 --- a/rehlds/engine/sse_mathfun.h +++ b/rehlds/engine/sse_mathfun.h @@ -36,7 +36,7 @@ misrepresented as being the original software. #ifdef _MSC_VER /* visual c++ */ # define ALIGN16_BEG __declspec(align(16)) -# define ALIGN16_END +# define ALIGN16_END #else /* gcc or icc */ # define ALIGN16_BEG # define ALIGN16_END __attribute__((aligned(16))) diff --git a/rehlds/msvc/ReHLDS.vcxproj b/rehlds/msvc/ReHLDS.vcxproj index 4d5147d..b587290 100644 --- a/rehlds/msvc/ReHLDS.vcxproj +++ b/rehlds/msvc/ReHLDS.vcxproj @@ -72,6 +72,7 @@ + @@ -432,6 +433,7 @@ + @@ -805,7 +807,7 @@ Level3 Disabled true - REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions) + REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SSE;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions) Precise /arch:IA32 %(AdditionalOptions) MultiThreadedDebug @@ -1149,7 +1151,7 @@ true true true - REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) + REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_SSE;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) MultiThreaded /arch:IA32 %(AdditionalOptions) Use diff --git a/rehlds/msvc/ReHLDS.vcxproj.filters b/rehlds/msvc/ReHLDS.vcxproj.filters index e143343..fddae72 100644 --- a/rehlds/msvc/ReHLDS.vcxproj.filters +++ b/rehlds/msvc/ReHLDS.vcxproj.filters @@ -353,6 +353,9 @@ engine\common + + engine + @@ -1081,6 +1084,9 @@ public + + engine + diff --git a/rehlds/public/rehlds/crc32c.cpp b/rehlds/public/rehlds/crc32c.cpp index 669fc81..4fcde71 100644 --- a/rehlds/public/rehlds/crc32c.cpp +++ b/rehlds/public/rehlds/crc32c.cpp @@ -115,6 +115,7 @@ uint32 crc32c_t_nosse(uint32 iCRC, const uint8 *buf, int len) { return crc; } +#ifdef REHLDS_SSE FUNC_TARGET("sse4.2") uint32 crc32c_t8_sse(uint32 iCRC, uint8 u8) { return _mm_crc32_u8(iCRC, u8); @@ -140,6 +141,14 @@ uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) { return cpuinfo.sse4_2 ? crc32c_t_sse(iCRC, buf, len) : crc32c_t_nosse(iCRC, buf, len); } +#else + +uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) { + return crc32c_t_nosse(iCRC, buf, len); +} + +#endif // REHLDS_SSE + uint32 crc32c(const uint8 *buf, int len) { return crc32c_t(0xffffffff, buf, len); } diff --git a/rehlds/public/rehlds/crc32c.h b/rehlds/public/rehlds/crc32c.h index 55bed92..7ee27fc 100644 --- a/rehlds/public/rehlds/crc32c.h +++ b/rehlds/public/rehlds/crc32c.h @@ -11,7 +11,9 @@ GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program; if not, see . */ + #pragma once + #include "archtypes.h" extern uint32 crc32c_t8_nosse(uint32 iCRC, uint8 u8); diff --git a/rehlds/public/rehlds/static_map.h b/rehlds/public/rehlds/static_map.h index 11235d7..b1da16f 100644 --- a/rehlds/public/rehlds/static_map.h +++ b/rehlds/public/rehlds/static_map.h @@ -169,7 +169,6 @@ public: } } - Iterator(CStaticMap* m) { m_Map = m; m_RootNodes = m_Map->m_RootNodes; @@ -226,6 +225,7 @@ protected: virtual uint32 hash(const char* const &val) { uint32 cksum = 0; const char* pcc = val; +#ifdef REHLDS_SSE if (cpuinfo.sse4_2) { while (*pcc) { char cc = *(pcc++); @@ -234,7 +234,10 @@ protected: } cksum = crc32c_t8_sse(cksum, cc); } - } else { + } + else +#endif // REHLDS_SSE + { while (*pcc) { char cc = *(pcc++); if (cc >= 'A' || cc <= 'Z') {