diff --git a/rehlds/build.gradle b/rehlds/build.gradle index e059cda..63fed9a 100644 --- a/rehlds/build.gradle +++ b/rehlds/build.gradle @@ -135,7 +135,7 @@ void setupToolchain(NativeBinarySpec b) { cfg.compilerOptions.floatingPointModel = FloatingPointModel.PRECISE cfg.compilerOptions.enhancedInstructionsSet = EnhancedInstructionsSet.DISABLED } else { - cfg.compilerOptions.args '/Oi', '/GF', '/GR-' + cfg.compilerOptions.args '/Oi', '/GF', '/GR-', '/GS-' } if (swdsLib) { cfg.linkerOptions.randomizedBaseAddress = false diff --git a/rehlds/engine/common.h b/rehlds/engine/common.h index 49650a2..300dbd5 100644 --- a/rehlds/engine/common.h +++ b/rehlds/engine/common.h @@ -141,6 +141,7 @@ extern int loadsize; #define Q_snprintf _snprintf #define Q_atoi atoi #define Q_atof atof +#define Q_sqrt M_sqrt //#define Q_strtoull strtoull //#define Q_FileNameCmp FileNameCmp #define Q_vsnprintf _vsnprintf @@ -168,6 +169,7 @@ extern int loadsize; #define Q_snprintf _snprintf #define Q_atoi atoi #define Q_atof atof +#define Q_sqrt sqrt //#define Q_strtoull strtoull //#define Q_FileNameCmp FileNameCmp #define Q_vsnprintf _vsnprintf diff --git a/rehlds/engine/mathlib.cpp b/rehlds/engine/mathlib.cpp index f33b702..8bd0615 100644 --- a/rehlds/engine/mathlib.cpp +++ b/rehlds/engine/mathlib.cpp @@ -75,12 +75,40 @@ const aivec4_t negmask_0010 = }; // save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow. -static inline void xmm2vec(vec_t *v, const __m128 m) +inline void xmm2vec(vec_t *v, const __m128 m) { _mm_storel_pi((__m64*)v, m); _mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02)); } +inline __m128 dotProduct3D(__m128 v1, __m128 v2) +{ + if (cpuinfo.sse4_1) + return _mm_dp_ps(v1, v2, 0x71); + __m128 v = _mm_mul_ps(v1, v2); + return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3 +} + +inline __m128 crossProduct3D(__m128 a, __m128 b) +{ + __m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); + __m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1))); + __m128 m = _mm_sub_ps(tmp1, tmp2); + + return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1)); +} + +inline __m128 length3D(__m128 v) +{ + return _mm_sqrt_ps(dotProduct3D(v, v)); +} + +inline __m128 length2D(__m128 v) +{ + v = _mm_mul_ps(v, v); + return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3 +} + /* <46ebf> ../engine/mathlib.c:14 */ float anglemod(float a) { @@ -551,7 +579,7 @@ int VectorCompare(const vec_t *v1, const vec_t *v2) { #ifdef REHLDS_OPT_PEDANTIC __m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2)); - return !(_mm_movemask_epi8(*(__m128i *)&cmp) & 0xFFF); + return !(_mm_movemask_ps(cmp) & (1|2|4)); #else // REHLDS_OPT_PEDANTIC for (int i = 0; i < 3; i++) { @@ -591,10 +619,8 @@ float _DotProduct(const vec_t *v1, const vec_t *v2) // _mm_dp_ps - dot product // 0x71 = 0b01110001 - mask for multiplying operands and result // dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f) - if (cpuinfo.sse4_1) - return _mm_cvtss_f32(_mm_dp_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2), 0x71)); - return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; + return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2))); } #endif // REHLDS_FIXES @@ -620,12 +646,7 @@ NOBODY void _VectorCopy(vec_t *in, vec_t *out); void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross) { #ifdef REHLDS_FIXES - __m128 a = _mm_loadu_ps(v1), b = _mm_loadu_ps(v2); - __m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); - __m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1))); - __m128 m = _mm_sub_ps(tmp1, tmp2); - - xmm2vec(cross, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1))); + xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2))); #else // REHLDS_FIXES cross[0] = v1[1] * v2[2] - v1[2] * v2[1]; cross[1] = v1[2] * v2[0] - v1[0] * v2[2]; @@ -637,10 +658,7 @@ void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross) float Length(const vec_t *v) { #ifdef REHLDS_FIXES - // based on dot product - if (cpuinfo.sse4_1) - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(_mm_loadu_ps(v), _mm_loadu_ps(v), 0x71))); - return sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + return _mm_cvtss_f32(length3D(_mm_loadu_ps(v))); #endif // REHLDS_FIXES float length; @@ -650,18 +668,16 @@ float Length(const vec_t *v) { length = v[i] * v[i] + length; } - return sqrt(length); + return Q_sqrt(length); } float Length2D(const vec_t *v) { #ifdef REHLDS_FIXES - // based on dot product - if (cpuinfo.sse4_1) - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(_mm_loadu_ps(v), _mm_loadu_ps(v), 0x31))); // 0b00110001 + return _mm_cvtss_f32(length2D(_mm_loadu_ps(v))); #endif // REHLDS_FIXES - return sqrt(v[0] * v[0] + v[1] * v[1]); + return Q_sqrt(v[0] * v[0] + v[1] * v[1]); } /* <47722> ../engine/mathlib.c:532 */ @@ -673,7 +689,7 @@ float VectorNormalize(vec3_t v) length = Length(v); // rsqrt is very inaccurate :( #else // REHLDS_FIXES length = v[0] * v[0] + v[1] * v[1] + v[2] * v[2]; - length = sqrt(length); + length = Q_sqrt(length); #endif // REHLDS_FIXES if (length) @@ -742,7 +758,7 @@ void VectorAngles(const vec_t *forward, vec_t *angles) #ifdef REHLDS_FIXES length = Length2D(forward); #else // REHLDS_FIXES - length = sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1])); + length = Q_sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1])); #endif // REHLDS_FIXES pitch = atan2((double)forward[2], (double)length) * 180.0 / M_PI; diff --git a/rehlds/engine/mathlib_e.h b/rehlds/engine/mathlib_e.h index d42a58c..9f44f18 100644 --- a/rehlds/engine/mathlib_e.h +++ b/rehlds/engine/mathlib_e.h @@ -56,6 +56,28 @@ static const int nanmask = 0x7F800000; #define IS_NAN(fvar) ((*reinterpret_cast(&(fvar)) & nanmask) == nanmask) +inline double M_sqrt(int value) { + return sqrt(value); +} + +inline float M_sqrt(float value) { + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value))); +} + +inline double M_sqrt(double value) { + double ret; + auto v = _mm_load_sd(&value); + _mm_store_pd(&ret, _mm_sqrt_sd(v, v)); + return ret; +} + +inline double M_sqrt(long double value) +{ + double ret; + auto v = _mm_load_sd((double *)&value); + _mm_store_pd(&ret, _mm_sqrt_sd(v, v)); + return ret; +} float anglemod(float a); void BOPS_Error(void); diff --git a/rehlds/engine/sv_phys.cpp b/rehlds/engine/sv_phys.cpp index d5fa7c4..688c1cb 100644 --- a/rehlds/engine/sv_phys.cpp +++ b/rehlds/engine/sv_phys.cpp @@ -1305,7 +1305,7 @@ void SV_Physics_Step(edict_t *ent) ent->v.flags &= ~FL_ONGROUND; if (wasonground && (ent->v.health > 0.0 || SV_CheckBottom(ent))) { - float speed = (float)sqrt((long double)(ent->v.velocity[0] * ent->v.velocity[0] + ent->v.velocity[1] * ent->v.velocity[1])); + float speed = (float)Q_sqrt((long double)(ent->v.velocity[0] * ent->v.velocity[0] + ent->v.velocity[1] * ent->v.velocity[1])); if (speed != 0.0) { float friction = sv_friction.value * ent->v.friction;