mirror of
https://github.com/rehlds/rehlds.git
synced 2025-02-06 02:30:44 +03:00
Merge pull request #200 from theAsmodai/master
Small update in sse math
This commit is contained in:
commit
10b0f2a10d
@ -135,7 +135,7 @@ void setupToolchain(NativeBinarySpec b) {
|
||||
cfg.compilerOptions.floatingPointModel = FloatingPointModel.PRECISE
|
||||
cfg.compilerOptions.enhancedInstructionsSet = EnhancedInstructionsSet.DISABLED
|
||||
} else {
|
||||
cfg.compilerOptions.args '/Oi', '/GF', '/GR-'
|
||||
cfg.compilerOptions.args '/Oi', '/GF', '/GR-', '/GS-'
|
||||
}
|
||||
if (swdsLib) {
|
||||
cfg.linkerOptions.randomizedBaseAddress = false
|
||||
|
@ -141,6 +141,7 @@ extern int loadsize;
|
||||
#define Q_snprintf _snprintf
|
||||
#define Q_atoi atoi
|
||||
#define Q_atof atof
|
||||
#define Q_sqrt M_sqrt
|
||||
//#define Q_strtoull strtoull
|
||||
//#define Q_FileNameCmp FileNameCmp
|
||||
#define Q_vsnprintf _vsnprintf
|
||||
@ -168,6 +169,7 @@ extern int loadsize;
|
||||
#define Q_snprintf _snprintf
|
||||
#define Q_atoi atoi
|
||||
#define Q_atof atof
|
||||
#define Q_sqrt sqrt
|
||||
//#define Q_strtoull strtoull
|
||||
//#define Q_FileNameCmp FileNameCmp
|
||||
#define Q_vsnprintf _vsnprintf
|
||||
|
@ -75,12 +75,40 @@ const aivec4_t negmask_0010 =
|
||||
};
|
||||
|
||||
// save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow.
|
||||
static inline void xmm2vec(vec_t *v, const __m128 m)
|
||||
inline void xmm2vec(vec_t *v, const __m128 m)
|
||||
{
|
||||
_mm_storel_pi((__m64*)v, m);
|
||||
_mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02));
|
||||
}
|
||||
|
||||
inline __m128 dotProduct3D(__m128 v1, __m128 v2)
|
||||
{
|
||||
if (cpuinfo.sse4_1)
|
||||
return _mm_dp_ps(v1, v2, 0x71);
|
||||
__m128 v = _mm_mul_ps(v1, v2);
|
||||
return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3
|
||||
}
|
||||
|
||||
inline __m128 crossProduct3D(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
|
||||
__m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)));
|
||||
__m128 m = _mm_sub_ps(tmp1, tmp2);
|
||||
|
||||
return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
}
|
||||
|
||||
inline __m128 length3D(__m128 v)
|
||||
{
|
||||
return _mm_sqrt_ps(dotProduct3D(v, v));
|
||||
}
|
||||
|
||||
inline __m128 length2D(__m128 v)
|
||||
{
|
||||
v = _mm_mul_ps(v, v);
|
||||
return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3
|
||||
}
|
||||
|
||||
/* <46ebf> ../engine/mathlib.c:14 */
|
||||
float anglemod(float a)
|
||||
{
|
||||
@ -551,7 +579,7 @@ int VectorCompare(const vec_t *v1, const vec_t *v2)
|
||||
{
|
||||
#ifdef REHLDS_OPT_PEDANTIC
|
||||
__m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
|
||||
return !(_mm_movemask_epi8(*(__m128i *)&cmp) & 0xFFF);
|
||||
return !(_mm_movemask_ps(cmp) & (1|2|4));
|
||||
#else // REHLDS_OPT_PEDANTIC
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
@ -591,10 +619,8 @@ float _DotProduct(const vec_t *v1, const vec_t *v2)
|
||||
// _mm_dp_ps - dot product
|
||||
// 0x71 = 0b01110001 - mask for multiplying operands and result
|
||||
// dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f)
|
||||
if (cpuinfo.sse4_1)
|
||||
return _mm_cvtss_f32(_mm_dp_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2), 0x71));
|
||||
|
||||
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
|
||||
return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
|
||||
}
|
||||
#endif // REHLDS_FIXES
|
||||
|
||||
@ -620,12 +646,7 @@ NOBODY void _VectorCopy(vec_t *in, vec_t *out);
|
||||
void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
|
||||
{
|
||||
#ifdef REHLDS_FIXES
|
||||
__m128 a = _mm_loadu_ps(v1), b = _mm_loadu_ps(v2);
|
||||
__m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
|
||||
__m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)));
|
||||
__m128 m = _mm_sub_ps(tmp1, tmp2);
|
||||
|
||||
xmm2vec(cross, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1)));
|
||||
xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
|
||||
#else // REHLDS_FIXES
|
||||
cross[0] = v1[1] * v2[2] - v1[2] * v2[1];
|
||||
cross[1] = v1[2] * v2[0] - v1[0] * v2[2];
|
||||
@ -637,10 +658,7 @@ void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
|
||||
float Length(const vec_t *v)
|
||||
{
|
||||
#ifdef REHLDS_FIXES
|
||||
// based on dot product
|
||||
if (cpuinfo.sse4_1)
|
||||
return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(_mm_loadu_ps(v), _mm_loadu_ps(v), 0x71)));
|
||||
return sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
|
||||
return _mm_cvtss_f32(length3D(_mm_loadu_ps(v)));
|
||||
#endif // REHLDS_FIXES
|
||||
|
||||
float length;
|
||||
@ -650,18 +668,16 @@ float Length(const vec_t *v)
|
||||
{
|
||||
length = v[i] * v[i] + length;
|
||||
}
|
||||
return sqrt(length);
|
||||
return Q_sqrt(length);
|
||||
}
|
||||
|
||||
float Length2D(const vec_t *v)
|
||||
{
|
||||
#ifdef REHLDS_FIXES
|
||||
// based on dot product
|
||||
if (cpuinfo.sse4_1)
|
||||
return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(_mm_loadu_ps(v), _mm_loadu_ps(v), 0x31))); // 0b00110001
|
||||
return _mm_cvtss_f32(length2D(_mm_loadu_ps(v)));
|
||||
#endif // REHLDS_FIXES
|
||||
|
||||
return sqrt(v[0] * v[0] + v[1] * v[1]);
|
||||
return Q_sqrt(v[0] * v[0] + v[1] * v[1]);
|
||||
}
|
||||
|
||||
/* <47722> ../engine/mathlib.c:532 */
|
||||
@ -673,7 +689,7 @@ float VectorNormalize(vec3_t v)
|
||||
length = Length(v); // rsqrt is very inaccurate :(
|
||||
#else // REHLDS_FIXES
|
||||
length = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
|
||||
length = sqrt(length);
|
||||
length = Q_sqrt(length);
|
||||
#endif // REHLDS_FIXES
|
||||
|
||||
if (length)
|
||||
@ -742,7 +758,7 @@ void VectorAngles(const vec_t *forward, vec_t *angles)
|
||||
#ifdef REHLDS_FIXES
|
||||
length = Length2D(forward);
|
||||
#else // REHLDS_FIXES
|
||||
length = sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1]));
|
||||
length = Q_sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1]));
|
||||
#endif // REHLDS_FIXES
|
||||
|
||||
pitch = atan2((double)forward[2], (double)length) * 180.0 / M_PI;
|
||||
|
@ -56,6 +56,28 @@ static const int nanmask = 0x7F800000;
|
||||
|
||||
#define IS_NAN(fvar) ((*reinterpret_cast<int*>(&(fvar)) & nanmask) == nanmask)
|
||||
|
||||
inline double M_sqrt(int value) {
|
||||
return sqrt(value);
|
||||
}
|
||||
|
||||
inline float M_sqrt(float value) {
|
||||
return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
|
||||
}
|
||||
|
||||
inline double M_sqrt(double value) {
|
||||
double ret;
|
||||
auto v = _mm_load_sd(&value);
|
||||
_mm_store_pd(&ret, _mm_sqrt_sd(v, v));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline double M_sqrt(long double value)
|
||||
{
|
||||
double ret;
|
||||
auto v = _mm_load_sd((double *)&value);
|
||||
_mm_store_pd(&ret, _mm_sqrt_sd(v, v));
|
||||
return ret;
|
||||
}
|
||||
|
||||
float anglemod(float a);
|
||||
void BOPS_Error(void);
|
||||
|
@ -1305,7 +1305,7 @@ void SV_Physics_Step(edict_t *ent)
|
||||
ent->v.flags &= ~FL_ONGROUND;
|
||||
if (wasonground && (ent->v.health > 0.0 || SV_CheckBottom(ent)))
|
||||
{
|
||||
float speed = (float)sqrt((long double)(ent->v.velocity[0] * ent->v.velocity[0] + ent->v.velocity[1] * ent->v.velocity[1]));
|
||||
float speed = (float)Q_sqrt((long double)(ent->v.velocity[0] * ent->v.velocity[0] + ent->v.velocity[1] * ent->v.velocity[1]));
|
||||
if (speed != 0.0)
|
||||
{
|
||||
float friction = sv_friction.value * ent->v.friction;
|
||||
|
Loading…
x
Reference in New Issue
Block a user