Move math sse to macros REHLDS_SSE

2025-06-03 09:17:32 +03:00 · 2017-12-09 16:47:08 +07:00 · 2017-12-09 16:47:08 +07:00 · 5f1672ca85
commit 5f1672ca85
parent 4fad5255ba
12 changed files with 812 additions and 744 deletions
--- a/rehlds/build.gradle
+++ b/rehlds/build.gradle
@ -184,11 +184,11 @@ void setupToolchain(NativeBinarySpec b) {
 	}

 	if (unitTestExecutable) {
-		cfg.singleDefines 'REHLDS_UNIT_TESTS'
+		cfg.singleDefines 'REHLDS_UNIT_TESTS', 'REHLDS_SSE'
 	}

 	if (rehldsFixes) {
-		cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS'
+		cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_SSE', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS'
 	}

 	ToolchainConfigUtils.apply(project, cfg, b)
--- a/rehlds/common/mathlib.h
+++ b/rehlds/common/mathlib.h
@ -82,6 +82,30 @@ inline T clamp(T a, T min, T max)
 	return (a > max) ? max : (a < min) ? min : a;
 }

+template<typename T>
+inline T M_min(T a, T b)
+{
+	return min(a, b);
+}
+
+template<typename T>
+inline T M_max(T a, T b)
+{
+	return max(a, b);
+}
+
+template<typename T>
+inline T M_clamp(T a, T min, T max)
+{
+	return clamp(a, min, max);
+}
+
+template<typename T>
+inline double M_sqrt(T value)
+{
+	return sqrt(value);
+}
+
 template <typename T>
 inline T bswap(T s)
 {
--- a/rehlds/engine/mathlib.cpp
+++ b/rehlds/engine/mathlib.cpp
--- a/rehlds/engine/mathlib_e.h
+++ b/rehlds/engine/mathlib_e.h
@ -30,9 +30,10 @@

 #include "maintypes.h"
 #include "model.h"
+#include "mathlib_sse.h"

-#ifndef REHLDS_FIXES
-// NOTE: In some cases we need high precision of floating-point,
+#if !defined(REHLDS_FIXES) && !defined(REHLDS_SSE)
+// NOTE: In some cases we need high precision of floating-point,
 // so use double instead of float, otherwise unittest will fail
 typedef double real_t;
 #else
@ -77,59 +78,6 @@ static const int nanmask = 0x7F800000;

 #define IS_NAN(fvar) ((*reinterpret_cast<int*>(&(fvar)) & nanmask) == nanmask)

-inline float M_sqrt(float value) {
-	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
-}
-
-inline double M_sqrt(double value) {
-	auto v = _mm_load_sd(&value);
-	return _mm_cvtsd_f64(_mm_sqrt_sd(v, v));
-}
-
-template<typename T>
-inline double M_sqrt(T value) {
-	return sqrt(value);
-}
-
-inline float M_min(float a, float b) {
-	return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
-}
-
-inline double M_min(double a, double b) {
-	return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
-}
-
-template<typename T>
-inline T M_min(T a, T b) {
-	return min(a, b);
-}
-
-inline float M_max(float a, float b) {
-	return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
-}
-
-inline double M_max(double a, double b) {
-	return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
-}
-
-template<typename T>
-inline T M_max(T a, T b) {
-	return max(a, b);
-}
-
-inline float M_clamp(float a, float min, float max) {
-	return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max)));
-}
-
-inline double M_clamp(double a, double min, double max) {
-	return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max)));
-}
-
-template<typename T>
-inline T M_clamp(T a, T min, T max) {
-	return clamp(a, min, max);
-}
-
 inline void VectorAdd(const vec_t *veca, const vec_t *vecb, vec_t *out)
 {
 	out[0] = veca[0] + vecb[0];
@ -208,7 +156,7 @@ NOBODY void AngleIMatrix(const vec_t *angles, float *matrix);
 NOBODY void NormalizeAngles(float *angles);
 NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac);
 void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out);
-int VectorCompare(const vec_t *v1, const vec_t *v2);
+qboolean VectorCompare(const vec_t *v1, const vec_t *v2);
 void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out);
 real_t _DotProduct(const vec_t *v1, const vec_t *v2);
 NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
@ -219,7 +167,6 @@ float Length(const vec_t *v);
 float Length2D(const vec_t *v);
 float VectorNormalize(vec_t *v);
 NOBODY void VectorInverse(vec_t *v);
-void VectorScale(const vec_t *in, float scale, vec_t *out);
 NOBODY int Q_log2(int val);
 NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up);
 void VectorAngles(const vec_t *forward, vec_t *angles);
--- a/rehlds/engine/mathlib_sse.cpp
+++ b/rehlds/engine/mathlib_sse.cpp
@ -0,0 +1,375 @@
+/*
+*
+*    This program is free software; you can redistribute it and/or modify it
+*    under the terms of the GNU General Public License as published by the
+*    Free Software Foundation; either version 2 of the License, or (at
+*    your option) any later version.
+*
+*    This program is distributed in the hope that it will be useful, but
+*    WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with this program; if not, write to the Free Software Foundation,
+*    Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*
+*    In addition, as a special exception, the author gives permission to
+*    link the code of this program with the Half-Life Game Engine ("HL
+*    Engine") and Modified Game Libraries ("MODs") developed by Valve,
+*    L.L.C ("Valve").  You must obey the GNU General Public License in all
+*    respects for all of the code used other than the HL Engine and MODs
+*    from Valve.  If you modify this file, you may extend this exception
+*    to your version of the file, but you are not obligated to do so.  If
+*    you do not wish to do so, delete this exception statement from your
+*    version.
+*
+*/
+
+#include "precompiled.h"
+
+#if defined(REHLDS_SSE)
+
+// Intrisics guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+// Shufps calculator: http://wurstcaptures.untergrund.net/assembler_tricks.html
+
+// aligned vec4_t
+typedef ALIGN16 vec4_t avec4_t;
+typedef ALIGN16 unsigned int aivec4_t[4];
+
+// conversion multiplier
+const avec4_t deg2rad =
+{
+	(float)M_PI / 180.f,
+	(float)M_PI / 180.f,
+	(float)M_PI / 180.f,
+	(float)M_PI / 180.f
+};
+
+const aivec4_t negmask[4] =
+{
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000
+};
+
+const aivec4_t negmask_1001 =
+{
+	0x80000000,
+	0,
+	0,
+	0x80000000
+};
+
+const aivec4_t negmask_0010 =
+{
+	0,
+	0,
+	0x80000000,
+	0
+};
+
+// save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow.
+inline void xmm2vec(vec_t *v, const __m128 m)
+{
+	_mm_storel_pi((__m64*)v, m);
+	_mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02));
+}
+
+FUNC_TARGET("sse4.1")
+inline __m128 dotProduct3D(__m128 v1, __m128 v2)
+{
+	if (cpuinfo.sse4_1)
+		return _mm_dp_ps(v1, v2, 0x71);
+	__m128 v = _mm_mul_ps(v1, v2);
+	return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3
+}
+
+inline __m128 crossProduct3D(__m128 a, __m128 b)
+{
+	__m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
+	__m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)));
+	__m128 m = _mm_sub_ps(tmp1, tmp2);
+
+	return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+inline __m128 length3D(__m128 v)
+{
+	return _mm_sqrt_ps(dotProduct3D(v, v));
+}
+
+inline __m128 length2D(__m128 v)
+{
+	v = _mm_mul_ps(v, v);
+	return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3
+}
+
+int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p)
+{
+	double dist1, dist2;
+	int sides = 0;
+
+	__m128 emin = _mm_loadu_ps(emins);
+	__m128 emax = _mm_loadu_ps(emaxs);
+	avec4_t d1, d2;
+
+	// general case
+	switch (p->signbits)
+	{
+	case 0:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		break;
+	case 1:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		d1[0] = emins[0];
+		d2[0] = emaxs[0];
+		break;
+	case 2:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		d1[1] = emins[1];
+		d2[1] = emaxs[1];
+		break;
+	case 3:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		d1[2] = emaxs[2];
+		d2[2] = emins[2];
+		break;
+	case 4:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		d1[2] = emins[2];
+		d2[2] = emaxs[2];
+		break;
+	case 5:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		d1[1] = emaxs[1];
+		d2[1] = emins[1];
+		break;
+	case 6:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		d1[0] = emaxs[0];
+		d2[0] = emins[0];
+		break;
+	case 7:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		break;
+	default:
+		BOPS_Error();
+		dist1 = dist2 = 0.0;
+		break;
+	}
+
+	dist1 = _DotProduct(p->normal, d1);
+	dist2 = _DotProduct(p->normal, d2);
+
+	if (dist1 >= p->dist)
+		sides = 1;
+
+	if (dist2 < p->dist)
+		sides |= 2;
+
+	return sides;
+}
+
+qboolean VectorCompare(const vec_t *v1, const vec_t *v2)
+{
+	__m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
+	return !(_mm_movemask_ps(cmp) & (1|2|4));
+}
+
+void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+{
+#ifndef SWDS
+	g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
+#endif // SWDS
+
+	__m128 s, c;
+	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
+
+	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
+	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
+	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
+
+	m1 = _mm_shuffle_ps(c, s, 0x15); // [cy][cy][sy][sp]
+	m2 = _mm_shuffle_ps(s, c, 0xA0); // [sp][sp][cr][cr]
+	m1 = _mm_shuffle_ps(m1, m1, 0xC8); // [cy][sy][cy][sp]
+
+	__m128 m3 = _mm_shuffle_ps(s, s, 0x4A); // [sr][sr][sp][sy];
+	m3 = _mm_mul_ps(m3, _mm_mul_ps(m1, m2)); // [sp*cy*sr][sp*sy*sr][cr*cy*sp][cr*sp*sy]
+
+	m2 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
+	m1 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
+	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [sy][cr][sy][cy]
+	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
+	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][cy*cr][sr*sy][-sr*cy]
+
+	m3 = _mm_add_ps(m3, m1);
+
+	if (forward)
+	{
+		_mm_storel_pi((__m64 *)forward, _mm_shuffle_ps(cp_mults, cp_mults, 0x08));
+		forward[2] = -_mm_cvtss_f32(s);
+	}
+	if (right)
+	{
+		__m128 r = _mm_shuffle_ps(m3, cp_mults, 0xF4); // [m3(0)][m3(1)][cp(3)][cp(3)]
+		xmm2vec(right, _mm_xor_ps(r, _mm_load_ps((float *)&negmask)));
+	}
+	if (up)
+	{
+		_mm_storel_pi((__m64 *)up, _mm_shuffle_ps(m3, m3, 0x0E));
+		up[2] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x01));
+	}
+}
+
+void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+{
+	__m128 s, c;
+	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
+
+	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
+	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
+	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
+
+	m1 = _mm_shuffle_ps(s, s, 0x50); // [sp][sp][sy][sy]
+	m2 = _mm_shuffle_ps(c, s, 0x05); // [cy][cy][sp][sp]
+
+	__m128 m3 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr]
+	m1 = _mm_mul_ps(m1, m2);
+	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [sr][cr][sr][cr]
+	m3 = _mm_mul_ps(m3, m1); // [sp*cy*sr][sp*cy*cr][sy*sp*sr][sy*sp*cr]
+
+	m2 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
+	m1 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
+	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [cr][sr][cy][sr]
+	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
+	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][sr*sy][cy*cr][-sr*cy]
+
+	m3 = _mm_add_ps(m3, m1);
+
+	if (forward)
+	{
+		forward[0] = _mm_cvtss_f32(cp_mults);
+		_mm_storel_pi((__m64*)(forward + 1), m3); // (sr*sp*cy + cr*-sy);
+	}
+	if (right)
+	{
+		right[0] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x02));
+		_mm_storel_pi((__m64*)(right + 1), _mm_shuffle_ps(m3, m3, 0x0E));
+	}
+	if (up)
+	{
+		up[0] = -_mm_cvtss_f32(s);
+		_mm_storel_pi((__m64 *)&up[1], _mm_shuffle_ps(cp_mults, cp_mults, 0x07));
+	}
+}
+
+void AngleMatrix(const vec_t *angles, float(*matrix)[4])
+{
+	__m128 s, c;
+	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
+
+	/*
+	matrix[0][1] = sr * sp * cy - cr * sy;
+	matrix[1][1] = sr * sp * sy + cr * cy;
+	matrix[0][2] = cr * sp * cy + sr * sy;
+	matrix[1][2] = cr * sp * sy - sr * cy;
+	*/
+	__m128 m1;
+	__m128 m2 = _mm_shuffle_ps(s, c, 0x00); // [sp][sp][cp][cp]
+	__m128 m3 = _mm_shuffle_ps(c, s, 0x55); // [cy][cy][sy][sy]
+
+	m1 = _mm_shuffle_ps(s, c, 0xAA);   // [sr][sr][cr][cr]
+	m2 = _mm_shuffle_ps(m2, m2, 0x00); // [sp][sp][sp][sp]
+	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [cy][sy][cy][sy]
+
+	m2 = _mm_mul_ps(m2, _mm_mul_ps(m1, m3)); // m1*m2*m3
+
+	m1 = _mm_shuffle_ps(m1, m1, 0x1B); // [cr][cr][sr][sr]
+	m3 = _mm_shuffle_ps(m3, m3, 0xB1); // [sy][cy][sy][cy]
+	m3 = _mm_xor_ps(m3, _mm_load_ps((float *)&negmask_1001));
+	m3 = _mm_mul_ps(m3, m1);
+
+	m2 = _mm_add_ps(m2, m3);
+
+	/*
+	matrix[0][0] = cp * cy;
+	matrix[1][0] = cp * sy;
+	matrix[2][1] = sr * cp;
+	matrix[2][2] = cr * cp;
+	*/
+	m1 = _mm_shuffle_ps(s, c, 0x29); // [sy][sr][cr][cp]
+	c = _mm_shuffle_ps(c, c, 0x40);  // [cp][cp][cp][cy]
+	m1 = _mm_mul_ps(m1, c);
+
+	// matrix[0]
+	m3 = _mm_shuffle_ps(m2, m2, 0xE1);
+	_mm_storeu_ps(&matrix[0][0], m3);
+	matrix[0][0] = _mm_cvtss_f32(_mm_shuffle_ps(m1, m1, 0x03));
+	*(int *)&matrix[0][3] = 0;
+
+	// matrix[1]
+	m2 = _mm_shuffle_ps(m2, m2, 0xB4);
+	_mm_storeu_ps(&matrix[1][0], m2);
+	matrix[1][0] = _mm_cvtss_f32(m1);
+	*(int *)&matrix[1][3] = 0;
+
+	// matrix[2]
+	_mm_storeu_ps(&matrix[2][0], m1);
+	matrix[2][0] = -_mm_cvtss_f32(s);
+	*(int *)&matrix[2][3] = 0;
+}
+
+void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out)
+{
+	xmm2vec(out, _mm_add_ps(_mm_mul_ps(_mm_set_ps1(scale), _mm_loadu_ps(vecm)), _mm_loadu_ps(veca)));
+}
+
+float _DotProduct(const vec_t *v1, const vec_t *v2)
+{
+	// _mm_loadu_ps - load xmm from unaligned address
+	// _mm_cvtss_f32 - return low float value of xmm
+	// _mm_dp_ps - dot product
+	// 0x71 = 0b01110001 - mask for multiplying operands and result
+	// dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f)
+
+	return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
+}
+
+float Length(const vec_t *v)
+{
+	return _mm_cvtss_f32(length3D(_mm_loadu_ps(v))); // rsqrt is very inaccurate :(
+}
+
+float Length2D(const vec_t *v)
+{
+	return _mm_cvtss_f32(length2D(_mm_loadu_ps(v)));
+}
+
+void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
+{
+	xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
+}
+
+void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4])
+{
+	for (size_t i = 0; i < 3; i++)
+	{
+		__m128 a1 = _mm_mul_ps(_mm_set_ps1(in1[i][0]), _mm_loadu_ps(in2[0]));
+		__m128 a2 = _mm_mul_ps(_mm_set_ps1(in1[i][1]), _mm_loadu_ps(in2[1]));
+		__m128 a3 = _mm_mul_ps(_mm_set_ps1(in1[i][2]), _mm_loadu_ps(in2[2]));
+		_mm_storeu_ps(out[i], _mm_add_ps(a1, _mm_add_ps(a2, a3)));
+		out[i][3] += in1[i][3];
+	}
+}
+
+#endif // #if defined(REHLDS_SSE)
--- a/rehlds/engine/mathlib_sse.h
+++ b/rehlds/engine/mathlib_sse.h
@ -0,0 +1,74 @@
+/*
+*
+*    This program is free software; you can redistribute it and/or modify it
+*    under the terms of the GNU General Public License as published by the
+*    Free Software Foundation; either version 2 of the License, or (at
+*    your option) any later version.
+*
+*    This program is distributed in the hope that it will be useful, but
+*    WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with this program; if not, write to the Free Software Foundation,
+*    Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*
+*    In addition, as a special exception, the author gives permission to
+*    link the code of this program with the Half-Life Game Engine ("HL
+*    Engine") and Modified Game Libraries ("MODs") developed by Valve,
+*    L.L.C ("Valve").  You must obey the GNU General Public License in all
+*    respects for all of the code used other than the HL Engine and MODs
+*    from Valve.  If you modify this file, you may extend this exception
+*    to your version of the file, but you are not obligated to do so.  If
+*    you do not wish to do so, delete this exception statement from your
+*    version.
+*
+*/
+
+#pragma once
+
+#if defined(REHLDS_SSE)
+
+inline float M_min(float a, float b)
+{
+	return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
+}
+
+inline double M_min(double a, double b)
+{
+	return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
+}
+
+inline float M_max(float a, float b)
+{
+	return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
+}
+
+inline double M_max(double a, double b)
+{
+	return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
+}
+
+inline float M_sqrt(float value)
+{
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
+}
+
+inline double M_sqrt(double value)
+{
+	auto v = _mm_load_sd(&value);
+	return _mm_cvtsd_f64(_mm_sqrt_sd(v, v));
+}
+
+inline float M_clamp(float a, float min, float max)
+{
+	return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max)));
+}
+
+inline double M_clamp(double a, double min, double max)
+{
+	return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max)));
+}
+
+#endif // #if defined(REHLDS_SSE)
--- a/rehlds/engine/sse_mathfun.h
+++ b/rehlds/engine/sse_mathfun.h
@ -36,7 +36,7 @@ misrepresented as being the original software.

 #ifdef _MSC_VER /* visual c++ */
 # define ALIGN16_BEG __declspec(align(16))
-# define ALIGN16_END 
+# define ALIGN16_END
 #else /* gcc or icc */
 # define ALIGN16_BEG
 # define ALIGN16_END __attribute__((aligned(16)))
--- a/rehlds/msvc/ReHLDS.vcxproj
+++ b/rehlds/msvc/ReHLDS.vcxproj
@ -72,6 +72,7 @@
    <ClCompile Include="..\engine\ipratelimitWrapper.cpp" />
    <ClCompile Include="..\engine\l_studio.cpp" />
    <ClCompile Include="..\engine\mathlib.cpp" />
+    <ClCompile Include="..\engine\mathlib_sse.cpp" />
    <ClCompile Include="..\engine\md5.cpp" />
    <ClCompile Include="..\engine\mem.cpp" />
    <ClCompile Include="..\engine\model.cpp" />
@ -432,6 +433,7 @@
    <ClInclude Include="..\engine\keys.h" />
    <ClInclude Include="..\engine\l_studio.h" />
    <ClInclude Include="..\engine\mathlib_e.h" />
+    <ClInclude Include="..\engine\mathlib_sse.h" />
    <ClInclude Include="..\engine\mem.h" />
    <ClInclude Include="..\engine\model_rehlds.h" />
    <ClInclude Include="..\engine\modinfo.h" />
@ -805,7 +807,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SSE;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FloatingPointModel>Precise</FloatingPointModel>
      <AdditionalOptions>/arch:IA32 %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
@ -1149,7 +1151,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_SSE;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <AdditionalOptions>/arch:IA32 %(AdditionalOptions)</AdditionalOptions>
      <PrecompiledHeader>Use</PrecompiledHeader>
--- a/rehlds/msvc/ReHLDS.vcxproj.filters
+++ b/rehlds/msvc/ReHLDS.vcxproj.filters
@ -353,6 +353,9 @@
    <ClCompile Include="..\engine\SystemWrapper.cpp">
      <Filter>engine\common</Filter>
    </ClCompile>
+    <ClCompile Include="..\engine\mathlib_sse.cpp">
+      <Filter>engine</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\version\version.h">
@ -1081,6 +1084,9 @@
    <ClInclude Include="..\public\strtools.h">
      <Filter>public</Filter>
    </ClInclude>
+    <ClInclude Include="..\engine\mathlib_sse.h">
+      <Filter>engine</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\linux\appversion.sh">
--- a/rehlds/public/rehlds/crc32c.cpp
+++ b/rehlds/public/rehlds/crc32c.cpp
@ -115,6 +115,7 @@ uint32 crc32c_t_nosse(uint32 iCRC, const uint8 *buf, int len) {
 	return crc;
 }

+#ifdef REHLDS_SSE
 FUNC_TARGET("sse4.2")
 uint32 crc32c_t8_sse(uint32 iCRC, uint8 u8) {
 	return _mm_crc32_u8(iCRC, u8);
@ -140,6 +141,14 @@ uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) {
 	return cpuinfo.sse4_2 ? crc32c_t_sse(iCRC, buf, len) : crc32c_t_nosse(iCRC, buf, len);
 }

+#else
+
+uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) {
+	return crc32c_t_nosse(iCRC, buf, len);
+}
+
+#endif // REHLDS_SSE
+
 uint32 crc32c(const uint8 *buf, int len) {
 	return crc32c_t(0xffffffff, buf, len);
 }
--- a/rehlds/public/rehlds/crc32c.h
+++ b/rehlds/public/rehlds/crc32c.h
@ -11,7 +11,9 @@ GNU Lesser General Public License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
+
 #pragma once
+
 #include "archtypes.h"

 extern uint32 crc32c_t8_nosse(uint32 iCRC, uint8 u8);
--- a/rehlds/public/rehlds/static_map.h
+++ b/rehlds/public/rehlds/static_map.h
@ -169,7 +169,6 @@ public:
 			}
 		}

-	
 		Iterator(CStaticMap* m) {
 			m_Map = m;
 			m_RootNodes = m_Map->m_RootNodes;
@ -226,6 +225,7 @@ protected:
 	virtual uint32 hash(const char* const &val) {
 		uint32 cksum = 0;
 		const char* pcc = val;
+#ifdef REHLDS_SSE
 		if (cpuinfo.sse4_2) {
 			while (*pcc) {
 				char cc = *(pcc++);
@ -234,7 +234,10 @@ protected:
 				}
 				cksum = crc32c_t8_sse(cksum, cc);
 			}
-		} else {
+		}
+		else
+#endif // REHLDS_SSE
+		{
 			while (*pcc) {
 				char cc = *(pcc++);
 				if (cc >= 'A' || cc <= 'Z') {