From 5f1672ca85252d74fb501f582f6e1477700f8a9f Mon Sep 17 00:00:00 2001
From: s1lent <s1lentsk@yandex.ru>
Date: Sat, 9 Dec 2017 16:47:08 +0700
Subject: [PATCH] Move math sse to macros REHLDS_SSE

---
 rehlds/build.gradle                |   4 +-
 rehlds/common/mathlib.h            |  24 +
 rehlds/engine/mathlib.cpp          | 986 +++++++++--------------------
 rehlds/engine/mathlib_e.h          |  61 +-
 rehlds/engine/mathlib_sse.cpp      | 375 +++++++++++
 rehlds/engine/mathlib_sse.h        |  74 +++
 rehlds/engine/sse_mathfun.h        |   2 +-
 rehlds/msvc/ReHLDS.vcxproj         |   6 +-
 rehlds/msvc/ReHLDS.vcxproj.filters |   6 +
 rehlds/public/rehlds/crc32c.cpp    |   9 +
 rehlds/public/rehlds/crc32c.h      |   2 +
 rehlds/public/rehlds/static_map.h  |   7 +-
 12 files changed, 812 insertions(+), 744 deletions(-)
 create mode 100644 rehlds/engine/mathlib_sse.cpp
 create mode 100644 rehlds/engine/mathlib_sse.h
diff --git a/rehlds/build.gradle b/rehlds/build.gradle
index b5cabf7..dafa2b9 100644
--- a/rehlds/build.gradle
+++ b/rehlds/build.gradle
@@ -184,11 +184,11 @@ void setupToolchain(NativeBinarySpec b) {
 	}
 
 	if (unitTestExecutable) {
-		cfg.singleDefines 'REHLDS_UNIT_TESTS'
+		cfg.singleDefines 'REHLDS_UNIT_TESTS', 'REHLDS_SSE'
 	}
 
 	if (rehldsFixes) {
-		cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS'
+		cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_SSE', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS'
 	}
 
 	ToolchainConfigUtils.apply(project, cfg, b)
diff --git a/rehlds/common/mathlib.h b/rehlds/common/mathlib.h
index 83affe8..545ecb3 100644
--- a/rehlds/common/mathlib.h
+++ b/rehlds/common/mathlib.h
@@ -82,6 +82,30 @@ inline T clamp(T a, T min, T max)
 	return (a > max) ? max : (a < min) ? min : a;
 }
 
+template<typename T>
+inline T M_min(T a, T b)
+{
+	return min(a, b);
+}
+
+template<typename T>
+inline T M_max(T a, T b)
+{
+	return max(a, b);
+}
+
+template<typename T>
+inline T M_clamp(T a, T min, T max)
+{
+	return clamp(a, min, max);
+}
+
+template<typename T>
+inline double M_sqrt(T value)
+{
+	return sqrt(value);
+}
+
 template <typename T>
 inline T bswap(T s)
 {
diff --git a/rehlds/engine/mathlib.cpp b/rehlds/engine/mathlib.cpp
index 8b81108..4647b97 100644
--- a/rehlds/engine/mathlib.cpp
+++ b/rehlds/engine/mathlib.cpp
@@ -28,91 +28,15 @@
 
 #include "precompiled.h"
 
-// Intrisics guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
-// Shufps calculator: http://wurstcaptures.untergrund.net/assembler_tricks.html
-
 vec3_t vec3_origin;
 //int nanmask;
 //short int new_cw;
 //short int old_cw;
 //DLONG dlong;
 
-// aligned vec4_t
-typedef ALIGN16 vec4_t avec4_t;
-typedef ALIGN16 unsigned int aivec4_t[4];
-
-// conversion multiplier
-const avec4_t deg2rad =
-{
-	(float)M_PI / 180.f,
-	(float)M_PI / 180.f,
-	(float)M_PI / 180.f,
-	(float)M_PI / 180.f
-};
-
-const aivec4_t negmask[4] =
-{
-	0x80000000,
-	0x80000000,
-	0x80000000,
-	0x80000000
-};
-
-const aivec4_t negmask_1001 =
-{
-	0x80000000,
-	0,
-	0,
-	0x80000000
-};
-
-const aivec4_t negmask_0010 =
-{
-	0,
-	0,
-	0x80000000,
-	0
-};
-
-// save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow.
-inline void xmm2vec(vec_t *v, const __m128 m)
-{
-	_mm_storel_pi((__m64*)v, m);
-	_mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02));
-}
-
-FUNC_TARGET("sse4.1")
-inline __m128 dotProduct3D(__m128 v1, __m128 v2)
-{
-	if (cpuinfo.sse4_1)
-		return _mm_dp_ps(v1, v2, 0x71);
-	__m128 v = _mm_mul_ps(v1, v2);
-	return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3
-}
-
-inline __m128 crossProduct3D(__m128 a, __m128 b)
-{
-	__m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
-	__m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)));
-	__m128 m = _mm_sub_ps(tmp1, tmp2);
-
-	return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1));
-}
-
-inline __m128 length3D(__m128 v)
-{
-	return _mm_sqrt_ps(dotProduct3D(v, v));
-}
-
-inline __m128 length2D(__m128 v)
-{
-	v = _mm_mul_ps(v, v);
-	return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3
-}
-
 float anglemod(float a)
 {
-	return (360.0 / 65536) * ((int)(a*(65536 / 360.0)) & 65535);
+	return (360.0 / 65536) * ((int)(a * (65536 / 360.0)) & 65535);
 }
 
 void BOPS_Error(void)
@@ -120,91 +44,317 @@ void BOPS_Error(void)
 	Sys_Error("%s: Bad signbits", __func__);
 }
 
-#ifdef REHLDS_OPT_PEDANTIC
-int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p)
+NOBODY int InvertMatrix(const float *m, float *out);
+//{
+//	float wtmp;
+//	float m0;
+//	float m1;
+//	float m2;
+//	float m3;
+//	float s;
+//	float *r0;
+//	float *r1;
+//	float *r2;
+//	float *r3;
+//}
+
+void EXT_FUNC AngleVectors_ext(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
 {
-	double	dist1, dist2;
-	int		sides = 0;
+	AngleVectors(angles, forward, right, up);
+}
 
-	__m128 emin = _mm_loadu_ps(emins);
-	__m128 emax = _mm_loadu_ps(emaxs);
-	avec4_t d1, d2;
+NOBODY void AngleIMatrix(const vec_t *angles, float *matrix);
+//{
+//	float angle;
+//	float sr;
+//	float sp;
+//	float sy;
+//	float cr;
+//	float cp;
+//	float cy;
+//}
 
-	// general case
-	switch (p->signbits)
+NOBODY void NormalizeAngles(float *angles);
+//{
+//	int i;
+//}
+
+NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac);
+//{
+//	int i;
+//	float ang1;
+//	float ang2;
+//	float d;
+//	NormalizeAngles(float *angles);
+//	NormalizeAngles(float *angles);
+//	NormalizeAngles(float *angles);
+//}
+
+void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out)
+{
+	out[0] = _DotProduct(in1, in2[0]) + in2[0][3];
+	out[1] = _DotProduct(in1, in2[1]) + in2[1][3];
+	out[2] = _DotProduct(in1, in2[2]) + in2[2][3];
+}
+
+NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
+//{
+//}
+
+void _VectorAdd(vec_t *veca, vec_t *vecb, vec_t *out)
+{
+	out[0] = veca[0] + vecb[0];
+	out[1] = veca[1] + vecb[1];
+	out[2] = veca[2] + vecb[2];
+}
+
+NOBODY void _VectorCopy(vec_t *in, vec_t *out);
+//{
+//}
+
+float VectorNormalize(vec3_t v)
+{
+	float length, ilength;
+	length = Length(v);
+
+	const int res = clamp(0.0, 1.0, 2.0);
+
+	if (length)
 	{
-	case 0:
-		_mm_store_ps(d1, emax);
-		_mm_store_ps(d2, emin);
-		break;
-	case 1:
-		_mm_store_ps(d1, emax);
-		_mm_store_ps(d2, emin);
-		d1[0] = emins[0];
-		d2[0] = emaxs[0];
-		break;
-	case 2:
-		_mm_store_ps(d1, emax);
-		_mm_store_ps(d2, emin);
-		d1[1] = emins[1];
-		d2[1] = emaxs[1];
-		break;
-	case 3:
-		_mm_store_ps(d1, emin);
-		_mm_store_ps(d2, emax);
-		d1[2] = emaxs[2];
-		d2[2] = emins[2];
-		break;
-	case 4:
-		_mm_store_ps(d1, emax);
-		_mm_store_ps(d2, emin);
-		d1[2] = emins[2];
-		d2[2] = emaxs[2];
-		break;
-	case 5:
-		_mm_store_ps(d1, emin);
-		_mm_store_ps(d2, emax);
-		d1[1] = emaxs[1];
-		d2[1] = emins[1];
-		break;
-	case 6:
-		_mm_store_ps(d1, emin);
-		_mm_store_ps(d2, emax);
-		d1[0] = emaxs[0];
-		d2[0] = emins[0];
-		break;
-	case 7:
-		_mm_store_ps(d1, emin);
-		_mm_store_ps(d2, emax);
-		break;
-	default:
-		BOPS_Error();
-		dist1 = dist2 = 0.0;
-		break;
+		ilength = 1 / length;
+		v[0] *= ilength;
+		v[1] *= ilength;
+		v[2] *= ilength;
 	}
 
-	dist1 = _DotProduct(p->normal, d1);
-	dist2 = _DotProduct(p->normal, d2);
-
-	if (dist1 >= p->dist)
-		sides = 1;
-	if (dist2 < p->dist)
-		sides |= 2;
-
-	return sides;
+	return length;
 }
+
+NOBODY void VectorInverse(vec_t *v);
+//{
+//}
+
+NOBODY int Q_log2(int val);
+//{
+//	int answer;
+//}
+
+NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up);
+//{
+//	vec3_t tmp;
+//	CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
+//	VectorNormalize(vec_t *v);
+//	CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross);
+//	VectorNormalize(vec_t *v);
+//}
+
+void VectorAngles(const vec_t *forward, vec_t *angles)
+{
+	float length, yaw, pitch;
+
+	if (forward[1] == 0 && forward[0] == 0)
+	{
+		yaw = 0;
+		if (forward[2] > 0)
+			pitch = 90;
+		else
+			pitch = 270;
+	}
+	else
+	{
+		yaw = (atan2((double)forward[1], (double)forward[0]) * 180.0 / M_PI);
+		if (yaw < 0)
+			yaw += 360;
+
+#ifdef REHLDS_SSE
+		length = Length2D(forward);
 #else
+		length = Q_sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1]));
+#endif // REHLDS_SSE
+
+		pitch = atan2((double)forward[2], (double)length) * 180.0 / M_PI;
+		if (pitch < 0)
+			pitch += 360;
+	}
+
+	angles[0] = pitch;
+	angles[1] = yaw;
+	angles[2] = 0;
+}
+
+NOBODY void R_ConcatRotations(float *in1, float *in2, float *out);
+//{
+//
+//}
+
+NOBODY void FloorDivMod(double numer, double denom, int *quotient, int *rem);
+//{
+//	int q;
+//	int r;
+//	double x;
+//	floor(double __x);
+//	floor(double __x);
+//}
+
+NOBODY int GreatestCommonDivisor(int i1, int i2);
+//{
+//}
+
+NOBODY fixed16_t Invert24To16(fixed16_t val);
+//{
+//}
+
+#if !defined(REHLDS_SSE)
+void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+{
+	float sr, sp, sy, cr, cp, cy;
+
+#ifndef SWDS
+	g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
+#endif // SWDS
+
+	float angle;
+	angle = (float)(angles[YAW] * (M_PI * 2 / 360));
+	sy = sin(angle);
+	cy = cos(angle);
+	angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
+	sp = sin(angle);
+	cp = cos(angle);
+	angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
+	sr = sin(angle);
+	cr = cos(angle);
+
+	if (forward)
+	{
+		forward[0] = cp*cy;
+		forward[1] = cp*sy;
+		forward[2] = -sp;
+	}
+	if (right)
+	{
+		right[0] = (-1 * sr*sp*cy + -1 * cr*-sy);
+		right[1] = (-1 * sr*sp*sy + -1 * cr*cy);
+		right[2] = -1 * sr*cp;
+	}
+	if (up)
+	{
+		up[0] = (cr*sp*cy + -sr*-sy);
+		up[1] = (cr*sp*sy + -sr*cy);
+		up[2] = cr*cp;
+	}
+}
+
+void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+{
+	float sr, sp, sy, cr, cp, cy;
+
+	float angle;
+	angle = (float)(angles[YAW] * (M_PI * 2 / 360));
+	sy = sin(angle);
+	cy = cos(angle);
+	angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
+	sp = sin(angle);
+	cp = cos(angle);
+	angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
+	sr = sin(angle);
+	cr = cos(angle);
+
+	if (forward)
+	{
+		forward[0] = cp*cy;
+		forward[1] = (sr*sp*cy + cr*-sy);
+		forward[2] = (cr*sp*cy + -sr*-sy);
+	}
+	if (right)
+	{
+		right[0] = cp*sy;
+		right[1] = (sr*sp*sy + cr*cy);
+		right[2] = (cr*sp*sy + -sr*cy);
+	}
+	if (up)
+	{
+		up[0] = -sp;
+		up[1] = sr*cp;
+		up[2] = cr*cp;
+	}
+}
+
+void AngleMatrix(const vec_t *angles, float(*matrix)[4])
+{
+	float sr, sp, sy, cr, cp, cy;
+
+	float angle;
+	angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
+	sy = sin(angle);
+	cy = cos(angle);
+	angle = (float)(angles[YAW] * (M_PI * 2 / 360));
+	sp = sin(angle);
+	cp = cos(angle);
+	angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
+	sr = sin(angle);
+	cr = cos(angle);
+
+	matrix[0][0] = cr * cp;
+	matrix[0][1] = sy * sr * cp - cy * sp;
+	matrix[0][2] = cy * sr * cp + sy * sp;
+	matrix[0][3] = 0.0f;
+
+	matrix[1][0] = cr * sp;
+	matrix[1][1] = sy * sr * sp + cy * cp;
+	matrix[1][2] = cy * sr * sp - sy * cp;
+	matrix[1][3] = 0.0f;
+
+	matrix[2][0] = -sr;
+	matrix[2][1] = sy * cr;
+	matrix[2][2] = cy * cr;
+	matrix[2][3] = 0.0f;
+}
+
+void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out)
+{
+	out[0] = scale * vecm[0] + veca[0];
+	out[1] = scale * vecm[1] + veca[1];
+	out[2] = scale * vecm[2] + veca[2];
+}
+
+real_t _DotProduct(const vec_t *v1, const vec_t *v2)
+{
+	return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
+}
+
+void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
+{
+	cross[0] = v1[1] * v2[2] - v1[2] * v2[1];
+	cross[1] = v1[2] * v2[0] - v1[0] * v2[2];
+	cross[2] = v1[0] * v2[1] - v1[1] * v2[0];
+}
+
+void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4])
+{
+	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] + in1[0][2] * in2[2][0];
+	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] + in1[0][2] * in2[2][1];
+	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] + in1[0][2] * in2[2][2];
+	out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] + in1[0][2] * in2[2][3] + in1[0][3];
+	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] + in1[1][2] * in2[2][0];
+	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] + in1[1][2] * in2[2][1];
+	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] + in1[1][2] * in2[2][2];
+	out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] + in1[1][2] * in2[2][3] + in1[1][3];
+	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] + in1[2][2] * in2[2][0];
+	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] + in1[2][2] * in2[2][1];
+	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] + in1[2][2] * in2[2][2];
+	out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] + in1[2][2] * in2[2][3] + in1[2][3];
+}
+
+float Length(const vec_t *v)
+{
+	return Q_sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+}
 
 int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p)
 {
-#if (1)
 	// Engine actual types
-	double	dist1, dist2;
-#else
-	// From sources
-	float	dist1, dist2;
-#endif
-	int		sides = 0;
+	real_t dist1, dist2;
+	int sides = 0;
 
 	// general case
 	switch (p->signbits)
@@ -249,546 +399,22 @@ int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p)
 
 	if (dist1 >= p->dist)
 		sides = 1;
+
 	if (dist2 < p->dist)
 		sides |= 2;
 
 	return sides;
 }
-#endif
 
-NOBODY int InvertMatrix(const float *m, float *out);
-//{
-//	float wtmp;                                                   //   159
-//	float m0;                                                     //   160
-//	float m1;                                                     //   160
-//	float m2;                                                     //   160
-//	float m3;                                                     //   160
-//	float s;                                                      //   160
-//	float *r0;                                                   //   161
-//	float *r1;                                                   //   161
-//	float *r2;                                                   //   161
-//	float *r3;                                                   //   161
-//}
-
-void EXT_FUNC AngleVectors_ext(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up) {
-	AngleVectors(angles, forward, right, up);
-}
-
-#ifdef REHLDS_FIXES
-// parallel SSE version
-void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+qboolean VectorCompare(const vec_t *v1, const vec_t *v2)
 {
-#ifndef SWDS
-	g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
-#endif // SWDS
-
-	__m128 s, c;
-	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
-
-	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
-	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
-	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
-
-	m1 = _mm_shuffle_ps(c, s, 0x15); // [cy][cy][sy][sp]
-	m2 = _mm_shuffle_ps(s, c, 0xA0); // [sp][sp][cr][cr]
-	m1 = _mm_shuffle_ps(m1, m1, 0xC8); // [cy][sy][cy][sp]
-
-	__m128 m3 = _mm_shuffle_ps(s, s, 0x4A); // [sr][sr][sp][sy];
-	m3 = _mm_mul_ps(m3, _mm_mul_ps(m1, m2)); // [sp*cy*sr][sp*sy*sr][cr*cy*sp][cr*sp*sy]
-
-	m2 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
-	m1 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
-	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [sy][cr][sy][cy]
-	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
-	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][cy*cr][sr*sy][-sr*cy]
-
-	m3 = _mm_add_ps(m3, m1);
-
-	if (forward)
-	{
-		_mm_storel_pi((__m64 *)forward, _mm_shuffle_ps(cp_mults, cp_mults, 0x08));
-		forward[2] = -_mm_cvtss_f32(s);
-	}
-	if (right)
-	{
-		__m128 r = _mm_shuffle_ps(m3, cp_mults, 0xF4); // [m3(0)][m3(1)][cp(3)][cp(3)]
-		xmm2vec(right, _mm_xor_ps(r, _mm_load_ps((float *)&negmask)));
-	}
-	if (up)
-	{
-		_mm_storel_pi((__m64 *)up, _mm_shuffle_ps(m3, m3, 0x0E));
-		up[2] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x01));
-	}
-}
-#else // REHLDS_FIXES
-void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
-{
-	float		sr, sp, sy, cr, cp, cy;
-
-#ifndef SWDS
-	g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
-#endif // SWDS
-
-	float angle;
-	angle = (float)(angles[YAW] * (M_PI * 2 / 360));
-	sy = sin(angle);
-	cy = cos(angle);
-	angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
-	sp = sin(angle);
-	cp = cos(angle);
-	angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
-	sr = sin(angle);
-	cr = cos(angle);
-
-	if (forward)
-	{
-		forward[0] = cp*cy;
-		forward[1] = cp*sy;
-		forward[2] = -sp;
-	}
-	if (right)
-	{
-		right[0] = (-1 * sr*sp*cy + -1 * cr*-sy);
-		right[1] = (-1 * sr*sp*sy + -1 * cr*cy);
-		right[2] = -1 * sr*cp;
-	}
-	if (up)
-	{
-		up[0] = (cr*sp*cy + -sr*-sy);
-		up[1] = (cr*sp*sy + -sr*cy);
-		up[2] = cr*cp;
-	}
-}
-#endif // REHLDS_FIXES
-
-#ifdef REHLDS_FIXES
-// parallel SSE version
-void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
-{
-	__m128 s, c;
-	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
-
-	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
-	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
-	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
-
-	m1 = _mm_shuffle_ps(s, s, 0x50); // [sp][sp][sy][sy]
-	m2 = _mm_shuffle_ps(c, s, 0x05); // [cy][cy][sp][sp]
-
-	__m128 m3 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr]
-	m1 = _mm_mul_ps(m1, m2);
-	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [sr][cr][sr][cr]
-	m3 = _mm_mul_ps(m3, m1); // [sp*cy*sr][sp*cy*cr][sy*sp*sr][sy*sp*cr]
-
-	m2 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
-	m1 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
-	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [cr][sr][cy][sr]
-	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
-	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][sr*sy][cy*cr][-sr*cy]
-
-	m3 = _mm_add_ps(m3, m1);
-
-	if (forward)
-	{
-		forward[0] = _mm_cvtss_f32(cp_mults);
-		_mm_storel_pi((__m64*)(forward + 1), m3); // (sr*sp*cy + cr*-sy);
-	}
-	if (right)
-	{
-		right[0] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x02));
-		_mm_storel_pi((__m64*)(right + 1), _mm_shuffle_ps(m3, m3, 0x0E));
-	}
-	if (up)
-	{
-		up[0] = -_mm_cvtss_f32(s);
-		_mm_storel_pi((__m64 *)&up[1], _mm_shuffle_ps(cp_mults, cp_mults, 0x07));
-	}
-}
-#else // REHLDS_FIXES
-void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
-{
-	float		sr, sp, sy, cr, cp, cy;
-
-	float angle;
-	angle = (float)(angles[YAW] * (M_PI * 2 / 360));
-	sy = sin(angle);
-	cy = cos(angle);
-	angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
-	sp = sin(angle);
-	cp = cos(angle);
-	angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
-	sr = sin(angle);
-	cr = cos(angle);
-
-	if (forward)
-	{
-		forward[0] = cp*cy;
-		forward[1] = (sr*sp*cy + cr*-sy);
-		forward[2] = (cr*sp*cy + -sr*-sy);
-	}
-	if (right)
-	{
-		right[0] = cp*sy;
-		right[1] = (sr*sp*sy + cr*cy);
-		right[2] = (cr*sp*sy + -sr*cy);
-	}
-	if (up)
-	{
-		up[0] = -sp;
-		up[1] = sr*cp;
-		up[2] = cr*cp;
-	}
-}
-#endif
-
-#ifdef REHLDS_FIXES
-// parallel SSE version
-void AngleMatrix(const vec_t *angles, float(*matrix)[4])
-{
-	__m128 s, c;
-	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
-
-	/*
-	matrix[0][1] = sr * sp * cy - cr * sy;
-	matrix[1][1] = sr * sp * sy + cr * cy;
-	matrix[0][2] = cr * sp * cy + sr * sy;
-	matrix[1][2] = cr * sp * sy - sr * cy;
-	*/
-	__m128 m1;
-	__m128 m2 = _mm_shuffle_ps(s, c, 0x00); // [sp][sp][cp][cp]
-	__m128 m3 = _mm_shuffle_ps(c, s, 0x55); // [cy][cy][sy][sy]
-
-	m1 = _mm_shuffle_ps(s, c, 0xAA);   // [sr][sr][cr][cr]
-	m2 = _mm_shuffle_ps(m2, m2, 0x00); // [sp][sp][sp][sp]
-	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [cy][sy][cy][sy]
-
-	m2 = _mm_mul_ps(m2, _mm_mul_ps(m1, m3)); // m1*m2*m3
-
-	m1 = _mm_shuffle_ps(m1, m1, 0x1B); // [cr][cr][sr][sr]
-	m3 = _mm_shuffle_ps(m3, m3, 0xB1); // [sy][cy][sy][cy]
-	m3 = _mm_xor_ps(m3, _mm_load_ps((float *)&negmask_1001));
-	m3 = _mm_mul_ps(m3, m1);
-
-	m2 = _mm_add_ps(m2, m3);
-
-	/*
-	matrix[0][0] = cp * cy;
-	matrix[1][0] = cp * sy;
-	matrix[2][1] = sr * cp;
-	matrix[2][2] = cr * cp;
-	*/
-	m1 = _mm_shuffle_ps(s, c, 0x29); // [sy][sr][cr][cp]
-	c = _mm_shuffle_ps(c, c, 0x40);  // [cp][cp][cp][cy]
-	m1 = _mm_mul_ps(m1, c);
-
-	// matrix[0]
-	m3 = _mm_shuffle_ps(m2, m2, 0xE1);
-	_mm_storeu_ps(&matrix[0][0], m3);
-	matrix[0][0] = _mm_cvtss_f32(_mm_shuffle_ps(m1, m1, 0x03));
-	*(int *)&matrix[0][3] = 0;
-
-	// matrix[1]
-	m2 = _mm_shuffle_ps(m2, m2, 0xB4);
-	_mm_storeu_ps(&matrix[1][0], m2);
-	matrix[1][0] = _mm_cvtss_f32(m1);
-	*(int *)&matrix[1][3] = 0;
-
-	// matrix[2]
-	_mm_storeu_ps(&matrix[2][0], m1);
-	matrix[2][0] = -_mm_cvtss_f32(s);
-	*(int *)&matrix[2][3] = 0;
-}
-#else // REHLDS_FIXES
-void AngleMatrix(const vec_t *angles, float(*matrix)[4])
-{
-	float		sr, sp, sy, cr, cp, cy;
-
-	float angle;
-	angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
-	sy = sin(angle);
-	cy = cos(angle);
-	angle = (float)(angles[YAW] * (M_PI * 2 / 360));
-	sp = sin(angle);
-	cp = cos(angle);
-	angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
-	sr = sin(angle);
-	cr = cos(angle);
-
-	matrix[0][0] = cr * cp;
-	matrix[0][1] = sy * sr * cp - cy * sp;
-	matrix[0][2] = cy * sr * cp + sy * sp;
-	matrix[0][3] = 0.0f;
-
-	matrix[1][0] = cr * sp;
-	matrix[1][1] = sy * sr * sp + cy * cp;
-	matrix[1][2] = cy * sr * sp - sy * cp;
-	matrix[1][3] = 0.0f;
-
-	matrix[2][0] = -sr;
-	matrix[2][1] = sy * cr;
-	matrix[2][2] = cy * cr;
-	matrix[2][3] = 0.0f;
-}
-#endif // REHLDS_FIXES
-
-NOBODY void AngleIMatrix(const vec_t *angles, float *matrix);
-//{
-//	float angle;                                                  //   372
-//	float sr;                                                     //   373
-//	float sp;                                                     //   373
-//	float sy;                                                     //   373
-//	float cr;                                                     //   373
-//	float cp;                                                     //   373
-//	float cy;                                                     //   373
-//}
-
-NOBODY void NormalizeAngles(float *angles);
-//{
-//	int i;                                                        //   402
-//}
-
-NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac);
-//{
-//	int i;                                                        //   428
-//	float ang1;                                                   //   429
-//	float ang2;                                                   //   429
-//	float d;                                                      //   430
-//	NormalizeAngles(float *angles);  //   432
-//	NormalizeAngles(float *angles);  //   433
-//	NormalizeAngles(float *angles);  //   453
-//}
-
-void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out)
-{
-	out[0] = _DotProduct(in1, in2[0]) + in2[0][3];
-	out[1] = _DotProduct(in1, in2[1]) + in2[1][3];
-	out[2] = _DotProduct(in1, in2[2]) + in2[2][3];
-}
-
-int VectorCompare(const vec_t *v1, const vec_t *v2)
-{
-#ifdef REHLDS_OPT_PEDANTIC
-	__m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
-	return !(_mm_movemask_ps(cmp) & (1|2|4));
-#else // REHLDS_OPT_PEDANTIC
 	for (int i = 0; i < 3; i++)
 	{
-		if (v1[i] != v2[i]) return 0;
+		if (v1[i] != v2[i])
+			return FALSE;
 	}
 
-	return 1;
-#endif // REHLDS_OPT_PEDANTIC
+	return TRUE;
 }
 
-#ifdef REHLDS_FIXES
-void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out)
-{
-	xmm2vec(out, _mm_add_ps(_mm_mul_ps(_mm_set_ps1(scale), _mm_loadu_ps(vecm)), _mm_loadu_ps(veca)));
-}
-#else
-void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out)
-{
-	out[0] = scale * vecm[0] + veca[0];
-	out[1] = scale * vecm[1] + veca[1];
-	out[2] = scale * vecm[2] + veca[2];
-}
-#endif
-
-#ifndef REHLDS_FIXES
-real_t _DotProduct(const vec_t *v1, const vec_t *v2)
-{
-	return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
-}
-#else // REHLDS_FIXES
-float _DotProduct(const vec_t *v1, const vec_t *v2)
-{
-	// _mm_loadu_ps - load xmm from unaligned address
-	// _mm_cvtss_f32 - return low float value of xmm
-	// _mm_dp_ps - dot product
-	// 0x71 = 0b01110001 - mask for multiplying operands and result
-	// dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f)
-
-	return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
-}
-#endif // REHLDS_FIXES
-
-NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
-//{
-//}
-
-void _VectorAdd(vec_t *veca, vec_t *vecb, vec_t *out)
-{
-	out[0] = veca[0] + vecb[0];
-	out[1] = veca[1] + vecb[1];
-	out[2] = veca[2] + vecb[2];
-}
-
-NOBODY void _VectorCopy(vec_t *in, vec_t *out);
-//{
-//}
-
-void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
-{
-#ifdef REHLDS_FIXES
-	xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
-#else // REHLDS_FIXES
-	cross[0] = v1[1] * v2[2] - v1[2] * v2[1];
-	cross[1] = v1[2] * v2[0] - v1[0] * v2[2];
-	cross[2] = v1[0] * v2[1] - v1[1] * v2[0];
-#endif // REHLDS_FIXES
-}
-
-float Length(const vec_t *v)
-{
-#ifdef REHLDS_FIXES
-	return _mm_cvtss_f32(length3D(_mm_loadu_ps(v)));
-#endif // REHLDS_FIXES
-
-	float length;
-
-	length = 0.0f;
-	for (int i = 0; i < 3; i++)
-	{
-		length = v[i] * v[i] + length;
-	}
-	return Q_sqrt(length);
-}
-
-float Length2D(const vec_t *v)
-{
-#ifdef REHLDS_FIXES
-	return _mm_cvtss_f32(length2D(_mm_loadu_ps(v)));
-#endif // REHLDS_FIXES
-
-	return Q_sqrt(v[0] * v[0] + v[1] * v[1]);
-}
-
-float VectorNormalize(vec3_t v)
-{
-	float	length, ilength;
-
-#ifdef REHLDS_FIXES
-	length = Length(v); // rsqrt is very inaccurate :(
-#else // REHLDS_FIXES
-	length = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
-	length = Q_sqrt(length);
-#endif // REHLDS_FIXES
-
-	if (length)
-	{
-		ilength = 1 / length;
-		v[0] *= ilength;
-		v[1] *= ilength;
-		v[2] *= ilength;
-	}
-
-	return length;
-}
-
-NOBODY void VectorInverse(vec_t *v);
-//{
-//}
-
-NOBODY int Q_log2(int val);
-//{
-//	int answer;                                                   //   568
-//}
-
-NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up);
-//{
-//	vec3_t tmp;                                                   //   576
-//	CrossProduct(const vec_t *v1,
-//		const vec_t *v2,
-//		vec_t *cross); /* size=0, low_pc=0 */ //   590
-//	VectorNormalize(vec_t *v); /* size=0, low_pc=0 */ //   591
-//	CrossProduct(const vec_t *v1,
-//		const vec_t *v2,
-//		vec_t *cross); /* size=0, low_pc=0 */ //   592
-//	VectorNormalize(vec_t *v); /* size=0, low_pc=0 */ //   593
-//}
-
-void VectorAngles(const vec_t *forward, vec_t *angles)
-{
-	float	length, yaw, pitch;
-
-	if (forward[1] == 0 && forward[0] == 0)
-	{
-		yaw = 0;
-		if (forward[2] > 0)
-			pitch = 90;
-		else
-			pitch = 270;
-	}
-	else
-	{
-		yaw = (atan2((double)forward[1], (double)forward[0]) * 180.0 / M_PI);
-		if (yaw < 0)
-			yaw += 360;
-
-#ifdef REHLDS_FIXES
-		length = Length2D(forward);
-#else // REHLDS_FIXES
-		length = Q_sqrt((double)(forward[0] * forward[0] + forward[1] * forward[1]));
-#endif // REHLDS_FIXES
-
-		pitch = atan2((double)forward[2], (double)length) * 180.0 / M_PI;
-		if (pitch < 0)
-			pitch += 360;
-	}
-
-	angles[0] = pitch;
-	angles[1] = yaw;
-	angles[2] = 0;
-}
-
-NOBODY void R_ConcatRotations(float *in1, float *in2, float *out);
-//{
-//
-//}
-
-#ifdef REHLDS_FIXES
-void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4])
-{
-	for (size_t i = 0; i < 3; i++)
-	{
-		__m128 a1 = _mm_mul_ps(_mm_set_ps1(in1[i][0]), _mm_loadu_ps(in2[0]));
-		__m128 a2 = _mm_mul_ps(_mm_set_ps1(in1[i][1]), _mm_loadu_ps(in2[1]));
-		__m128 a3 = _mm_mul_ps(_mm_set_ps1(in1[i][2]), _mm_loadu_ps(in2[2]));
-		_mm_storeu_ps(out[i], _mm_add_ps(a1, _mm_add_ps(a2, a3)));
-		out[i][3] += in1[i][3];
-	}
-}
-#else // REHLDS_FIXES
-void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4])
-{
-	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] + in1[0][2] * in2[2][0];
-	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] + in1[0][2] * in2[2][1];
-	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] + in1[0][2] * in2[2][2];
-	out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] + in1[0][2] * in2[2][3] + in1[0][3];
-	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] + in1[1][2] * in2[2][0];
-	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] + in1[1][2] * in2[2][1];
-	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] + in1[1][2] * in2[2][2];
-	out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] + in1[1][2] * in2[2][3] + in1[1][3];
-	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] + in1[2][2] * in2[2][0];
-	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] + in1[2][2] * in2[2][1];
-	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] + in1[2][2] * in2[2][2];
-	out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] + in1[2][2] * in2[2][3] + in1[2][3];
-}
-#endif // REHLDS_FIXES
-
-NOBODY void FloorDivMod(double numer, double denom, int *quotient, int *rem);
-//{
-//	int q;                                                        //   702
-//	int r;                                                        //   702
-//	double x;                                                     //   703
-//	floor(double __x); /* size=0, low_pc=0 */ //   726
-//	floor(double __x); /* size=0, low_pc=0 */ //   717
-//}
-
-NOBODY int GreatestCommonDivisor(int i1, int i2);
-//{
-//}
-
-NOBODY fixed16_t Invert24To16(fixed16_t val);
-//{
-//}
+#endif // #if !defined(REHLDS_SSE)
diff --git a/rehlds/engine/mathlib_e.h b/rehlds/engine/mathlib_e.h
index 7bfcb2a..77f69ee 100644
--- a/rehlds/engine/mathlib_e.h
+++ b/rehlds/engine/mathlib_e.h
@@ -30,9 +30,10 @@
 
 #include "maintypes.h"
 #include "model.h"
+#include "mathlib_sse.h"
 
-#ifndef REHLDS_FIXES
-// NOTE: In some cases we need high precision of floating-point,
+#if !defined(REHLDS_FIXES) && !defined(REHLDS_SSE)
+// NOTE: In some cases we need high precision of floating-point,
 // so use double instead of float, otherwise unittest will fail
 typedef double real_t;
 #else
@@ -77,59 +78,6 @@ static const int nanmask = 0x7F800000;
 
 #define IS_NAN(fvar) ((*reinterpret_cast<int*>(&(fvar)) & nanmask) == nanmask)
 
-inline float M_sqrt(float value) {
-	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
-}
-
-inline double M_sqrt(double value) {
-	auto v = _mm_load_sd(&value);
-	return _mm_cvtsd_f64(_mm_sqrt_sd(v, v));
-}
-
-template<typename T>
-inline double M_sqrt(T value) {
-	return sqrt(value);
-}
-
-inline float M_min(float a, float b) {
-	return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
-}
-
-inline double M_min(double a, double b) {
-	return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
-}
-
-template<typename T>
-inline T M_min(T a, T b) {
-	return min(a, b);
-}
-
-inline float M_max(float a, float b) {
-	return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
-}
-
-inline double M_max(double a, double b) {
-	return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
-}
-
-template<typename T>
-inline T M_max(T a, T b) {
-	return max(a, b);
-}
-
-inline float M_clamp(float a, float min, float max) {
-	return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max)));
-}
-
-inline double M_clamp(double a, double min, double max) {
-	return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max)));
-}
-
-template<typename T>
-inline T M_clamp(T a, T min, T max) {
-	return clamp(a, min, max);
-}
-
 inline void VectorAdd(const vec_t *veca, const vec_t *vecb, vec_t *out)
 {
 	out[0] = veca[0] + vecb[0];
@@ -208,7 +156,7 @@ NOBODY void AngleIMatrix(const vec_t *angles, float *matrix);
 NOBODY void NormalizeAngles(float *angles);
 NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac);
 void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out);
-int VectorCompare(const vec_t *v1, const vec_t *v2);
+qboolean VectorCompare(const vec_t *v1, const vec_t *v2);
 void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out);
 real_t _DotProduct(const vec_t *v1, const vec_t *v2);
 NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
@@ -219,7 +167,6 @@ float Length(const vec_t *v);
 float Length2D(const vec_t *v);
 float VectorNormalize(vec_t *v);
 NOBODY void VectorInverse(vec_t *v);
-void VectorScale(const vec_t *in, float scale, vec_t *out);
 NOBODY int Q_log2(int val);
 NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up);
 void VectorAngles(const vec_t *forward, vec_t *angles);
diff --git a/rehlds/engine/mathlib_sse.cpp b/rehlds/engine/mathlib_sse.cpp
new file mode 100644
index 0000000..e99d85a
--- /dev/null
+++ b/rehlds/engine/mathlib_sse.cpp
@@ -0,0 +1,375 @@
+/*
+*
+*    This program is free software; you can redistribute it and/or modify it
+*    under the terms of the GNU General Public License as published by the
+*    Free Software Foundation; either version 2 of the License, or (at
+*    your option) any later version.
+*
+*    This program is distributed in the hope that it will be useful, but
+*    WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with this program; if not, write to the Free Software Foundation,
+*    Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*
+*    In addition, as a special exception, the author gives permission to
+*    link the code of this program with the Half-Life Game Engine ("HL
+*    Engine") and Modified Game Libraries ("MODs") developed by Valve,
+*    L.L.C ("Valve").  You must obey the GNU General Public License in all
+*    respects for all of the code used other than the HL Engine and MODs
+*    from Valve.  If you modify this file, you may extend this exception
+*    to your version of the file, but you are not obligated to do so.  If
+*    you do not wish to do so, delete this exception statement from your
+*    version.
+*
+*/
+
+#include "precompiled.h"
+
+#if defined(REHLDS_SSE)
+
+// Intrisics guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+// Shufps calculator: http://wurstcaptures.untergrund.net/assembler_tricks.html
+
+// aligned vec4_t
+typedef ALIGN16 vec4_t avec4_t;
+typedef ALIGN16 unsigned int aivec4_t[4];
+
+// conversion multiplier
+const avec4_t deg2rad =
+{
+	(float)M_PI / 180.f,
+	(float)M_PI / 180.f,
+	(float)M_PI / 180.f,
+	(float)M_PI / 180.f
+};
+
+const aivec4_t negmask[4] =
+{
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000
+};
+
+const aivec4_t negmask_1001 =
+{
+	0x80000000,
+	0,
+	0,
+	0x80000000
+};
+
+const aivec4_t negmask_0010 =
+{
+	0,
+	0,
+	0x80000000,
+	0
+};
+
+// save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow.
+inline void xmm2vec(vec_t *v, const __m128 m)
+{
+	_mm_storel_pi((__m64*)v, m);
+	_mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02));
+}
+
+FUNC_TARGET("sse4.1")
+inline __m128 dotProduct3D(__m128 v1, __m128 v2)
+{
+	if (cpuinfo.sse4_1)
+		return _mm_dp_ps(v1, v2, 0x71);
+	__m128 v = _mm_mul_ps(v1, v2);
+	return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3
+}
+
+inline __m128 crossProduct3D(__m128 a, __m128 b)
+{
+	__m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
+	__m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)));
+	__m128 m = _mm_sub_ps(tmp1, tmp2);
+
+	return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+inline __m128 length3D(__m128 v)
+{
+	return _mm_sqrt_ps(dotProduct3D(v, v));
+}
+
+inline __m128 length2D(__m128 v)
+{
+	v = _mm_mul_ps(v, v);
+	return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3
+}
+
+int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p)
+{
+	double dist1, dist2;
+	int sides = 0;
+
+	__m128 emin = _mm_loadu_ps(emins);
+	__m128 emax = _mm_loadu_ps(emaxs);
+	avec4_t d1, d2;
+
+	// general case
+	switch (p->signbits)
+	{
+	case 0:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		break;
+	case 1:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		d1[0] = emins[0];
+		d2[0] = emaxs[0];
+		break;
+	case 2:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		d1[1] = emins[1];
+		d2[1] = emaxs[1];
+		break;
+	case 3:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		d1[2] = emaxs[2];
+		d2[2] = emins[2];
+		break;
+	case 4:
+		_mm_store_ps(d1, emax);
+		_mm_store_ps(d2, emin);
+		d1[2] = emins[2];
+		d2[2] = emaxs[2];
+		break;
+	case 5:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		d1[1] = emaxs[1];
+		d2[1] = emins[1];
+		break;
+	case 6:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		d1[0] = emaxs[0];
+		d2[0] = emins[0];
+		break;
+	case 7:
+		_mm_store_ps(d1, emin);
+		_mm_store_ps(d2, emax);
+		break;
+	default:
+		BOPS_Error();
+		dist1 = dist2 = 0.0;
+		break;
+	}
+
+	dist1 = _DotProduct(p->normal, d1);
+	dist2 = _DotProduct(p->normal, d2);
+
+	if (dist1 >= p->dist)
+		sides = 1;
+
+	if (dist2 < p->dist)
+		sides |= 2;
+
+	return sides;
+}
+
+qboolean VectorCompare(const vec_t *v1, const vec_t *v2)
+{
+	__m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
+	return !(_mm_movemask_ps(cmp) & (1|2|4));
+}
+
+void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+{
+#ifndef SWDS
+	g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
+#endif // SWDS
+
+	__m128 s, c;
+	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
+
+	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
+	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
+	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
+
+	m1 = _mm_shuffle_ps(c, s, 0x15); // [cy][cy][sy][sp]
+	m2 = _mm_shuffle_ps(s, c, 0xA0); // [sp][sp][cr][cr]
+	m1 = _mm_shuffle_ps(m1, m1, 0xC8); // [cy][sy][cy][sp]
+
+	__m128 m3 = _mm_shuffle_ps(s, s, 0x4A); // [sr][sr][sp][sy];
+	m3 = _mm_mul_ps(m3, _mm_mul_ps(m1, m2)); // [sp*cy*sr][sp*sy*sr][cr*cy*sp][cr*sp*sy]
+
+	m2 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
+	m1 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
+	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [sy][cr][sy][cy]
+	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
+	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][cy*cr][sr*sy][-sr*cy]
+
+	m3 = _mm_add_ps(m3, m1);
+
+	if (forward)
+	{
+		_mm_storel_pi((__m64 *)forward, _mm_shuffle_ps(cp_mults, cp_mults, 0x08));
+		forward[2] = -_mm_cvtss_f32(s);
+	}
+	if (right)
+	{
+		__m128 r = _mm_shuffle_ps(m3, cp_mults, 0xF4); // [m3(0)][m3(1)][cp(3)][cp(3)]
+		xmm2vec(right, _mm_xor_ps(r, _mm_load_ps((float *)&negmask)));
+	}
+	if (up)
+	{
+		_mm_storel_pi((__m64 *)up, _mm_shuffle_ps(m3, m3, 0x0E));
+		up[2] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x01));
+	}
+}
+
+void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
+{
+	__m128 s, c;
+	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
+
+	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
+	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
+	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
+
+	m1 = _mm_shuffle_ps(s, s, 0x50); // [sp][sp][sy][sy]
+	m2 = _mm_shuffle_ps(c, s, 0x05); // [cy][cy][sp][sp]
+
+	__m128 m3 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr]
+	m1 = _mm_mul_ps(m1, m2);
+	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [sr][cr][sr][cr]
+	m3 = _mm_mul_ps(m3, m1); // [sp*cy*sr][sp*cy*cr][sy*sp*sr][sy*sp*cr]
+
+	m2 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
+	m1 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
+	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [cr][sr][cy][sr]
+	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
+	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][sr*sy][cy*cr][-sr*cy]
+
+	m3 = _mm_add_ps(m3, m1);
+
+	if (forward)
+	{
+		forward[0] = _mm_cvtss_f32(cp_mults);
+		_mm_storel_pi((__m64*)(forward + 1), m3); // (sr*sp*cy + cr*-sy);
+	}
+	if (right)
+	{
+		right[0] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x02));
+		_mm_storel_pi((__m64*)(right + 1), _mm_shuffle_ps(m3, m3, 0x0E));
+	}
+	if (up)
+	{
+		up[0] = -_mm_cvtss_f32(s);
+		_mm_storel_pi((__m64 *)&up[1], _mm_shuffle_ps(cp_mults, cp_mults, 0x07));
+	}
+}
+
+void AngleMatrix(const vec_t *angles, float(*matrix)[4])
+{
+	__m128 s, c;
+	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
+
+	/*
+	matrix[0][1] = sr * sp * cy - cr * sy;
+	matrix[1][1] = sr * sp * sy + cr * cy;
+	matrix[0][2] = cr * sp * cy + sr * sy;
+	matrix[1][2] = cr * sp * sy - sr * cy;
+	*/
+	__m128 m1;
+	__m128 m2 = _mm_shuffle_ps(s, c, 0x00); // [sp][sp][cp][cp]
+	__m128 m3 = _mm_shuffle_ps(c, s, 0x55); // [cy][cy][sy][sy]
+
+	m1 = _mm_shuffle_ps(s, c, 0xAA);   // [sr][sr][cr][cr]
+	m2 = _mm_shuffle_ps(m2, m2, 0x00); // [sp][sp][sp][sp]
+	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [cy][sy][cy][sy]
+
+	m2 = _mm_mul_ps(m2, _mm_mul_ps(m1, m3)); // m1*m2*m3
+
+	m1 = _mm_shuffle_ps(m1, m1, 0x1B); // [cr][cr][sr][sr]
+	m3 = _mm_shuffle_ps(m3, m3, 0xB1); // [sy][cy][sy][cy]
+	m3 = _mm_xor_ps(m3, _mm_load_ps((float *)&negmask_1001));
+	m3 = _mm_mul_ps(m3, m1);
+
+	m2 = _mm_add_ps(m2, m3);
+
+	/*
+	matrix[0][0] = cp * cy;
+	matrix[1][0] = cp * sy;
+	matrix[2][1] = sr * cp;
+	matrix[2][2] = cr * cp;
+	*/
+	m1 = _mm_shuffle_ps(s, c, 0x29); // [sy][sr][cr][cp]
+	c = _mm_shuffle_ps(c, c, 0x40);  // [cp][cp][cp][cy]
+	m1 = _mm_mul_ps(m1, c);
+
+	// matrix[0]
+	m3 = _mm_shuffle_ps(m2, m2, 0xE1);
+	_mm_storeu_ps(&matrix[0][0], m3);
+	matrix[0][0] = _mm_cvtss_f32(_mm_shuffle_ps(m1, m1, 0x03));
+	*(int *)&matrix[0][3] = 0;
+
+	// matrix[1]
+	m2 = _mm_shuffle_ps(m2, m2, 0xB4);
+	_mm_storeu_ps(&matrix[1][0], m2);
+	matrix[1][0] = _mm_cvtss_f32(m1);
+	*(int *)&matrix[1][3] = 0;
+
+	// matrix[2]
+	_mm_storeu_ps(&matrix[2][0], m1);
+	matrix[2][0] = -_mm_cvtss_f32(s);
+	*(int *)&matrix[2][3] = 0;
+}
+
+void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out)
+{
+	xmm2vec(out, _mm_add_ps(_mm_mul_ps(_mm_set_ps1(scale), _mm_loadu_ps(vecm)), _mm_loadu_ps(veca)));
+}
+
+float _DotProduct(const vec_t *v1, const vec_t *v2)
+{
+	// _mm_loadu_ps - load xmm from unaligned address
+	// _mm_cvtss_f32 - return low float value of xmm
+	// _mm_dp_ps - dot product
+	// 0x71 = 0b01110001 - mask for multiplying operands and result
+	// dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f)
+
+	return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
+}
+
+float Length(const vec_t *v)
+{
+	return _mm_cvtss_f32(length3D(_mm_loadu_ps(v))); // rsqrt is very inaccurate :(
+}
+
+float Length2D(const vec_t *v)
+{
+	return _mm_cvtss_f32(length2D(_mm_loadu_ps(v)));
+}
+
+void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
+{
+	xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
+}
+
+void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4])
+{
+	for (size_t i = 0; i < 3; i++)
+	{
+		__m128 a1 = _mm_mul_ps(_mm_set_ps1(in1[i][0]), _mm_loadu_ps(in2[0]));
+		__m128 a2 = _mm_mul_ps(_mm_set_ps1(in1[i][1]), _mm_loadu_ps(in2[1]));
+		__m128 a3 = _mm_mul_ps(_mm_set_ps1(in1[i][2]), _mm_loadu_ps(in2[2]));
+		_mm_storeu_ps(out[i], _mm_add_ps(a1, _mm_add_ps(a2, a3)));
+		out[i][3] += in1[i][3];
+	}
+}
+
+#endif // #if defined(REHLDS_SSE)
diff --git a/rehlds/engine/mathlib_sse.h b/rehlds/engine/mathlib_sse.h
new file mode 100644
index 0000000..0e9550f
--- /dev/null
+++ b/rehlds/engine/mathlib_sse.h
@@ -0,0 +1,74 @@
+/*
+*
+*    This program is free software; you can redistribute it and/or modify it
+*    under the terms of the GNU General Public License as published by the
+*    Free Software Foundation; either version 2 of the License, or (at
+*    your option) any later version.
+*
+*    This program is distributed in the hope that it will be useful, but
+*    WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*    General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with this program; if not, write to the Free Software Foundation,
+*    Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*
+*    In addition, as a special exception, the author gives permission to
+*    link the code of this program with the Half-Life Game Engine ("HL
+*    Engine") and Modified Game Libraries ("MODs") developed by Valve,
+*    L.L.C ("Valve").  You must obey the GNU General Public License in all
+*    respects for all of the code used other than the HL Engine and MODs
+*    from Valve.  If you modify this file, you may extend this exception
+*    to your version of the file, but you are not obligated to do so.  If
+*    you do not wish to do so, delete this exception statement from your
+*    version.
+*
+*/
+
+#pragma once
+
+#if defined(REHLDS_SSE)
+
+inline float M_min(float a, float b)
+{
+	return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
+}
+
+inline double M_min(double a, double b)
+{
+	return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
+}
+
+inline float M_max(float a, float b)
+{
+	return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
+}
+
+inline double M_max(double a, double b)
+{
+	return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
+}
+
+inline float M_sqrt(float value)
+{
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
+}
+
+inline double M_sqrt(double value)
+{
+	auto v = _mm_load_sd(&value);
+	return _mm_cvtsd_f64(_mm_sqrt_sd(v, v));
+}
+
+inline float M_clamp(float a, float min, float max)
+{
+	return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max)));
+}
+
+inline double M_clamp(double a, double min, double max)
+{
+	return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max)));
+}
+
+#endif // #if defined(REHLDS_SSE)
diff --git a/rehlds/engine/sse_mathfun.h b/rehlds/engine/sse_mathfun.h
index 24fd11c..2407327 100644
--- a/rehlds/engine/sse_mathfun.h
+++ b/rehlds/engine/sse_mathfun.h
@@ -36,7 +36,7 @@ misrepresented as being the original software.
 
 #ifdef _MSC_VER /* visual c++ */
 # define ALIGN16_BEG __declspec(align(16))
-# define ALIGN16_END 
+# define ALIGN16_END
 #else /* gcc or icc */
 # define ALIGN16_BEG
 # define ALIGN16_END __attribute__((aligned(16)))
diff --git a/rehlds/msvc/ReHLDS.vcxproj b/rehlds/msvc/ReHLDS.vcxproj
index 4d5147d..b587290 100644
--- a/rehlds/msvc/ReHLDS.vcxproj
+++ b/rehlds/msvc/ReHLDS.vcxproj
@@ -72,6 +72,7 @@
     <ClCompile Include="..\engine\ipratelimitWrapper.cpp" />
     <ClCompile Include="..\engine\l_studio.cpp" />
     <ClCompile Include="..\engine\mathlib.cpp" />
+    <ClCompile Include="..\engine\mathlib_sse.cpp" />
     <ClCompile Include="..\engine\md5.cpp" />
     <ClCompile Include="..\engine\mem.cpp" />
     <ClCompile Include="..\engine\model.cpp" />
@@ -432,6 +433,7 @@
     <ClInclude Include="..\engine\keys.h" />
     <ClInclude Include="..\engine\l_studio.h" />
     <ClInclude Include="..\engine\mathlib_e.h" />
+    <ClInclude Include="..\engine\mathlib_sse.h" />
     <ClInclude Include="..\engine\mem.h" />
     <ClInclude Include="..\engine\model_rehlds.h" />
     <ClInclude Include="..\engine\modinfo.h" />
@@ -805,7 +807,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SSE;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <FloatingPointModel>Precise</FloatingPointModel>
       <AdditionalOptions>/arch:IA32 %(AdditionalOptions)</AdditionalOptions>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
@@ -1149,7 +1151,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_SSE;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <AdditionalOptions>/arch:IA32 %(AdditionalOptions)</AdditionalOptions>
       <PrecompiledHeader>Use</PrecompiledHeader>
diff --git a/rehlds/msvc/ReHLDS.vcxproj.filters b/rehlds/msvc/ReHLDS.vcxproj.filters
index e143343..fddae72 100644
--- a/rehlds/msvc/ReHLDS.vcxproj.filters
+++ b/rehlds/msvc/ReHLDS.vcxproj.filters
@@ -353,6 +353,9 @@
     <ClCompile Include="..\engine\SystemWrapper.cpp">
       <Filter>engine\common</Filter>
     </ClCompile>
+    <ClCompile Include="..\engine\mathlib_sse.cpp">
+      <Filter>engine</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\version\version.h">
@@ -1081,6 +1084,9 @@
     <ClInclude Include="..\public\strtools.h">
       <Filter>public</Filter>
     </ClInclude>
+    <ClInclude Include="..\engine\mathlib_sse.h">
+      <Filter>engine</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\linux\appversion.sh">
diff --git a/rehlds/public/rehlds/crc32c.cpp b/rehlds/public/rehlds/crc32c.cpp
index 669fc81..4fcde71 100644
--- a/rehlds/public/rehlds/crc32c.cpp
+++ b/rehlds/public/rehlds/crc32c.cpp
@@ -115,6 +115,7 @@ uint32 crc32c_t_nosse(uint32 iCRC, const uint8 *buf, int len) {
 	return crc;
 }
 
+#ifdef REHLDS_SSE
 FUNC_TARGET("sse4.2")
 uint32 crc32c_t8_sse(uint32 iCRC, uint8 u8) {
 	return _mm_crc32_u8(iCRC, u8);
@@ -140,6 +141,14 @@ uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) {
 	return cpuinfo.sse4_2 ? crc32c_t_sse(iCRC, buf, len) : crc32c_t_nosse(iCRC, buf, len);
 }
 
+#else
+
+uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) {
+	return crc32c_t_nosse(iCRC, buf, len);
+}
+
+#endif // REHLDS_SSE
+
 uint32 crc32c(const uint8 *buf, int len) {
 	return crc32c_t(0xffffffff, buf, len);
 }
diff --git a/rehlds/public/rehlds/crc32c.h b/rehlds/public/rehlds/crc32c.h
index 55bed92..7ee27fc 100644
--- a/rehlds/public/rehlds/crc32c.h
+++ b/rehlds/public/rehlds/crc32c.h
@@ -11,7 +11,9 @@ GNU Lesser General Public License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
+
 #pragma once
+
 #include "archtypes.h"
 
 extern uint32 crc32c_t8_nosse(uint32 iCRC, uint8 u8);
diff --git a/rehlds/public/rehlds/static_map.h b/rehlds/public/rehlds/static_map.h
index 11235d7..b1da16f 100644
--- a/rehlds/public/rehlds/static_map.h
+++ b/rehlds/public/rehlds/static_map.h
@@ -169,7 +169,6 @@ public:
 			}
 		}
 
-	
 		Iterator(CStaticMap* m) {
 			m_Map = m;
 			m_RootNodes = m_Map->m_RootNodes;
@@ -226,6 +225,7 @@ protected:
 	virtual uint32 hash(const char* const &val) {
 		uint32 cksum = 0;
 		const char* pcc = val;
+#ifdef REHLDS_SSE
 		if (cpuinfo.sse4_2) {
 			while (*pcc) {
 				char cc = *(pcc++);
@@ -234,7 +234,10 @@ protected:
 				}
 				cksum = crc32c_t8_sse(cksum, cc);
 			}
-		} else {
+		}
+		else
+#endif // REHLDS_SSE
+		{
 			while (*pcc) {
 				char cc = *(pcc++);
 				if (cc >= 'A' || cc <= 'Z') {