Move math sse to macros REHLDS_SSE

2025-07-16 02:06:15 +03:00 · 2017-12-09 16:47:08 +07:00 · 2017-12-09 16:47:08 +07:00 · 5f1672ca85
commit 5f1672ca85
parent 4fad5255ba
12 changed files with 812 additions and 744 deletions
--- a/rehlds/build.gradle
+++ b/rehlds/build.gradle
@ -184,11 +184,11 @@ void setupToolchain(NativeBinarySpec b) {
 	}
 	if (unitTestExecutable) {
-		cfg.singleDefines 'REHLDS_UNIT_TESTS'
+		cfg.singleDefines 'REHLDS_UNIT_TESTS', 'REHLDS_SSE'
 	}
 	if (rehldsFixes) {
-		cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS'
+		cfg.singleDefines 'REHLDS_FIXES', 'REHLDS_SSE', 'REHLDS_CHECKS', 'HAVE_OPT_STRTOOLS'
 	}
 	ToolchainConfigUtils.apply(project, cfg, b)
--- a/rehlds/common/mathlib.h
+++ b/rehlds/common/mathlib.h
@ -82,6 +82,30 @@ inline T clamp(T a, T min, T max)
 	return (a > max) ? max : (a < min) ? min : a;
 }
 template<typename T>
 inline T M_min(T a, T b)
 {
 	return min(a, b);
 }
 template<typename T>
 inline T M_max(T a, T b)
 {
 	return max(a, b);
 }
 template<typename T>
 inline T M_clamp(T a, T min, T max)
 {
 	return clamp(a, min, max);
 }
 template<typename T>
 inline double M_sqrt(T value)
 {
 	return sqrt(value);
 }
 template <typename T>
 inline T bswap(T s)
 {
--- a/rehlds/engine/mathlib.cpp
+++ b/rehlds/engine/mathlib.cpp
--- a/rehlds/engine/mathlib_e.h
+++ b/rehlds/engine/mathlib_e.h
@ -30,8 +30,9 @@
 #include "maintypes.h"
 #include "model.h"
 #include "mathlib_sse.h"
-#ifndef REHLDS_FIXES
+#if !defined(REHLDS_FIXES) && !defined(REHLDS_SSE)
 // NOTE: In some cases we need high precision of floating-point,
 // so use double instead of float, otherwise unittest will fail
 typedef double real_t;
@ -77,59 +78,6 @@ static const int nanmask = 0x7F800000;
 #define IS_NAN(fvar) ((*reinterpret_cast<int*>(&(fvar)) & nanmask) == nanmask)
 inline float M_sqrt(float value) {
 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
 }
 inline double M_sqrt(double value) {
 	auto v = _mm_load_sd(&value);
 	return _mm_cvtsd_f64(_mm_sqrt_sd(v, v));
 }
 template<typename T>
 inline double M_sqrt(T value) {
 	return sqrt(value);
 }
 inline float M_min(float a, float b) {
 	return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
 }
 inline double M_min(double a, double b) {
 	return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
 }
 template<typename T>
 inline T M_min(T a, T b) {
 	return min(a, b);
 }
 inline float M_max(float a, float b) {
 	return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
 }
 inline double M_max(double a, double b) {
 	return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
 }
 template<typename T>
 inline T M_max(T a, T b) {
 	return max(a, b);
 }
 inline float M_clamp(float a, float min, float max) {
 	return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max)));
 }
 inline double M_clamp(double a, double min, double max) {
 	return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max)));
 }
 template<typename T>
 inline T M_clamp(T a, T min, T max) {
 	return clamp(a, min, max);
 }
 inline void VectorAdd(const vec_t *veca, const vec_t *vecb, vec_t *out)
 {
 	out[0] = veca[0] + vecb[0];
@ -208,7 +156,7 @@ NOBODY void AngleIMatrix(const vec_t *angles, float *matrix);
 NOBODY void NormalizeAngles(float *angles);
 NOBODY void InterpolateAngles(float *start, float *end, float *output, float frac);
 void VectorTransform(const vec_t *in1, float (*in2)[4], vec_t *out);
-int VectorCompare(const vec_t *v1, const vec_t *v2);
+qboolean VectorCompare(const vec_t *v1, const vec_t *v2);
 void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out);
 real_t _DotProduct(const vec_t *v1, const vec_t *v2);
 NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
@ -219,7 +167,6 @@ float Length(const vec_t *v);
 float Length2D(const vec_t *v);
 float VectorNormalize(vec_t *v);
 NOBODY void VectorInverse(vec_t *v);
 void VectorScale(const vec_t *in, float scale, vec_t *out);
 NOBODY int Q_log2(int val);
 NOBODY void VectorMatrix(vec_t *forward, vec_t *right, vec_t *up);
 void VectorAngles(const vec_t *forward, vec_t *angles);
--- a/rehlds/engine/mathlib_sse.cpp
+++ b/rehlds/engine/mathlib_sse.cpp
@ -0,0 +1,375 @@
 /*
 *
 *    This program is free software; you can redistribute it and/or modify it
 *    under the terms of the GNU General Public License as published by the
 *    Free Software Foundation; either version 2 of the License, or (at
 *    your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful, but
 *    WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *    General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software Foundation,
 *    Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *    In addition, as a special exception, the author gives permission to
 *    link the code of this program with the Half-Life Game Engine ("HL
 *    Engine") and Modified Game Libraries ("MODs") developed by Valve,
 *    L.L.C ("Valve").  You must obey the GNU General Public License in all
 *    respects for all of the code used other than the HL Engine and MODs
 *    from Valve.  If you modify this file, you may extend this exception
 *    to your version of the file, but you are not obligated to do so.  If
 *    you do not wish to do so, delete this exception statement from your
 *    version.
 *
 */
 #include "precompiled.h"
 #if defined(REHLDS_SSE)
 // Intrisics guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
 // Shufps calculator: http://wurstcaptures.untergrund.net/assembler_tricks.html
 // aligned vec4_t
 typedef ALIGN16 vec4_t avec4_t;
 typedef ALIGN16 unsigned int aivec4_t[4];
 // conversion multiplier
 const avec4_t deg2rad =
 {
 	(float)M_PI / 180.f,
 	(float)M_PI / 180.f,
 	(float)M_PI / 180.f,
 	(float)M_PI / 180.f
 };
 const aivec4_t negmask[4] =
 {
 	0x80000000,
 	0x80000000,
 	0x80000000,
 	0x80000000
 };
 const aivec4_t negmask_1001 =
 {
 	0x80000000,
 	0,
 	0,
 	0x80000000
 };
 const aivec4_t negmask_0010 =
 {
 	0,
 	0,
 	0x80000000,
 	0
 };
 // save 4d xmm to 3d vector. we can't optimize many simple vector3 functions because saving back to 3d is slow.
 inline void xmm2vec(vec_t *v, const __m128 m)
 {
 	_mm_storel_pi((__m64*)v, m);
 	_mm_store_ss(v + 2, _mm_shuffle_ps(m, m, 0x02));
 }
 FUNC_TARGET("sse4.1")
 inline __m128 dotProduct3D(__m128 v1, __m128 v2)
 {
 	if (cpuinfo.sse4_1)
 		return _mm_dp_ps(v1, v2, 0x71);
 	__m128 v = _mm_mul_ps(v1, v2);
 	return _mm_add_ps(_mm_movehl_ps(v, v), _mm_hadd_ps(v, v)); // SSE3
 }
 inline __m128 crossProduct3D(__m128 a, __m128 b)
 {
 	__m128 tmp1 = _mm_mul_ps(a, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
 	__m128 tmp2 = _mm_mul_ps(b, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)));
 	__m128 m = _mm_sub_ps(tmp1, tmp2);
 	return _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 0, 2, 1));
 }
 inline __m128 length3D(__m128 v)
 {
 	return _mm_sqrt_ps(dotProduct3D(v, v));
 }
 inline __m128 length2D(__m128 v)
 {
 	v = _mm_mul_ps(v, v);
 	return _mm_sqrt_ps(_mm_hadd_ps(v, v)); // hadd = SSE3
 }
 int BoxOnPlaneSide(vec_t *emins, vec_t *emaxs, mplane_t *p)
 {
 	double dist1, dist2;
 	int sides = 0;
 	__m128 emin = _mm_loadu_ps(emins);
 	__m128 emax = _mm_loadu_ps(emaxs);
 	avec4_t d1, d2;
 	// general case
 	switch (p->signbits)
 	{
 	case 0:
 		_mm_store_ps(d1, emax);
 		_mm_store_ps(d2, emin);
 		break;
 	case 1:
 		_mm_store_ps(d1, emax);
 		_mm_store_ps(d2, emin);
 		d1[0] = emins[0];
 		d2[0] = emaxs[0];
 		break;
 	case 2:
 		_mm_store_ps(d1, emax);
 		_mm_store_ps(d2, emin);
 		d1[1] = emins[1];
 		d2[1] = emaxs[1];
 		break;
 	case 3:
 		_mm_store_ps(d1, emin);
 		_mm_store_ps(d2, emax);
 		d1[2] = emaxs[2];
 		d2[2] = emins[2];
 		break;
 	case 4:
 		_mm_store_ps(d1, emax);
 		_mm_store_ps(d2, emin);
 		d1[2] = emins[2];
 		d2[2] = emaxs[2];
 		break;
 	case 5:
 		_mm_store_ps(d1, emin);
 		_mm_store_ps(d2, emax);
 		d1[1] = emaxs[1];
 		d2[1] = emins[1];
 		break;
 	case 6:
 		_mm_store_ps(d1, emin);
 		_mm_store_ps(d2, emax);
 		d1[0] = emaxs[0];
 		d2[0] = emins[0];
 		break;
 	case 7:
 		_mm_store_ps(d1, emin);
 		_mm_store_ps(d2, emax);
 		break;
 	default:
 		BOPS_Error();
 		dist1 = dist2 = 0.0;
 		break;
 	}
 	dist1 = _DotProduct(p->normal, d1);
 	dist2 = _DotProduct(p->normal, d2);
 	if (dist1 >= p->dist)
 		sides = 1;
 	if (dist2 < p->dist)
 		sides |= 2;
 	return sides;
 }
 qboolean VectorCompare(const vec_t *v1, const vec_t *v2)
 {
 	__m128 cmp = _mm_cmpneq_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2));
 	return !(_mm_movemask_ps(cmp) & (1|2|4));
 }
 void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
 {
 #ifndef SWDS
 	g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
 #endif // SWDS
 	__m128 s, c;
 	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
 	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
 	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
 	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
 	m1 = _mm_shuffle_ps(c, s, 0x15); // [cy][cy][sy][sp]
 	m2 = _mm_shuffle_ps(s, c, 0xA0); // [sp][sp][cr][cr]
 	m1 = _mm_shuffle_ps(m1, m1, 0xC8); // [cy][sy][cy][sp]
 	__m128 m3 = _mm_shuffle_ps(s, s, 0x4A); // [sr][sr][sp][sy];
 	m3 = _mm_mul_ps(m3, _mm_mul_ps(m1, m2)); // [sp*cy*sr][sp*sy*sr][cr*cy*sp][cr*sp*sy]
 	m2 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
 	m1 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
 	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [sy][cr][sy][cy]
 	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
 	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][cy*cr][sr*sy][-sr*cy]
 	m3 = _mm_add_ps(m3, m1);
 	if (forward)
 	{
 		_mm_storel_pi((__m64 *)forward, _mm_shuffle_ps(cp_mults, cp_mults, 0x08));
 		forward[2] = -_mm_cvtss_f32(s);
 	}
 	if (right)
 	{
 		__m128 r = _mm_shuffle_ps(m3, cp_mults, 0xF4); // [m3(0)][m3(1)][cp(3)][cp(3)]
 		xmm2vec(right, _mm_xor_ps(r, _mm_load_ps((float *)&negmask)));
 	}
 	if (up)
 	{
 		_mm_storel_pi((__m64 *)up, _mm_shuffle_ps(m3, m3, 0x0E));
 		up[2] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x01));
 	}
 }
 void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
 {
 	__m128 s, c;
 	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
 	__m128 m1 = _mm_shuffle_ps(c, s, 0x90); // [cp][cp][sy][sr]
 	__m128 m2 = _mm_shuffle_ps(c, c, 0x09); // [cy][cr][cp][cp]
 	__m128 cp_mults = _mm_mul_ps(m1, m2); // [cp * cy][cp * cr][cp * sy][cp * sr];
 	m1 = _mm_shuffle_ps(s, s, 0x50); // [sp][sp][sy][sy]
 	m2 = _mm_shuffle_ps(c, s, 0x05); // [cy][cy][sp][sp]
 	__m128 m3 = _mm_shuffle_ps(s, c, 0xAA); // [sr][sr][cr][cr]
 	m1 = _mm_mul_ps(m1, m2);
 	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [sr][cr][sr][cr]
 	m3 = _mm_mul_ps(m3, m1); // [sp*cy*sr][sp*cy*cr][sy*sp*sr][sy*sp*cr]
 	m2 = _mm_shuffle_ps(c, s, 0xA6); // [cr][cy][sr][sr]
 	m1 = _mm_shuffle_ps(s, c, 0x65); // [sy][sy][cr][cy]
 	m2 = _mm_shuffle_ps(m2, m2, 0xD8); // [cr][sr][cy][sr]
 	m1 = _mm_xor_ps(m1, _mm_load_ps((float *)&negmask_1001)); // [-cr][cy][sr][-sr]
 	m1 = _mm_mul_ps(m1, m2); // [-cr*sy][sr*sy][cy*cr][-sr*cy]
 	m3 = _mm_add_ps(m3, m1);
 	if (forward)
 	{
 		forward[0] = _mm_cvtss_f32(cp_mults);
 		_mm_storel_pi((__m64*)(forward + 1), m3); // (sr*sp*cy + cr*-sy);
 	}
 	if (right)
 	{
 		right[0] = _mm_cvtss_f32(_mm_shuffle_ps(cp_mults, cp_mults, 0x02));
 		_mm_storel_pi((__m64*)(right + 1), _mm_shuffle_ps(m3, m3, 0x0E));
 	}
 	if (up)
 	{
 		up[0] = -_mm_cvtss_f32(s);
 		_mm_storel_pi((__m64 *)&up[1], _mm_shuffle_ps(cp_mults, cp_mults, 0x07));
 	}
 }
 void AngleMatrix(const vec_t *angles, float(*matrix)[4])
 {
 	__m128 s, c;
 	sincos_ps(_mm_mul_ps(_mm_loadu_ps(angles), _mm_load_ps(deg2rad)), &s, &c);
 	/*
 	matrix[0][1] = sr * sp * cy - cr * sy;
 	matrix[1][1] = sr * sp * sy + cr * cy;
 	matrix[0][2] = cr * sp * cy + sr * sy;
 	matrix[1][2] = cr * sp * sy - sr * cy;
 	*/
 	__m128 m1;
 	__m128 m2 = _mm_shuffle_ps(s, c, 0x00); // [sp][sp][cp][cp]
 	__m128 m3 = _mm_shuffle_ps(c, s, 0x55); // [cy][cy][sy][sy]
 	m1 = _mm_shuffle_ps(s, c, 0xAA);   // [sr][sr][cr][cr]
 	m2 = _mm_shuffle_ps(m2, m2, 0x00); // [sp][sp][sp][sp]
 	m3 = _mm_shuffle_ps(m3, m3, 0xD8); // [cy][sy][cy][sy]
 	m2 = _mm_mul_ps(m2, _mm_mul_ps(m1, m3)); // m1*m2*m3
 	m1 = _mm_shuffle_ps(m1, m1, 0x1B); // [cr][cr][sr][sr]
 	m3 = _mm_shuffle_ps(m3, m3, 0xB1); // [sy][cy][sy][cy]
 	m3 = _mm_xor_ps(m3, _mm_load_ps((float *)&negmask_1001));
 	m3 = _mm_mul_ps(m3, m1);
 	m2 = _mm_add_ps(m2, m3);
 	/*
 	matrix[0][0] = cp * cy;
 	matrix[1][0] = cp * sy;
 	matrix[2][1] = sr * cp;
 	matrix[2][2] = cr * cp;
 	*/
 	m1 = _mm_shuffle_ps(s, c, 0x29); // [sy][sr][cr][cp]
 	c = _mm_shuffle_ps(c, c, 0x40);  // [cp][cp][cp][cy]
 	m1 = _mm_mul_ps(m1, c);
 	// matrix[0]
 	m3 = _mm_shuffle_ps(m2, m2, 0xE1);
 	_mm_storeu_ps(&matrix[0][0], m3);
 	matrix[0][0] = _mm_cvtss_f32(_mm_shuffle_ps(m1, m1, 0x03));
 	*(int *)&matrix[0][3] = 0;
 	// matrix[1]
 	m2 = _mm_shuffle_ps(m2, m2, 0xB4);
 	_mm_storeu_ps(&matrix[1][0], m2);
 	matrix[1][0] = _mm_cvtss_f32(m1);
 	*(int *)&matrix[1][3] = 0;
 	// matrix[2]
 	_mm_storeu_ps(&matrix[2][0], m1);
 	matrix[2][0] = -_mm_cvtss_f32(s);
 	*(int *)&matrix[2][3] = 0;
 }
 void VectorMA(const vec_t *veca, float scale, const vec_t *vecm, vec_t *out)
 {
 	xmm2vec(out, _mm_add_ps(_mm_mul_ps(_mm_set_ps1(scale), _mm_loadu_ps(vecm)), _mm_loadu_ps(veca)));
 }
 float _DotProduct(const vec_t *v1, const vec_t *v2)
 {
 	// _mm_loadu_ps - load xmm from unaligned address
 	// _mm_cvtss_f32 - return low float value of xmm
 	// _mm_dp_ps - dot product
 	// 0x71 = 0b01110001 - mask for multiplying operands and result
 	// dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f)
 	return _mm_cvtss_f32(dotProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
 }
 float Length(const vec_t *v)
 {
 	return _mm_cvtss_f32(length3D(_mm_loadu_ps(v))); // rsqrt is very inaccurate :(
 }
 float Length2D(const vec_t *v)
 {
 	return _mm_cvtss_f32(length2D(_mm_loadu_ps(v)));
 }
 void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
 {
 	xmm2vec(cross, crossProduct3D(_mm_loadu_ps(v1), _mm_loadu_ps(v2)));
 }
 void R_ConcatTransforms(float in1[3][4], float in2[3][4], float out[3][4])
 {
 	for (size_t i = 0; i < 3; i++)
 	{
 		__m128 a1 = _mm_mul_ps(_mm_set_ps1(in1[i][0]), _mm_loadu_ps(in2[0]));
 		__m128 a2 = _mm_mul_ps(_mm_set_ps1(in1[i][1]), _mm_loadu_ps(in2[1]));
 		__m128 a3 = _mm_mul_ps(_mm_set_ps1(in1[i][2]), _mm_loadu_ps(in2[2]));
 		_mm_storeu_ps(out[i], _mm_add_ps(a1, _mm_add_ps(a2, a3)));
 		out[i][3] += in1[i][3];
 	}
 }
 #endif // #if defined(REHLDS_SSE)
--- a/rehlds/engine/mathlib_sse.h
+++ b/rehlds/engine/mathlib_sse.h
@ -0,0 +1,74 @@
 /*
 *
 *    This program is free software; you can redistribute it and/or modify it
 *    under the terms of the GNU General Public License as published by the
 *    Free Software Foundation; either version 2 of the License, or (at
 *    your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful, but
 *    WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *    General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software Foundation,
 *    Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *    In addition, as a special exception, the author gives permission to
 *    link the code of this program with the Half-Life Game Engine ("HL
 *    Engine") and Modified Game Libraries ("MODs") developed by Valve,
 *    L.L.C ("Valve").  You must obey the GNU General Public License in all
 *    respects for all of the code used other than the HL Engine and MODs
 *    from Valve.  If you modify this file, you may extend this exception
 *    to your version of the file, but you are not obligated to do so.  If
 *    you do not wish to do so, delete this exception statement from your
 *    version.
 *
 */
 #pragma once
 #if defined(REHLDS_SSE)
 inline float M_min(float a, float b)
 {
 	return _mm_cvtss_f32(_mm_min_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
 }
 inline double M_min(double a, double b)
 {
 	return _mm_cvtsd_f64(_mm_min_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
 }
 inline float M_max(float a, float b)
 {
 	return _mm_cvtss_f32(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&b)));
 }
 inline double M_max(double a, double b)
 {
 	return _mm_cvtsd_f64(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&b)));
 }
 inline float M_sqrt(float value)
 {
 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_load_ss(&value)));
 }
 inline double M_sqrt(double value)
 {
 	auto v = _mm_load_sd(&value);
 	return _mm_cvtsd_f64(_mm_sqrt_sd(v, v));
 }
 inline float M_clamp(float a, float min, float max)
 {
 	return _mm_cvtss_f32(_mm_min_ss(_mm_max_ss(_mm_load_ss(&a), _mm_load_ss(&min)), _mm_load_ss(&max)));
 }
 inline double M_clamp(double a, double min, double max)
 {
 	return _mm_cvtsd_f64(_mm_min_sd(_mm_max_sd(_mm_load_sd(&a), _mm_load_sd(&min)), _mm_load_sd(&max)));
 }
 #endif // #if defined(REHLDS_SSE)
--- a/rehlds/msvc/ReHLDS.vcxproj
+++ b/rehlds/msvc/ReHLDS.vcxproj
@ -72,6 +72,7 @@
    <ClCompile Include="..\engine\ipratelimitWrapper.cpp" />
    <ClCompile Include="..\engine\l_studio.cpp" />
    <ClCompile Include="..\engine\mathlib.cpp" />
    <ClCompile Include="..\engine\mathlib_sse.cpp" />
    <ClCompile Include="..\engine\md5.cpp" />
    <ClCompile Include="..\engine\mem.cpp" />
    <ClCompile Include="..\engine\model.cpp" />
@ -432,6 +433,7 @@
    <ClInclude Include="..\engine\keys.h" />
    <ClInclude Include="..\engine\l_studio.h" />
    <ClInclude Include="..\engine\mathlib_e.h" />
    <ClInclude Include="..\engine\mathlib_sse.h" />
    <ClInclude Include="..\engine\mem.h" />
    <ClInclude Include="..\engine\model_rehlds.h" />
    <ClInclude Include="..\engine\modinfo.h" />
@ -805,7 +807,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_OPT_PEDANTIC;REHLDS_FIXES;REHLDS_SSE;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FloatingPointModel>Precise</FloatingPointModel>
      <AdditionalOptions>/arch:IA32 %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
@ -1149,7 +1151,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>REHLDS_API;REHLDS_FLIGHT_REC;REHLDS_FIXES;REHLDS_SSE;REHLDS_OPT_PEDANTIC;REHLDS_SELF;REHLDS_CHECKS;HAVE_OPT_STRTOOLS;USE_BREAKPAD_HANDLER;DEDICATED;SWDS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <AdditionalOptions>/arch:IA32 %(AdditionalOptions)</AdditionalOptions>
      <PrecompiledHeader>Use</PrecompiledHeader>
--- a/rehlds/msvc/ReHLDS.vcxproj.filters
+++ b/rehlds/msvc/ReHLDS.vcxproj.filters
@ -353,6 +353,9 @@
    <ClCompile Include="..\engine\SystemWrapper.cpp">
      <Filter>engine\common</Filter>
    </ClCompile>
    <ClCompile Include="..\engine\mathlib_sse.cpp">
      <Filter>engine</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\version\version.h">
@ -1081,6 +1084,9 @@
    <ClInclude Include="..\public\strtools.h">
      <Filter>public</Filter>
    </ClInclude>
    <ClInclude Include="..\engine\mathlib_sse.h">
      <Filter>engine</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\linux\appversion.sh">
--- a/rehlds/public/rehlds/crc32c.cpp
+++ b/rehlds/public/rehlds/crc32c.cpp
@ -115,6 +115,7 @@ uint32 crc32c_t_nosse(uint32 iCRC, const uint8 *buf, int len) {
 	return crc;
 }
 #ifdef REHLDS_SSE
 FUNC_TARGET("sse4.2")
 uint32 crc32c_t8_sse(uint32 iCRC, uint8 u8) {
 	return _mm_crc32_u8(iCRC, u8);
@ -140,6 +141,14 @@ uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) {
 	return cpuinfo.sse4_2 ? crc32c_t_sse(iCRC, buf, len) : crc32c_t_nosse(iCRC, buf, len);
 }
 #else
 uint32 crc32c_t(uint32 iCRC, const uint8 *buf, unsigned int len) {
 	return crc32c_t_nosse(iCRC, buf, len);
 }
 #endif // REHLDS_SSE
 uint32 crc32c(const uint8 *buf, int len) {
 	return crc32c_t(0xffffffff, buf, len);
 }
--- a/rehlds/public/rehlds/crc32c.h
+++ b/rehlds/public/rehlds/crc32c.h
@ -11,7 +11,9 @@ GNU Lesser General Public License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
 #include "archtypes.h"
 extern uint32 crc32c_t8_nosse(uint32 iCRC, uint8 u8);
--- a/rehlds/public/rehlds/static_map.h
+++ b/rehlds/public/rehlds/static_map.h
@ -169,7 +169,6 @@ public:
 			}
 		}
 		Iterator(CStaticMap* m) {
 			m_Map = m;
 			m_RootNodes = m_Map->m_RootNodes;
@ -226,6 +225,7 @@ protected:
 	virtual uint32 hash(const char* const &val) {
 		uint32 cksum = 0;
 		const char* pcc = val;
 #ifdef REHLDS_SSE
 		if (cpuinfo.sse4_2) {
 			while (*pcc) {
 				char cc = *(pcc++);
@ -234,7 +234,10 @@ protected:
 				}
 				cksum = crc32c_t8_sse(cksum, cc);
 			}
-		} else {
+		}
 		else
 #endif // REHLDS_SSE
 		{
 			while (*pcc) {
 				char cc = *(pcc++);
 				if (cc >= 'A' || cc <= 'Z') {