2
0
mirror of https://github.com/rehlds/rehlds.git synced 2025-01-01 01:25:38 +03:00

Added SSE 4.1 implementation of dot product, added SinCos function, maked fast version of SV_Move for point hull (trace line), small copying optimization in SV_RunCmd.

This commit is contained in:
Andrey 2015-05-11 20:09:51 +03:00
parent 7ae1eebac8
commit 904394b6ea
10 changed files with 245 additions and 45 deletions

View File

@ -28,7 +28,8 @@
#include "precompiled.h"
#include <smmintrin.h>
#include <xmmintrin.h>
vec3_t vec3_origin;
//int nanmask;
@ -125,16 +126,34 @@ NOBODY int InvertMatrix(const float *m, float *out);
// float *r3; // 161
//}
#ifdef REHLDS_FIXES
void SinCos(float radians, float *sine, float *cosine)
{
__asm
{
fld dword ptr [radians];
fsincos;
fstp dword ptr [cosine];
fstp dword ptr [sine];
}
}
#endif // REHLDS_FIXES
/* <47067> ../engine/mathlib.c:267 */
void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
{
float angle;
float sr, sp, sy, cr, cp, cy;
#ifndef SWDS
g_engdstAddrs.pfnAngleVectors(&angles, &forward, &right, &up);
#endif // SWDS
#ifdef REHLDS_FIXES
SinCos(DEG2RAD(angles[YAW]), &sy, &cy);
SinCos(DEG2RAD(angles[PITCH]), &sp, &cp);
SinCos(DEG2RAD(angles[ROLL]), &sr, &cr);
#else
float angle;
angle = (float)(angles[YAW] * (M_PI * 2 / 360));
sy = sin(angle);
cy = cos(angle);
@ -144,6 +163,7 @@ void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
sr = sin(angle);
cr = cos(angle);
#endif
if (forward)
{
@ -168,9 +188,14 @@ void AngleVectors(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
/* <4712e> ../engine/mathlib.c:304 */
void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, vec_t *up)
{
float angle;
float sr, sp, sy, cr, cp, cy;
#ifdef REHLDS_FIXES
SinCos(DEG2RAD(angles[YAW]), &sy, &cy);
SinCos(DEG2RAD(angles[PITCH]), &sp, &cp);
SinCos(DEG2RAD(angles[ROLL]), &sr, &cr);
#else
float angle;
angle = (float)(angles[YAW] * (M_PI * 2 / 360));
sy = sin(angle);
cy = cos(angle);
@ -180,6 +205,7 @@ void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, ve
angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
sr = sin(angle);
cr = cos(angle);
#endif
if (forward)
{
@ -204,18 +230,24 @@ void AngleVectorsTranspose(const vec_t *angles, vec_t *forward, vec_t *right, ve
/* <471e9> ../engine/mathlib.c:340 */
void AngleMatrix(const vec_t *angles, float(*matrix)[4])
{
float angle;
float sr, sp, sy, cr, cp, cy;
angle = (float)(angles[2] * (M_PI * 2 / 360));
#ifdef REHLDS_FIXES
SinCos(DEG2RAD(angles[ROLL]), &sy, &cy);
SinCos(DEG2RAD(angles[YAW]), &sp, &cp);
SinCos(DEG2RAD(angles[PITCH]), &sr, &cr);
#else
float angle;
angle = (float)(angles[ROLL] * (M_PI * 2 / 360));
sy = sin(angle);
cy = cos(angle);
angle = (float)(angles[1] * (M_PI * 2 / 360));
angle = (float)(angles[YAW] * (M_PI * 2 / 360));
sp = sin(angle);
cp = cos(angle);
angle = (float)(angles[0] * (M_PI * 2 / 360));
angle = (float)(angles[PITCH] * (M_PI * 2 / 360));
sr = sin(angle);
cr = cos(angle);
#endif
float tmp1, tmp2;
@ -296,11 +328,28 @@ void VectorMA(const vec_t *veca, float scale, const vec_t *vecb, vec_t *vecc)
vecc[2] = scale * vecb[2] + veca[2];
}
#ifndef REHLDS_FIXES
/* <4757a> ../engine/mathlib.c:484 */
float _DotProduct(vec_t *v1, vec_t *v2)
long double _DotProduct(const vec_t *v1, const vec_t *v2)
{
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
}
#else // REHLDS_FIXES
float _DotProduct(const vec_t *v1, const vec_t *v2)
{
#ifdef REHLDS_FIXES
// _mm_loadu_ps - load xmm from unaligned address
// _mm_cvtss_f32 - return low float value of xmm
// _mm_dp_ps - dot product
// 0x71 = 0b01110001 - mask for multiplying operands and result
// dpps isn't binary compatible with separate sse2 instructions (max difference is about 0.0002f, but usually < 0.00001f)
if (cpuinfo.sse4_1)
return _mm_cvtss_f32(_mm_dp_ps(_mm_loadu_ps(v1), _mm_loadu_ps(v2), 0x71));
#endif // REHLDS_FIXES
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
}
#endif // REHLDS_FIXES
/* <475b4> ../engine/mathlib.c:489 */
NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
@ -331,6 +380,14 @@ void CrossProduct(const vec_t *v1, const vec_t *v2, vec_t *cross)
/* <476d8> ../engine/mathlib.c:519 */
float Length(const vec_t *v)
{
#ifdef REHLDS_FIXES
// based on dot product
if (cpuinfo.sse4_1)
{
return _mm_cvtss_f32(_mm_sqrt_ps(_mm_dp_ps(_mm_loadu_ps(v), _mm_loadu_ps(v), 0x71)));
}
#endif // REHLDS_FIXES
float length;
length = 0.0f;
@ -346,8 +403,12 @@ float VectorNormalize(vec3_t v)
{
float length, ilength;
#ifdef REHLDS_FIXES
length = Length(v);
#else // REHLDS_FIXES
length = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
length = sqrt(length); // FIXME
#endif // REHLDS_FIXES
if (length)
{

View File

@ -42,9 +42,11 @@
// fall over
#define ROLL 2
#define RAD2DEG(x) ((float)(x) * (float)(180.f / M_PI))
#define DEG2RAD(x) ((float)(x) * (float)(M_PI / 180.f))
#ifdef HOOK_ENGINE
#define vec3_origin (*pvec3_origin)
#endif // HOOK_ENGINE
@ -69,7 +71,11 @@ NOBODY void InterpolateAngles(float *start, float *end, float *output, float fra
void VectorTransform(const vec_t *in1, float *in2, vec_t *out);
int VectorCompare(const vec_t *v1, const vec_t *v2);
void VectorMA(const vec_t *veca, float scale, const vec_t *vecb, vec_t *vecc);
NOBODY float _DotProduct(vec_t *v1, vec_t *v2);
#ifdef REHLDS_FIXES
float _DotProduct(const vec_t *v1, const vec_t *v2); // with sse support
#else // REHLDS_FIXES
long double _DotProduct(const vec_t *v1, const vec_t *v2); // precise
#endif // REHLDS_FIXES
NOBODY void _VectorSubtract(vec_t *veca, vec_t *vecb, vec_t *out);
void _VectorAdd(vec_t *veca, vec_t *vecb, vec_t *out);
NOBODY void _VectorCopy(vec_t *in, vec_t *out);

View File

@ -645,7 +645,6 @@ struct pmtrace_s *PM_TraceLineEx(float *start, float *end, int flags, int usehul
/* <6ef4a> ../engine/pmovetst.c:844 */
qboolean PM_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, const vec_t *p1, const vec_t *p2, pmtrace_t *trace)
{
qboolean retval;
dclipnode_t *node;
mplane_t *plane;
vec3_t mid;
@ -687,8 +686,8 @@ qboolean PM_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, cons
plane = &hull->planes[node->planenum];
if (plane->type >= 3u)
{
t1 = p1[2] * plane->normal[2] + p1[1] * plane->normal[1] + p1[0] * plane->normal[0] - plane->dist;
t2 = p2[2] * plane->normal[2] + p2[1] * plane->normal[1] + p2[0] * plane->normal[0] - plane->dist;
t1 = _DotProduct(p1, plane->normal) - plane->dist;
t2 = _DotProduct(p2, plane->normal) - plane->dist;
}
else
{
@ -700,14 +699,14 @@ qboolean PM_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, cons
if (t1 >= 0.0)
{
midf = t1 - 0.03125f;
midf = t1 - DIST_EPSILON;
}
else
{
if (t2 < 0.0)
return PM_RecursiveHullCheck(hull, node->children[1], p1f, p2f, p1, p2, trace);
midf = t1 + 0.03125f;
midf = t1 + DIST_EPSILON;
}
midf = midf / (t1 - t2);
if (midf >= 0.0)
@ -837,8 +836,8 @@ qboolean PM_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, cons
plane = &hull->planes[node->planenum];
if (plane->type >= 3u)
{
t1 = p1[1] * plane->normal[1] + p1[2] * plane->normal[2] + p1[0] * plane->normal[0] - plane->dist;
t2 = p2[1] * plane->normal[1] + p2[2] * plane->normal[2] + plane->normal[0] * p2[0] - plane->dist;
t1 = _DotProduct(p1, plane->normal) - plane->dist;
t2 = _DotProduct(p2, plane->normal) - plane->dist;
}
else
{

View File

@ -341,7 +341,12 @@ void PF_sound_I(edict_t *entity, int channel, const char *sample, float volume,
/* <78cdd> ../engine/pr_cmds.c:491 */
void PF_traceline_Shared(const float *v1, const float *v2, int nomonsters, edict_t *ent)
{
#ifdef REHLDS_OPT_PEDANTIC
trace_t trace = SV_Move_Point(v1, v2, nomonsters, ent);
#else // REHLDS_OPT_PEDANTIC
trace_t trace = SV_Move(v1, vec3_origin, vec3_origin, v2, nomonsters, ent, 0);
#endif // REHLDS_OPT_PEDANTIC
gGlobalVariables.trace_flags = 0;
SV_SetGlobalTrace(&trace);
}

View File

@ -514,6 +514,8 @@ int Sys_InitGame(char *lpOrgCmdLine, char *pBaseDir, void *pwnd, int bIsDedicate
Sys_InitHardwareTimer();
#endif // _WIN32
Sys_CheckCpuInstructionsSupport();
#ifndef SWDS
Sys_InitFloatTime();
#endif // SWDS

View File

@ -529,7 +529,7 @@ int SV_HullPointContents(hull_t *hull, int num, const vec_t *p)
node = &hull->clipnodes[i];
plane = &hull->planes[node->planenum];
if (plane->type > 2)
d = plane->normal[0] * *p + plane->normal[1] * p[1] + plane->normal[2] * p[2] - plane->dist;
d = _DotProduct(plane->normal, p) - plane->dist;
else
d = p[plane->type] - plane->dist;
i = node->children[(d >= 0.0f) ? 0 : 1];
@ -644,7 +644,7 @@ edict_t *SV_TestEntityPosition(edict_t *ent)
#ifndef REHLDS_OPT_PEDANTIC
/* <cacbc> ../engine/world.c:804 */
qboolean SV_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, vec_t *p1, vec_t *p2, trace_t *trace)
qboolean SV_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, const vec_t *p1, const vec_t *p2, trace_t *trace)
{
dclipnode_t *node;
mplane_t *plane;
@ -665,8 +665,8 @@ qboolean SV_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, vec_
plane = &hull->planes[hull->clipnodes[num].planenum];
if (plane->type >= 3)
{
t1 = p1[1] * plane->normal[1] + p1[2] * plane->normal[2] + p1[0] * plane->normal[0] - plane->dist;
t2 = p2[1] * plane->normal[1] + p2[2] * plane->normal[2] + plane->normal[0] * p2[0] - plane->dist;
t1 = _DotProduct(p1, plane->normal) - plane->dist;
t2 = _DotProduct(p2, plane->normal) - plane->dist;
}
else
{
@ -678,14 +678,14 @@ qboolean SV_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, vec_
if (t1 >= 0.0f)
{
midf = t1 - 0.03125f;
midf = t1 - DIST_EPSILON;
}
else
{
if (t2 < 0.0f)
return SV_RecursiveHullCheck(hull, node->children[1], p1f, p2f, p1, p2, trace);
midf = t1 + 0.03125f;
midf = t1 + DIST_EPSILON;
}
midf = midf / (t1 - t2);
@ -698,7 +698,7 @@ qboolean SV_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, vec_
{
midf = 0.0f;
}
if (((*reinterpret_cast<int*>(&midf)) & nanmask) != nanmask)
if (!IS_NAN(midf)) // not a number
{
frac = pdif * midf + p1f;
mid[0] = (p2[0] - p1[0]) * midf + p1[0];
@ -803,8 +803,8 @@ qboolean SV_RecursiveHullCheck(hull_t *hull, int num, float p1f, float p2f, cons
plane = &hull->planes[hull->clipnodes[num].planenum];
if (plane->type >= 3)
{
t1 = p1[1] * plane->normal[1] + p1[2] * plane->normal[2] + p1[0] * plane->normal[0] - plane->dist;
t2 = p2[1] * plane->normal[1] + p2[2] * plane->normal[2] + plane->normal[0] * p2[0] - plane->dist;
t1 = _DotProduct(p1, plane->normal) - plane->dist;
t2 = _DotProduct(p2, plane->normal) - plane->dist;
}
else
{
@ -974,14 +974,14 @@ void SV_SingleClipMoveToEntity(edict_t *ent, const vec_t *start, const vec_t *mi
AngleVectors(ent->v.angles, forward, right, up);
temp[0] = start_l[0]; temp[1] = start_l[1]; temp[2] = start_l[2];
start_l[0] = forward[2] * temp[2] + forward[1] * temp[1] + forward[0] * temp[0];
start_l[1] = -(right[0] * temp[0] + right[2] * temp[2] + right[1] * temp[1]);
start_l[2] = up[1] * temp[1] + up[0] * temp[0] + up[2] * temp[2];
start_l[0] = _DotProduct(forward, temp);
start_l[1] = -_DotProduct(right, temp);
start_l[2] = _DotProduct(up, temp);
temp[0] = end_l[0]; temp[1] = end_l[1]; temp[2] = end_l[2];
end_l[0] = forward[2] * temp[2] + forward[1] * temp[1] + forward[0] * temp[0];
end_l[1] = -(right[0] * temp[0] + right[2] * temp[2] + right[1] * temp[1]);
end_l[2] = up[1] * temp[1] + up[0] * temp[0] + up[2] * temp[2];
end_l[0] = _DotProduct(forward, temp);
end_l[1] = -_DotProduct(right, temp);
end_l[2] = _DotProduct(up, temp);
rotated = 1;
}
@ -1033,9 +1033,9 @@ void SV_SingleClipMoveToEntity(edict_t *ent, const vec_t *start, const vec_t *mi
temp[1] = trace->plane.normal[1];
temp[2] = trace->plane.normal[2];
trace->plane.normal[0] = up[2] * temp[2] + up[1] * temp[1] + up[0] * temp[0];
trace->plane.normal[1] = right[2] * temp[2] + right[1] * temp[1] + right[0] * temp[0];
trace->plane.normal[2] = forward[2] * temp[2] + forward[1] * temp[1] + forward[0] * temp[0];
trace->plane.normal[0] = _DotProduct(up, temp);
trace->plane.normal[1] = _DotProduct(right, temp);
trace->plane.normal[2] = _DotProduct(forward, temp);
}
trace->endpos[0] = (end[0] - start[0]) * trace->fraction + start[0];
@ -1274,7 +1274,7 @@ trace_t SV_Move(const vec_t *start, const vec_t *mins, const vec_t *maxs, const
clip.end = worldEndPoint;
worldFraction = clip.trace.fraction;
clip.type = (unsigned char)type;
clip.type = type & 0xFF;
clip.ignoretrans = type >> 8;
clip.trace.fraction = 1.0f;
clip.start = start;
@ -1307,3 +1307,100 @@ trace_t SV_Move(const vec_t *start, const vec_t *mins, const vec_t *maxs, const
return clip.trace;
}
#ifdef REHLDS_OPT_PEDANTIC
// Optimized version of SV_Move routines for moving point hull throw world
void SV_SingleClipMoveToPoint(const vec_t *start, const vec_t *end, trace_t *trace)
{
hull_t *hull;
Q_memset(trace, 0, sizeof(trace_t));
trace->fraction = 1.0f;
trace->allsolid = TRUE;
trace->endpos[0] = end[0];
trace->endpos[1] = end[1];
trace->endpos[2] = end[2];
hull = &g_psv.models[1]->hulls[0]; // world point hull
SV_RecursiveHullCheck(hull, hull->firstclipnode, 0.0f, 1.0f, start, end, trace);
if (trace->fraction != 1.0f)
{
trace->endpos[0] = ( end[0] - start[0] ) * trace->fraction + start[0];
trace->endpos[1] = ( end[1] - start[1] ) * trace->fraction + start[1];
trace->endpos[2] = ( end[2] - start[2] ) * trace->fraction + start[2];
}
if (trace->fraction < 1.0f || trace->startsolid)
trace->ent = &g_psv.edicts[0];
}
void SV_MoveBounds_Point(const vec_t *start, const vec_t *end, vec_t *boxmins, vec_t *boxmaxs)
{
for (int i = 0; i < 3; i++)
{
if (end[i] > start[i])
{
boxmins[i] = start[i] - 1.0f;
boxmaxs[i] = end[i] + 1.0f;
}
else
{
boxmins[i] = end[i] - 1.0f;
boxmaxs[i] = start[i] + 1.0f;
}
}
}
trace_t SV_Move_Point(const vec_t *start, const vec_t *end, int type, edict_t *passedict)
{
moveclip_t clip;
vec3_t worldEndPoint;
float worldFraction;
Q_memset(&clip, 0, sizeof(clip));
SV_SingleClipMoveToPoint(start, end, &clip.trace);
if (clip.trace.fraction != 0.0f)
{
worldEndPoint[0] = clip.trace.endpos[0];
worldEndPoint[1] = clip.trace.endpos[1];
worldEndPoint[2] = clip.trace.endpos[2];
clip.end = worldEndPoint;
worldFraction = clip.trace.fraction;
clip.type = type & 0xFF;
clip.ignoretrans = type >> 8;
clip.trace.fraction = 1.0f;
clip.start = start;
clip.mins = vec3_origin;
clip.maxs = vec3_origin;
clip.passedict = passedict;
clip.monsterClipBrush = 0;
if (type == 2)
{
for (int i = 0; i < 3; i++)
{
clip.mins2[i] = -15.0f;
clip.maxs2[i] = +15.0f;
}
}
else
{
clip.mins2[0] = 0.0;
clip.mins2[1] = 0.0;
clip.mins2[2] = 0.0;
clip.maxs2[0] = 0.0;
clip.maxs2[1] = 0.0;
clip.maxs2[2] = 0.0;
}
SV_MoveBounds_Point(start, worldEndPoint, clip.boxmins, clip.boxmaxs);
SV_ClipToLinks(sv_areanodes, &clip);
gGlobalVariables.trace_ent = clip.trace.ent;
clip.trace.fraction = worldFraction * clip.trace.fraction;
}
return clip.trace;
}
#endif // REHLDS_OPT_PEDANTIC

View File

@ -124,4 +124,8 @@ void SV_MoveBounds(const vec_t *start, const vec_t *mins, const vec_t *maxs, con
trace_t SV_MoveNoEnts(const vec_t *start, vec_t *mins, vec_t *maxs, const vec_t *end, int type, edict_t *passedict);
trace_t SV_Move(const vec_t *start, const vec_t *mins, const vec_t *maxs, const vec_t *end, int type, edict_t *passedict, qboolean monsterClipBrush);
#ifdef REHLDS_OPT_PEDANTIC
trace_t SV_Move_Point(const vec_t *start, const vec_t *end, int type, edict_t *passedict);
#endif // REHLDS_OPT_PEDANTIC
#endif // WORLD_H

View File

@ -110,7 +110,7 @@ uint32 crc32_t_nosse(uint32 iCRC, const uint8 *s, unsigned int len) {
}
uint32 crc32_t(uint32 iCRC, const uint8 *s, unsigned int len) {
if (!g_HasSSE42) {
if (!cpuinfo.sse4_2) {
return crc32_t_nosse(iCRC, s, len);
}

View File

@ -27,19 +27,38 @@
*/
#include "sys_shared.h"
bool Sys_ChechSSE42Support();
#define SSE3_FLAG (1<<0)
#define SSSE3_FLAG (1<<9)
#define SSE4_1_FLAG (1<<19)
#define SSE4_2_FLAG (1<<20)
#define AVX_FLAG (1<<28)
#define AVX2_FLAG (1<<5)
bool g_HasSSE42 = Sys_ChechSSE42Support();
cpuinfo_t cpuinfo;
bool Sys_ChechSSE42Support() {
void Sys_CheckCpuInstructionsSupport(void)
{
unsigned int cpuid_data[4];
// eax = 1, ecx = 0
#if defined(__GNUC__)
__get_cpuid(0x1, &cpuid_data[0], &cpuid_data[1], &cpuid_data[2], &cpuid_data[3]);
#else //__GNUC__
__cpuidex((int*)cpuid_data, 1, 0);
#endif //__GNUC__
return (0 != (cpuid_data[2] & (1 << 20)));
}
cpuinfo.sse3 = (cpuid_data[2] & SSE3_FLAG) ? 1 : 0; // ecx
cpuinfo.ssse3 = (cpuid_data[2] & SSSE3_FLAG) ? 1 : 0;
cpuinfo.sse4_1 = (cpuid_data[2] & SSE4_1_FLAG) ? 1 : 0;
cpuinfo.sse4_2 = (cpuid_data[2] & SSE4_2_FLAG) ? 1 : 0;
cpuinfo.avx = (cpuid_data[2] & AVX_FLAG) ? 1 : 0;
// eax = 7, ecx = 0
#if defined(__GNUC__)
__get_cpuid(0x7, &cpuid_data[0], &cpuid_data[1], &cpuid_data[2], &cpuid_data[3]);
#else //__GNUC__
__cpuidex((int*)cpuid_data, 7, 0);
#endif //__GNUC__
cpuinfo.avx2 = (cpuid_data[1] & AVX2_FLAG) ? 1 : 0; // ebx
}

View File

@ -27,4 +27,11 @@
*/
#pragma once
extern bool g_HasSSE42;
typedef struct cpuinfo_s
{
uint8 sse3, ssse3, sse4_1, sse4_2, avx, avx2;
} cpuinfo_t;
extern cpuinfo_t cpuinfo;
void Sys_CheckCpuInstructionsSupport(void);