541 lines
No EOL
12 KiB
C++
541 lines
No EOL
12 KiB
C++
/*
|
|
** Command & Conquer Renegade(tm)
|
|
** Copyright 2025 Electronic Arts Inc.
|
|
**
|
|
** This program is free software: you can redistribute it and/or modify
|
|
** it under the terms of the GNU General Public License as published by
|
|
** the Free Software Foundation, either version 3 of the License, or
|
|
** (at your option) any later version.
|
|
**
|
|
** This program is distributed in the hope that it will be useful,
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
** GNU General Public License for more details.
|
|
**
|
|
** You should have received a copy of the GNU General Public License
|
|
** along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/***********************************************************************************************
|
|
*** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S ***
|
|
***********************************************************************************************
|
|
* *
|
|
* Project Name : wwmath *
|
|
* *
|
|
* $Archive:: /Commando/Code/WWMath/vp.cpp $*
|
|
* *
|
|
* Author:: Hector Yee *
|
|
* *
|
|
* $Modtime:: 6/27/01 4:16p $*
|
|
* *
|
|
* $Revision:: 11 $*
|
|
* *
|
|
*---------------------------------------------------------------------------------------------*/
|
|
|
|
#include "vp.h"
|
|
#include "vector2.h"
|
|
#include "vector3.h"
|
|
#include "vector4.h"
|
|
#include "matrix3d.h"
|
|
#include "matrix4.h"
|
|
#include "wwdebug.h"
|
|
#include "cpudetect.h"
|
|
#include <memory.h>
|
|
|
|
#define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
|
|
#define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))
|
|
|
|
#define TRANSPOSE(BX, BY, BZ, BW, TV) \
|
|
__asm movaps TV,BZ \
|
|
__asm unpcklps BZ,BW \
|
|
__asm unpckhps TV,BW \
|
|
__asm movaps BW,BX \
|
|
__asm unpcklps BX,BY \
|
|
__asm unpckhps BW,BY \
|
|
__asm movaps BY,BX \
|
|
__asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \
|
|
__asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \
|
|
__asm movaps BZ,BW \
|
|
__asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \
|
|
__asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)
|
|
|
|
|
|
void VectorProcessorClass::Prefetch(void* address)
|
|
{
|
|
#if defined (__ICL) // Detect Intel compiler
|
|
if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
|
|
__asm {
|
|
// mov edx,address
|
|
// mov eax,[edx]
|
|
// prefetchT1 address
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
|
|
void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
|
|
#if defined (__ICL) // Detect Intel compiler
|
|
if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
|
|
|
|
__asm {
|
|
mov edx,dst
|
|
mov eax,src
|
|
mov ebx,mtx
|
|
mov edi,count
|
|
|
|
movups xmm4,[ebx+0]
|
|
movups xmm5,[ebx+16]
|
|
movups xmm6,[ebx+32]
|
|
movups xmm7,lastrow //[ebx+48]
|
|
|
|
TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);
|
|
|
|
shufps xmm4,xmm4,SHUFFLE(2,1,0,0)
|
|
shufps xmm5,xmm5,SHUFFLE(2,1,0,0)
|
|
shufps xmm6,xmm6,SHUFFLE(2,1,0,0)
|
|
shufps xmm7,xmm7,SHUFFLE(2,1,0,0)
|
|
|
|
mov esi,edx
|
|
_lp:
|
|
test edi,edi
|
|
jz _ulos
|
|
test esi,0xf
|
|
jz _aligned
|
|
movss xmm0,[eax]
|
|
movss xmm1,[eax+4]
|
|
movss xmm2,[eax+8]
|
|
BROADCAST(xmm0,0)
|
|
BROADCAST(xmm1,0)
|
|
BROADCAST(xmm2,0)
|
|
mulps xmm0,xmm4
|
|
mulps xmm1,xmm5
|
|
mulps xmm2,xmm6
|
|
addps xmm0,xmm1
|
|
addps xmm0,xmm2
|
|
addps xmm0,xmm7
|
|
movss [edx],xmm0
|
|
movhps [edx+4],xmm0
|
|
add eax,12
|
|
add edx,12
|
|
add esi,12
|
|
dec edi
|
|
jmp _lp
|
|
_aligned:
|
|
|
|
mov esi,1
|
|
|
|
mov ecx,edi
|
|
and edi,3
|
|
and ecx,~3
|
|
jz _lp
|
|
|
|
lea ecx,[ecx+ecx*2]
|
|
shl ecx,2
|
|
add eax,ecx
|
|
add edx,ecx
|
|
neg ecx
|
|
|
|
cmp dword ptr [ebx+12],0
|
|
jne _xlatelp
|
|
cmp dword ptr [ebx+28],0
|
|
jne _xlatelp
|
|
cmp dword ptr [ebx+44],0
|
|
jne _xlatelp
|
|
jmp _noxlatelp
|
|
|
|
align 16
|
|
|
|
_noxlatelp:
|
|
prefetchnta [eax+ecx+48]
|
|
prefetchnta [eax+ecx+48+32]
|
|
|
|
movss xmm0,[eax+ecx]
|
|
BROADCAST(xmm0,0)
|
|
movss xmm1,[eax+ecx+4]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+8]
|
|
BROADCAST(xmm2,0)
|
|
mulps xmm0,xmm4
|
|
mulps xmm1,xmm5
|
|
mulps xmm2,xmm6
|
|
addps xmm0,xmm1
|
|
addps xmm0,xmm2
|
|
|
|
movss xmm1,[eax+ecx+12]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+16]
|
|
BROADCAST(xmm2,0)
|
|
movss xmm3,[eax+ecx+20]
|
|
BROADCAST(xmm3,0)
|
|
mulps xmm1,xmm4
|
|
mulps xmm2,xmm5
|
|
mulps xmm3,xmm6
|
|
addps xmm1,xmm2
|
|
addps xmm3,xmm1
|
|
|
|
movss xmm0,xmm3
|
|
shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
|
|
|
|
movaps [edx+ecx],xmm0
|
|
|
|
prefetcht0 [edx+ecx+48]
|
|
prefetcht0 [edx+ecx+48+32]
|
|
|
|
movss xmm0,[eax+ecx+24]
|
|
BROADCAST(xmm0,0)
|
|
movss xmm1,[eax+ecx+24+4]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+24+8]
|
|
BROADCAST(xmm2,0)
|
|
mulps xmm0,xmm4
|
|
mulps xmm1,xmm5
|
|
mulps xmm2,xmm6
|
|
addps xmm0,xmm1
|
|
addps xmm0,xmm2
|
|
|
|
shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
|
|
movaps [edx+ecx+16],xmm3
|
|
|
|
movss xmm1,[eax+ecx+24+12]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+24+16]
|
|
BROADCAST(xmm2,0)
|
|
movss xmm3,[eax+ecx+24+20]
|
|
BROADCAST(xmm3,0)
|
|
mulps xmm1,xmm4
|
|
mulps xmm2,xmm5
|
|
mulps xmm3,xmm6
|
|
addps xmm1,xmm2
|
|
addps xmm1,xmm3
|
|
|
|
shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
|
|
movss xmm1,xmm0
|
|
movaps [edx+ecx+32],xmm1
|
|
|
|
add ecx,48
|
|
js _noxlatelp
|
|
|
|
jmp _lp
|
|
|
|
align 16
|
|
|
|
_xlatelp:
|
|
prefetchnta [eax+ecx+48]
|
|
prefetchnta [eax+ecx+48+32]
|
|
|
|
movss xmm0,[eax+ecx]
|
|
BROADCAST(xmm0,0)
|
|
movss xmm1,[eax+ecx+4]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+8]
|
|
BROADCAST(xmm2,0)
|
|
mulps xmm0,xmm4
|
|
mulps xmm1,xmm5
|
|
mulps xmm2,xmm6
|
|
addps xmm0,xmm1
|
|
addps xmm0,xmm2
|
|
addps xmm0,xmm7
|
|
|
|
movss xmm1,[eax+ecx+12]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+16]
|
|
BROADCAST(xmm2,0)
|
|
movss xmm3,[eax+ecx+20]
|
|
BROADCAST(xmm3,0)
|
|
mulps xmm1,xmm4
|
|
mulps xmm2,xmm5
|
|
mulps xmm3,xmm6
|
|
addps xmm1,xmm2
|
|
addps xmm3,xmm1
|
|
addps xmm3,xmm7
|
|
|
|
movss xmm0,xmm3
|
|
shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
|
|
movaps [edx+ecx],xmm0
|
|
|
|
prefetcht0 [edx+ecx+48]
|
|
prefetcht0 [edx+ecx+48+32]
|
|
|
|
movss xmm0,[eax+ecx+24]
|
|
BROADCAST(xmm0,0)
|
|
movss xmm1,[eax+ecx+24+4]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+24+8]
|
|
BROADCAST(xmm2,0)
|
|
mulps xmm0,xmm4
|
|
mulps xmm1,xmm5
|
|
mulps xmm2,xmm6
|
|
addps xmm0,xmm1
|
|
addps xmm0,xmm2
|
|
addps xmm0,xmm7
|
|
|
|
shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
|
|
movaps [edx+ecx+16],xmm3
|
|
|
|
movss xmm1,[eax+ecx+24+12]
|
|
BROADCAST(xmm1,0)
|
|
movss xmm2,[eax+ecx+24+16]
|
|
BROADCAST(xmm2,0)
|
|
movss xmm3,[eax+ecx+24+20]
|
|
BROADCAST(xmm3,0)
|
|
mulps xmm1,xmm4
|
|
mulps xmm2,xmm5
|
|
mulps xmm3,xmm6
|
|
addps xmm1,xmm2
|
|
addps xmm1,xmm3
|
|
addps xmm1,xmm7
|
|
|
|
shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
|
|
movss xmm1,xmm0
|
|
|
|
movaps [edx+ecx+32],xmm1
|
|
|
|
add ecx,48
|
|
js _xlatelp
|
|
|
|
jmp _lp
|
|
_ulos:
|
|
}
|
|
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=mtx*src[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4& matrix, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=matrix*src[i];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(Vector2 *dst, const Vector2 *src, int count)
|
|
{
|
|
if (count<=0) return;
|
|
memcpy(dst,src,sizeof(Vector2)*count);
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(unsigned *dst, const unsigned *src, int count)
|
|
{
|
|
if (count<=0) return;
|
|
memcpy(dst,src,sizeof(unsigned)*count);
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(Vector3 *dst, const Vector3 *src, int count)
|
|
{
|
|
if (count<=0) return;
|
|
memcpy(dst,src,sizeof(Vector3)*count);
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(Vector4 *dst, const Vector4 *src, int count)
|
|
{
|
|
if (count<=0) return;
|
|
memcpy(dst,src,sizeof(Vector4)*count);
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float * srca, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i].X=src[i].X;
|
|
dst[i].Y=src[i].Y;
|
|
dst[i].Z=src[i].Z;
|
|
dst[i].W=srca[i];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float srca, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i].X=src[i].X;
|
|
dst[i].Y=src[i].Y;
|
|
dst[i].Z=src[i].Z;
|
|
dst[i].W=srca;
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 &src, const float * srca, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i].X=src.X;
|
|
dst[i].Y=src.Y;
|
|
dst[i].Z=src.Z;
|
|
dst[i].W=srca[i];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::CopyIndexed (unsigned *dst,const unsigned *src, const unsigned int *index, int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=src[index[i]];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::CopyIndexed (Vector2 *dst,const Vector2 *src, const unsigned int *index, int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=src[index[i]];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::CopyIndexed (Vector3 *dst,const Vector3 *src, const unsigned int *index, int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=src[index[i]];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::CopyIndexed (Vector4 *dst,const Vector4 *src, const unsigned int *index, int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=src[index[i]];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=src[index[i]];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i]=src[index[i]];
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::Clamp(Vector4 *dst,const Vector4 *src, const float min, const float max, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
{
|
|
dst[i].X=(src[i].X<min)?min:src[i].X;
|
|
dst[i].X=(src[i].X>max)?max:src[i].X;
|
|
|
|
dst[i].Y=(src[i].Y<min)?min:src[i].Y;
|
|
dst[i].Y=(src[i].Y>max)?max:src[i].Y;
|
|
|
|
dst[i].Z=(src[i].Z<min)?min:src[i].Z;
|
|
dst[i].Z=(src[i].Z>max)?max:src[i].Z;
|
|
|
|
dst[i].W=(src[i].W<min)?min:src[i].W;
|
|
dst[i].W=(src[i].W>max)?max:src[i].W;
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::Clear(Vector3*dst, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
memset(dst,0,sizeof(Vector3)*count);
|
|
}
|
|
|
|
|
|
void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
int i;
|
|
|
|
for (i=0; i<count; i++)
|
|
dst[i].Normalize();
|
|
}
|
|
|
|
void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)
|
|
{
|
|
if (count<=0) return;
|
|
min=*src;
|
|
max=*src;
|
|
|
|
int i;
|
|
|
|
for (i=1; i<count; i++)
|
|
{
|
|
min.X=MIN(min.X,src[i].X);
|
|
min.Y=MIN(min.Y,src[i].Y);
|
|
min.Z=MIN(min.Z,src[i].Z);
|
|
|
|
max.X=MAX(max.X,src[i].X);
|
|
max.Y=MAX(max.Y,src[i].Y);
|
|
max.Z=MAX(max.Z,src[i].Z);
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)
|
|
{
|
|
for (int i=0; i<count; i++) {
|
|
dest[i] = dest[i] * multiplier + add;
|
|
}
|
|
}
|
|
|
|
void VectorProcessorClass::DotProduct(float *dst, const Vector3 &a, const Vector3 *b,const int count)
|
|
{
|
|
for (int i=0; i<count; i++)
|
|
dst[i]=Vector3::Dot_Product(a,b[i]);
|
|
}
|
|
|
|
void VectorProcessorClass::ClampMin(float *dst, float *src, const float min, const int count)
|
|
{
|
|
for (int i=0; i<count; i++)
|
|
dst[i]=(src[i]>min?src[i]:min);
|
|
}
|
|
|
|
void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
|
|
{
|
|
for (int i=0; i<count; i++)
|
|
dst[i]=powf(src[i],pow);
|
|
} |