This repository has been archived on 2025-02-27. You can view files and clone it, but cannot push or open issues or pull requests.
CnC_Renegade/Code/WWMath/vp.cpp

541 lines
12 KiB
C++
Raw Permalink Normal View History

/*
** Command & Conquer Renegade(tm)
** Copyright 2025 Electronic Arts Inc.
**
** This program is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/***********************************************************************************************
*** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S ***
***********************************************************************************************
* *
* Project Name : wwmath *
* *
* $Archive:: /Commando/Code/WWMath/vp.cpp $*
* *
* Author:: Hector Yee *
* *
* $Modtime:: 6/27/01 4:16p $*
* *
* $Revision:: 11 $*
* *
*---------------------------------------------------------------------------------------------*/
#include "vp.h"
#include "vector2.h"
#include "vector3.h"
#include "vector4.h"
#include "matrix3d.h"
#include "matrix4.h"
#include "wwdebug.h"
#include "cpudetect.h"
#include <memory.h>
#define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
#define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))
#define TRANSPOSE(BX, BY, BZ, BW, TV) \
__asm movaps TV,BZ \
__asm unpcklps BZ,BW \
__asm unpckhps TV,BW \
__asm movaps BW,BX \
__asm unpcklps BX,BY \
__asm unpckhps BW,BY \
__asm movaps BY,BX \
__asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \
__asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \
__asm movaps BZ,BW \
__asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \
__asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)
void VectorProcessorClass::Prefetch(void* address)
{
#if defined (__ICL) // Detect Intel compiler
if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
__asm {
// mov edx,address
// mov eax,[edx]
// prefetchT1 address
}
}
#endif
}
static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
{
if (count<=0) return;
#if defined (__ICL) // Detect Intel compiler
if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
__asm {
mov edx,dst
mov eax,src
mov ebx,mtx
mov edi,count
movups xmm4,[ebx+0]
movups xmm5,[ebx+16]
movups xmm6,[ebx+32]
movups xmm7,lastrow //[ebx+48]
TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);
shufps xmm4,xmm4,SHUFFLE(2,1,0,0)
shufps xmm5,xmm5,SHUFFLE(2,1,0,0)
shufps xmm6,xmm6,SHUFFLE(2,1,0,0)
shufps xmm7,xmm7,SHUFFLE(2,1,0,0)
mov esi,edx
_lp:
test edi,edi
jz _ulos
test esi,0xf
jz _aligned
movss xmm0,[eax]
movss xmm1,[eax+4]
movss xmm2,[eax+8]
BROADCAST(xmm0,0)
BROADCAST(xmm1,0)
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm7
movss [edx],xmm0
movhps [edx+4],xmm0
add eax,12
add edx,12
add esi,12
dec edi
jmp _lp
_aligned:
mov esi,1
mov ecx,edi
and edi,3
and ecx,~3
jz _lp
lea ecx,[ecx+ecx*2]
shl ecx,2
add eax,ecx
add edx,ecx
neg ecx
cmp dword ptr [ebx+12],0
jne _xlatelp
cmp dword ptr [ebx+28],0
jne _xlatelp
cmp dword ptr [ebx+44],0
jne _xlatelp
jmp _noxlatelp
align 16
_noxlatelp:
prefetchnta [eax+ecx+48]
prefetchnta [eax+ecx+48+32]
movss xmm0,[eax+ecx]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
movss xmm1,[eax+ecx+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm3,xmm1
movss xmm0,xmm3
shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
movaps [edx+ecx],xmm0
prefetcht0 [edx+ecx+48]
prefetcht0 [edx+ecx+48+32]
movss xmm0,[eax+ecx+24]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+24+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
movaps [edx+ecx+16],xmm3
movss xmm1,[eax+ecx+24+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+24+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm1,xmm3
shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
movss xmm1,xmm0
movaps [edx+ecx+32],xmm1
add ecx,48
js _noxlatelp
jmp _lp
align 16
_xlatelp:
prefetchnta [eax+ecx+48]
prefetchnta [eax+ecx+48+32]
movss xmm0,[eax+ecx]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm7
movss xmm1,[eax+ecx+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm3,xmm1
addps xmm3,xmm7
movss xmm0,xmm3
shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
movaps [edx+ecx],xmm0
prefetcht0 [edx+ecx+48]
prefetcht0 [edx+ecx+48+32]
movss xmm0,[eax+ecx+24]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+24+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm7
shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
movaps [edx+ecx+16],xmm3
movss xmm1,[eax+ecx+24+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+24+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm1,xmm3
addps xmm1,xmm7
shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
movss xmm1,xmm0
movaps [edx+ecx+32],xmm1
add ecx,48
js _xlatelp
jmp _lp
_ulos:
}
}
else
#endif
{
int i;
for (i=0; i<count; i++)
{
dst[i]=mtx*src[i];
}
}
}
void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4& matrix, const int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=matrix*src[i];
}
}
void VectorProcessorClass::Copy(Vector2 *dst, const Vector2 *src, int count)
{
if (count<=0) return;
memcpy(dst,src,sizeof(Vector2)*count);
}
void VectorProcessorClass::Copy(unsigned *dst, const unsigned *src, int count)
{
if (count<=0) return;
memcpy(dst,src,sizeof(unsigned)*count);
}
void VectorProcessorClass::Copy(Vector3 *dst, const Vector3 *src, int count)
{
if (count<=0) return;
memcpy(dst,src,sizeof(Vector3)*count);
}
void VectorProcessorClass::Copy(Vector4 *dst, const Vector4 *src, int count)
{
if (count<=0) return;
memcpy(dst,src,sizeof(Vector4)*count);
}
void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float * srca, const int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i].X=src[i].X;
dst[i].Y=src[i].Y;
dst[i].Z=src[i].Z;
dst[i].W=srca[i];
}
}
void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float srca, const int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i].X=src[i].X;
dst[i].Y=src[i].Y;
dst[i].Z=src[i].Z;
dst[i].W=srca;
}
}
void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 &src, const float * srca, const int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i].X=src.X;
dst[i].Y=src.Y;
dst[i].Z=src.Z;
dst[i].W=srca[i];
}
}
void VectorProcessorClass::CopyIndexed (unsigned *dst,const unsigned *src, const unsigned int *index, int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=src[index[i]];
}
}
void VectorProcessorClass::CopyIndexed (Vector2 *dst,const Vector2 *src, const unsigned int *index, int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=src[index[i]];
}
}
void VectorProcessorClass::CopyIndexed (Vector3 *dst,const Vector3 *src, const unsigned int *index, int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=src[index[i]];
}
}
void VectorProcessorClass::CopyIndexed (Vector4 *dst,const Vector4 *src, const unsigned int *index, int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=src[index[i]];
}
}
void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=src[index[i]];
}
}
void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i]=src[index[i]];
}
}
void VectorProcessorClass::Clamp(Vector4 *dst,const Vector4 *src, const float min, const float max, const int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
{
dst[i].X=(src[i].X<min)?min:src[i].X;
dst[i].X=(src[i].X>max)?max:src[i].X;
dst[i].Y=(src[i].Y<min)?min:src[i].Y;
dst[i].Y=(src[i].Y>max)?max:src[i].Y;
dst[i].Z=(src[i].Z<min)?min:src[i].Z;
dst[i].Z=(src[i].Z>max)?max:src[i].Z;
dst[i].W=(src[i].W<min)?min:src[i].W;
dst[i].W=(src[i].W>max)?max:src[i].W;
}
}
void VectorProcessorClass::Clear(Vector3*dst, const int count)
{
if (count<=0) return;
memset(dst,0,sizeof(Vector3)*count);
}
void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
{
if (count<=0) return;
int i;
for (i=0; i<count; i++)
dst[i].Normalize();
}
void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)
{
if (count<=0) return;
min=*src;
max=*src;
int i;
for (i=1; i<count; i++)
{
min.X=MIN(min.X,src[i].X);
min.Y=MIN(min.Y,src[i].Y);
min.Z=MIN(min.Z,src[i].Z);
max.X=MAX(max.X,src[i].X);
max.Y=MAX(max.Y,src[i].Y);
max.Z=MAX(max.Z,src[i].Z);
}
}
void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)
{
for (int i=0; i<count; i++) {
dest[i] = dest[i] * multiplier + add;
}
}
void VectorProcessorClass::DotProduct(float *dst, const Vector3 &a, const Vector3 *b,const int count)
{
for (int i=0; i<count; i++)
dst[i]=Vector3::Dot_Product(a,b[i]);
}
void VectorProcessorClass::ClampMin(float *dst, float *src, const float min, const int count)
{
for (int i=0; i<count; i++)
dst[i]=(src[i]>min?src[i]:min);
}
void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
{
for (int i=0; i<count; i++)
dst[i]=powf(src[i],pow);
}