/*
** Command & Conquer Renegade(tm)
** Copyright 2025 Electronic Arts Inc.
**
** This program is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see .
*/
/***********************************************************************************************
*** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S ***
***********************************************************************************************
* *
* Project Name : wwmath *
* *
* $Archive:: /Commando/Code/WWMath/vp.cpp $*
* *
* Author:: Hector Yee *
* *
* $Modtime:: 6/27/01 4:16p $*
* *
* $Revision:: 11 $*
* *
*---------------------------------------------------------------------------------------------*/
#include "vp.h"
#include "vector2.h"
#include "vector3.h"
#include "vector4.h"
#include "matrix3d.h"
#include "matrix4.h"
#include "wwdebug.h"
#include "cpudetect.h"
#include
#define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
#define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))
#define TRANSPOSE(BX, BY, BZ, BW, TV) \
__asm movaps TV,BZ \
__asm unpcklps BZ,BW \
__asm unpckhps TV,BW \
__asm movaps BW,BX \
__asm unpcklps BX,BY \
__asm unpckhps BW,BY \
__asm movaps BY,BX \
__asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \
__asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \
__asm movaps BZ,BW \
__asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \
__asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)
void VectorProcessorClass::Prefetch(void* address)
{
#if defined (__ICL) // Detect Intel compiler
if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
__asm {
// mov edx,address
// mov eax,[edx]
// prefetchT1 address
}
}
#endif
}
static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
{
if (count<=0) return;
#if defined (__ICL) // Detect Intel compiler
if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
__asm {
mov edx,dst
mov eax,src
mov ebx,mtx
mov edi,count
movups xmm4,[ebx+0]
movups xmm5,[ebx+16]
movups xmm6,[ebx+32]
movups xmm7,lastrow //[ebx+48]
TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);
shufps xmm4,xmm4,SHUFFLE(2,1,0,0)
shufps xmm5,xmm5,SHUFFLE(2,1,0,0)
shufps xmm6,xmm6,SHUFFLE(2,1,0,0)
shufps xmm7,xmm7,SHUFFLE(2,1,0,0)
mov esi,edx
_lp:
test edi,edi
jz _ulos
test esi,0xf
jz _aligned
movss xmm0,[eax]
movss xmm1,[eax+4]
movss xmm2,[eax+8]
BROADCAST(xmm0,0)
BROADCAST(xmm1,0)
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm7
movss [edx],xmm0
movhps [edx+4],xmm0
add eax,12
add edx,12
add esi,12
dec edi
jmp _lp
_aligned:
mov esi,1
mov ecx,edi
and edi,3
and ecx,~3
jz _lp
lea ecx,[ecx+ecx*2]
shl ecx,2
add eax,ecx
add edx,ecx
neg ecx
cmp dword ptr [ebx+12],0
jne _xlatelp
cmp dword ptr [ebx+28],0
jne _xlatelp
cmp dword ptr [ebx+44],0
jne _xlatelp
jmp _noxlatelp
align 16
_noxlatelp:
prefetchnta [eax+ecx+48]
prefetchnta [eax+ecx+48+32]
movss xmm0,[eax+ecx]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
movss xmm1,[eax+ecx+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm3,xmm1
movss xmm0,xmm3
shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
movaps [edx+ecx],xmm0
prefetcht0 [edx+ecx+48]
prefetcht0 [edx+ecx+48+32]
movss xmm0,[eax+ecx+24]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+24+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
movaps [edx+ecx+16],xmm3
movss xmm1,[eax+ecx+24+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+24+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm1,xmm3
shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
movss xmm1,xmm0
movaps [edx+ecx+32],xmm1
add ecx,48
js _noxlatelp
jmp _lp
align 16
_xlatelp:
prefetchnta [eax+ecx+48]
prefetchnta [eax+ecx+48+32]
movss xmm0,[eax+ecx]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm7
movss xmm1,[eax+ecx+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm3,xmm1
addps xmm3,xmm7
movss xmm0,xmm3
shufps xmm0,xmm0,SHUFFLE(0,3,2,1)
movaps [edx+ecx],xmm0
prefetcht0 [edx+ecx+48]
prefetcht0 [edx+ecx+48+32]
movss xmm0,[eax+ecx+24]
BROADCAST(xmm0,0)
movss xmm1,[eax+ecx+24+4]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+8]
BROADCAST(xmm2,0)
mulps xmm0,xmm4
mulps xmm1,xmm5
mulps xmm2,xmm6
addps xmm0,xmm1
addps xmm0,xmm2
addps xmm0,xmm7
shufps xmm3,xmm0,SHUFFLE(2,1,3,2)
movaps [edx+ecx+16],xmm3
movss xmm1,[eax+ecx+24+12]
BROADCAST(xmm1,0)
movss xmm2,[eax+ecx+24+16]
BROADCAST(xmm2,0)
movss xmm3,[eax+ecx+24+20]
BROADCAST(xmm3,0)
mulps xmm1,xmm4
mulps xmm2,xmm5
mulps xmm3,xmm6
addps xmm1,xmm2
addps xmm1,xmm3
addps xmm1,xmm7
shufps xmm0,xmm0,SHUFFLE(2,1,0,3)
movss xmm1,xmm0
movaps [edx+ecx+32],xmm1
add ecx,48
js _xlatelp
jmp _lp
_ulos:
}
}
else
#endif
{
int i;
for (i=0; imax)?max:src[i].X;
dst[i].Y=(src[i].Ymax)?max:src[i].Y;
dst[i].Z=(src[i].Zmax)?max:src[i].Z;
dst[i].W=(src[i].Wmax)?max:src[i].W;
}
}
void VectorProcessorClass::Clear(Vector3*dst, const int count)
{
if (count<=0) return;
memset(dst,0,sizeof(Vector3)*count);
}
void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
{
if (count<=0) return;
int i;
for (i=0; imin?src[i]:min);
}
void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
{
for (int i=0; i