/* ** Command & Conquer Renegade(tm) ** Copyright 2025 Electronic Arts Inc. ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation, either version 3 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program. If not, see . */ /*********************************************************************************************** *** C O N F I D E N T I A L --- W E S T W O O D S T U D I O S *** *********************************************************************************************** * * * Project Name : wwmath * * * * $Archive:: /Commando/Code/WWMath/vp.cpp $* * * * Author:: Hector Yee * * * * $Modtime:: 6/27/01 4:16p $* * * * $Revision:: 11 $* * * *---------------------------------------------------------------------------------------------*/ #include "vp.h" #include "vector2.h" #include "vector3.h" #include "vector4.h" #include "matrix3d.h" #include "matrix4.h" #include "wwdebug.h" #include "cpudetect.h" #include #define SHUFFLE(x, y, z, w) (((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3)) #define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3)) #define TRANSPOSE(BX, BY, BZ, BW, TV) \ __asm movaps TV,BZ \ __asm unpcklps BZ,BW \ __asm unpckhps TV,BW \ __asm movaps BW,BX \ __asm unpcklps BX,BY \ __asm unpckhps BW,BY \ __asm movaps BY,BX \ __asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \ __asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \ __asm movaps BZ,BW \ __asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \ __asm shufps BW,TV,SHUFFLE(3, 2, 3, 2) void VectorProcessorClass::Prefetch(void* address) { #if defined (__ICL) // Detect Intel compiler if (CPUDetectClass::_Has_SSE_Instruction_Set()) { __asm { // mov edx,address // mov eax,[edx] // prefetchT1 address } } #endif } static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f); void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count) { if (count<=0) return; #if defined (__ICL) // Detect Intel compiler if (CPUDetectClass::_Has_SSE_Instruction_Set()) { __asm { mov edx,dst mov eax,src mov ebx,mtx mov edi,count movups xmm4,[ebx+0] movups xmm5,[ebx+16] movups xmm6,[ebx+32] movups xmm7,lastrow //[ebx+48] TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0); shufps xmm4,xmm4,SHUFFLE(2,1,0,0) shufps xmm5,xmm5,SHUFFLE(2,1,0,0) shufps xmm6,xmm6,SHUFFLE(2,1,0,0) shufps xmm7,xmm7,SHUFFLE(2,1,0,0) mov esi,edx _lp: test edi,edi jz _ulos test esi,0xf jz _aligned movss xmm0,[eax] movss xmm1,[eax+4] movss xmm2,[eax+8] BROADCAST(xmm0,0) BROADCAST(xmm1,0) BROADCAST(xmm2,0) mulps xmm0,xmm4 mulps xmm1,xmm5 mulps xmm2,xmm6 addps xmm0,xmm1 addps xmm0,xmm2 addps xmm0,xmm7 movss [edx],xmm0 movhps [edx+4],xmm0 add eax,12 add edx,12 add esi,12 dec edi jmp _lp _aligned: mov esi,1 mov ecx,edi and edi,3 and ecx,~3 jz _lp lea ecx,[ecx+ecx*2] shl ecx,2 add eax,ecx add edx,ecx neg ecx cmp dword ptr [ebx+12],0 jne _xlatelp cmp dword ptr [ebx+28],0 jne _xlatelp cmp dword ptr [ebx+44],0 jne _xlatelp jmp _noxlatelp align 16 _noxlatelp: prefetchnta [eax+ecx+48] prefetchnta [eax+ecx+48+32] movss xmm0,[eax+ecx] BROADCAST(xmm0,0) movss xmm1,[eax+ecx+4] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+8] BROADCAST(xmm2,0) mulps xmm0,xmm4 mulps xmm1,xmm5 mulps xmm2,xmm6 addps xmm0,xmm1 addps xmm0,xmm2 movss xmm1,[eax+ecx+12] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+16] BROADCAST(xmm2,0) movss xmm3,[eax+ecx+20] BROADCAST(xmm3,0) mulps xmm1,xmm4 mulps xmm2,xmm5 mulps xmm3,xmm6 addps xmm1,xmm2 addps xmm3,xmm1 movss xmm0,xmm3 shufps xmm0,xmm0,SHUFFLE(0,3,2,1) movaps [edx+ecx],xmm0 prefetcht0 [edx+ecx+48] prefetcht0 [edx+ecx+48+32] movss xmm0,[eax+ecx+24] BROADCAST(xmm0,0) movss xmm1,[eax+ecx+24+4] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+24+8] BROADCAST(xmm2,0) mulps xmm0,xmm4 mulps xmm1,xmm5 mulps xmm2,xmm6 addps xmm0,xmm1 addps xmm0,xmm2 shufps xmm3,xmm0,SHUFFLE(2,1,3,2) movaps [edx+ecx+16],xmm3 movss xmm1,[eax+ecx+24+12] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+24+16] BROADCAST(xmm2,0) movss xmm3,[eax+ecx+24+20] BROADCAST(xmm3,0) mulps xmm1,xmm4 mulps xmm2,xmm5 mulps xmm3,xmm6 addps xmm1,xmm2 addps xmm1,xmm3 shufps xmm0,xmm0,SHUFFLE(2,1,0,3) movss xmm1,xmm0 movaps [edx+ecx+32],xmm1 add ecx,48 js _noxlatelp jmp _lp align 16 _xlatelp: prefetchnta [eax+ecx+48] prefetchnta [eax+ecx+48+32] movss xmm0,[eax+ecx] BROADCAST(xmm0,0) movss xmm1,[eax+ecx+4] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+8] BROADCAST(xmm2,0) mulps xmm0,xmm4 mulps xmm1,xmm5 mulps xmm2,xmm6 addps xmm0,xmm1 addps xmm0,xmm2 addps xmm0,xmm7 movss xmm1,[eax+ecx+12] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+16] BROADCAST(xmm2,0) movss xmm3,[eax+ecx+20] BROADCAST(xmm3,0) mulps xmm1,xmm4 mulps xmm2,xmm5 mulps xmm3,xmm6 addps xmm1,xmm2 addps xmm3,xmm1 addps xmm3,xmm7 movss xmm0,xmm3 shufps xmm0,xmm0,SHUFFLE(0,3,2,1) movaps [edx+ecx],xmm0 prefetcht0 [edx+ecx+48] prefetcht0 [edx+ecx+48+32] movss xmm0,[eax+ecx+24] BROADCAST(xmm0,0) movss xmm1,[eax+ecx+24+4] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+24+8] BROADCAST(xmm2,0) mulps xmm0,xmm4 mulps xmm1,xmm5 mulps xmm2,xmm6 addps xmm0,xmm1 addps xmm0,xmm2 addps xmm0,xmm7 shufps xmm3,xmm0,SHUFFLE(2,1,3,2) movaps [edx+ecx+16],xmm3 movss xmm1,[eax+ecx+24+12] BROADCAST(xmm1,0) movss xmm2,[eax+ecx+24+16] BROADCAST(xmm2,0) movss xmm3,[eax+ecx+24+20] BROADCAST(xmm3,0) mulps xmm1,xmm4 mulps xmm2,xmm5 mulps xmm3,xmm6 addps xmm1,xmm2 addps xmm1,xmm3 addps xmm1,xmm7 shufps xmm0,xmm0,SHUFFLE(2,1,0,3) movss xmm1,xmm0 movaps [edx+ecx+32],xmm1 add ecx,48 js _xlatelp jmp _lp _ulos: } } else #endif { int i; for (i=0; imax)?max:src[i].X; dst[i].Y=(src[i].Ymax)?max:src[i].Y; dst[i].Z=(src[i].Zmax)?max:src[i].Z; dst[i].W=(src[i].Wmax)?max:src[i].W; } } void VectorProcessorClass::Clear(Vector3*dst, const int count) { if (count<=0) return; memset(dst,0,sizeof(Vector3)*count); } void VectorProcessorClass::Normalize(Vector3 *dst, const int count) { if (count<=0) return; int i; for (i=0; imin?src[i]:min); } void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count) { for (int i=0; i