CnC_Renegade/Code/WWMath/vp.cpp

/*
**	Command & Conquer Renegade(tm)
**	Copyright 2025 Electronic Arts Inc.
**
**	This program is free software: you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation, either version 3 of the License, or
**	(at your option) any later version.
**
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
**
**	You should have received a copy of the GNU General Public License
**	along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/***********************************************************************************************
 ***              C O N F I D E N T I A L  ---  W E S T W O O D  S T U D I O S               ***
 ***********************************************************************************************
 *                                                                                             *
 *                 Project Name : wwmath                                                       *
 *                                                                                             *
 *                     $Archive:: /Commando/Code/WWMath/vp.cpp                                $*
 *                                                                                             *
 *                        Author:: Hector Yee                                                  *
 *                                                                                             *
 *                     $Modtime:: 6/27/01 4:16p                                               $*
 *                                                                                             *
 *                    $Revision:: 11                                                          $*
 *                                                                                             *
 *---------------------------------------------------------------------------------------------*/

#include "vp.h"
#include "vector2.h"
#include "vector3.h"
#include "vector4.h"
#include "matrix3d.h"
#include "matrix4.h"
#include "wwdebug.h"
#include "cpudetect.h"
#include <memory.h>

#define SHUFFLE(x, y, z, w)	(((x)&3)<< 6|((y)&3)<<4|((z)&3)<< 2|((w)&3))
#define	BROADCAST(XMM, INDEX)	__asm	shufps	XMM,XMM,(((INDEX)&3)<< 6|((INDEX)&3)<<4|((INDEX)&3)<< 2|((INDEX)&3))

#define TRANSPOSE(BX, BY, BZ, BW, TV)					\
		__asm	movaps		TV,BZ						\
		__asm	unpcklps	BZ,BW						\
		__asm	unpckhps	TV,BW						\
		__asm	movaps		BW,BX						\
		__asm	unpcklps	BX,BY						\
		__asm	unpckhps	BW,BY						\
		__asm	movaps		BY,BX						\
		__asm	shufps		BX,BZ,SHUFFLE(1, 0, 1, 0)	\
		__asm	shufps		BY,BZ,SHUFFLE(3, 2, 3, 2)	\
		__asm	movaps		BZ,BW						\
		__asm	shufps		BZ,TV,SHUFFLE(1, 0, 1, 0)	\
		__asm	shufps		BW,TV,SHUFFLE(3, 2, 3, 2)


void VectorProcessorClass::Prefetch(void* address)
{
#if defined (__ICL)    // Detect Intel compiler
	if (CPUDetectClass::_Has_SSE_Instruction_Set()) {
		__asm {
//			mov edx,address
//			mov eax,[edx]
//			prefetchT1 address
		}
	}
#endif
}

static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);
void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)
{
	if (count<=0) return;

#if defined (__ICL)    // Detect Intel compiler
	if (CPUDetectClass::_Has_SSE_Instruction_Set()) {

		__asm	{
			mov		edx,dst
			mov		eax,src
			mov		ebx,mtx
			mov		edi,count

			movups	xmm4,[ebx+0]
			movups	xmm5,[ebx+16]
			movups	xmm6,[ebx+32]
			movups	xmm7,lastrow	//[ebx+48]

			TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);

			shufps	xmm4,xmm4,SHUFFLE(2,1,0,0)
			shufps	xmm5,xmm5,SHUFFLE(2,1,0,0)
			shufps	xmm6,xmm6,SHUFFLE(2,1,0,0)
			shufps	xmm7,xmm7,SHUFFLE(2,1,0,0)

			mov		esi,edx
		_lp:
			test	edi,edi
			jz		_ulos
			test	esi,0xf
			jz		_aligned
			movss	xmm0,[eax]
			movss	xmm1,[eax+4]
			movss	xmm2,[eax+8]
			BROADCAST(xmm0,0)
			BROADCAST(xmm1,0)
			BROADCAST(xmm2,0)
			mulps	xmm0,xmm4
			mulps	xmm1,xmm5
			mulps	xmm2,xmm6
			addps	xmm0,xmm1
			addps	xmm0,xmm2
			addps	xmm0,xmm7
			movss	[edx],xmm0
			movhps	[edx+4],xmm0
			add		eax,12
			add		edx,12
			add		esi,12
			dec		edi
			jmp		_lp
		_aligned:

			mov		esi,1
		
			mov		ecx,edi
			and		edi,3
			and		ecx,~3
			jz		_lp
			
			lea		ecx,[ecx+ecx*2]
			shl		ecx,2
			add		eax,ecx
			add		edx,ecx
			neg		ecx

			cmp		dword ptr [ebx+12],0
			jne		_xlatelp
			cmp		dword ptr [ebx+28],0
			jne		_xlatelp
			cmp		dword ptr [ebx+44],0
			jne		_xlatelp
			jmp		_noxlatelp

			align	16

			_noxlatelp:
				prefetchnta	[eax+ecx+48]
				prefetchnta	[eax+ecx+48+32]

				movss	xmm0,[eax+ecx]
				BROADCAST(xmm0,0)
				movss	xmm1,[eax+ecx+4]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+8]
				BROADCAST(xmm2,0)
				mulps	xmm0,xmm4
				mulps	xmm1,xmm5
				mulps	xmm2,xmm6
				addps	xmm0,xmm1
				addps	xmm0,xmm2

				movss	xmm1,[eax+ecx+12]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+16]
				BROADCAST(xmm2,0)
				movss	xmm3,[eax+ecx+20]
				BROADCAST(xmm3,0)
				mulps	xmm1,xmm4
				mulps	xmm2,xmm5
				mulps	xmm3,xmm6
				addps	xmm1,xmm2
				addps	xmm3,xmm1

				movss	xmm0,xmm3
				shufps	xmm0,xmm0,SHUFFLE(0,3,2,1)

				movaps	[edx+ecx],xmm0

				prefetcht0	[edx+ecx+48]
				prefetcht0	[edx+ecx+48+32]

				movss	xmm0,[eax+ecx+24]
				BROADCAST(xmm0,0)
				movss	xmm1,[eax+ecx+24+4]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+24+8]
				BROADCAST(xmm2,0)
				mulps	xmm0,xmm4
				mulps	xmm1,xmm5
				mulps	xmm2,xmm6
				addps	xmm0,xmm1
				addps	xmm0,xmm2

				shufps	xmm3,xmm0,SHUFFLE(2,1,3,2)
				movaps	[edx+ecx+16],xmm3

				movss	xmm1,[eax+ecx+24+12]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+24+16]
				BROADCAST(xmm2,0)
				movss	xmm3,[eax+ecx+24+20]
				BROADCAST(xmm3,0)
				mulps	xmm1,xmm4
				mulps	xmm2,xmm5
				mulps	xmm3,xmm6
				addps	xmm1,xmm2
				addps	xmm1,xmm3

				shufps	xmm0,xmm0,SHUFFLE(2,1,0,3)
				movss	xmm1,xmm0
				movaps	[edx+ecx+32],xmm1

				add		ecx,48
				js		_noxlatelp

			jmp	_lp

			align	16

			_xlatelp:
				prefetchnta	[eax+ecx+48]
				prefetchnta	[eax+ecx+48+32]

				movss	xmm0,[eax+ecx]
				BROADCAST(xmm0,0)
				movss	xmm1,[eax+ecx+4]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+8]
				BROADCAST(xmm2,0)
				mulps	xmm0,xmm4
				mulps	xmm1,xmm5
				mulps	xmm2,xmm6
				addps	xmm0,xmm1
				addps	xmm0,xmm2
				addps	xmm0,xmm7

				movss	xmm1,[eax+ecx+12]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+16]
				BROADCAST(xmm2,0)
				movss	xmm3,[eax+ecx+20]
				BROADCAST(xmm3,0)
				mulps	xmm1,xmm4
				mulps	xmm2,xmm5
				mulps	xmm3,xmm6
				addps	xmm1,xmm2
				addps	xmm3,xmm1
				addps	xmm3,xmm7

				movss	xmm0,xmm3
				shufps	xmm0,xmm0,SHUFFLE(0,3,2,1)
				movaps	[edx+ecx],xmm0

				prefetcht0	[edx+ecx+48]
				prefetcht0	[edx+ecx+48+32]

				movss	xmm0,[eax+ecx+24]
				BROADCAST(xmm0,0)
				movss	xmm1,[eax+ecx+24+4]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+24+8]
				BROADCAST(xmm2,0)
				mulps	xmm0,xmm4
				mulps	xmm1,xmm5
				mulps	xmm2,xmm6
				addps	xmm0,xmm1
				addps	xmm0,xmm2
				addps	xmm0,xmm7

				shufps	xmm3,xmm0,SHUFFLE(2,1,3,2)
				movaps	[edx+ecx+16],xmm3

				movss	xmm1,[eax+ecx+24+12]
				BROADCAST(xmm1,0)
				movss	xmm2,[eax+ecx+24+16]
				BROADCAST(xmm2,0)
				movss	xmm3,[eax+ecx+24+20]
				BROADCAST(xmm3,0)
				mulps	xmm1,xmm4
				mulps	xmm2,xmm5
				mulps	xmm3,xmm6
				addps	xmm1,xmm2
				addps	xmm1,xmm3
				addps	xmm1,xmm7

				shufps	xmm0,xmm0,SHUFFLE(2,1,0,3)
				movss	xmm1,xmm0

				movaps	[edx+ecx+32],xmm1
				
				add		ecx,48
				js		_xlatelp

			jmp	_lp
		_ulos:
		}

	}
	else
#endif
	{
		int i;

		for (i=0; i<count; i++)
		{
			dst[i]=mtx*src[i];
		}
	}
}

void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4& matrix, const int count)
{
	if (count<=0) return;

	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=matrix*src[i];
	}
}

void VectorProcessorClass::Copy(Vector2 *dst, const Vector2 *src, int count)
{
	if (count<=0) return;
	memcpy(dst,src,sizeof(Vector2)*count);
}

void VectorProcessorClass::Copy(unsigned *dst, const unsigned *src, int count)
{
	if (count<=0) return;
	memcpy(dst,src,sizeof(unsigned)*count);
}

void VectorProcessorClass::Copy(Vector3 *dst, const Vector3 *src, int count)
{
	if (count<=0) return;
	memcpy(dst,src,sizeof(Vector3)*count);
}

void VectorProcessorClass::Copy(Vector4 *dst, const Vector4 *src, int count)
{
	if (count<=0) return;
	memcpy(dst,src,sizeof(Vector4)*count);
}

void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float * srca, const int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i].X=src[i].X;
		dst[i].Y=src[i].Y;
		dst[i].Z=src[i].Z;
		dst[i].W=srca[i];
	}
}

void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 *src, const float srca, const int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i].X=src[i].X;
		dst[i].Y=src[i].Y;
		dst[i].Z=src[i].Z;
		dst[i].W=srca;
	}
}

void VectorProcessorClass::Copy(Vector4 *dst,const Vector3 &src, const float * srca, const int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i].X=src.X;
		dst[i].Y=src.Y;
		dst[i].Z=src.Z;
		dst[i].W=srca[i];
	}
}

void VectorProcessorClass::CopyIndexed (unsigned *dst,const unsigned *src, const unsigned int *index, int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=src[index[i]];
	}
}

void VectorProcessorClass::CopyIndexed (Vector2 *dst,const Vector2 *src, const unsigned int *index, int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=src[index[i]];
	}
}

void VectorProcessorClass::CopyIndexed (Vector3 *dst,const Vector3 *src, const unsigned int *index, int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=src[index[i]];
	}
}

void VectorProcessorClass::CopyIndexed (Vector4 *dst,const Vector4 *src, const unsigned int *index, int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=src[index[i]];
	}
}

void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=src[index[i]];
	}
}

void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i]=src[index[i]];
	}
}

void VectorProcessorClass::Clamp(Vector4 *dst,const Vector4 *src, const float min, const float max, const int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
	{
		dst[i].X=(src[i].X<min)?min:src[i].X;
		dst[i].X=(src[i].X>max)?max:src[i].X;

		dst[i].Y=(src[i].Y<min)?min:src[i].Y;
		dst[i].Y=(src[i].Y>max)?max:src[i].Y;

		dst[i].Z=(src[i].Z<min)?min:src[i].Z;
		dst[i].Z=(src[i].Z>max)?max:src[i].Z;

		dst[i].W=(src[i].W<min)?min:src[i].W;
		dst[i].W=(src[i].W>max)?max:src[i].W;
	}
}

void VectorProcessorClass::Clear(Vector3*dst, const int count)
{
	if (count<=0) return;
	memset(dst,0,sizeof(Vector3)*count);
}


void VectorProcessorClass::Normalize(Vector3 *dst, const int count)
{
	if (count<=0) return;
	int i;

	for (i=0; i<count; i++)
		dst[i].Normalize();
}

void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)
{
	if (count<=0) return;
	min=*src;
	max=*src;

	int i;

	for (i=1; i<count; i++)
	{
		min.X=MIN(min.X,src[i].X);
		min.Y=MIN(min.Y,src[i].Y);
		min.Z=MIN(min.Z,src[i].Z);

		max.X=MAX(max.X,src[i].X);
		max.Y=MAX(max.Y,src[i].Y);
		max.Z=MAX(max.Z,src[i].Z);
	}
}

void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)
{
	for (int i=0; i<count; i++) {
		dest[i] = dest[i] * multiplier + add;
	}
}

void VectorProcessorClass::DotProduct(float *dst, const Vector3 &a, const Vector3 *b,const int count)
{
	for (int i=0; i<count; i++)
		dst[i]=Vector3::Dot_Product(a,b[i]);
}

void VectorProcessorClass::ClampMin(float *dst, float *src, const float min, const int count)
{
	for (int i=0; i<count; i++)
		dst[i]=(src[i]>min?src[i]:min);
}

void VectorProcessorClass::Power(float *dst, float *src, const float pow, const int count)
{
	for (int i=0; i<count; i++)
		dst[i]=powf(src[i],pow);
}
Initial commit of Command & Conquer Renegade source code. 2025-02-27 16:39:46 +00:00			`/*`
			`** Command & Conquer Renegade(tm)`
			`** Copyright 2025 Electronic Arts Inc.`
			`**`
			`** This program is free software: you can redistribute it and/or modify`
			`** it under the terms of the GNU General Public License as published by`
			`** the Free Software Foundation, either version 3 of the License, or`
			`** (at your option) any later version.`
			`**`
			`** This program is distributed in the hope that it will be useful,`
			`** but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`** GNU General Public License for more details.`
			`**`
			`** You should have received a copy of the GNU General Public License`
			`** along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`/***********************************************************************************************`
			`* C O N F I D E N T I A L --- W E S T W O O D S T U D I O S *`
			`***********************************************************************************************`
			`* *`
			`* Project Name : wwmath *`
			`* *`
			`* $Archive:: /Commando/Code/WWMath/vp.cpp $*`
			`* *`
			`* Author:: Hector Yee *`
			`* *`
			`* $Modtime:: 6/27/01 4:16p $*`
			`* *`
			`* $Revision:: 11 $*`
			`* *`
			`---------------------------------------------------------------------------------------------/`

			`#include "vp.h"`
			`#include "vector2.h"`
			`#include "vector3.h"`
			`#include "vector4.h"`
			`#include "matrix3d.h"`
			`#include "matrix4.h"`
			`#include "wwdebug.h"`
			`#include "cpudetect.h"`
			`#include <memory.h>`

			`#define SHUFFLE(x, y, z, w) (((x)&3)<< 6\|((y)&3)<<4\|((z)&3)<< 2\|((w)&3))`
			`#define BROADCAST(XMM, INDEX) __asm shufps XMM,XMM,(((INDEX)&3)<< 6\|((INDEX)&3)<<4\|((INDEX)&3)<< 2\|((INDEX)&3))`

			`#define TRANSPOSE(BX, BY, BZ, BW, TV) \`
			`__asm movaps TV,BZ \`
			`__asm unpcklps BZ,BW \`
			`__asm unpckhps TV,BW \`
			`__asm movaps BW,BX \`
			`__asm unpcklps BX,BY \`
			`__asm unpckhps BW,BY \`
			`__asm movaps BY,BX \`
			`__asm shufps BX,BZ,SHUFFLE(1, 0, 1, 0) \`
			`__asm shufps BY,BZ,SHUFFLE(3, 2, 3, 2) \`
			`__asm movaps BZ,BW \`
			`__asm shufps BZ,TV,SHUFFLE(1, 0, 1, 0) \`
			`__asm shufps BW,TV,SHUFFLE(3, 2, 3, 2)`


			`void VectorProcessorClass::Prefetch(void* address)`
			`{`
			`#if defined (__ICL) // Detect Intel compiler`
			`if (CPUDetectClass::_Has_SSE_Instruction_Set()) {`
			`__asm {`
			`// mov edx,address`
			`// mov eax,[edx]`
			`// prefetchT1 address`
			`}`
			`}`
			`#endif`
			`}`

			`static Vector4 lastrow(0.0f,0.0f,0.0f,1.0f);`
			`void VectorProcessorClass::Transform (Vector3* dst,const Vector3 *src, const Matrix3D& mtx, const int count)`
			`{`
			`if (count<=0) return;`

			`#if defined (__ICL) // Detect Intel compiler`
			`if (CPUDetectClass::_Has_SSE_Instruction_Set()) {`

			`__asm {`
			`mov edx,dst`
			`mov eax,src`
			`mov ebx,mtx`
			`mov edi,count`

			`movups xmm4,[ebx+0]`
			`movups xmm5,[ebx+16]`
			`movups xmm6,[ebx+32]`
			`movups xmm7,lastrow //[ebx+48]`

			`TRANSPOSE(xmm4, xmm5, xmm6, xmm7, xmm0);`

			`shufps xmm4,xmm4,SHUFFLE(2,1,0,0)`
			`shufps xmm5,xmm5,SHUFFLE(2,1,0,0)`
			`shufps xmm6,xmm6,SHUFFLE(2,1,0,0)`
			`shufps xmm7,xmm7,SHUFFLE(2,1,0,0)`

			`mov esi,edx`
			`_lp:`
			`test edi,edi`
			`jz _ulos`
			`test esi,0xf`
			`jz _aligned`
			`movss xmm0,[eax]`
			`movss xmm1,[eax+4]`
			`movss xmm2,[eax+8]`
			`BROADCAST(xmm0,0)`
			`BROADCAST(xmm1,0)`
			`BROADCAST(xmm2,0)`
			`mulps xmm0,xmm4`
			`mulps xmm1,xmm5`
			`mulps xmm2,xmm6`
			`addps xmm0,xmm1`
			`addps xmm0,xmm2`
			`addps xmm0,xmm7`
			`movss [edx],xmm0`
			`movhps [edx+4],xmm0`
			`add eax,12`
			`add edx,12`
			`add esi,12`
			`dec edi`
			`jmp _lp`
			`_aligned:`

			`mov esi,1`

			`mov ecx,edi`
			`and edi,3`
			`and ecx,~3`
			`jz _lp`

			`lea ecx,[ecx+ecx*2]`
			`shl ecx,2`
			`add eax,ecx`
			`add edx,ecx`
			`neg ecx`

			`cmp dword ptr [ebx+12],0`
			`jne _xlatelp`
			`cmp dword ptr [ebx+28],0`
			`jne _xlatelp`
			`cmp dword ptr [ebx+44],0`
			`jne _xlatelp`
			`jmp _noxlatelp`

			`align 16`

			`_noxlatelp:`
			`prefetchnta [eax+ecx+48]`
			`prefetchnta [eax+ecx+48+32]`

			`movss xmm0,[eax+ecx]`
			`BROADCAST(xmm0,0)`
			`movss xmm1,[eax+ecx+4]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+8]`
			`BROADCAST(xmm2,0)`
			`mulps xmm0,xmm4`
			`mulps xmm1,xmm5`
			`mulps xmm2,xmm6`
			`addps xmm0,xmm1`
			`addps xmm0,xmm2`

			`movss xmm1,[eax+ecx+12]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+16]`
			`BROADCAST(xmm2,0)`
			`movss xmm3,[eax+ecx+20]`
			`BROADCAST(xmm3,0)`
			`mulps xmm1,xmm4`
			`mulps xmm2,xmm5`
			`mulps xmm3,xmm6`
			`addps xmm1,xmm2`
			`addps xmm3,xmm1`

			`movss xmm0,xmm3`
			`shufps xmm0,xmm0,SHUFFLE(0,3,2,1)`

			`movaps [edx+ecx],xmm0`

			`prefetcht0 [edx+ecx+48]`
			`prefetcht0 [edx+ecx+48+32]`

			`movss xmm0,[eax+ecx+24]`
			`BROADCAST(xmm0,0)`
			`movss xmm1,[eax+ecx+24+4]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+24+8]`
			`BROADCAST(xmm2,0)`
			`mulps xmm0,xmm4`
			`mulps xmm1,xmm5`
			`mulps xmm2,xmm6`
			`addps xmm0,xmm1`
			`addps xmm0,xmm2`

			`shufps xmm3,xmm0,SHUFFLE(2,1,3,2)`
			`movaps [edx+ecx+16],xmm3`

			`movss xmm1,[eax+ecx+24+12]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+24+16]`
			`BROADCAST(xmm2,0)`
			`movss xmm3,[eax+ecx+24+20]`
			`BROADCAST(xmm3,0)`
			`mulps xmm1,xmm4`
			`mulps xmm2,xmm5`
			`mulps xmm3,xmm6`
			`addps xmm1,xmm2`
			`addps xmm1,xmm3`

			`shufps xmm0,xmm0,SHUFFLE(2,1,0,3)`
			`movss xmm1,xmm0`
			`movaps [edx+ecx+32],xmm1`

			`add ecx,48`
			`js _noxlatelp`

			`jmp _lp`

			`align 16`

			`_xlatelp:`
			`prefetchnta [eax+ecx+48]`
			`prefetchnta [eax+ecx+48+32]`

			`movss xmm0,[eax+ecx]`
			`BROADCAST(xmm0,0)`
			`movss xmm1,[eax+ecx+4]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+8]`
			`BROADCAST(xmm2,0)`
			`mulps xmm0,xmm4`
			`mulps xmm1,xmm5`
			`mulps xmm2,xmm6`
			`addps xmm0,xmm1`
			`addps xmm0,xmm2`
			`addps xmm0,xmm7`

			`movss xmm1,[eax+ecx+12]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+16]`
			`BROADCAST(xmm2,0)`
			`movss xmm3,[eax+ecx+20]`
			`BROADCAST(xmm3,0)`
			`mulps xmm1,xmm4`
			`mulps xmm2,xmm5`
			`mulps xmm3,xmm6`
			`addps xmm1,xmm2`
			`addps xmm3,xmm1`
			`addps xmm3,xmm7`

			`movss xmm0,xmm3`
			`shufps xmm0,xmm0,SHUFFLE(0,3,2,1)`
			`movaps [edx+ecx],xmm0`

			`prefetcht0 [edx+ecx+48]`
			`prefetcht0 [edx+ecx+48+32]`

			`movss xmm0,[eax+ecx+24]`
			`BROADCAST(xmm0,0)`
			`movss xmm1,[eax+ecx+24+4]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+24+8]`
			`BROADCAST(xmm2,0)`
			`mulps xmm0,xmm4`
			`mulps xmm1,xmm5`
			`mulps xmm2,xmm6`
			`addps xmm0,xmm1`
			`addps xmm0,xmm2`
			`addps xmm0,xmm7`

			`shufps xmm3,xmm0,SHUFFLE(2,1,3,2)`
			`movaps [edx+ecx+16],xmm3`

			`movss xmm1,[eax+ecx+24+12]`
			`BROADCAST(xmm1,0)`
			`movss xmm2,[eax+ecx+24+16]`
			`BROADCAST(xmm2,0)`
			`movss xmm3,[eax+ecx+24+20]`
			`BROADCAST(xmm3,0)`
			`mulps xmm1,xmm4`
			`mulps xmm2,xmm5`
			`mulps xmm3,xmm6`
			`addps xmm1,xmm2`
			`addps xmm1,xmm3`
			`addps xmm1,xmm7`

			`shufps xmm0,xmm0,SHUFFLE(2,1,0,3)`
			`movss xmm1,xmm0`

			`movaps [edx+ecx+32],xmm1`

			`add ecx,48`
			`js _xlatelp`

			`jmp _lp`
			`_ulos:`
			`}`

			`}`
			`else`
			`#endif`
			`{`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=mtx*src[i];`
			`}`
			`}`
			`}`

			`void VectorProcessorClass::Transform(Vector4* dst,const Vector3 *src, const Matrix4& matrix, const int count)`
			`{`
			`if (count<=0) return;`

			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=matrix*src[i];`
			`}`
			`}`

			`void VectorProcessorClass::Copy(Vector2 dst, const Vector2 src, int count)`
			`{`
			`if (count<=0) return;`
			`memcpy(dst,src,sizeof(Vector2)*count);`
			`}`

			`void VectorProcessorClass::Copy(unsigned dst, const unsigned src, int count)`
			`{`
			`if (count<=0) return;`
			`memcpy(dst,src,sizeof(unsigned)*count);`
			`}`

			`void VectorProcessorClass::Copy(Vector3 dst, const Vector3 src, int count)`
			`{`
			`if (count<=0) return;`
			`memcpy(dst,src,sizeof(Vector3)*count);`
			`}`

			`void VectorProcessorClass::Copy(Vector4 dst, const Vector4 src, int count)`
			`{`
			`if (count<=0) return;`
			`memcpy(dst,src,sizeof(Vector4)*count);`
			`}`

			`void VectorProcessorClass::Copy(Vector4 dst,const Vector3 src, const float * srca, const int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i].X=src[i].X;`
			`dst[i].Y=src[i].Y;`
			`dst[i].Z=src[i].Z;`
			`dst[i].W=srca[i];`
			`}`
			`}`

			`void VectorProcessorClass::Copy(Vector4 dst,const Vector3 src, const float srca, const int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i].X=src[i].X;`
			`dst[i].Y=src[i].Y;`
			`dst[i].Z=src[i].Z;`
			`dst[i].W=srca;`
			`}`
			`}`

			`void VectorProcessorClass::Copy(Vector4 dst,const Vector3 &src, const float srca, const int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i].X=src.X;`
			`dst[i].Y=src.Y;`
			`dst[i].Z=src.Z;`
			`dst[i].W=srca[i];`
			`}`
			`}`

			`void VectorProcessorClass::CopyIndexed (unsigned dst,const unsigned src, const unsigned int *index, int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=src[index[i]];`
			`}`
			`}`

			`void VectorProcessorClass::CopyIndexed (Vector2 dst,const Vector2 src, const unsigned int *index, int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=src[index[i]];`
			`}`
			`}`

			`void VectorProcessorClass::CopyIndexed (Vector3 dst,const Vector3 src, const unsigned int *index, int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=src[index[i]];`
			`}`
			`}`

			`void VectorProcessorClass::CopyIndexed (Vector4 dst,const Vector4 src, const unsigned int *index, int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=src[index[i]];`
			`}`
			`}`

			`void VectorProcessorClass::CopyIndexed(unsigned char* dst, const unsigned char* src, const unsigned int *index, int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=src[index[i]];`
			`}`
			`}`

			`void VectorProcessorClass::CopyIndexed(float* dst, float* src, const unsigned int *index, int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i]=src[index[i]];`
			`}`
			`}`

			`void VectorProcessorClass::Clamp(Vector4 dst,const Vector4 src, const float min, const float max, const int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`{`
			`dst[i].X=(src[i].X<min)?min:src[i].X;`
			`dst[i].X=(src[i].X>max)?max:src[i].X;`

			`dst[i].Y=(src[i].Y<min)?min:src[i].Y;`
			`dst[i].Y=(src[i].Y>max)?max:src[i].Y;`

			`dst[i].Z=(src[i].Z<min)?min:src[i].Z;`
			`dst[i].Z=(src[i].Z>max)?max:src[i].Z;`

			`dst[i].W=(src[i].W<min)?min:src[i].W;`
			`dst[i].W=(src[i].W>max)?max:src[i].W;`
			`}`
			`}`

			`void VectorProcessorClass::Clear(Vector3*dst, const int count)`
			`{`
			`if (count<=0) return;`
			`memset(dst,0,sizeof(Vector3)*count);`
			`}`


			`void VectorProcessorClass::Normalize(Vector3 *dst, const int count)`
			`{`
			`if (count<=0) return;`
			`int i;`

			`for (i=0; i<count; i++)`
			`dst[i].Normalize();`
			`}`

			`void VectorProcessorClass::MinMax(Vector3 *src, Vector3 &min, Vector3 &max, const int count)`
			`{`
			`if (count<=0) return;`
			`min=*src;`
			`max=*src;`

			`int i;`

			`for (i=1; i<count; i++)`
			`{`
			`min.X=MIN(min.X,src[i].X);`
			`min.Y=MIN(min.Y,src[i].Y);`
			`min.Z=MIN(min.Z,src[i].Z);`

			`max.X=MAX(max.X,src[i].X);`
			`max.Y=MAX(max.Y,src[i].Y);`
			`max.Z=MAX(max.Z,src[i].Z);`
			`}`
			`}`

			`void VectorProcessorClass::MulAdd(float * dest,float multiplier,float add,int count)`
			`{`
			`for (int i=0; i<count; i++) {`
			`dest[i] = dest[i] * multiplier + add;`
			`}`
			`}`

			`void VectorProcessorClass::DotProduct(float dst, const Vector3 &a, const Vector3 b,const int count)`
			`{`
			`for (int i=0; i<count; i++)`
			`dst[i]=Vector3::Dot_Product(a,b[i]);`
			`}`

			`void VectorProcessorClass::ClampMin(float dst, float src, const float min, const int count)`
			`{`
			`for (int i=0; i<count; i++)`
			`dst[i]=(src[i]>min?src[i]:min);`
			`}`

			`void VectorProcessorClass::Power(float dst, float src, const float pow, const int count)`
			`{`
			`for (int i=0; i<count; i++)`
			`dst[i]=powf(src[i],pow);`
			`}`