Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/OgreMain/src/OgreOptimisedUtilSSE.cpp @ 1

Last change on this file since 1 was 1, checked in by landauf, 18 years ago

File size: 94.1 KB

Line
1	/*
2	-----------------------------------------------------------------------------
3	This source file is part of OGRE
4	(Object-oriented Graphics Rendering Engine)
5	For the latest info, see http://www.ogre3d.org/
6
7	Copyright (c) 2000-2006 Torus Knot Software Ltd
8	Also see acknowledgements in Readme.html
9
10	This program is free software; you can redistribute it and/or modify it under
11	the terms of the GNU Lesser General Public License as published by the Free Software
12	Foundation; either version 2 of the License, or (at your option) any later
13	version.
14
15	This program is distributed in the hope that it will be useful, but WITHOUT
16	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
18
19	You should have received a copy of the GNU Lesser General Public License along with
20	this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21	Place - Suite 330, Boston, MA 02111-1307, USA, or go to
22	http://www.gnu.org/copyleft/lesser.txt.
23
24	You may alternatively use this source under the terms of a specific version of
25	the OGRE Unrestricted License provided you have obtained such a license from
26	Torus Knot Software Ltd.
27	-----------------------------------------------------------------------------
28	*/
29	#include "OgreStableHeaders.h"
30
31	#include "OgreOptimisedUtil.h"
32	#include "OgrePlatformInformation.h"
33
34	#if __OGRE_HAVE_SSE
35
36	#include "OgreMatrix4.h"
37
38	// Should keep this includes at latest to avoid potential "xmmintrin.h" included by
39	// other header file on some platform for some reason.
40	#include "OgreSIMDHelper.h"
41
42	// I'd like to merge this file with OgreOptimisedUtil.cpp, but it's
43	// impossible when compile with gcc, due SSE instructions can only
44	// enable/disable at file level.
45
46	//-------------------------------------------------------------------------
47	//
48	// The routines implemented in this file are performance oriented,
49	// which means saving every penny as possible. This requirement might
50	// break some C++/STL-rules.
51	//
52	//
53	// Some rules I'd like to respects:
54	//
55	// 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because
56	// it can saving one byte of binary code :)
57	// 2. Use add/sub instead of mul.
58	// 3. Eliminate prolog code of function call.
59	//
60	// The last, anything recommended by Intel Optimization Reference Manual.
61	//
62	//-------------------------------------------------------------------------
63
64	// Use unrolled SSE version when vertices exceeds this limit
65	#define OGRE_SSE_SKINNING_UNROLL_VERTICES 16
66
67	namespace Ogre {
68
69	//-------------------------------------------------------------------------
70	// Local classes
71	//-------------------------------------------------------------------------
72
73	/** SSE implementation of OptimisedUtil.
74	@note
75	Don't use this class directly, use OptimisedUtil instead.
76	*/
77	class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil
78	{
79	protected:
80	/// Does we prefer use general SSE version for position/normals shared buffers?
81	bool mPreferGeneralVersionForSharedBuffers;
82
83	public:
84	/// Constructor
85	OptimisedUtilSSE(void);
86
87	/// @copydoc OptimisedUtil::softwareVertexSkinning
88	virtual void softwareVertexSkinning(
89	const float srcPosPtr, float destPosPtr,
90	const float srcNormPtr, float destNormPtr,
91	const float blendWeightPtr, const unsigned char blendIndexPtr,
92	const Matrix4* const* blendMatrices,
93	size_t srcPosStride, size_t destPosStride,
94	size_t srcNormStride, size_t destNormStride,
95	size_t blendWeightStride, size_t blendIndexStride,
96	size_t numWeightsPerVertex,
97	size_t numVertices);
98
99	/// @copydoc OptimisedUtil::softwareVertexMorph
100	virtual void softwareVertexMorph(
101	Real t,
102	const float srcPos1, const float srcPos2,
103	float *dstPos,
104	size_t numVertices);
105
106	/// @copydoc OptimisedUtil::concatenateAffineMatrices
107	virtual void concatenateAffineMatrices(
108	const Matrix4& baseMatrix,
109	const Matrix4* srcMatrices,
110	Matrix4* dstMatrices,
111	size_t numMatrices);
112
113	/// @copydoc OptimisedUtil::calculateFaceNormals
114	virtual void calculateFaceNormals(
115	const float *positions,
116	const EdgeData::Triangle *triangles,
117	Vector4 *faceNormals,
118	size_t numTriangles);
119
120	/// @copydoc OptimisedUtil::calculateLightFacing
121	virtual void calculateLightFacing(
122	const Vector4& lightPos,
123	const Vector4* faceNormals,
124	char* lightFacings,
125	size_t numFaces);
126
127	/// @copydoc OptimisedUtil::extrudeVertices
128	virtual void extrudeVertices(
129	const Vector4& lightPos,
130	Real extrudeDist,
131	const float* srcPositions,
132	float* destPositions,
133	size_t numVertices);
134	};
135
136	#if defined(__OGRE_SIMD_ALIGN_STACK)
137	/** Stack-align implementation of OptimisedUtil.
138	@remarks
139	User code compiled by icc and gcc might not align stack
140	properly, we need ensure stack align to a 16-bytes boundary
141	when execute SSE function.
142	@par
143	We implemeted as align stack following a virtual function call,
144	then should guarantee call instruction are used instead of inline
145	underlying function body here (which might causing problem).
146	@note
147	Don't use this class directly, use OptimisedUtil instead.
148	*/
149	class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil
150	{
151	protected:
152	/// The actual implementation
153	OptimisedUtil* mImpl;
154
155	public:
156	/// Constructor
157	OptimisedUtilWithStackAlign(OptimisedUtil* impl)
158	: mImpl(impl)
159	{
160	}
161
162	/// @copydoc OptimisedUtil::softwareVertexSkinning
163	virtual void softwareVertexSkinning(
164	const float srcPosPtr, float destPosPtr,
165	const float srcNormPtr, float destNormPtr,
166	const float blendWeightPtr, const unsigned char blendIndexPtr,
167	const Matrix4* const* blendMatrices,
168	size_t srcPosStride, size_t destPosStride,
169	size_t srcNormStride, size_t destNormStride,
170	size_t blendWeightStride, size_t blendIndexStride,
171	size_t numWeightsPerVertex,
172	size_t numVertices)
173	{
174	__OGRE_SIMD_ALIGN_STACK();
175
176	mImpl->softwareVertexSkinning(
177	srcPosPtr, destPosPtr,
178	srcNormPtr, destNormPtr,
179	blendWeightPtr, blendIndexPtr,
180	blendMatrices,
181	srcPosStride, destPosStride,
182	srcNormStride, destNormStride,
183	blendWeightStride, blendIndexStride,
184	numWeightsPerVertex,
185	numVertices);
186	}
187
188	/// @copydoc OptimisedUtil::softwareVertexMorph
189	virtual void softwareVertexMorph(
190	Real t,
191	const float srcPos1, const float srcPos2,
192	float *dstPos,
193	size_t numVertices)
194	{
195	__OGRE_SIMD_ALIGN_STACK();
196
197	mImpl->softwareVertexMorph(
198	t,
199	srcPos1, srcPos2,
200	dstPos,
201	numVertices);
202	}
203
204	/// @copydoc OptimisedUtil::concatenateAffineMatrices
205	virtual void concatenateAffineMatrices(
206	const Matrix4& baseMatrix,
207	const Matrix4* srcMatrices,
208	Matrix4* dstMatrices,
209	size_t numMatrices)
210	{
211	__OGRE_SIMD_ALIGN_STACK();
212
213	mImpl->concatenateAffineMatrices(
214	baseMatrix,
215	srcMatrices,
216	dstMatrices,
217	numMatrices);
218	}
219
220	/// @copydoc OptimisedUtil::calculateFaceNormals
221	virtual void calculateFaceNormals(
222	const float *positions,
223	const EdgeData::Triangle *triangles,
224	Vector4 *faceNormals,
225	size_t numTriangles)
226	{
227	__OGRE_SIMD_ALIGN_STACK();
228
229	mImpl->calculateFaceNormals(
230	positions,
231	triangles,
232	faceNormals,
233	numTriangles);
234	}
235
236	/// @copydoc OptimisedUtil::calculateLightFacing
237	virtual void calculateLightFacing(
238	const Vector4& lightPos,
239	const Vector4* faceNormals,
240	char* lightFacings,
241	size_t numFaces)
242	{
243	__OGRE_SIMD_ALIGN_STACK();
244
245	mImpl->calculateLightFacing(
246	lightPos,
247	faceNormals,
248	lightFacings,
249	numFaces);
250	}
251
252	/// @copydoc OptimisedUtil::extrudeVertices
253	virtual void extrudeVertices(
254	const Vector4& lightPos,
255	Real extrudeDist,
256	const float* srcPositions,
257	float* destPositions,
258	size_t numVertices)
259	{
260	__OGRE_SIMD_ALIGN_STACK();
261
262	mImpl->extrudeVertices(
263	lightPos,
264	extrudeDist,
265	srcPositions,
266	destPositions,
267	numVertices);
268	}
269	};
270	#endif // !defined(__OGRE_SIMD_ALIGN_STACK)
271
272	//---------------------------------------------------------------------
273	// Some useful macro for collapse matrices.
274	//---------------------------------------------------------------------
275
276	#define __LOAD_MATRIX(row0, row1, row2, pMatrix) \
277	{ \
278	row0 = __MM_LOAD_PS((*pMatrix)[0]); \
279	row1 = __MM_LOAD_PS((*pMatrix)[1]); \
280	row2 = __MM_LOAD_PS((*pMatrix)[2]); \
281	}
282
283	#define __LERP_MATRIX(row0, row1, row2, weight, pMatrix) \
284	{ \
285	row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \
286	row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \
287	row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \
288	}
289
290	#define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \
291	{ \
292	row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight); \
293	row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight); \
294	row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight); \
295	}
296
297	#define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \
298	{ \
299	row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \
300	row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \
301	row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \
302	}
303
304	//---------------------------------------------------------------------
305	// The following macros request variables declared by caller.
306	//
307	// :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy.
308	//---------------------------------------------------------------------
309
310	/** Collapse one-weighted matrix.
311	Eliminated multiply by weight since the weight should be equal to one always
312	*/
313	#define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights) \
314	{ \
315	pMatrix0 = blendMatrices[pIndices[0]]; \
316	__LOAD_MATRIX(row0, row1, row2, pMatrix0); \
317	}
318
319	/** Collapse two-weighted matrix.
320	Based on the fact that accumulated weights are equal to one, by use lerp,
321	replaced two multiplies and one additive with one multiplie and two additives.
322	*/
323	#define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights) \
324	{ \
325	weight = _mm_load_ps1(pWeights + 1); \
326	pMatrix0 = ppMatrices[pIndices[0]]; \
327	__LOAD_MATRIX(row0, row1, row2, pMatrix0); \
328	pMatrix1 = ppMatrices[pIndices[1]]; \
329	__LERP_MATRIX(row0, row1, row2, weight, pMatrix1); \
330	}
331
332	/** Collapse three-weighted matrix.
333	*/
334	#define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights) \
335	{ \
336	weight = _mm_load_ps1(pWeights + 0); \
337	pMatrix0 = ppMatrices[pIndices[0]]; \
338	__LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \
339	weight = _mm_load_ps1(pWeights + 1); \
340	pMatrix1 = ppMatrices[pIndices[1]]; \
341	__ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \
342	weight = _mm_load_ps1(pWeights + 2); \
343	pMatrix2 = ppMatrices[pIndices[2]]; \
344	__ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \
345	}
346
347	/** Collapse four-weighted matrix.
348	*/
349	#define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights) \
350	{ \
351	/* Load four blend weights at one time, they will be shuffled later */ \
352	weights = _mm_loadu_ps(pWeights); \
353	\
354	pMatrix0 = ppMatrices[pIndices[0]]; \
355	weight = __MM_SELECT(weights, 0); \
356	__LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \
357	pMatrix1 = ppMatrices[pIndices[1]]; \
358	weight = __MM_SELECT(weights, 1); \
359	__ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \
360	pMatrix2 = ppMatrices[pIndices[2]]; \
361	weight = __MM_SELECT(weights, 2); \
362	__ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \
363	pMatrix3 = ppMatrices[pIndices[3]]; \
364	weight = __MM_SELECT(weights, 3); \
365	__ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3); \
366	}
367
368
369
370	//---------------------------------------------------------------------
371	// Collapse a matrix at one time. The collapsed matrix are weighted by
372	// blend-weights, and then can use to transform corresponding vertex directly.
373	//
374	// I'd like use inline function instead of macro here, but I also want to
375	// ensure compiler integrate this code into its callers (release build at
376	// least), doesn't matter about specific compile options. Inline function
377	// work fine for VC, but looks like gcc (3.4.4 here) generate function-call
378	// when implemented as inline function, even if compile with "-O3" option.
379	//
380	#define _collapseOneMatrix( \
381	m00, m01, m02, \
382	pBlendWeight, pBlendIndex, \
383	blendMatrices, \
384	blendWeightStride, blendIndexStride, \
385	numWeightsPerVertex) \
386	{ \
387	/* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \
388	/* generate wrong code here!!! */ \
389	const Matrix4* pMatrix0, pMatrix1, pMatrix2, *pMatrix3; \
390	__m128 weight, weights; \
391	\
392	switch (numWeightsPerVertex) \
393	{ \
394	default: /* Just in case and make compiler happy */ \
395	case 1: \
396	__COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \
397	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
398	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
399	break; \
400	\
401	case 2: \
402	__COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \
403	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
404	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
405	break; \
406	\
407	case 3: \
408	__COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \
409	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
410	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
411	break; \
412	\
413	case 4: \
414	__COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \
415	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
416	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
417	break; \
418	} \
419	}
420
421	//---------------------------------------------------------------------
422	// Collapse four matrices at one time. The collapsed matrix are weighted by
423	// blend-weights, and then can use to transform corresponding vertex directly.
424	//
425	// I'd like use inline function instead of macro here, but I also want to
426	// ensure compiler integrate this code into its callers (release build at
427	// least), doesn't matter about specific compile options. Inline function
428	// work fine for VC, but looks like gcc (3.4.4 here) generate function-call
429	// when implemented as inline function, even if compile with "-O3" option.
430	//
431	#define _collapseFourMatrices( \
432	m00, m01, m02, \
433	m10, m11, m12, \
434	m20, m21, m22, \
435	m30, m31, m32, \
436	pBlendWeight, pBlendIndex, \
437	blendMatrices, \
438	blendWeightStride, blendIndexStride, \
439	numWeightsPerVertex) \
440	{ \
441	/* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \
442	/* generate wrong code here!!! */ \
443	const Matrix4* pMatrix0, pMatrix1, pMatrix2, *pMatrix3; \
444	__m128 weight, weights; \
445	\
446	switch (numWeightsPerVertex) \
447	{ \
448	default: /* Just in case and make compiler happy */ \
449	case 1: \
450	__COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \
451	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
452	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
453	__COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices, \
454	rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
455	rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
456	__COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices, \
457	rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
458	rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
459	__COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices, \
460	rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
461	rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
462	break; \
463	\
464	case 2: \
465	__COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \
466	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
467	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
468	__COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices, \
469	rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
470	rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
471	__COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices, \
472	rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
473	rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
474	__COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices, \
475	rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
476	rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
477	break; \
478	\
479	case 3: \
480	__COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \
481	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
482	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
483	__COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices, \
484	rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
485	rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
486	__COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices, \
487	rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
488	rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
489	__COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices, \
490	rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
491	rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
492	break; \
493	\
494	case 4: \
495	__COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \
496	rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
497	rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
498	__COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices, \
499	rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
500	rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
501	__COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices, \
502	rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
503	rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
504	__COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices, \
505	rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
506	rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
507	break; \
508	} \
509	}
510
511
512
513	//---------------------------------------------------------------------
514	// General SSE version skinning positions, and optional skinning normals.
515	static void softwareVertexSkinning_SSE_General(
516	const float pSrcPos, float pDestPos,
517	const float pSrcNorm, float pDestNorm,
518	const float pBlendWeight, const unsigned char pBlendIndex,
519	const Matrix4* const* blendMatrices,
520	size_t srcPosStride, size_t destPosStride,
521	size_t srcNormStride, size_t destNormStride,
522	size_t blendWeightStride, size_t blendIndexStride,
523	size_t numWeightsPerVertex,
524	size_t numVertices)
525	{
526	for (size_t i = 0; i < numVertices; ++i)
527	{
528	// Collapse matrices
529	__m128 m00, m01, m02;
530	_collapseOneMatrix(
531	m00, m01, m02,
532	pBlendWeight, pBlendIndex,
533	blendMatrices,
534	blendWeightStride, blendIndexStride,
535	numWeightsPerVertex);
536
537	// Advance blend weight and index pointers
538	advanceRawPointer(pBlendWeight, blendWeightStride);
539	advanceRawPointer(pBlendIndex, blendIndexStride);
540
541	//------------------------------------------------------------------
542
543	// Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y
544	__m128 m03 = _mm_setzero_ps();
545	__MM_TRANSPOSE4x4_PS(m02, m03, m00, m01);
546
547	//------------------------------------------------------------------
548	// Transform position
549	//------------------------------------------------------------------
550
551	__m128 s0, s1, s2;
552
553	// Load source position
554	s0 = _mm_load_ps1(pSrcPos + 0);
555	s1 = _mm_load_ps1(pSrcPos + 1);
556	s2 = _mm_load_ps1(pSrcPos + 2);
557
558	// Transform by collapsed matrix
559	__m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2); // z 0 x y
560
561	// Store blended position, no aligned requirement
562	_mm_storeh_pi((__m64*)pDestPos, accumPos);
563	_mm_store_ss(pDestPos+2, accumPos);
564
565	// Advance source and target position pointers
566	advanceRawPointer(pSrcPos, srcPosStride);
567	advanceRawPointer(pDestPos, destPosStride);
568
569	//------------------------------------------------------------------
570	// Optional blend normal
571	//------------------------------------------------------------------
572
573	if (pSrcNorm)
574	{
575	// Load source normal
576	s0 = _mm_load_ps1(pSrcNorm + 0);
577	s1 = _mm_load_ps1(pSrcNorm + 1);
578	s2 = _mm_load_ps1(pSrcNorm + 2);
579
580	// Transform by collapsed matrix
581	__m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2); // z 0 x y
582
583	// Normalise normal
584	__m128 tmp = _mm_mul_ps(accumNorm, accumNorm); // z^2 0 x^2 y^2
585	tmp = __MM_ACCUM3_PS(tmp,
586	_mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)), // x^2 0 y^2 z^2
587	_mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3))); // y^2 0 z^2 x^2
588	// Note: zero divided here, but neglectable
589	tmp = __MM_RSQRT_PS(tmp);
590	accumNorm = _mm_mul_ps(accumNorm, tmp);
591
592	// Store blended normal, no aligned requirement
593	_mm_storeh_pi((__m64*)pDestNorm, accumNorm);
594	_mm_store_ss(pDestNorm+2, accumNorm);
595
596	// Advance source and target normal pointers
597	advanceRawPointer(pSrcNorm, srcNormStride);
598	advanceRawPointer(pDestNorm, destNormStride);
599	}
600	}
601	}
602	//---------------------------------------------------------------------
603	// Special SSE version skinning shared buffers of position and normal,
604	// and the buffer are packed.
605	template <bool srcAligned, bool destAligned>
606	struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed
607	{
608	static void apply(
609	const float* pSrc, float* pDest,
610	const float* pBlendWeight, const unsigned char* pBlendIndex,
611	const Matrix4* const* blendMatrices,
612	size_t blendWeightStride, size_t blendIndexStride,
613	size_t numWeightsPerVertex,
614	size_t numIterations)
615	{
616	typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
617	typedef SSEMemoryAccessor<destAligned> DestAccessor;
618
619	// Blending 4 vertices per-iteration
620	for (size_t i = 0; i < numIterations; ++i)
621	{
622	// Collapse matrices
623	__m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
624	_collapseFourMatrices(
625	m00, m01, m02,
626	m10, m11, m12,
627	m20, m21, m22,
628	m30, m31, m32,
629	pBlendWeight, pBlendIndex,
630	blendMatrices,
631	blendWeightStride, blendIndexStride,
632	numWeightsPerVertex);
633
634	// Advance 4 vertices
635	advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
636	advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
637
638	//------------------------------------------------------------------
639	// Transform position/normals
640	//------------------------------------------------------------------
641
642	__m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5;
643	__m128 t0, t1, t2, t3, t4, t5;
644
645	// Load source position/normals
646	s0 = SrcAccessor::load(pSrc + 0); // px0 py0 pz0 nx0
647	s1 = SrcAccessor::load(pSrc + 4); // ny0 nz0 px1 py1
648	s2 = SrcAccessor::load(pSrc + 8); // pz1 nx1 ny1 nz1
649	s3 = SrcAccessor::load(pSrc + 12); // px2 py2 pz2 nx2
650	s4 = SrcAccessor::load(pSrc + 16); // ny2 nz2 px3 py3
651	s5 = SrcAccessor::load(pSrc + 20); // pz3 nx3 ny3 nz3
652
653	// Rearrange to component-major for batches calculate.
654
655	t0 = _mm_unpacklo_ps(s0, s3); // px0 px2 py0 py2
656	t1 = _mm_unpackhi_ps(s0, s3); // pz0 pz2 nx0 nx2
657	t2 = _mm_unpacklo_ps(s1, s4); // ny0 ny2 nz0 nz2
658	t3 = _mm_unpackhi_ps(s1, s4); // px1 px3 py1 py3
659	t4 = _mm_unpacklo_ps(s2, s5); // pz1 pz3 nx1 nx3
660	t5 = _mm_unpackhi_ps(s2, s5); // ny1 ny3 nz1 nz3
661
662	s0 = _mm_unpacklo_ps(t0, t3); // px0 px1 px2 px3
663	s1 = _mm_unpackhi_ps(t0, t3); // py0 py1 py2 py3
664	s2 = _mm_unpacklo_ps(t1, t4); // pz0 pz1 pz2 pz3
665	s3 = _mm_unpackhi_ps(t1, t4); // nx0 nx1 nx2 nx3
666	s4 = _mm_unpacklo_ps(t2, t5); // ny0 ny1 ny2 ny3
667	s5 = _mm_unpackhi_ps(t2, t5); // nz0 nz1 nz2 nz3
668
669	// Transform by collapsed matrix
670
671	// Shuffle row 0 of four collapsed matrices for calculate X component
672	__MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
673
674	// Transform X components
675	d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // PX0 PX1 PX2 PX3
676	d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5); // NX0 NX1 NX2 NX3
677
678	// Shuffle row 1 of four collapsed matrices for calculate Y component
679	__MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
680
681	// Transform Y components
682	d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // PY0 PY1 PY2 PY3
683	d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5); // NY0 NY1 NY2 NY3
684
685	// Shuffle row 2 of four collapsed matrices for calculate Z component
686	__MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
687
688	// Transform Z components
689	d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // PZ0 PZ1 PZ2 PZ3
690	d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5); // NZ0 NZ1 NZ2 NZ3
691
692	// Normalise normals
693	__m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5);
694	tmp = __MM_RSQRT_PS(tmp);
695	d3 = _mm_mul_ps(d3, tmp);
696	d4 = _mm_mul_ps(d4, tmp);
697	d5 = _mm_mul_ps(d5, tmp);
698
699	// Arrange back to continuous format for store results
700
701	t0 = _mm_unpacklo_ps(d0, d1); // PX0 PY0 PX1 PY1
702	t1 = _mm_unpackhi_ps(d0, d1); // PX2 PY2 PX3 PY3
703	t2 = _mm_unpacklo_ps(d2, d3); // PZ0 NX0 PZ1 NX1
704	t3 = _mm_unpackhi_ps(d2, d3); // PZ2 NX2 PZ3 NX3
705	t4 = _mm_unpacklo_ps(d4, d5); // NY0 NZ0 NY1 NZ1
706	t5 = _mm_unpackhi_ps(d4, d5); // NY2 NZ2 NY3 NZ3
707
708	d0 = _mm_movelh_ps(t0, t2); // PX0 PY0 PZ0 NX0
709	d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0)); // NY0 NZ0 PX1 PY1
710	d2 = _mm_movehl_ps(t4, t2); // PZ1 NX1 NY1 NZ1
711	d3 = _mm_movelh_ps(t1, t3); // PX2 PY2 PZ2 NX2
712	d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0)); // NY2 NZ2 PX3 PY3
713	d5 = _mm_movehl_ps(t5, t3); // PZ3 NX3 NY3 NZ3
714
715	// Store blended position/normals
716	DestAccessor::store(pDest + 0, d0);
717	DestAccessor::store(pDest + 4, d1);
718	DestAccessor::store(pDest + 8, d2);
719	DestAccessor::store(pDest + 12, d3);
720	DestAccessor::store(pDest + 16, d4);
721	DestAccessor::store(pDest + 20, d5);
722
723	// Advance 4 vertices
724	pSrc += 4 * (3 + 3);
725	pDest += 4 * (3 + 3);
726	}
727	}
728	};
729	static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
730	const float* pSrcPos, float* pDestPos,
731	const float* pBlendWeight, const unsigned char* pBlendIndex,
732	const Matrix4* const* blendMatrices,
733	size_t blendWeightStride, size_t blendIndexStride,
734	size_t numWeightsPerVertex,
735	size_t numIterations)
736	{
737	// pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex
738
739	// Instantiating two version only, since other alignement combination not that important.
740	if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos))
741	{
742	SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply(
743	pSrcPos, pDestPos,
744	pBlendWeight, pBlendIndex,
745	blendMatrices,
746	blendWeightStride, blendIndexStride,
747	numWeightsPerVertex,
748	numIterations);
749	}
750	else
751	{
752	SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply(
753	pSrcPos, pDestPos,
754	pBlendWeight, pBlendIndex,
755	blendMatrices,
756	blendWeightStride, blendIndexStride,
757	numWeightsPerVertex,
758	numIterations);
759	}
760	}
761	//---------------------------------------------------------------------
762	// Special SSE version skinning separated buffers of position and normal,
763	// both of position and normal buffer are packed.
764	template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned>
765	struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed
766	{
767	static void apply(
768	const float* pSrcPos, float* pDestPos,
769	const float* pSrcNorm, float* pDestNorm,
770	const float* pBlendWeight, const unsigned char* pBlendIndex,
771	const Matrix4* const* blendMatrices,
772	size_t blendWeightStride, size_t blendIndexStride,
773	size_t numWeightsPerVertex,
774	size_t numIterations)
775	{
776	typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
777	typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
778	typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor;
779	typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor;
780
781	// Blending 4 vertices per-iteration
782	for (size_t i = 0; i < numIterations; ++i)
783	{
784	// Collapse matrices
785	__m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
786	_collapseFourMatrices(
787	m00, m01, m02,
788	m10, m11, m12,
789	m20, m21, m22,
790	m30, m31, m32,
791	pBlendWeight, pBlendIndex,
792	blendMatrices,
793	blendWeightStride, blendIndexStride,
794	numWeightsPerVertex);
795
796	// Advance 4 vertices
797	advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
798	advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
799
800	//------------------------------------------------------------------
801	// Transform positions
802	//------------------------------------------------------------------
803
804	__m128 s0, s1, s2, d0, d1, d2;
805
806	// Load source positions
807	s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1
808	s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2
809	s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3
810
811	// Arrange to 3x4 component-major for batches calculate
812	__MM_TRANSPOSE4x3_PS(s0, s1, s2);
813
814	// Transform by collapsed matrix
815
816	// Shuffle row 0 of four collapsed matrices for calculate X component
817	__MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
818
819	// Transform X components
820	d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3
821
822	// Shuffle row 1 of four collapsed matrices for calculate Y component
823	__MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
824
825	// Transform Y components
826	d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3
827
828	// Shuffle row 2 of four collapsed matrices for calculate Z component
829	__MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
830
831	// Transform Z components
832	d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3
833
834	// Arrange back to 4x3 continuous format for store results
835	__MM_TRANSPOSE3x4_PS(d0, d1, d2);
836
837	// Store blended positions
838	DestPosAccessor::store(pDestPos + 0, d0);
839	DestPosAccessor::store(pDestPos + 4, d1);
840	DestPosAccessor::store(pDestPos + 8, d2);
841
842	// Advance 4 vertices
843	pSrcPos += 4 * 3;
844	pDestPos += 4 * 3;
845
846	//------------------------------------------------------------------
847	// Transform normals
848	//------------------------------------------------------------------
849
850	// Load source normals
851	s0 = SrcNormAccessor::load(pSrcNorm + 0); // x0 y0 z0 x1
852	s1 = SrcNormAccessor::load(pSrcNorm + 4); // y1 z1 x2 y2
853	s2 = SrcNormAccessor::load(pSrcNorm + 8); // z2 x3 y3 z3
854
855	// Arrange to 3x4 component-major for batches calculate
856	__MM_TRANSPOSE4x3_PS(s0, s1, s2);
857
858	// Transform by collapsed and shuffled matrices
859	d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2); // X0 X1 X2 X3
860	d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2); // Y0 Y1 Y2 Y3
861	d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2); // Z0 Z1 Z2 Z3
862
863	// Normalise normals
864	__m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2);
865	tmp = __MM_RSQRT_PS(tmp);
866	d0 = _mm_mul_ps(d0, tmp);
867	d1 = _mm_mul_ps(d1, tmp);
868	d2 = _mm_mul_ps(d2, tmp);
869
870	// Arrange back to 4x3 continuous format for store results
871	__MM_TRANSPOSE3x4_PS(d0, d1, d2);
872
873	// Store blended normals
874	DestNormAccessor::store(pDestNorm + 0, d0);
875	DestNormAccessor::store(pDestNorm + 4, d1);
876	DestNormAccessor::store(pDestNorm + 8, d2);
877
878	// Advance 4 vertices
879	pSrcNorm += 4 * 3;
880	pDestNorm += 4 * 3;
881	}
882	}
883	};
884	static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
885	const float* pSrcPos, float* pDestPos,
886	const float* pSrcNorm, float* pDestNorm,
887	const float* pBlendWeight, const unsigned char* pBlendIndex,
888	const Matrix4* const* blendMatrices,
889	size_t blendWeightStride, size_t blendIndexStride,
890	size_t numWeightsPerVertex,
891	size_t numIterations)
892	{
893	assert(_isAlignedForSSE(pSrcPos));
894
895	// Instantiating two version only, since other alignement combination not that important.
896	if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm))
897	{
898	SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply(
899	pSrcPos, pDestPos,
900	pSrcNorm, pDestNorm,
901	pBlendWeight, pBlendIndex,
902	blendMatrices,
903	blendWeightStride, blendIndexStride,
904	numWeightsPerVertex,
905	numIterations);
906	}
907	else
908	{
909	SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply(
910	pSrcPos, pDestPos,
911	pSrcNorm, pDestNorm,
912	pBlendWeight, pBlendIndex,
913	blendMatrices,
914	blendWeightStride, blendIndexStride,
915	numWeightsPerVertex,
916	numIterations);
917	}
918	}
919	//---------------------------------------------------------------------
920	// Special SSE version skinning position only, the position buffer are
921	// packed.
922	template <bool srcPosAligned, bool destPosAligned>
923	struct SoftwareVertexSkinning_SSE_PosOnly_Packed
924	{
925	static void apply(
926	const float* pSrcPos, float* pDestPos,
927	const float* pBlendWeight, const unsigned char* pBlendIndex,
928	const Matrix4* const* blendMatrices,
929	size_t blendWeightStride, size_t blendIndexStride,
930	size_t numWeightsPerVertex,
931	size_t numIterations)
932	{
933	typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
934	typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
935
936	// Blending 4 vertices per-iteration
937	for (size_t i = 0; i < numIterations; ++i)
938	{
939	// Collapse matrices
940	__m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
941	_collapseFourMatrices(
942	m00, m01, m02,
943	m10, m11, m12,
944	m20, m21, m22,
945	m30, m31, m32,
946	pBlendWeight, pBlendIndex,
947	blendMatrices,
948	blendWeightStride, blendIndexStride,
949	numWeightsPerVertex);
950
951	// Advance 4 vertices
952	advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
953	advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
954
955	//------------------------------------------------------------------
956	// Transform positions
957	//------------------------------------------------------------------
958
959	__m128 s0, s1, s2, d0, d1, d2;
960
961	// Load source positions
962	s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1
963	s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2
964	s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3
965
966	// Arrange to 3x4 component-major for batches calculate
967	__MM_TRANSPOSE4x3_PS(s0, s1, s2);
968
969	// Transform by collapsed matrix
970
971	// Shuffle row 0 of four collapsed matrices for calculate X component
972	__MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
973
974	// Transform X components
975	d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3
976
977	// Shuffle row 1 of four collapsed matrices for calculate Y component
978	__MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
979
980	// Transform Y components
981	d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3
982
983	// Shuffle row 2 of four collapsed matrices for calculate Z component
984	__MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
985
986	// Transform Z components
987	d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3
988
989	// Arrange back to 4x3 continuous format for store results
990	__MM_TRANSPOSE3x4_PS(d0, d1, d2);
991
992	// Store blended positions
993	DestPosAccessor::store(pDestPos + 0, d0);
994	DestPosAccessor::store(pDestPos + 4, d1);
995	DestPosAccessor::store(pDestPos + 8, d2);
996
997	// Advance 4 vertices
998	pSrcPos += 4 * 3;
999	pDestPos += 4 * 3;
1000	}
1001	}
1002	};
1003	static FORCEINLINE void softwareVertexSkinning_SSE_PosOnly_Packed(
1004	const float* pSrcPos, float* pDestPos,
1005	const float* pBlendWeight, const unsigned char* pBlendIndex,
1006	const Matrix4* const* blendMatrices,
1007	size_t blendWeightStride, size_t blendIndexStride,
1008	size_t numWeightsPerVertex,
1009	size_t numIterations)
1010	{
1011	assert(_isAlignedForSSE(pSrcPos));
1012
1013	// Instantiating two version only, since other alignement combination not that important.
1014	if (_isAlignedForSSE(pDestPos))
1015	{
1016	SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply(
1017	pSrcPos, pDestPos,
1018	pBlendWeight, pBlendIndex,
1019	blendMatrices,
1020	blendWeightStride, blendIndexStride,
1021	numWeightsPerVertex,
1022	numIterations);
1023	}
1024	else
1025	{
1026	SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply(
1027	pSrcPos, pDestPos,
1028	pBlendWeight, pBlendIndex,
1029	blendMatrices,
1030	blendWeightStride, blendIndexStride,
1031	numWeightsPerVertex,
1032	numIterations);
1033	}
1034	}
1035	//---------------------------------------------------------------------
1036	//---------------------------------------------------------------------
1037	//---------------------------------------------------------------------
1038	OptimisedUtilSSE::OptimisedUtilSSE(void)
1039	: mPreferGeneralVersionForSharedBuffers(false)
1040	{
1041	// For AMD Athlon XP (but not that for Althon 64), it's prefer to never use
1042	// unrolled version for shared buffers at all, I guess because that version
1043	// run out of usable CPU registers, or L1/L2 cache related problem, causing
1044	// slight performance loss than general version.
1045	//
1046
1047	if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos)
1048	{
1049	// How can I check it's an Athlon XP but not Althon 64?
1050	// Ok, just test whether supports SSE2/SSE3 or not, if not,
1051	// assume general version faster than unrolled version :)
1052	//
1053	if (!(PlatformInformation::getCpuFeatures() &
1054	(PlatformInformation::CPU_FEATURE_SSE2 \| PlatformInformation::CPU_FEATURE_SSE3)))
1055	{
1056	mPreferGeneralVersionForSharedBuffers = true;
1057	}
1058	}
1059	}
1060	//---------------------------------------------------------------------
1061	void OptimisedUtilSSE::softwareVertexSkinning(
1062	const float pSrcPos, float pDestPos,
1063	const float pSrcNorm, float pDestNorm,
1064	const float pBlendWeight, const unsigned char pBlendIndex,
1065	const Matrix4* const* blendMatrices,
1066	size_t srcPosStride, size_t destPosStride,
1067	size_t srcNormStride, size_t destNormStride,
1068	size_t blendWeightStride, size_t blendIndexStride,
1069	size_t numWeightsPerVertex,
1070	size_t numVertices)
1071	{
1072	__OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1073
1074	// All position/normal pointers should be perfect aligned, but still check here
1075	// for avoid hardware buffer which allocated by potential buggy driver doesn't
1076	// support alignment properly.
1077	// Because we are used meta-function technique here, the code is easy to maintenance
1078	// and still provides all possible alignment combination.
1079	//
1080
1081	// Use unrolled routines only if there a lot of vertices
1082	if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES)
1083	{
1084	if (pSrcNorm)
1085	{
1086	// Blend position and normal
1087
1088	if (!mPreferGeneralVersionForSharedBuffers &&
1089	srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) &&
1090	pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3)
1091	{
1092	// Position and normal are sharing with packed buffer
1093
1094	size_t srcPosAlign = (size_t)pSrcPos & 15;
1095	assert((srcPosAlign & 3) == 0);
1096
1097	// Blend unaligned vertices with general SIMD routine
1098	if (srcPosAlign == 8) // Because 8 bytes alignment shift per-vertex
1099	{
1100	size_t count = srcPosAlign / 8;
1101	numVertices -= count;
1102	softwareVertexSkinning_SSE_General(
1103	pSrcPos, pDestPos,
1104	pSrcNorm, pDestNorm,
1105	pBlendWeight, pBlendIndex,
1106	blendMatrices,
1107	srcPosStride, destPosStride,
1108	srcNormStride, destNormStride,
1109	blendWeightStride, blendIndexStride,
1110	numWeightsPerVertex,
1111	count);
1112
1113	pSrcPos += count * (3 + 3);
1114	pDestPos += count * (3 + 3);
1115	pSrcNorm += count * (3 + 3);
1116	pDestNorm += count * (3 + 3);
1117	advanceRawPointer(pBlendWeight, count * blendWeightStride);
1118	advanceRawPointer(pBlendIndex, count * blendIndexStride);
1119	}
1120
1121	// Blend vertices, four vertices per-iteration
1122	size_t numIterations = numVertices / 4;
1123	softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
1124	pSrcPos, pDestPos,
1125	pBlendWeight, pBlendIndex,
1126	blendMatrices,
1127	blendWeightStride, blendIndexStride,
1128	numWeightsPerVertex,
1129	numIterations);
1130
1131	// Advance pointers for remaining vertices
1132	numVertices &= 3;
1133	if (numVertices)
1134	{
1135	pSrcPos += numIterations * 4 * (3 + 3);
1136	pDestPos += numIterations * 4 * (3 + 3);
1137	pSrcNorm += numIterations * 4 * (3 + 3);
1138	pDestNorm += numIterations * 4 * (3 + 3);
1139	advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1140	advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1141	}
1142	}
1143	else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 &&
1144	srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3)
1145	{
1146	// Position and normal are separate buffers, and all of them are packed
1147
1148	size_t srcPosAlign = (size_t)pSrcPos & 15;
1149	assert((srcPosAlign & 3) == 0);
1150
1151	// Blend unaligned vertices with general SIMD routine
1152	if (srcPosAlign)
1153	{
1154	size_t count = srcPosAlign / 4;
1155	numVertices -= count;
1156	softwareVertexSkinning_SSE_General(
1157	pSrcPos, pDestPos,
1158	pSrcNorm, pDestNorm,
1159	pBlendWeight, pBlendIndex,
1160	blendMatrices,
1161	srcPosStride, destPosStride,
1162	srcNormStride, destNormStride,
1163	blendWeightStride, blendIndexStride,
1164	numWeightsPerVertex,
1165	count);
1166
1167	pSrcPos += count * 3;
1168	pDestPos += count * 3;
1169	pSrcNorm += count * 3;
1170	pDestNorm += count * 3;
1171	advanceRawPointer(pBlendWeight, count * blendWeightStride);
1172	advanceRawPointer(pBlendIndex, count * blendIndexStride);
1173	}
1174
1175	// Blend vertices, four vertices per-iteration
1176	size_t numIterations = numVertices / 4;
1177	softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
1178	pSrcPos, pDestPos,
1179	pSrcNorm, pDestNorm,
1180	pBlendWeight, pBlendIndex,
1181	blendMatrices,
1182	blendWeightStride, blendIndexStride,
1183	numWeightsPerVertex,
1184	numIterations);
1185
1186	// Advance pointers for remaining vertices
1187	numVertices &= 3;
1188	if (numVertices)
1189	{
1190	pSrcPos += numIterations * 4 * 3;
1191	pDestPos += numIterations * 4 * 3;
1192	pSrcNorm += numIterations * 4 * 3;
1193	pDestNorm += numIterations * 4 * 3;
1194	advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1195	advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1196	}
1197	}
1198	else // Not 'packed' form or wrong order between position and normal
1199	{
1200	// Should never occuring, do nothing here just in case
1201	}
1202	}
1203	else // !pSrcNorm
1204	{
1205	// Blend position only
1206
1207	if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3)
1208	{
1209	// All buffers are packed
1210
1211	size_t srcPosAlign = (size_t)pSrcPos & 15;
1212	assert((srcPosAlign & 3) == 0);
1213
1214	// Blend unaligned vertices with general SIMD routine
1215	if (srcPosAlign)
1216	{
1217	size_t count = srcPosAlign / 4;
1218	numVertices -= count;
1219	softwareVertexSkinning_SSE_General(
1220	pSrcPos, pDestPos,
1221	pSrcNorm, pDestNorm,
1222	pBlendWeight, pBlendIndex,
1223	blendMatrices,
1224	srcPosStride, destPosStride,
1225	srcNormStride, destNormStride,
1226	blendWeightStride, blendIndexStride,
1227	numWeightsPerVertex,
1228	count);
1229
1230	pSrcPos += count * 3;
1231	pDestPos += count * 3;
1232	advanceRawPointer(pBlendWeight, count * blendWeightStride);
1233	advanceRawPointer(pBlendIndex, count * blendIndexStride);
1234	}
1235
1236	// Blend vertices, four vertices per-iteration
1237	size_t numIterations = numVertices / 4;
1238	softwareVertexSkinning_SSE_PosOnly_Packed(
1239	pSrcPos, pDestPos,
1240	pBlendWeight, pBlendIndex,
1241	blendMatrices,
1242	blendWeightStride, blendIndexStride,
1243	numWeightsPerVertex,
1244	numIterations);
1245
1246	// Advance pointers for remaining vertices
1247	numVertices &= 3;
1248	if (numVertices)
1249	{
1250	pSrcPos += numIterations * 4 * 3;
1251	pDestPos += numIterations * 4 * 3;
1252	advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1253	advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1254	}
1255	}
1256	else // Not 'packed' form
1257	{
1258	// Might occuring only if user forced software blending position only
1259	}
1260	}
1261	}
1262
1263	// Blend remaining vertices, need to do it with SIMD for identical result,
1264	// since mixing general floating-point and SIMD algorithm will causing
1265	// floating-point error.
1266	if (numVertices)
1267	{
1268	softwareVertexSkinning_SSE_General(
1269	pSrcPos, pDestPos,
1270	pSrcNorm, pDestNorm,
1271	pBlendWeight, pBlendIndex,
1272	blendMatrices,
1273	srcPosStride, destPosStride,
1274	srcNormStride, destNormStride,
1275	blendWeightStride, blendIndexStride,
1276	numWeightsPerVertex,
1277	numVertices);
1278	}
1279	}
1280	//---------------------------------------------------------------------
1281	void OptimisedUtilSSE::softwareVertexMorph(
1282	Real t,
1283	const float pSrc1, const float pSrc2,
1284	float *pDst,
1285	size_t numVertices)
1286	{
1287	__OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1288
1289	__m128 src01, src02, src11, src12, src21, src22;
1290	__m128 dst0, dst1, dst2;
1291
1292	__m128 t4 = _mm_load_ps1(&t);
1293
1294	size_t numIterations = numVertices / 4;
1295	numVertices &= 3;
1296
1297	// Never use meta-function technique to accessing memory because looks like
1298	// VC7.1 generate a bit inefficient binary code when put following code into
1299	// inline function.
1300
1301	if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst))
1302	{
1303	// All data aligned
1304
1305	// Morph 4 vertices per-iteration. Special designed for use all
1306	// available CPU registers as possible (7 registers used here),
1307	// and avoid temporary values allocated in stack for suppress
1308	// extra memory access.
1309	for (size_t i = 0; i < numIterations; ++i)
1310	{
1311	// 12 floating-point values
1312	src01 = __MM_LOAD_PS(pSrc1 + 0);
1313	src02 = __MM_LOAD_PS(pSrc2 + 0);
1314	src11 = __MM_LOAD_PS(pSrc1 + 4);
1315	src12 = __MM_LOAD_PS(pSrc2 + 4);
1316	src21 = __MM_LOAD_PS(pSrc1 + 8);
1317	src22 = __MM_LOAD_PS(pSrc2 + 8);
1318	pSrc1 += 12; pSrc2 += 12;
1319
1320	dst0 = __MM_LERP_PS(t4, src01, src02);
1321	dst1 = __MM_LERP_PS(t4, src11, src12);
1322	dst2 = __MM_LERP_PS(t4, src21, src22);
1323
1324	__MM_STORE_PS(pDst + 0, dst0);
1325	__MM_STORE_PS(pDst + 4, dst1);
1326	__MM_STORE_PS(pDst + 8, dst2);
1327	pDst += 12;
1328	}
1329
1330	// Morph remaining vertices
1331	switch (numVertices)
1332	{
1333	case 3:
1334	// 9 floating-point values
1335	src01 = __MM_LOAD_PS(pSrc1 + 0);
1336	src02 = __MM_LOAD_PS(pSrc2 + 0);
1337	src11 = __MM_LOAD_PS(pSrc1 + 4);
1338	src12 = __MM_LOAD_PS(pSrc2 + 4);
1339	src21 = _mm_load_ss(pSrc1 + 8);
1340	src22 = _mm_load_ss(pSrc2 + 8);
1341
1342	dst0 = __MM_LERP_PS(t4, src01, src02);
1343	dst1 = __MM_LERP_PS(t4, src11, src12);
1344	dst2 = __MM_LERP_SS(t4, src21, src22);
1345
1346	__MM_STORE_PS(pDst + 0, dst0);
1347	__MM_STORE_PS(pDst + 4, dst1);
1348	_mm_store_ss(pDst + 8, dst2);
1349	break;
1350
1351	case 2:
1352	// 6 floating-point values
1353	src01 = __MM_LOAD_PS(pSrc1 + 0);
1354	src02 = __MM_LOAD_PS(pSrc2 + 0);
1355	src11 = _mm_loadl_pi(t4, (__m64*)(pSrc1 + 4)); // t4 is meaningless here
1356	src12 = _mm_loadl_pi(t4, (__m64*)(pSrc2 + 4)); // t4 is meaningless here
1357
1358	dst0 = __MM_LERP_PS(t4, src01, src02);
1359	dst1 = __MM_LERP_PS(t4, src11, src12);
1360
1361	__MM_STORE_PS(pDst + 0, dst0);
1362	_mm_storel_pi((__m64*)(pDst + 4), dst1);
1363	break;
1364
1365	case 1:
1366	// 3 floating-point values
1367	src01 = _mm_load_ss(pSrc1 + 2);
1368	src02 = _mm_load_ss(pSrc2 + 2);
1369	src01 = _mm_loadh_pi(src01, (__m64*)(pSrc1 + 0));
1370	src02 = _mm_loadh_pi(src02, (__m64*)(pSrc2 + 0));
1371
1372	dst0 = __MM_LERP_PS(t4, src01, src02);
1373
1374	_mm_storeh_pi((__m64*)(pDst + 0), dst0);
1375	_mm_store_ss(pDst + 2, dst0);
1376	break;
1377	}
1378	}
1379	else // Should never occuring, just in case buggy driver
1380	{
1381	// Assume all data unaligned
1382
1383	// Morph 4 vertices per-iteration. Special designed for use all
1384	// available CPU registers as possible (7 registers used here),
1385	// and avoid temporary values allocated in stack for suppress
1386	// extra memory access.
1387	for (size_t i = 0; i < numIterations; ++i)
1388	{
1389	// 12 floating-point values
1390	src01 = _mm_loadu_ps(pSrc1 + 0);
1391	src02 = _mm_loadu_ps(pSrc2 + 0);
1392	src11 = _mm_loadu_ps(pSrc1 + 4);
1393	src12 = _mm_loadu_ps(pSrc2 + 4);
1394	src21 = _mm_loadu_ps(pSrc1 + 8);
1395	src22 = _mm_loadu_ps(pSrc2 + 8);
1396	pSrc1 += 12; pSrc2 += 12;
1397
1398	dst0 = __MM_LERP_PS(t4, src01, src02);
1399	dst1 = __MM_LERP_PS(t4, src11, src12);
1400	dst2 = __MM_LERP_PS(t4, src21, src22);
1401
1402	_mm_storeu_ps(pDst + 0, dst0);
1403	_mm_storeu_ps(pDst + 4, dst1);
1404	_mm_storeu_ps(pDst + 8, dst2);
1405	pDst += 12;
1406	}
1407
1408	// Morph remaining vertices
1409	switch (numVertices)
1410	{
1411	case 3:
1412	// 9 floating-point values
1413	src01 = _mm_loadu_ps(pSrc1 + 0);
1414	src02 = _mm_loadu_ps(pSrc2 + 0);
1415	src11 = _mm_loadu_ps(pSrc1 + 4);
1416	src12 = _mm_loadu_ps(pSrc2 + 4);
1417	src21 = _mm_load_ss(pSrc1 + 8);
1418	src22 = _mm_load_ss(pSrc2 + 8);
1419
1420	dst0 = __MM_LERP_PS(t4, src01, src02);
1421	dst1 = __MM_LERP_PS(t4, src11, src12);
1422	dst2 = __MM_LERP_SS(t4, src21, src22);
1423
1424	_mm_storeu_ps(pDst + 0, dst0);
1425	_mm_storeu_ps(pDst + 4, dst1);
1426	_mm_store_ss(pDst + 8, dst2);
1427	break;
1428
1429	case 2:
1430	// 6 floating-point values
1431	src01 = _mm_loadu_ps(pSrc1 + 0);
1432	src02 = _mm_loadu_ps(pSrc2 + 0);
1433	src11 = _mm_loadl_pi(t4, (__m64*)(pSrc1 + 4)); // t4 is meaningless here
1434	src12 = _mm_loadl_pi(t4, (__m64*)(pSrc2 + 4)); // t4 is meaningless here
1435
1436	dst0 = __MM_LERP_PS(t4, src01, src02);
1437	dst1 = __MM_LERP_PS(t4, src11, src12);
1438
1439	_mm_storeu_ps(pDst + 0, dst0);
1440	_mm_storel_pi((__m64*)(pDst + 4), dst1);
1441	break;
1442
1443	case 1:
1444	// 3 floating-point values
1445	src01 = _mm_load_ss(pSrc1 + 2);
1446	src02 = _mm_load_ss(pSrc2 + 2);
1447	src01 = _mm_loadh_pi(src01, (__m64*)(pSrc1 + 0));
1448	src02 = _mm_loadh_pi(src02, (__m64*)(pSrc2 + 0));
1449
1450	dst0 = __MM_LERP_PS(t4, src01, src02);
1451
1452	_mm_storeh_pi((__m64*)(pDst + 0), dst0);
1453	_mm_store_ss(pDst + 2, dst0);
1454	break;
1455	}
1456	}
1457	}
1458	//---------------------------------------------------------------------
1459	void OptimisedUtilSSE::concatenateAffineMatrices(
1460	const Matrix4& baseMatrix,
1461	const Matrix4* pSrcMat,
1462	Matrix4* pDstMat,
1463	size_t numMatrices)
1464	{
1465	__OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1466
1467	assert(_isAlignedForSSE(pSrcMat));
1468	assert(_isAlignedForSSE(pDstMat));
1469
1470	// Load base matrix, unaligned
1471	__m128 m0 = _mm_loadu_ps(baseMatrix[0]);
1472	__m128 m1 = _mm_loadu_ps(baseMatrix[1]);
1473	__m128 m2 = _mm_loadu_ps(baseMatrix[2]);
1474	__m128 m3 = _mm_loadu_ps(baseMatrix[3]); // m3 should be equal to (0, 0, 0, 1)
1475
1476	for (size_t i = 0; i < numMatrices; ++i)
1477	{
1478	// Load source matrix, aligned
1479	__m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]);
1480	__m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]);
1481	__m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]);
1482
1483	++pSrcMat;
1484
1485	__m128 t0, t1, t2, t3;
1486
1487	// Concatenate matrix, and store results
1488
1489	// Row 0
1490	t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0);
1491	t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1);
1492	t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2);
1493	t3 = _mm_mul_ps(m0, m3); // Compiler should optimise this out of the loop
1494	__MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3));
1495
1496	// Row 1
1497	t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0);
1498	t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1);
1499	t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2);
1500	t3 = _mm_mul_ps(m1, m3); // Compiler should optimise this out of the loop
1501	__MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3));
1502
1503	// Row 2
1504	t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0);
1505	t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1);
1506	t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2);
1507	t3 = _mm_mul_ps(m2, m3); // Compiler should optimise this out of the loop
1508	__MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3));
1509
1510	// Row 3
1511	__MM_STORE_PS((*pDstMat)[3], m3);
1512
1513	++pDstMat;
1514	}
1515	}
1516	//---------------------------------------------------------------------
1517	void OptimisedUtilSSE::calculateFaceNormals(
1518	const float *positions,
1519	const EdgeData::Triangle *triangles,
1520	Vector4 *faceNormals,
1521	size_t numTriangles)
1522	{
1523	__OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1524
1525	assert(_isAlignedForSSE(faceNormals));
1526
1527	// Load Vector3 as: (x, 0, y, z)
1528	#define __LOAD_VECTOR3(p) _mm_loadh_pi(_mm_load_ss(p), (__m64*)((p)+1))
1529
1530	// Mask used to changes sign of single precision floating point values.
1531	OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) =
1532	{
1533	0x80000000, 0x80000000, 0x80000000, 0x80000000,
1534	};
1535
1536	size_t numIterations = numTriangles / 4;
1537	numTriangles &= 3;
1538
1539	// Four triangles per-iteration
1540	for (size_t i = 0; i < numIterations; ++i)
1541	{
1542
1543	// Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3)
1544	#define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3) \
1545	{ \
1546	__m128 v0 = __LOAD_VECTOR3(p0); /* x0 -- y0 z0 */ \
1547	__m128 v1 = __LOAD_VECTOR3(p1); /* x1 -- y1 z1 */ \
1548	__m128 v2 = __LOAD_VECTOR3(p2); /* x2 -- y2 z2 */ \
1549	__m128 v3 = __LOAD_VECTOR3(p3); /* x3 -- y3 z3 */ \
1550	__m128 t0, t1; \
1551	\
1552	t0 = _mm_unpacklo_ps(v0, v2); /* x0 x2 -- -- */ \
1553	t1 = _mm_unpacklo_ps(v1, v3); /* x1 x3 -- -- */ \
1554	x = _mm_unpacklo_ps(t0, t1); /* x0 x1 x2 x3 */ \
1555	\
1556	t0 = _mm_unpackhi_ps(v0, v2); /* y0 y2 z0 z2 */ \
1557	t1 = _mm_unpackhi_ps(v1, v3); /* y1 y3 z1 z3 */ \
1558	y = _mm_unpacklo_ps(t0, t1); /* y0 y1 y2 y3 */ \
1559	z = _mm_unpackhi_ps(t0, t1); /* z0 z1 z2 z3 */ \
1560	}
1561
1562	__m128 x0, x1, x2, y0, y1, y2, z0, z1, z2;
1563
1564	// Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz
1565	__LOAD_FOUR_VECTOR3(x0, y0, z0,
1566	positions + triangles[0].vertIndex[0] * 3,
1567	positions + triangles[1].vertIndex[0] * 3,
1568	positions + triangles[2].vertIndex[0] * 3,
1569	positions + triangles[3].vertIndex[0] * 3);
1570
1571	// Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz
1572	__LOAD_FOUR_VECTOR3(x1, y1, z1,
1573	positions + triangles[0].vertIndex[1] * 3,
1574	positions + triangles[1].vertIndex[1] * 3,
1575	positions + triangles[2].vertIndex[1] * 3,
1576	positions + triangles[3].vertIndex[1] * 3);
1577
1578	// Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz
1579	__LOAD_FOUR_VECTOR3(x2, y2, z2,
1580	positions + triangles[0].vertIndex[2] * 3,
1581	positions + triangles[1].vertIndex[2] * 3,
1582	positions + triangles[2].vertIndex[2] * 3,
1583	positions + triangles[3].vertIndex[2] * 3);
1584
1585	triangles += 4;
1586
1587	// Calculate triangle face normals
1588
1589	// a = v1 - v0
1590	__m128 ax = _mm_sub_ps(x1, x0);
1591	__m128 ay = _mm_sub_ps(y1, y0);
1592	__m128 az = _mm_sub_ps(z1, z0);
1593
1594	// b = v2 - v0
1595	__m128 bx = _mm_sub_ps(x2, x0);
1596	__m128 by = _mm_sub_ps(y2, y0);
1597	__m128 bz = _mm_sub_ps(z2, z0);
1598
1599	// n = a cross b
1600	__m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by));
1601	__m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz));
1602	__m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx));
1603
1604	// w = - (n dot v0)
1605	__m128 nw = _mm_xor_ps(
1606	__MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0),
1607	(const __m128 )&msSignMask);
1608
1609	// Arrange to per-triangle face normal major format
1610	__MM_TRANSPOSE4x4_PS(nx, ny, nz, nw);
1611
1612	// Store results
1613	__MM_STORE_PS(&faceNormals[0].x, nx);
1614	__MM_STORE_PS(&faceNormals[1].x, ny);
1615	__MM_STORE_PS(&faceNormals[2].x, nz);
1616	__MM_STORE_PS(&faceNormals[3].x, nw);
1617	faceNormals += 4;
1618
1619	#undef __LOAD_FOUR_VECTOR3
1620	}
1621
1622	// Dealing with remaining triangles
1623	for (size_t j = 0; j < numTriangles; ++j)
1624	{
1625	// Load vertices of the triangle
1626	__m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3);
1627	__m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3);
1628	__m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3);
1629	++triangles;
1630
1631	// Calculate face normal
1632
1633	__m128 t0, t1;
1634
1635	__m128 a = _mm_sub_ps(v1, v0); // ax 0 ay az
1636	__m128 b = _mm_sub_ps(v2, v0); // bx 0 by bz
1637	t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3)); // az 0 ax ay
1638	t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3)); // bz 0 bx by
1639	t0 = _mm_mul_ps(t0, b); // azbx 0 axby ay*bz
1640	t1 = _mm_mul_ps(t1, a); // axbz 0 aybx az*by
1641
1642	__m128 n = _mm_sub_ps(t0, t1); // ny 0 nz nx
1643
1644	__m128 d = _mm_mul_ps( // dy 0 dz dx
1645	_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n);
1646
1647	n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps( // nx ny nz -(dx+dy+dz)
1648	_mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)), // nx ny nz 0
1649	_mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))), // 0 0 0 dx
1650	_mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))), // 0 0 0 dy
1651	_mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1))); // 0 0 0 dz
1652
1653	// Store result
1654	__MM_STORE_PS(&faceNormals->x, n);
1655	++faceNormals;
1656	}
1657
1658	#undef __LOAD_VECTOR3
1659	}
1660	//---------------------------------------------------------------------
1661	void OptimisedUtilSSE::calculateLightFacing(
1662	const Vector4& lightPos,
1663	const Vector4* faceNormals,
1664	char* lightFacings,
1665	size_t numFaces)
1666	{
1667	__OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1668
1669	assert(_isAlignedForSSE(faceNormals));
1670
1671	// Map to convert 4-bits mask to 4 byte values
1672	static const char msMaskMapping[16][4] =
1673	{
1674	{0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0},
1675	{0, 0, 1, 0}, {1, 0, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0},
1676	{0, 0, 0, 1}, {1, 0, 0, 1}, {0, 1, 0, 1}, {1, 1, 0, 1},
1677	{0, 0, 1, 1}, {1, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1},
1678	};
1679
1680	__m128 n0, n1, n2, n3;
1681	__m128 t0, t1;
1682	__m128 dp;
1683	int bitmask;
1684
1685	// Load light vector, unaligned
1686	__m128 lp = _mm_loadu_ps(&lightPos.x);
1687
1688	// Perload zero to register for compare dot product values
1689	__m128 zero = _mm_setzero_ps();
1690
1691	size_t numIterations = numFaces / 4;
1692	numFaces &= 3;
1693
1694	// Four faces per-iteration
1695	for (size_t i = 0; i < numIterations; ++i)
1696	{
1697	// Load face normals, aligned
1698	n0 = __MM_LOAD_PS(&faceNormals[0].x);
1699	n1 = __MM_LOAD_PS(&faceNormals[1].x);
1700	n2 = __MM_LOAD_PS(&faceNormals[2].x);
1701	n3 = __MM_LOAD_PS(&faceNormals[3].x);
1702	faceNormals += 4;
1703
1704	// Multiply by light vector
1705	n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0
1706	n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1
1707	n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2
1708	n3 = _mm_mul_ps(n3, lp); // x3 y3 z3 w3
1709
1710	// Horizontal add four vector values.
1711	t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1
1712	_mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1
1713	_mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1
1714	t1 = _mm_add_ps( // x2+z2 x3+z3 y2+w2 y3+w3
1715	_mm_unpacklo_ps(n2, n3), // x2 x3 y2 y3
1716	_mm_unpackhi_ps(n2, n3)); // z2 z3 w2 w3
1717	dp = _mm_add_ps( // dp0 dp1 dp2 dp3
1718	_mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x3+z3
1719	_mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y3+w3
1720
1721	// Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps'
1722	// instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch,
1723	// i.e. it's 2nd operand of the assembly instruction. And in fact
1724	// '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped
1725	// in VC7.1.
1726	bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1727
1728	// Convert 4-bits mask to 4 bytes, and store results.
1729	reinterpret_cast<uint32>(lightFacings) =
1730	reinterpret_cast<const uint32>(msMaskMapping[bitmask]);
1731	lightFacings += 4;
1732	}
1733
1734	// Dealing with remaining faces
1735	switch (numFaces)
1736	{
1737	case 3:
1738	n0 = __MM_LOAD_PS(&faceNormals[0].x);
1739	n1 = __MM_LOAD_PS(&faceNormals[1].x);
1740	n2 = __MM_LOAD_PS(&faceNormals[2].x);
1741
1742	n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0
1743	n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1
1744	n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2
1745
1746	t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1
1747	_mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1
1748	_mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1
1749	t1 = _mm_add_ps( // x2+z2 x2+z2 y2+w2 y2+w2
1750	_mm_unpacklo_ps(n2, n2), // x2 x2 y2 y2
1751	_mm_unpackhi_ps(n2, n2)); // z2 z2 w2 w2
1752	dp = _mm_add_ps( // dp0 dp1 dp2 dp2
1753	_mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x2+z2
1754	_mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y2+w2
1755
1756	bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1757
1758	lightFacings[0] = msMaskMapping[bitmask][0];
1759	lightFacings[1] = msMaskMapping[bitmask][1];
1760	lightFacings[2] = msMaskMapping[bitmask][2];
1761	break;
1762
1763	case 2:
1764	n0 = __MM_LOAD_PS(&faceNormals[0].x);
1765	n1 = __MM_LOAD_PS(&faceNormals[1].x);
1766
1767	n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0
1768	n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1
1769
1770	t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1
1771	_mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1
1772	_mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1
1773	dp = _mm_add_ps( // dp0 dp1 dp0 dp1
1774	_mm_movelh_ps(t0, t0), // x0+z0 x1+z1 x0+z0 x1+z1
1775	_mm_movehl_ps(t0, t0)); // y0+w0 y1+w1 y0+w0 y1+w1
1776
1777	bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1778
1779	lightFacings[0] = msMaskMapping[bitmask][0];
1780	lightFacings[1] = msMaskMapping[bitmask][1];
1781	break;
1782
1783	case 1:
1784	n0 = __MM_LOAD_PS(&faceNormals[0].x);
1785
1786	n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0
1787
1788	t0 = _mm_add_ps( // x0+z0 x0+z0 y0+w0 y0+w0
1789	_mm_unpacklo_ps(n0, n0), // x0 x0 y0 y0
1790	_mm_unpackhi_ps(n0, n0)); // z0 z0 w0 w0
1791	dp = _mm_add_ps( // dp0 dp0 dp0 dp0
1792	_mm_movelh_ps(t0, t0), // x0+z0 x0+z0 x0+z0 x0+z0
1793	_mm_movehl_ps(t0, t0)); // y0+w0 y0+w0 y0+w0 y0+w0
1794
1795	bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1796
1797	lightFacings[0] = msMaskMapping[bitmask][0];
1798	break;
1799	}
1800	}
1801	//---------------------------------------------------------------------
1802	// Template to extrude vertices for directional light.
1803	template <bool srcAligned, bool destAligned>
1804	struct ExtrudeVertices_SSE_DirectionalLight
1805	{
1806	static void apply(
1807	const Vector4& lightPos,
1808	Real extrudeDist,
1809	const float* pSrcPos,
1810	float* pDestPos,
1811	size_t numVertices)
1812	{
1813	typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1814	typedef SSEMemoryAccessor<destAligned> DestAccessor;
1815
1816	// Directional light, extrusion is along light direction
1817
1818	// Load light vector, unaligned
1819	__m128 lp = _mm_loadu_ps(&lightPos.x);
1820
1821	// Calculate extrusion direction, note that we use inverted direction here
1822	// for eliminate an extra negative instruction, we'll compensate for that
1823	// by use subtract instruction instead later.
1824	__m128 tmp = _mm_mul_ps(lp, lp);
1825	tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp));
1826	// Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
1827	tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist));
1828	__m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0)); // X Y Z -
1829
1830	// Prepare extrude direction for extruding 4 vertices parallelly
1831	__m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0)); // X Y Z X
1832	__m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1)); // Y Z X Y
1833	__m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2)); // Z X Y Z
1834
1835	__m128 s0, s1, s2;
1836	__m128 d0, d1, d2;
1837
1838	size_t numIterations = numVertices / 4;
1839	numVertices &= 3;
1840
1841	// Extruding 4 vertices per-iteration
1842	for (size_t i = 0; i < numIterations; ++i)
1843	{
1844	s0 = SrcAccessor::load(pSrcPos + 0);
1845	s1 = SrcAccessor::load(pSrcPos + 4);
1846	s2 = SrcAccessor::load(pSrcPos + 8);
1847	pSrcPos += 12;
1848
1849	// The extrusion direction is inverted, use subtract instruction here
1850	d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1
1851	d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2
1852	d2 = _mm_sub_ps(s2, dir2); // Z2 X3 Y3 Z3
1853
1854	DestAccessor::store(pDestPos + 0, d0);
1855	DestAccessor::store(pDestPos + 4, d1);
1856	DestAccessor::store(pDestPos + 8, d2);
1857	pDestPos += 12;
1858	}
1859
1860	// Dealing with remaining vertices
1861	switch (numVertices)
1862	{
1863	case 3:
1864	// 9 floating-point values
1865	s0 = SrcAccessor::load(pSrcPos + 0);
1866	s1 = SrcAccessor::load(pSrcPos + 4);
1867	s2 = _mm_load_ss(pSrcPos + 8);
1868
1869	// The extrusion direction is inverted, use subtract instruction here
1870	d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1
1871	d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2
1872	d2 = _mm_sub_ss(s2, dir2); // Z2 -- -- --
1873
1874	DestAccessor::store(pDestPos + 0, d0);
1875	DestAccessor::store(pDestPos + 4, d1);
1876	_mm_store_ss(pDestPos + 8, d2);
1877	break;
1878
1879	case 2:
1880	// 6 floating-point values
1881	s0 = SrcAccessor::load(pSrcPos + 0);
1882	s1 = _mm_loadl_pi(dir1, (__m64*)(pSrcPos + 4)); // dir1 is meaningless here
1883
1884	// The extrusion direction is inverted, use subtract instruction here
1885	d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1
1886	d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 -- --
1887
1888	DestAccessor::store(pDestPos + 0, d0);
1889	_mm_storel_pi((__m64*)(pDestPos + 4), d1);
1890	break;
1891
1892	case 1:
1893	// 3 floating-point values
1894	s0 = _mm_loadl_pi(dir0, (__m64*)(pSrcPos + 0)); // dir0 is meaningless here
1895	s1 = _mm_load_ss(pSrcPos + 2);
1896
1897	// The extrusion direction is inverted, use subtract instruction here
1898	d0 = _mm_sub_ps(s0, dir0); // X0 Y0 -- --
1899	d1 = _mm_sub_ss(s1, dir2); // Z0 -- -- --
1900
1901	_mm_storel_pi((__m64*)(pDestPos + 0), d0);
1902	_mm_store_ss(pDestPos + 2, d1);
1903	break;
1904	}
1905	}
1906	};
1907	//---------------------------------------------------------------------
1908	// Template to extrude vertices for point light.
1909	template <bool srcAligned, bool destAligned>
1910	struct ExtrudeVertices_SSE_PointLight
1911	{
1912	static void apply(
1913	const Vector4& lightPos,
1914	Real extrudeDist,
1915	const float* pSrcPos,
1916	float* pDestPos,
1917	size_t numVertices)
1918	{
1919	typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1920	typedef SSEMemoryAccessor<destAligned> DestAccessor;
1921
1922	// Point light, will calculate extrusion direction for every vertex
1923
1924	// Load light vector, unaligned
1925	__m128 lp = _mm_loadu_ps(&lightPos.x);
1926
1927	// Load extrude distance
1928	__m128 extrudeDist4 = _mm_load_ps1(&extrudeDist);
1929
1930	size_t numIterations = numVertices / 4;
1931	numVertices &= 3;
1932
1933	// Extruding 4 vertices per-iteration
1934	for (size_t i = 0; i < numIterations; ++i)
1935	{
1936	// Load source positions
1937	__m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1
1938	__m128 s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2
1939	__m128 s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3
1940	pSrcPos += 12;
1941
1942	// Arrange to 3x4 component-major for batches calculate
1943	__MM_TRANSPOSE4x3_PS(s0, s1, s2);
1944
1945	// Calculate unnormalised extrusion direction
1946	__m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3
1947	__m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3
1948	__m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3
1949
1950	// Normalise extrusion direction and multiply by extrude distance
1951	__m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz);
1952	tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4);
1953	dx = _mm_mul_ps(dx, tmp);
1954	dy = _mm_mul_ps(dy, tmp);
1955	dz = _mm_mul_ps(dz, tmp);
1956
1957	// Calculate extruded positions
1958	__m128 d0 = _mm_add_ps(dx, s0);
1959	__m128 d1 = _mm_add_ps(dy, s1);
1960	__m128 d2 = _mm_add_ps(dz, s2);
1961
1962	// Arrange back to 4x3 continuous format for store results
1963	__MM_TRANSPOSE3x4_PS(d0, d1, d2);
1964
1965	// Store extruded positions
1966	DestAccessor::store(pDestPos + 0, d0);
1967	DestAccessor::store(pDestPos + 4, d1);
1968	DestAccessor::store(pDestPos + 8, d2);
1969	pDestPos += 12;
1970	}
1971
1972	// Dealing with remaining vertices
1973	for (size_t j = 0; j < numVertices; ++j)
1974	{
1975	// Load source position
1976	__m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (__m64*)(pSrcPos + 1)); // x 0 y z
1977	pSrcPos += 3;
1978
1979	// Calculate unnormalised extrusion direction
1980	__m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z
1981
1982	// Normalise extrusion direction and multiply by extrude distance
1983	__m128 tmp = _mm_mul_ps(dir, dir);
1984	tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3));
1985	// Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
1986	tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4);
1987	dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0));
1988
1989	// Calculate extruded position
1990	__m128 dst = _mm_add_ps(dir, src);
1991
1992	// Store extruded position
1993	_mm_store_ss(pDestPos + 0, dst);
1994	_mm_storeh_pi((__m64*)(pDestPos + 1), dst);
1995	pDestPos += 3;
1996	}
1997	}
1998	};
1999	//---------------------------------------------------------------------
2000	void OptimisedUtilSSE::extrudeVertices(
2001	const Vector4& lightPos,
2002	Real extrudeDist,
2003	const float* pSrcPos,
2004	float* pDestPos,
2005	size_t numVertices)
2006	{
2007	__OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
2008
2009	// Note: Since pDestPos is following tail of pSrcPos, we can't assume
2010	// it's aligned to SIMD alignment properly, so must check for it here.
2011	//
2012	// TODO: Add extra vertex to the vertex buffer for make sure pDestPos
2013	// aligned same as pSrcPos.
2014	//
2015
2016	// We are use SSE reciprocal square root directly while calculating
2017	// extrusion direction, since precision loss not that important here.
2018	//
2019	if (lightPos.w == 0.0f)
2020	{
2021	if (_isAlignedForSSE(pSrcPos))
2022	{
2023	if (_isAlignedForSSE(pDestPos))
2024	ExtrudeVertices_SSE_DirectionalLight<true, true>::apply(
2025	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2026	else
2027	ExtrudeVertices_SSE_DirectionalLight<true, false>::apply(
2028	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2029	}
2030	else
2031	{
2032	if (_isAlignedForSSE(pDestPos))
2033	ExtrudeVertices_SSE_DirectionalLight<false, true>::apply(
2034	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2035	else
2036	ExtrudeVertices_SSE_DirectionalLight<false, false>::apply(
2037	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2038	}
2039	}
2040	else
2041	{
2042	assert(lightPos.w == 1.0f);
2043
2044	if (_isAlignedForSSE(pSrcPos))
2045	{
2046	if (_isAlignedForSSE(pDestPos))
2047	ExtrudeVertices_SSE_PointLight<true, true>::apply(
2048	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2049	else
2050	ExtrudeVertices_SSE_PointLight<true, false>::apply(
2051	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2052	}
2053	else
2054	{
2055	if (_isAlignedForSSE(pDestPos))
2056	ExtrudeVertices_SSE_PointLight<false, true>::apply(
2057	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2058	else
2059	ExtrudeVertices_SSE_PointLight<false, false>::apply(
2060	lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2061	}
2062	}
2063	}
2064	//---------------------------------------------------------------------
2065	//---------------------------------------------------------------------
2066	//---------------------------------------------------------------------
2067	extern OptimisedUtil* _getOptimisedUtilSSE(void)
2068	{
2069	static OptimisedUtilSSE msOptimisedUtilSSE;
2070	#if defined(__OGRE_SIMD_ALIGN_STACK)
2071	static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE);
2072	return &msOptimisedUtilWithStackAlign;
2073	#else
2074	return &msOptimisedUtilSSE;
2075	#endif
2076	}
2077
2078	}
2079
2080	#endif // __OGRE_HAVE_SSE

Note: See TracBrowser for help on using the repository browser.

Download in other formats: