Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/OgreMain/src/OgreSIMDHelper.h @ 1

Last change on this file since 1 was 1, checked in by landauf, 18 years ago

File size: 19.5 KB

Rev	Line
[1]	1	/*
	2	-----------------------------------------------------------------------------
	3	This source file is part of OGRE
	4	(Object-oriented Graphics Rendering Engine)
	5	For the latest info, see http://www.ogre3d.org/
	6
	7	Copyright (c) 2000-2006 Torus Knot Software Ltd
	8	Also see acknowledgements in Readme.html
	9
	10	This program is free software; you can redistribute it and/or modify it under
	11	the terms of the GNU Lesser General Public License as published by the Free Software
	12	Foundation; either version 2 of the License, or (at your option) any later
	13	version.
	14
	15	This program is distributed in the hope that it will be useful, but WITHOUT
	16	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	17	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
	18
	19	You should have received a copy of the GNU Lesser General Public License along with
	20	this program; if not, write to the Free Software Foundation, Inc., 59 Temple
	21	Place - Suite 330, Boston, MA 02111-1307, USA, or go to
	22	http://www.gnu.org/copyleft/lesser.txt.
	23
	24	You may alternatively use this source under the terms of a specific version of
	25	the OGRE Unrestricted License provided you have obtained such a license from
	26	Torus Knot Software Ltd.
	27	-----------------------------------------------------------------------------
	28	*/
	29	#ifndef __SIMDHelper_H__
	30	#define __SIMDHelper_H__
	31
	32	#include "OgrePrerequisites.h"
	33	#include "OgrePlatformInformation.h"
	34
	35	// Stack-alignment hackery.
	36	//
	37	// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
	38	// special code to ensure stack align to a 16-bytes boundary.
	39	//
	40	// Note:
	41	// This macro can only guarantee callee stack pointer (esp) align
	42	// to a 16-bytes boundary, but not that for frame pointer (ebp).
	43	// Because most compiler might use frame pointer to access to stack
	44	// variables, so you need to wrap those alignment required functions
	45	// with extra function call.
	46	//
	47	#if defined(__INTEL_COMPILER)
	48	// For intel's compiler, simply calling alloca seems to do the right
	49	// thing. The size of the allocated block seems to be irrelevant.
	50	#define __OGRE_SIMD_ALIGN_STACK() _alloca(16)
	51
	52	#elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC
	53	//
	54	// Horrible hack to align the stack to a 16-bytes boundary for gcc.
	55	//
	56	// We assume a gcc version >= 2.95 so that
	57	// -mpreferred-stack-boundary works. Otherwise, all bets are
	58	// off. However, -mpreferred-stack-boundary does not create a
	59	// stack alignment, but it only preserves it. Unfortunately,
	60	// since Ogre are designed as a flexibility library, user might
	61	// compile their application with wrong stack alignment, even
	62	// if user taken care with stack alignment, but many versions
	63	// of libc on linux call main() with the wrong initial stack
	64	// alignment the result that the code is now pessimally aligned
	65	// instead of having a 50% chance of being correct.
	66	//
	67	#define __OGRE_SIMD_ALIGN_STACK() \
	68	{ \
	69	/* Use alloca to allocate some memory on the stack. */ \
	70	/* This alerts gcc that something funny is going on, */ \
	71	/* so that it does not omit the frame pointer etc. */ \
	72	(void)__builtin_alloca(16); \
	73	/* Now align the stack pointer */ \
	74	__asm__ __volatile__ ("andl $-16, %esp"); \
	75	}
	76
	77	#elif defined(_MSC_VER)
	78	// Fortunately, MSVC will align the stack automatically
	79
	80	#endif
	81
	82
	83	// Additional platform-dependent header files and declares.
	84	//
	85	// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
	86	//
	87
	88	#if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
	89
	90	#if OGRE_COMPILER == OGRE_COMPILER_MSVC \|\| defined(__INTEL_COMPILER)
	91	#include "OgreNoMemoryMacros.h"
	92	#include <xmmintrin.h>
	93	#include "OgreMemoryMacros.h"
	94
	95	#elif OGRE_COMPILER == OGRE_COMPILER_GNUC
	96	// Don't define ourself version SSE intrinsics if "xmmintrin.h" already included.
	97	//
	98	// Note: gcc in some platform already included "xmmintrin.h" for some reason.
	99	// I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h"
	100	// comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition
	101	// problem on gcc for x86.
	102	//
	103	#if !defined(_XMMINTRIN_H_INCLUDED)
	104
	105	// Simulate VC/ICC intrinsics. Only used intrinsics are declared here.
	106
	107	typedef float __m128 __attribute__ ((mode(V4SF),aligned(16)));
	108	typedef int __m64 __attribute__ ((mode(V2SI)));
	109
	110	// Macro for declare intrinsic routines always inline even if in debug build
	111	#define __ALWAYS_INLINE FORCEINLINE __attribute__ ((__always_inline__))
	112
	113	// Shuffle instruction must be declare as macro
	114
	115	#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
	116	(((fp3) << 6) \| ((fp2) << 4) \| ((fp1) << 2) \| ((fp0)))
	117
	118	#define _mm_shuffle_ps(a, b, imm8) __extension__ \
	119	({ \
	120	__m128 result; \
	121	__asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8)); \
	122	result; \
	123	})
	124
	125
	126	// Load/store instructions
	127
	128	#define __MM_DECL_LD(name, instruction, type) \
	129	static __ALWAYS_INLINE __m128 _mm_##name(const type *addr) \
	130	{ \
	131	__m128 result; \
	132	__asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr)); \
	133	return result; \
	134	}
	135
	136	#define __MM_DECL_LD2(name, instruction, type) \
	137	static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr) \
	138	{ \
	139	__m128 result; \
	140	__asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr)); \
	141	return result; \
	142	}
	143
	144	#define __MM_DECL_ST(name, instruction, type) \
	145	static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val) \
	146	{ \
	147	__asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val)); \
	148	}
	149
	150	__MM_DECL_LD(loadu_ps, movups, float)
	151	__MM_DECL_ST(storeu_ps, movups, float)
	152
	153	__MM_DECL_LD(load_ss, movss, float)
	154	__MM_DECL_ST(store_ss, movss, float)
	155
	156	__MM_DECL_ST(storel_pi, movlps, __m64)
	157	__MM_DECL_ST(storeh_pi, movhps, __m64)
	158	__MM_DECL_LD2(loadl_pi, movlps, __m64)
	159	__MM_DECL_LD2(loadh_pi, movhps, __m64)
	160
	161	#undef __MM_DECL_LD
	162	#undef __MM_DECL_LD2
	163	#undef __MM_DECL_ST
	164
	165	// Two operand instructions
	166
	167	#define __MM_DECL_OP2(name, instruction, constraint) \
	168	static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b) \
	169	{ \
	170	__m128 result; \
	171	__asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b)); \
	172	return result; \
	173	}
	174
	175	__MM_DECL_OP2(add_ps, addps, xm)
	176	__MM_DECL_OP2(add_ss, addss, xm)
	177	__MM_DECL_OP2(sub_ps, subps, xm)
	178	__MM_DECL_OP2(sub_ss, subss, xm)
	179	__MM_DECL_OP2(mul_ps, mulps, xm)
	180	__MM_DECL_OP2(mul_ss, mulss, xm)
	181
	182	__MM_DECL_OP2(xor_ps, xorps, xm)
	183
	184	__MM_DECL_OP2(unpacklo_ps, unpcklps, xm)
	185	__MM_DECL_OP2(unpackhi_ps, unpckhps, xm)
	186
	187	__MM_DECL_OP2(movehl_ps, movhlps, x)
	188	__MM_DECL_OP2(movelh_ps, movlhps, x)
	189
	190	__MM_DECL_OP2(cmpnle_ps, cmpnleps, xm)
	191
	192	#undef __MM_DECL_OP2
	193
	194	// Other used instructions
	195
	196	static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr)
	197	{
	198	__m128 tmp = _mm_load_ss(addr);
	199	return _mm_shuffle_ps(tmp, tmp, 0);
	200	}
	201
	202	static __ALWAYS_INLINE __m128 _mm_setzero_ps(void)
	203	{
	204	__m128 result;
	205	__asm__("xorps %0, %0" : "=x" (result));
	206	return result;
	207	}
	208
	209	static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val)
	210	{
	211	__m128 result;
	212	__asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val));
	213	//__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val));
	214	return result;
	215	}
	216
	217	static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val)
	218	{
	219	int result;
	220	__asm__("movmskps %1, %0" : "=r" (result) : "x" (val));
	221	return result;
	222	}
	223
	224	#endif // !defined(_XMMINTRIN_H_INCLUDED)
	225
	226	#endif // OGRE_COMPILER == OGRE_COMPILER_GNUC
	227
	228	#endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
	229
	230
	231
	232	//---------------------------------------------------------------------
	233	// SIMD macros and helpers
	234	//---------------------------------------------------------------------
	235
	236
	237	namespace Ogre {
	238
	239	#if __OGRE_HAVE_SSE
	240
	241	/** Macro __MM_RSQRT_PS calculate square root, which should be used for
	242	normalise normals only. It might be use NewtonRaphson reciprocal square
	243	root for high precision, or use SSE rsqrt instruction directly, based
	244	on profile to pick up perfect one.
	245	@note:
	246	Prefer to never use NewtonRaphson reciprocal square root at all, since
	247	speed test indicate performance loss 10% for unrolled version, and loss
	248	%25 for general version (P4 3.0G HT). A slight loss in precision not
	249	that important in case of normalise normals.
	250	*/
	251	#if 1
	252	#define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
	253	#else
	254	#define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below
	255	#endif
	256
	257	/** Performing the transpose of a 4x4 matrix of single precision floating
	258	point values.
	259	Arguments r0, r1, r2, and r3 are __m128 values whose elements
	260	form the corresponding rows of a 4x4 matrix.
	261	The matrix transpose is returned in arguments r0, r1, r2, and
	262	r3 where r0 now holds column 0 of the original matrix, r1 now
	263	holds column 1 of the original matrix, etc.
	264	*/
	265	#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
	266	{ \
	267	__m128 t3, t2, t1, t0; \
	268	\
	269	/* r00 r01 r02 r03 */ \
	270	/* r10 r11 r12 r13 */ \
	271	/* r20 r21 r22 r23 */ \
	272	/* r30 r31 r32 r33 */ \
	273	\
	274	t0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \
	275	t2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \
	276	t1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \
	277	t3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \
	278	\
	279	r0 = _mm_movelh_ps(t0, t1); /* r00 r10 r20 r30 */ \
	280	r1 = _mm_movehl_ps(t1, t0); /* r01 r11 r21 r31 */ \
	281	r2 = _mm_movelh_ps(t2, t3); /* r02 r12 r22 r32 */ \
	282	r3 = _mm_movehl_ps(t3, t2); /* r03 r13 r23 r33 */ \
	283	}
	284
	285	/** Performing the transpose of a continuous stored rows of a 4x3 matrix to
	286	a 3x4 matrix of single precision floating point values.
	287	Arguments v0, v1, and v2 are __m128 values whose elements form the
	288	corresponding continuous stored rows of a 4x3 matrix.
	289	The matrix transpose is returned in arguments v0, v1, and v2, where
	290	v0 now holds column 0 of the original matrix, v1 now holds column 1
	291	of the original matrix, etc.
	292	*/
	293	#define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
	294	{ \
	295	__m128 t0, t1, t2; \
	296	\
	297	/* r00 r01 r02 r10 */ \
	298	/* r11 r12 r20 r21 */ \
	299	/* r22 r30 r31 r32 */ \
	300	\
	301	t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \
	302	t1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \
	303	t2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \
	304	\
	305	v0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \
	306	v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \
	307	v2 = _mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \
	308	}
	309
	310	/** Performing the transpose of a 3x4 matrix to a continuous stored rows of
	311	a 4x3 matrix of single precision floating point values.
	312	Arguments v0, v1, and v2 are __m128 values whose elements form the
	313	corresponding columns of a 3x4 matrix.
	314	The matrix transpose is returned in arguments v0, v1, and v2, as a
	315	continuous stored rows of a 4x3 matrix.
	316	*/
	317	#define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
	318	{ \
	319	__m128 t0, t1, t2; \
	320	\
	321	/* r00 r10 r20 r30 */ \
	322	/* r01 r11 r21 r31 */ \
	323	/* r02 r12 r22 r32 */ \
	324	\
	325	t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \
	326	t1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \
	327	t2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \
	328	\
	329	v0 = _mm_shuffle_ps(t2, t0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \
	330	v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \
	331	v2 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \
	332	}
	333
	334	/** Fill vector of single precision floating point with selected value.
	335	Argument 'fp' is a digit[0123] that represents the fp of argument 'v'.
	336	*/
	337	#define __MM_SELECT(v, fp) \
	338	_mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
	339
	340	/// Accumulate four vector of single precision floating point values.
	341	#define __MM_ACCUM4_PS(a, b, c, d) \
	342	_mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
	343
	344	/** Performing dot-product between two of four vector of single precision
	345	floating point values.
	346	*/
	347	#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
	348	__MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
	349
	350	/** Performing dot-product between four vector and three vector of single
	351	precision floating point values.
	352	*/
	353	#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
	354	__MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
	355
	356	/// Accumulate three vector of single precision floating point values.
	357	#define __MM_ACCUM3_PS(a, b, c) \
	358	_mm_add_ps(_mm_add_ps(a, b), c)
	359
	360	/** Performing dot-product between two of three vector of single precision
	361	floating point values.
	362	*/
	363	#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
	364	__MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
	365
	366	/// Calculate multiply of two vector and plus another vector
	367	#define __MM_MADD_PS(a, b, c) \
	368	_mm_add_ps(_mm_mul_ps(a, b), c)
	369
	370	/// Linear interpolation
	371	#define __MM_LERP_PS(t, a, b) \
	372	__MM_MADD_PS(_mm_sub_ps(b, a), t, a)
	373
	374	/// Calculate multiply of two single floating value and plus another floating value
	375	#define __MM_MADD_SS(a, b, c) \
	376	_mm_add_ss(_mm_mul_ss(a, b), c)
	377
	378	/// Linear interpolation
	379	#define __MM_LERP_SS(t, a, b) \
	380	__MM_MADD_SS(_mm_sub_ss(b, a), t, a)
	381
	382	/// Same as _mm_load_ps, but can help VC generate more optimised code.
	383	#define __MM_LOAD_PS(p) \
	384	((__m128)(p))
	385
	386	/// Same as _mm_store_ps, but can help VC generate more optimised code.
	387	#define __MM_STORE_PS(p, v) \
	388	((__m128)(p) = (v))
	389
	390
	391	/** Helper to load/store SSE data based on whether or not aligned.
	392	*/
	393	template <bool aligned = false>
	394	struct SSEMemoryAccessor
	395	{
	396	static FORCEINLINE __m128 load(const float *p)
	397	{
	398	return _mm_loadu_ps(p);
	399	}
	400	static FORCEINLINE void store(float *p, const __m128& v)
	401	{
	402	_mm_storeu_ps(p, v);
	403	}
	404	};
	405	// Special aligned accessor
	406	template <>
	407	struct SSEMemoryAccessor<true>
	408	{
	409	static FORCEINLINE const __m128& load(const float *p)
	410	{
	411	return __MM_LOAD_PS(p);
	412	}
	413	static FORCEINLINE void store(float *p, const __m128& v)
	414	{
	415	__MM_STORE_PS(p, v);
	416	}
	417	};
	418
	419	/** Check whether or not the given pointer perfect aligned for SSE.
	420	*/
	421	static FORCEINLINE bool _isAlignedForSSE(const void *p)
	422	{
	423	return (((size_t)p) & 15) == 0;
	424	}
	425
	426	/** Calculate NewtonRaphson Reciprocal Square Root with formula:
	427	0.5 * rsqrt(x) * (3 - x * rsqrt(x)^2)
	428	*/
	429	static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
	430	{
	431	static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
	432	static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
	433	__m128 t = _mm_rsqrt_ps(x);
	434	return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
	435	_mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
	436	}
	437
	438	// Macro to check the stack aligned for SSE
	439	#if OGRE_DEBUG_MODE
	440	#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
	441	{ \
	442	__m128 test; \
	443	assert(_isAlignedForSSE(&test)); \
	444	}
	445
	446	#else // !OGRE_DEBUG_MODE
	447	#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
	448
	449	#endif // OGRE_DEBUG_MODE
	450
	451
	452	#endif // __OGRE_HAVE_SSE
	453
	454	}
	455
	456	#endif // __SIMDHelper_H__

Note: See TracBrowser for help on using the repository browser.

Download in other formats: