Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: code/branches/ode/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp @ 216

Last change on this file since 216 was 216, checked in by mathiask, 16 years ago
[Physik] add ode-0.9
File size: 21.5 KB

Line
1	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2	/**
3	* Contains source code from the article "Radix Sort Revisited".
4	* \file IceRevisitedRadix.cpp
5	* \author Pierre Terdiman
6	* \date April, 4, 2000
7	*/
8	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
9
10	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11	/**
12	* Revisited Radix Sort.
13	* This is my new radix routine:
14	* - it uses indices and doesn't recopy the values anymore, hence wasting less ram
15	* - it creates all the histograms in one run instead of four
16	* - it sorts words faster than dwords and bytes faster than words
17	* - it correctly sorts negative floating-point values by patching the offsets
18	* - it automatically takes advantage of temporal coherence
19	* - multiple keys support is a side effect of temporal coherence
20	* - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
21	*
22	* History:
23	* - 08.15.98: very first version
24	* - 04.04.00: recoded for the radix article
25	* - 12.xx.00: code lifting
26	* - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
27	* - 10.11.01: added local ram support
28	* - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
29	* - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
30	* - ranks are not "reset" anymore, but implicit on first calls
31	* - 07.05.02: - offsets rewritten with one less indirection.
32	* - 11.03.02: - "bool" replaced with RadixHint enum
33	*
34	* \class RadixSort
35	* \author Pierre Terdiman
36	* \version 1.4
37	* \date August, 15, 1998
38	*/
39	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
40
41	/*
42	To do:
43	- add an offset parameter between two input values (avoid some data recopy sometimes)
44	- unroll ? asm ?
45	- 11 bits trick & 3 passes as Michael did
46	- prefetch stuff the day I have a P3
47	- make a version with 16-bits indices ?
48	*/
49
50	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
51	// Precompiled Header
52	#include "Stdafx.h"
53
54	using namespace IceCore;
55
56	#define INVALIDATE_RANKS mCurrentSize\|=0x80000000
57	#define VALIDATE_RANKS mCurrentSize&=0x7fffffff
58	#define CURRENT_SIZE (mCurrentSize&0x7fffffff)
59	#define INVALID_RANKS (mCurrentSize&0x80000000)
60
61	#define CHECK_RESIZE(n) \
62	if(n!=mPreviousSize) \
63	{ \
64	if(n>mCurrentSize) Resize(n); \
65	else ResetRanks(); \
66	mPreviousSize = n; \
67	}
68
69	#define CREATE_HISTOGRAMS(type, buffer) \
70	/* Clear counters/histograms */ \
71	ZeroMemory(mHistogram, 2564sizeof(udword)); \
72	\
73	/* Prepare to count */ \
74	ubyte* p = (ubyte*)input; \
75	ubyte* pe = &p[nb*4]; \
76	udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
77	udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
78	udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
79	udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
80	\
81	bool AlreadySorted = true; /* Optimism... */ \
82	\
83	if(INVALID_RANKS) \
84	{ \
85	/* Prepare for temporal coherence */ \
86	type* Running = (type*)buffer; \
87	type PrevVal = *Running; \
88	\
89	while(p!=pe) \
90	{ \
91	/* Read input buffer in previous sorted order */ \
92	type Val = *Running++; \
93	/* Check whether already sorted or not */ \
94	if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
95	/* Update for next iteration */ \
96	PrevVal = Val; \
97	\
98	/* Create histograms */ \
99	h0[p++]++; h1[p++]++; h2[p++]++; h3[p++]++; \
100	} \
101	\
102	/* If all input values are already sorted, we just have to return and leave the */ \
103	/* previous list unchanged. That way the routine may take advantage of temporal */ \
104	/* coherence, for example when used to sort transparent faces. */ \
105	if(AlreadySorted) \
106	{ \
107	mNbHits++; \
108	for(udword i=0;i<nb;i++) mRanks[i] = i; \
109	return *this; \
110	} \
111	} \
112	else \
113	{ \
114	/* Prepare for temporal coherence */ \
115	udword* Indices = mRanks; \
116	type PrevVal = (type)buffer[*Indices]; \
117	\
118	while(p!=pe) \
119	{ \
120	/* Read input buffer in previous sorted order */ \
121	type Val = (type)buffer[*Indices++]; \
122	/* Check whether already sorted or not */ \
123	if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
124	/* Update for next iteration */ \
125	PrevVal = Val; \
126	\
127	/* Create histograms */ \
128	h0[p++]++; h1[p++]++; h2[p++]++; h3[p++]++; \
129	} \
130	\
131	/* If all input values are already sorted, we just have to return and leave the */ \
132	/* previous list unchanged. That way the routine may take advantage of temporal */ \
133	/* coherence, for example when used to sort transparent faces. */ \
134	if(AlreadySorted) { mNbHits++; return *this; } \
135	} \
136	\
137	/* Else there has been an early out and we must finish computing the histograms */ \
138	while(p!=pe) \
139	{ \
140	/* Create histograms without the previous overhead */ \
141	h0[p++]++; h1[p++]++; h2[p++]++; h3[p++]++; \
142	}
143
144	#define CHECK_PASS_VALIDITY(pass) \
145	/* Shortcut to current counters */ \
146	udword* CurCount = &mHistogram[pass<<8]; \
147	\
148	/* Reset flag. The sorting pass is supposed to be performed. (default) */ \
149	bool PerformPass = true; \
150	\
151	/* Check pass validity */ \
152	\
153	/* If all values have the same byte, sorting is useless. */ \
154	/* It may happen when sorting bytes or words instead of dwords. */ \
155	/* This routine actually sorts words faster than dwords, and bytes */ \
156	/* faster than words. Standard running time (O(4n))is reduced to O(2n) */ \
157	/* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
158	\
159	/* Get first byte */ \
160	ubyte UniqueVal = (((ubyte)input)+pass); \
161	\
162	/* Check that byte's counter */ \
163	if(CurCount[UniqueVal]==nb) PerformPass=false;
164
165	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
166	/**
167	* Constructor.
168	*/
169	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
170	RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0)
171	{
172	#ifndef RADIX_LOCAL_RAM
173	// Allocate input-independent ram
174	mHistogram = new udword[256*4];
175	mOffset = new udword[256];
176	#endif
177	// Initialize indices
178	INVALIDATE_RANKS;
179	}
180
181	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
182	/**
183	* Destructor.
184	*/
185	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
186	RadixSort::~RadixSort()
187	{
188	// Release everything
189	#ifndef RADIX_LOCAL_RAM
190	DELETEARRAY(mOffset);
191	DELETEARRAY(mHistogram);
192	#endif
193	DELETEARRAY(mRanks2);
194	DELETEARRAY(mRanks);
195	}
196
197	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
198	/**
199	* Resizes the inner lists.
200	* \param nb [in] new size (number of dwords)
201	* \return true if success
202	*/
203	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
204	bool RadixSort::Resize(udword nb)
205	{
206	// Free previously used ram
207	DELETEARRAY(mRanks2);
208	DELETEARRAY(mRanks);
209
210	// Get some fresh one
211	mRanks = new udword[nb]; CHECKALLOC(mRanks);
212	mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
213
214	return true;
215	}
216
217	inline_ void RadixSort::CheckResize(udword nb)
218	{
219	udword CurSize = CURRENT_SIZE;
220	if(nb!=CurSize)
221	{
222	if(nb>CurSize) Resize(nb);
223	mCurrentSize = nb;
224	INVALIDATE_RANKS;
225	}
226	}
227
228	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
229	/**
230	* Main sort routine.
231	* This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
232	* \param input [in] a list of integer values to sort
233	* \param nb [in] number of values to sort, must be < 2^31
234	* \param hint [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
235	* \return Self-Reference
236	*/
237	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
238	RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
239	{
240	// Checkings
241	if(!input \|\| !nb \|\| nb&0x80000000) return *this;
242
243	// Stats
244	mTotalCalls++;
245
246	// Resize lists if needed
247	CheckResize(nb);
248
249	#ifdef RADIX_LOCAL_RAM
250	// Allocate histograms & offsets on the stack
251	udword mHistogram[256*4];
252	// udword mOffset[256];
253	udword* mLink[256];
254	#endif
255
256	// Create histograms (counters). Counters for all passes are created in one run.
257	// Pros: read input buffer once instead of four times
258	// Cons: mHistogram is 4Kb instead of 1Kb
259	// We must take care of signed/unsigned values for temporal coherence.... I just
260	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
261	if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
262	else { CREATE_HISTOGRAMS(sdword, input); }
263
264	// Compute #negative values involved if needed
265	udword NbNegativeValues = 0;
266	if(hint==RADIX_SIGNED)
267	{
268	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
269	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
270	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
271	udword* h3= &mHistogram[768];
272	for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
273	}
274
275	// Radix sort, j is the pass number (0=LSB, 3=MSB)
276	for(udword j=0;j<4;j++)
277	{
278	CHECK_PASS_VALIDITY(j);
279
280	// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
281	// not a problem, numbers are correctly sorted anyway.
282	if(PerformPass)
283	{
284	// Should we care about negative values?
285	if(j!=3 \|\| hint==RADIX_UNSIGNED)
286	{
287	// Here we deal with positive values only
288
289	// Create offsets
290	// mOffset[0] = 0;
291	// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
292	mLink[0] = mRanks2;
293	for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
294	}
295	else
296	{
297	// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
298
299	// Create biased offsets, in order for negative numbers to be sorted as well
300	// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
301	mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
302	// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
303	for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
304
305	// Fixing the wrong place for negative values
306	// mOffset[128] = 0;
307	mLink[128] = mRanks2;
308	// for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
309	for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
310	}
311
312	// Perform Radix Sort
313	ubyte* InputBytes = (ubyte*)input;
314	InputBytes += j;
315	if(INVALID_RANKS)
316	{
317	// for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
318	for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
319	VALIDATE_RANKS;
320	}
321	else
322	{
323	udword* Indices = mRanks;
324	udword* IndicesEnd = &mRanks[nb];
325	while(Indices!=IndicesEnd)
326	{
327	udword id = *Indices++;
328	// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
329	*mLink[InputBytes[id<<2]]++ = id;
330	}
331	}
332
333	// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
334	udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
335	}
336	}
337	return *this;
338	}
339
340	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
341	/**
342	* Main sort routine.
343	* This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
344	* \param input [in] a list of floating-point values to sort
345	* \param nb [in] number of values to sort, must be < 2^31
346	* \return Self-Reference
347	* \warning only sorts IEEE floating-point values
348	*/
349	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
350	RadixSort& RadixSort::Sort(const float* input2, udword nb)
351	{
352	// Checkings
353	if(!input2 \|\| !nb \|\| nb&0x80000000) return *this;
354
355	// Stats
356	mTotalCalls++;
357
358	udword* input = (udword*)input2;
359
360	// Resize lists if needed
361	CheckResize(nb);
362
363	#ifdef RADIX_LOCAL_RAM
364	// Allocate histograms & offsets on the stack
365	udword mHistogram[256*4];
366	// udword mOffset[256];
367	udword* mLink[256];
368	#endif
369
370	// Create histograms (counters). Counters for all passes are created in one run.
371	// Pros: read input buffer once instead of four times
372	// Cons: mHistogram is 4Kb instead of 1Kb
373	// Floating-point values are always supposed to be signed values, so there's only one code path there.
374	// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
375	// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
376	// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
377	// wouldn't work with mixed positive/negative values....
378	{ CREATE_HISTOGRAMS(float, input2); }
379
380	// Compute #negative values involved if needed
381	udword NbNegativeValues = 0;
382	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
383	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
384	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
385	udword* h3= &mHistogram[768];
386	for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
387
388	// Radix sort, j is the pass number (0=LSB, 3=MSB)
389	for(udword j=0;j<4;j++)
390	{
391	// Should we care about negative values?
392	if(j!=3)
393	{
394	// Here we deal with positive values only
395	CHECK_PASS_VALIDITY(j);
396
397	if(PerformPass)
398	{
399	// Create offsets
400	// mOffset[0] = 0;
401	mLink[0] = mRanks2;
402	// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
403	for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
404
405	// Perform Radix Sort
406	ubyte* InputBytes = (ubyte*)input;
407	InputBytes += j;
408	if(INVALID_RANKS)
409	{
410	// for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
411	for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
412	VALIDATE_RANKS;
413	}
414	else
415	{
416	udword* Indices = mRanks;
417	udword* IndicesEnd = &mRanks[nb];
418	while(Indices!=IndicesEnd)
419	{
420	udword id = *Indices++;
421	// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
422	*mLink[InputBytes[id<<2]]++ = id;
423	}
424	}
425
426	// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
427	udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
428	}
429	}
430	else
431	{
432	// This is a special case to correctly handle negative values
433	CHECK_PASS_VALIDITY(j);
434
435	if(PerformPass)
436	{
437	// Create biased offsets, in order for negative numbers to be sorted as well
438	// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
439	mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
440	// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
441	for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
442
443	// We must reverse the sorting order for negative numbers!
444	// mOffset[255] = 0;
445	mLink[255] = mRanks2;
446	// for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
447	for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
448	// for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
449	for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
450
451	// Perform Radix Sort
452	if(INVALID_RANKS)
453	{
454	for(udword i=0;i<nb;i++)
455	{
456	udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
457	// ### cmp to be killed. Not good. Later.
458	// if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
459	// else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
460	if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
461	else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
462	}
463	VALIDATE_RANKS;
464	}
465	else
466	{
467	for(udword i=0;i<nb;i++)
468	{
469	udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
470	// ### cmp to be killed. Not good. Later.
471	// if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
472	// else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
473	if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
474	else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
475	}
476	}
477	// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
478	udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
479	}
480	else
481	{
482	// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
483	if(UniqueVal>=128)
484	{
485	if(INVALID_RANKS)
486	{
487	// ###Possible?
488	for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
489	VALIDATE_RANKS;
490	}
491	else
492	{
493	for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
494	}
495
496	// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
497	udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
498	}
499	}
500	}
501	}
502	return *this;
503	}
504
505	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
506	/**
507	* Gets the ram used.
508	* \return memory used in bytes
509	*/
510	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
511	udword RadixSort::GetUsedRam() const
512	{
513	udword UsedRam = sizeof(RadixSort);
514	#ifndef RADIX_LOCAL_RAM
515	UsedRam += 2564sizeof(udword); // Histograms
516	UsedRam += 256*sizeof(udword); // Offsets
517	#endif
518	UsedRam += 2CURRENT_SIZEsizeof(udword); // 2 lists of indices
519	return UsedRam;
520	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: