| 1 | /******************************************************************** |
|---|
| 2 | * * |
|---|
| 3 | * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. * |
|---|
| 4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|---|
| 5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|---|
| 6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|---|
| 7 | * * |
|---|
| 8 | * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001 * |
|---|
| 9 | * by the Xiph.Org Foundation http://www.xiph.org/ * |
|---|
| 10 | * * |
|---|
| 11 | ******************************************************************** |
|---|
| 12 | |
|---|
| 13 | function: build a VQ codebook and the encoding decision 'tree' |
|---|
| 14 | last mod: $Id: vqsplit.c 13293 2007-07-24 00:09:47Z xiphmont $ |
|---|
| 15 | |
|---|
| 16 | ********************************************************************/ |
|---|
| 17 | |
|---|
| 18 | /* This code is *not* part of libvorbis. It is used to generate |
|---|
| 19 | trained codebooks offline and then spit the results into a |
|---|
| 20 | pregenerated codebook that is compiled into libvorbis. It is an |
|---|
| 21 | expensive (but good) algorithm. Run it on big iron. */ |
|---|
| 22 | |
|---|
| 23 | /* There are so many optimizations to explore in *both* stages that |
|---|
| 24 | considering the undertaking is almost withering. For now, we brute |
|---|
| 25 | force it all */ |
|---|
| 26 | |
|---|
| 27 | #include <stdlib.h> |
|---|
| 28 | #include <stdio.h> |
|---|
| 29 | #include <math.h> |
|---|
| 30 | #include <string.h> |
|---|
| 31 | #include <sys/time.h> |
|---|
| 32 | |
|---|
| 33 | #include "vqgen.h" |
|---|
| 34 | #include "vqsplit.h" |
|---|
| 35 | #include "bookutil.h" |
|---|
| 36 | |
|---|
| 37 | /* Codebook generation happens in two steps: |
|---|
| 38 | |
|---|
| 39 | 1) Train the codebook with data collected from the encoder: We use |
|---|
| 40 | one of a few error metrics (which represent the distance between a |
|---|
| 41 | given data point and a candidate point in the training set) to |
|---|
| 42 | divide the training set up into cells representing roughly equal |
|---|
| 43 | probability of occurring. |
|---|
| 44 | |
|---|
| 45 | 2) Generate the codebook and auxiliary data from the trained data set |
|---|
| 46 | */ |
|---|
| 47 | |
|---|
| 48 | /* Building a codebook from trained set ********************************** |
|---|
| 49 | |
|---|
| 50 | The codebook in raw form is technically finished once it's trained. |
|---|
| 51 | However, we want to finalize the representative codebook values for |
|---|
| 52 | each entry and generate auxiliary information to optimize encoding. |
|---|
| 53 | We generate the auxiliary coding tree using collected data, |
|---|
| 54 | probably the same data as in the original training */ |
|---|
| 55 | |
|---|
| 56 | /* At each recursion, the data set is split in half. Cells with data |
|---|
| 57 | points on side A go into set A, same with set B. The sets may |
|---|
| 58 | overlap. If the cell overlaps the deviding line only very slightly |
|---|
| 59 | (provided parameter), we may choose to ignore the overlap in order |
|---|
| 60 | to pare the tree down */ |
|---|
| 61 | |
|---|
| 62 | long *isortvals; |
|---|
| 63 | int iascsort(const void *a,const void *b){ |
|---|
| 64 | long av=isortvals[*((long *)a)]; |
|---|
| 65 | long bv=isortvals[*((long *)b)]; |
|---|
| 66 | return(av-bv); |
|---|
| 67 | } |
|---|
| 68 | |
|---|
| 69 | static float _Ndist(int el,float *a, float *b){ |
|---|
| 70 | int i; |
|---|
| 71 | float acc=0.f; |
|---|
| 72 | for(i=0;i<el;i++){ |
|---|
| 73 | float val=(a[i]-b[i]); |
|---|
| 74 | acc+=val*val; |
|---|
| 75 | } |
|---|
| 76 | return sqrt(acc); |
|---|
| 77 | } |
|---|
| 78 | |
|---|
| 79 | #define _Npoint(i) (pointlist+dim*(i)) |
|---|
| 80 | #define _Nnow(i) (entrylist+dim*(i)) |
|---|
| 81 | |
|---|
| 82 | |
|---|
| 83 | /* goes through the split, but just counts it and returns a metric*/ |
|---|
| 84 | int vqsp_count(float *entrylist,float *pointlist,int dim, |
|---|
| 85 | long *membership,long *reventry, |
|---|
| 86 | long *entryindex,long entries, |
|---|
| 87 | long *pointindex,long points,int splitp, |
|---|
| 88 | long *entryA,long *entryB, |
|---|
| 89 | long besti,long bestj, |
|---|
| 90 | long *entriesA,long *entriesB,long *entriesC){ |
|---|
| 91 | long i,j; |
|---|
| 92 | long A=0,B=0,C=0; |
|---|
| 93 | long pointsA=0; |
|---|
| 94 | long pointsB=0; |
|---|
| 95 | long *temppointsA=NULL; |
|---|
| 96 | long *temppointsB=NULL; |
|---|
| 97 | |
|---|
| 98 | if(splitp){ |
|---|
| 99 | temppointsA=_ogg_malloc(points*sizeof(long)); |
|---|
| 100 | temppointsB=_ogg_malloc(points*sizeof(long)); |
|---|
| 101 | } |
|---|
| 102 | |
|---|
| 103 | memset(entryA,0,sizeof(long)*entries); |
|---|
| 104 | memset(entryB,0,sizeof(long)*entries); |
|---|
| 105 | |
|---|
| 106 | /* Do the points belonging to this cell occur on sideA, sideB or |
|---|
| 107 | both? */ |
|---|
| 108 | |
|---|
| 109 | for(i=0;i<points;i++){ |
|---|
| 110 | float *ppt=_Npoint(pointindex[i]); |
|---|
| 111 | long firstentry=membership[pointindex[i]]; |
|---|
| 112 | |
|---|
| 113 | if(firstentry==besti){ |
|---|
| 114 | entryA[reventry[firstentry]]=1; |
|---|
| 115 | if(splitp)temppointsA[pointsA++]=pointindex[i]; |
|---|
| 116 | continue; |
|---|
| 117 | } |
|---|
| 118 | if(firstentry==bestj){ |
|---|
| 119 | entryB[reventry[firstentry]]=1; |
|---|
| 120 | if(splitp)temppointsB[pointsB++]=pointindex[i]; |
|---|
| 121 | continue; |
|---|
| 122 | } |
|---|
| 123 | { |
|---|
| 124 | float distA=_Ndist(dim,ppt,_Nnow(besti)); |
|---|
| 125 | float distB=_Ndist(dim,ppt,_Nnow(bestj)); |
|---|
| 126 | if(distA<distB){ |
|---|
| 127 | entryA[reventry[firstentry]]=1; |
|---|
| 128 | if(splitp)temppointsA[pointsA++]=pointindex[i]; |
|---|
| 129 | }else{ |
|---|
| 130 | entryB[reventry[firstentry]]=1; |
|---|
| 131 | if(splitp)temppointsB[pointsB++]=pointindex[i]; |
|---|
| 132 | } |
|---|
| 133 | } |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | /* The entry splitting isn't total, so that storage has to be |
|---|
| 137 | allocated for recursion. Reuse the entryA/entryB vectors */ |
|---|
| 138 | /* keep the entries in ascending order (relative to the original |
|---|
| 139 | list); we rely on that stability when ordering p/q choice */ |
|---|
| 140 | for(j=0;j<entries;j++){ |
|---|
| 141 | if(entryA[j] && entryB[j])C++; |
|---|
| 142 | if(entryA[j])entryA[A++]=entryindex[j]; |
|---|
| 143 | if(entryB[j])entryB[B++]=entryindex[j]; |
|---|
| 144 | } |
|---|
| 145 | *entriesA=A; |
|---|
| 146 | *entriesB=B; |
|---|
| 147 | *entriesC=C; |
|---|
| 148 | if(splitp){ |
|---|
| 149 | memcpy(pointindex,temppointsA,sizeof(long)*pointsA); |
|---|
| 150 | memcpy(pointindex+pointsA,temppointsB,sizeof(long)*pointsB); |
|---|
| 151 | free(temppointsA); |
|---|
| 152 | free(temppointsB); |
|---|
| 153 | } |
|---|
| 154 | return(pointsA); |
|---|
| 155 | } |
|---|
| 156 | |
|---|
| 157 | int lp_split(float *pointlist,long totalpoints, |
|---|
| 158 | codebook *b, |
|---|
| 159 | long *entryindex,long entries, |
|---|
| 160 | long *pointindex,long points, |
|---|
| 161 | long *membership,long *reventry, |
|---|
| 162 | long depth, long *pointsofar){ |
|---|
| 163 | |
|---|
| 164 | encode_aux_nearestmatch *t=b->c->nearest_tree; |
|---|
| 165 | |
|---|
| 166 | /* The encoder, regardless of book, will be using a straight |
|---|
| 167 | euclidian distance-to-point metric to determine closest point. |
|---|
| 168 | Thus we split the cells using the same (we've already trained the |
|---|
| 169 | codebook set spacing and distribution using special metrics and |
|---|
| 170 | even a midpoint division won't disturb the basic properties) */ |
|---|
| 171 | |
|---|
| 172 | int dim=b->dim; |
|---|
| 173 | float *entrylist=b->valuelist; |
|---|
| 174 | long ret; |
|---|
| 175 | long *entryA=_ogg_calloc(entries,sizeof(long)); |
|---|
| 176 | long *entryB=_ogg_calloc(entries,sizeof(long)); |
|---|
| 177 | long entriesA=0; |
|---|
| 178 | long entriesB=0; |
|---|
| 179 | long entriesC=0; |
|---|
| 180 | long pointsA=0; |
|---|
| 181 | long i,j,k; |
|---|
| 182 | |
|---|
| 183 | long besti=-1; |
|---|
| 184 | long bestj=-1; |
|---|
| 185 | |
|---|
| 186 | char spinbuf[80]; |
|---|
| 187 | sprintf(spinbuf,"splitting [%ld left]... ",totalpoints-*pointsofar); |
|---|
| 188 | |
|---|
| 189 | /* one reverse index needed */ |
|---|
| 190 | for(i=0;i<b->entries;i++)reventry[i]=-1; |
|---|
| 191 | for(i=0;i<entries;i++)reventry[entryindex[i]]=i; |
|---|
| 192 | |
|---|
| 193 | /* We need to find the dividing hyperplane. find the median of each |
|---|
| 194 | axis as the centerpoint and the normal facing farthest point */ |
|---|
| 195 | |
|---|
| 196 | /* more than one way to do this part. For small sets, we can brute |
|---|
| 197 | force it. */ |
|---|
| 198 | |
|---|
| 199 | if(entries<8 || (float)points*entries*entries<16.f*1024*1024){ |
|---|
| 200 | /* try every pair possibility */ |
|---|
| 201 | float best=0; |
|---|
| 202 | float this; |
|---|
| 203 | for(i=0;i<entries-1;i++){ |
|---|
| 204 | for(j=i+1;j<entries;j++){ |
|---|
| 205 | spinnit(spinbuf,entries-i); |
|---|
| 206 | vqsp_count(b->valuelist,pointlist,dim, |
|---|
| 207 | membership,reventry, |
|---|
| 208 | entryindex,entries, |
|---|
| 209 | pointindex,points,0, |
|---|
| 210 | entryA,entryB, |
|---|
| 211 | entryindex[i],entryindex[j], |
|---|
| 212 | &entriesA,&entriesB,&entriesC); |
|---|
| 213 | this=(entriesA-entriesC)*(entriesB-entriesC); |
|---|
| 214 | |
|---|
| 215 | /* when choosing best, we also want some form of stability to |
|---|
| 216 | make sure more branches are pared later; secondary |
|---|
| 217 | weighting isn;t needed as the entry lists are in ascending |
|---|
| 218 | order, and we always try p/q in the same sequence */ |
|---|
| 219 | |
|---|
| 220 | if( (besti==-1) || |
|---|
| 221 | (this>best) ){ |
|---|
| 222 | |
|---|
| 223 | best=this; |
|---|
| 224 | besti=entryindex[i]; |
|---|
| 225 | bestj=entryindex[j]; |
|---|
| 226 | |
|---|
| 227 | } |
|---|
| 228 | } |
|---|
| 229 | } |
|---|
| 230 | }else{ |
|---|
| 231 | float *p=alloca(dim*sizeof(float)); |
|---|
| 232 | float *q=alloca(dim*sizeof(float)); |
|---|
| 233 | float best=0.f; |
|---|
| 234 | |
|---|
| 235 | /* try COG/normal and furthest pairs */ |
|---|
| 236 | /* meanpoint */ |
|---|
| 237 | /* eventually, we want to select the closest entry and figure n/c |
|---|
| 238 | from p/q (because storing n/c is too large */ |
|---|
| 239 | for(k=0;k<dim;k++){ |
|---|
| 240 | spinnit(spinbuf,entries); |
|---|
| 241 | |
|---|
| 242 | p[k]=0.f; |
|---|
| 243 | for(j=0;j<entries;j++) |
|---|
| 244 | p[k]+=b->valuelist[entryindex[j]*dim+k]; |
|---|
| 245 | p[k]/=entries; |
|---|
| 246 | |
|---|
| 247 | } |
|---|
| 248 | |
|---|
| 249 | /* we go through the entries one by one, looking for the entry on |
|---|
| 250 | the other side closest to the point of reflection through the |
|---|
| 251 | center */ |
|---|
| 252 | |
|---|
| 253 | for(i=0;i<entries;i++){ |
|---|
| 254 | float *ppi=_Nnow(entryindex[i]); |
|---|
| 255 | float ref_best=0.f; |
|---|
| 256 | float ref_j=-1; |
|---|
| 257 | float this; |
|---|
| 258 | spinnit(spinbuf,entries-i); |
|---|
| 259 | |
|---|
| 260 | for(k=0;k<dim;k++) |
|---|
| 261 | q[k]=2*p[k]-ppi[k]; |
|---|
| 262 | |
|---|
| 263 | for(j=0;j<entries;j++){ |
|---|
| 264 | if(j!=i){ |
|---|
| 265 | float this=_Ndist(dim,q,_Nnow(entryindex[j])); |
|---|
| 266 | if(ref_j==-1 || this<=ref_best){ /* <=, not <; very important */ |
|---|
| 267 | ref_best=this; |
|---|
| 268 | ref_j=entryindex[j]; |
|---|
| 269 | } |
|---|
| 270 | } |
|---|
| 271 | } |
|---|
| 272 | |
|---|
| 273 | vqsp_count(b->valuelist,pointlist,dim, |
|---|
| 274 | membership,reventry, |
|---|
| 275 | entryindex,entries, |
|---|
| 276 | pointindex,points,0, |
|---|
| 277 | entryA,entryB, |
|---|
| 278 | entryindex[i],ref_j, |
|---|
| 279 | &entriesA,&entriesB,&entriesC); |
|---|
| 280 | this=(entriesA-entriesC)*(entriesB-entriesC); |
|---|
| 281 | |
|---|
| 282 | /* when choosing best, we also want some form of stability to |
|---|
| 283 | make sure more branches are pared later; secondary |
|---|
| 284 | weighting isn;t needed as the entry lists are in ascending |
|---|
| 285 | order, and we always try p/q in the same sequence */ |
|---|
| 286 | |
|---|
| 287 | if( (besti==-1) || |
|---|
| 288 | (this>best) ){ |
|---|
| 289 | |
|---|
| 290 | best=this; |
|---|
| 291 | besti=entryindex[i]; |
|---|
| 292 | bestj=ref_j; |
|---|
| 293 | |
|---|
| 294 | } |
|---|
| 295 | } |
|---|
| 296 | if(besti>bestj){ |
|---|
| 297 | long temp=besti; |
|---|
| 298 | besti=bestj; |
|---|
| 299 | bestj=temp; |
|---|
| 300 | } |
|---|
| 301 | |
|---|
| 302 | } |
|---|
| 303 | |
|---|
| 304 | /* find cells enclosing points */ |
|---|
| 305 | /* count A/B points */ |
|---|
| 306 | |
|---|
| 307 | pointsA=vqsp_count(b->valuelist,pointlist,dim, |
|---|
| 308 | membership,reventry, |
|---|
| 309 | entryindex,entries, |
|---|
| 310 | pointindex,points,1, |
|---|
| 311 | entryA,entryB, |
|---|
| 312 | besti,bestj, |
|---|
| 313 | &entriesA,&entriesB,&entriesC); |
|---|
| 314 | |
|---|
| 315 | /* fprintf(stderr,"split: total=%ld depth=%ld set A=%ld:%ld:%ld=B\n", |
|---|
| 316 | entries,depth,entriesA-entriesC,entriesC,entriesB-entriesC);*/ |
|---|
| 317 | { |
|---|
| 318 | long thisaux=t->aux++; |
|---|
| 319 | if(t->aux>=t->alloc){ |
|---|
| 320 | t->alloc*=2; |
|---|
| 321 | t->ptr0=_ogg_realloc(t->ptr0,sizeof(long)*t->alloc); |
|---|
| 322 | t->ptr1=_ogg_realloc(t->ptr1,sizeof(long)*t->alloc); |
|---|
| 323 | t->p=_ogg_realloc(t->p,sizeof(long)*t->alloc); |
|---|
| 324 | t->q=_ogg_realloc(t->q,sizeof(long)*t->alloc); |
|---|
| 325 | } |
|---|
| 326 | |
|---|
| 327 | t->p[thisaux]=besti; |
|---|
| 328 | t->q[thisaux]=bestj; |
|---|
| 329 | |
|---|
| 330 | if(entriesA==1){ |
|---|
| 331 | ret=1; |
|---|
| 332 | t->ptr0[thisaux]=entryA[0]; |
|---|
| 333 | *pointsofar+=pointsA; |
|---|
| 334 | }else{ |
|---|
| 335 | t->ptr0[thisaux]= -t->aux; |
|---|
| 336 | ret=lp_split(pointlist,totalpoints,b,entryA,entriesA,pointindex,pointsA, |
|---|
| 337 | membership,reventry,depth+1,pointsofar); |
|---|
| 338 | } |
|---|
| 339 | if(entriesB==1){ |
|---|
| 340 | ret++; |
|---|
| 341 | t->ptr1[thisaux]=entryB[0]; |
|---|
| 342 | *pointsofar+=points-pointsA; |
|---|
| 343 | }else{ |
|---|
| 344 | t->ptr1[thisaux]= -t->aux; |
|---|
| 345 | ret+=lp_split(pointlist,totalpoints,b,entryB,entriesB,pointindex+pointsA, |
|---|
| 346 | points-pointsA,membership,reventry, |
|---|
| 347 | depth+1,pointsofar); |
|---|
| 348 | } |
|---|
| 349 | } |
|---|
| 350 | free(entryA); |
|---|
| 351 | free(entryB); |
|---|
| 352 | return(ret); |
|---|
| 353 | } |
|---|
| 354 | |
|---|
| 355 | static int _node_eq(encode_aux_nearestmatch *v, long a, long b){ |
|---|
| 356 | long Aptr0=v->ptr0[a]; |
|---|
| 357 | long Aptr1=v->ptr1[a]; |
|---|
| 358 | long Bptr0=v->ptr0[b]; |
|---|
| 359 | long Bptr1=v->ptr1[b]; |
|---|
| 360 | |
|---|
| 361 | /* the possibility of choosing the same p and q, but switched, can;t |
|---|
| 362 | happen because we always look for the best p/q in the same search |
|---|
| 363 | order and the search is stable */ |
|---|
| 364 | |
|---|
| 365 | if(Aptr0==Bptr0 && Aptr1==Bptr1) |
|---|
| 366 | return(1); |
|---|
| 367 | |
|---|
| 368 | return(0); |
|---|
| 369 | } |
|---|
| 370 | |
|---|
| 371 | void vqsp_book(vqgen *v, codebook *b, long *quantlist){ |
|---|
| 372 | long i,j; |
|---|
| 373 | static_codebook *c=(static_codebook *)b->c; |
|---|
| 374 | encode_aux_nearestmatch *t; |
|---|
| 375 | |
|---|
| 376 | memset(b,0,sizeof(codebook)); |
|---|
| 377 | memset(c,0,sizeof(static_codebook)); |
|---|
| 378 | b->c=c; |
|---|
| 379 | t=c->nearest_tree=_ogg_calloc(1,sizeof(encode_aux_nearestmatch)); |
|---|
| 380 | c->maptype=2; |
|---|
| 381 | |
|---|
| 382 | /* make sure there are no duplicate entries and that every |
|---|
| 383 | entry has points */ |
|---|
| 384 | |
|---|
| 385 | for(i=0;i<v->entries;){ |
|---|
| 386 | /* duplicate? if so, eliminate */ |
|---|
| 387 | for(j=0;j<i;j++){ |
|---|
| 388 | if(_Ndist(v->elements,_now(v,i),_now(v,j))==0.f){ |
|---|
| 389 | fprintf(stderr,"found a duplicate entry! removing...\n"); |
|---|
| 390 | v->entries--; |
|---|
| 391 | memcpy(_now(v,i),_now(v,v->entries),sizeof(float)*v->elements); |
|---|
| 392 | memcpy(quantlist+i*v->elements,quantlist+v->entries*v->elements, |
|---|
| 393 | sizeof(long)*v->elements); |
|---|
| 394 | break; |
|---|
| 395 | } |
|---|
| 396 | } |
|---|
| 397 | if(j==i)i++; |
|---|
| 398 | } |
|---|
| 399 | |
|---|
| 400 | { |
|---|
| 401 | v->assigned=_ogg_calloc(v->entries,sizeof(long)); |
|---|
| 402 | for(i=0;i<v->points;i++){ |
|---|
| 403 | float *ppt=_point(v,i); |
|---|
| 404 | float firstmetric=_Ndist(v->elements,_now(v,0),ppt); |
|---|
| 405 | long firstentry=0; |
|---|
| 406 | |
|---|
| 407 | if(!(i&0xff))spinnit("checking... ",v->points-i); |
|---|
| 408 | |
|---|
| 409 | for(j=0;j<v->entries;j++){ |
|---|
| 410 | float thismetric=_Ndist(v->elements,_now(v,j),ppt); |
|---|
| 411 | if(thismetric<firstmetric){ |
|---|
| 412 | firstmetric=thismetric; |
|---|
| 413 | firstentry=j; |
|---|
| 414 | } |
|---|
| 415 | } |
|---|
| 416 | |
|---|
| 417 | v->assigned[firstentry]++; |
|---|
| 418 | } |
|---|
| 419 | |
|---|
| 420 | for(j=0;j<v->entries;){ |
|---|
| 421 | if(v->assigned[j]==0){ |
|---|
| 422 | fprintf(stderr,"found an unused entry! removing...\n"); |
|---|
| 423 | v->entries--; |
|---|
| 424 | memcpy(_now(v,j),_now(v,v->entries),sizeof(float)*v->elements); |
|---|
| 425 | v->assigned[j]=v->assigned[v->elements]; |
|---|
| 426 | memcpy(quantlist+j*v->elements,quantlist+v->entries*v->elements, |
|---|
| 427 | sizeof(long)*v->elements); |
|---|
| 428 | continue; |
|---|
| 429 | } |
|---|
| 430 | j++; |
|---|
| 431 | } |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | fprintf(stderr,"Building a book with %ld unique entries...\n",v->entries); |
|---|
| 435 | |
|---|
| 436 | { |
|---|
| 437 | long *entryindex=_ogg_malloc(v->entries*sizeof(long *)); |
|---|
| 438 | long *pointindex=_ogg_malloc(v->points*sizeof(long)); |
|---|
| 439 | long *membership=_ogg_malloc(v->points*sizeof(long)); |
|---|
| 440 | long *reventry=_ogg_malloc(v->entries*sizeof(long)); |
|---|
| 441 | long pointssofar=0; |
|---|
| 442 | |
|---|
| 443 | for(i=0;i<v->entries;i++)entryindex[i]=i; |
|---|
| 444 | for(i=0;i<v->points;i++)pointindex[i]=i; |
|---|
| 445 | |
|---|
| 446 | t->alloc=4096; |
|---|
| 447 | t->ptr0=_ogg_malloc(sizeof(long)*t->alloc); |
|---|
| 448 | t->ptr1=_ogg_malloc(sizeof(long)*t->alloc); |
|---|
| 449 | t->p=_ogg_malloc(sizeof(long)*t->alloc); |
|---|
| 450 | t->q=_ogg_malloc(sizeof(long)*t->alloc); |
|---|
| 451 | t->aux=0; |
|---|
| 452 | c->dim=v->elements; |
|---|
| 453 | c->entries=v->entries; |
|---|
| 454 | c->lengthlist=_ogg_calloc(c->entries,sizeof(long)); |
|---|
| 455 | b->valuelist=v->entrylist; /* temporary; replaced later */ |
|---|
| 456 | b->dim=c->dim; |
|---|
| 457 | b->entries=c->entries; |
|---|
| 458 | |
|---|
| 459 | for(i=0;i<v->points;i++)membership[i]=-1; |
|---|
| 460 | for(i=0;i<v->points;i++){ |
|---|
| 461 | float *ppt=_point(v,i); |
|---|
| 462 | long firstentry=0; |
|---|
| 463 | float firstmetric=_Ndist(v->elements,_now(v,0),ppt); |
|---|
| 464 | |
|---|
| 465 | if(!(i&0xff))spinnit("assigning... ",v->points-i); |
|---|
| 466 | |
|---|
| 467 | for(j=1;j<v->entries;j++){ |
|---|
| 468 | if(v->assigned[j]!=-1){ |
|---|
| 469 | float thismetric=_Ndist(v->elements,_now(v,j),ppt); |
|---|
| 470 | if(thismetric<=firstmetric){ |
|---|
| 471 | firstmetric=thismetric; |
|---|
| 472 | firstentry=j; |
|---|
| 473 | } |
|---|
| 474 | } |
|---|
| 475 | } |
|---|
| 476 | |
|---|
| 477 | membership[i]=firstentry; |
|---|
| 478 | } |
|---|
| 479 | |
|---|
| 480 | fprintf(stderr,"Leaves added: %d \n", |
|---|
| 481 | lp_split(v->pointlist,v->points, |
|---|
| 482 | b,entryindex,v->entries, |
|---|
| 483 | pointindex,v->points, |
|---|
| 484 | membership,reventry, |
|---|
| 485 | 0,&pointssofar)); |
|---|
| 486 | |
|---|
| 487 | free(pointindex); |
|---|
| 488 | free(membership); |
|---|
| 489 | free(reventry); |
|---|
| 490 | |
|---|
| 491 | fprintf(stderr,"Paring/rerouting redundant branches... "); |
|---|
| 492 | |
|---|
| 493 | /* The tree is likely big and redundant. Pare and reroute branches */ |
|---|
| 494 | { |
|---|
| 495 | int changedflag=1; |
|---|
| 496 | |
|---|
| 497 | while(changedflag){ |
|---|
| 498 | changedflag=0; |
|---|
| 499 | |
|---|
| 500 | /* span the tree node by node; list unique decision nodes and |
|---|
| 501 | short circuit redundant branches */ |
|---|
| 502 | |
|---|
| 503 | for(i=0;i<t->aux;){ |
|---|
| 504 | int k; |
|---|
| 505 | |
|---|
| 506 | /* check list of unique decisions */ |
|---|
| 507 | for(j=0;j<i;j++) |
|---|
| 508 | if(_node_eq(t,i,j))break; |
|---|
| 509 | |
|---|
| 510 | if(j<i){ |
|---|
| 511 | /* a redundant entry; find all higher nodes referencing it and |
|---|
| 512 | short circuit them to the previously noted unique entry */ |
|---|
| 513 | changedflag=1; |
|---|
| 514 | for(k=0;k<t->aux;k++){ |
|---|
| 515 | if(t->ptr0[k]==-i)t->ptr0[k]=-j; |
|---|
| 516 | if(t->ptr1[k]==-i)t->ptr1[k]=-j; |
|---|
| 517 | } |
|---|
| 518 | |
|---|
| 519 | /* Now, we need to fill in the hole from this redundant |
|---|
| 520 | entry in the listing. Insert the last entry in the list. |
|---|
| 521 | Fix the forward pointers to that last entry */ |
|---|
| 522 | t->aux--; |
|---|
| 523 | t->ptr0[i]=t->ptr0[t->aux]; |
|---|
| 524 | t->ptr1[i]=t->ptr1[t->aux]; |
|---|
| 525 | t->p[i]=t->p[t->aux]; |
|---|
| 526 | t->q[i]=t->q[t->aux]; |
|---|
| 527 | for(k=0;k<t->aux;k++){ |
|---|
| 528 | if(t->ptr0[k]==-t->aux)t->ptr0[k]=-i; |
|---|
| 529 | if(t->ptr1[k]==-t->aux)t->ptr1[k]=-i; |
|---|
| 530 | } |
|---|
| 531 | /* hole plugged */ |
|---|
| 532 | |
|---|
| 533 | }else |
|---|
| 534 | i++; |
|---|
| 535 | } |
|---|
| 536 | |
|---|
| 537 | fprintf(stderr,"\rParing/rerouting redundant branches... " |
|---|
| 538 | "%ld remaining ",t->aux); |
|---|
| 539 | } |
|---|
| 540 | fprintf(stderr,"\n"); |
|---|
| 541 | } |
|---|
| 542 | } |
|---|
| 543 | |
|---|
| 544 | /* run all training points through the decision tree to get a final |
|---|
| 545 | probability count */ |
|---|
| 546 | { |
|---|
| 547 | long *probability=_ogg_malloc(c->entries*sizeof(long)); |
|---|
| 548 | for(i=0;i<c->entries;i++)probability[i]=1; /* trivial guard */ |
|---|
| 549 | b->dim=c->dim; |
|---|
| 550 | |
|---|
| 551 | /* sigh. A necessary hack */ |
|---|
| 552 | for(i=0;i<t->aux;i++)t->p[i]*=c->dim; |
|---|
| 553 | for(i=0;i<t->aux;i++)t->q[i]*=c->dim; |
|---|
| 554 | |
|---|
| 555 | for(i=0;i<v->points;i++){ |
|---|
| 556 | /* we use the linear matcher regardless becuase the trainer |
|---|
| 557 | doesn't convert log to linear */ |
|---|
| 558 | int ret=_best(b,v->pointlist+i*v->elements,1); |
|---|
| 559 | probability[ret]++; |
|---|
| 560 | if(!(i&0xff))spinnit("counting hits... ",v->points-i); |
|---|
| 561 | } |
|---|
| 562 | for(i=0;i<t->aux;i++)t->p[i]/=c->dim; |
|---|
| 563 | for(i=0;i<t->aux;i++)t->q[i]/=c->dim; |
|---|
| 564 | |
|---|
| 565 | build_tree_from_lengths(c->entries,probability,c->lengthlist); |
|---|
| 566 | |
|---|
| 567 | free(probability); |
|---|
| 568 | } |
|---|
| 569 | |
|---|
| 570 | /* Sort the entries by codeword length, short to long (eases |
|---|
| 571 | assignment and packing to do it now) */ |
|---|
| 572 | { |
|---|
| 573 | long *wordlen=c->lengthlist; |
|---|
| 574 | long *index=_ogg_malloc(c->entries*sizeof(long)); |
|---|
| 575 | long *revindex=_ogg_malloc(c->entries*sizeof(long)); |
|---|
| 576 | int k; |
|---|
| 577 | for(i=0;i<c->entries;i++)index[i]=i; |
|---|
| 578 | isortvals=c->lengthlist; |
|---|
| 579 | qsort(index,c->entries,sizeof(long),iascsort); |
|---|
| 580 | |
|---|
| 581 | /* rearrange storage; ptr0/1 first as it needs a reverse index */ |
|---|
| 582 | /* n and c stay unchanged */ |
|---|
| 583 | for(i=0;i<c->entries;i++)revindex[index[i]]=i; |
|---|
| 584 | for(i=0;i<t->aux;i++){ |
|---|
| 585 | if(!(i&0x3f))spinnit("sorting... ",t->aux-i); |
|---|
| 586 | |
|---|
| 587 | if(t->ptr0[i]>=0)t->ptr0[i]=revindex[t->ptr0[i]]; |
|---|
| 588 | if(t->ptr1[i]>=0)t->ptr1[i]=revindex[t->ptr1[i]]; |
|---|
| 589 | t->p[i]=revindex[t->p[i]]; |
|---|
| 590 | t->q[i]=revindex[t->q[i]]; |
|---|
| 591 | } |
|---|
| 592 | free(revindex); |
|---|
| 593 | |
|---|
| 594 | /* map lengthlist and vallist with index */ |
|---|
| 595 | c->lengthlist=_ogg_calloc(c->entries,sizeof(long)); |
|---|
| 596 | b->valuelist=_ogg_malloc(sizeof(float)*c->entries*c->dim); |
|---|
| 597 | c->quantlist=_ogg_malloc(sizeof(long)*c->entries*c->dim); |
|---|
| 598 | for(i=0;i<c->entries;i++){ |
|---|
| 599 | long e=index[i]; |
|---|
| 600 | for(k=0;k<c->dim;k++){ |
|---|
| 601 | b->valuelist[i*c->dim+k]=v->entrylist[e*c->dim+k]; |
|---|
| 602 | c->quantlist[i*c->dim+k]=quantlist[e*c->dim+k]; |
|---|
| 603 | } |
|---|
| 604 | c->lengthlist[i]=wordlen[e]; |
|---|
| 605 | } |
|---|
| 606 | |
|---|
| 607 | free(wordlen); |
|---|
| 608 | } |
|---|
| 609 | |
|---|
| 610 | fprintf(stderr,"Done. \n\n"); |
|---|
| 611 | } |
|---|
| 612 | |
|---|