Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/libvorbis-1.2.0/vq/latticepare.c @ 16

Last change on this file since 16 was 16, checked in by landauf, 16 years ago

added libvorbis

File size: 16.6 KB
Line 
1/********************************************************************
2 *                                                                  *
3 * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7 *                                                                  *
8 * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001             *
9 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10 *                                                                  *
11 ********************************************************************
12
13 function: utility for paring low hit count cells from lattice codebook
14 last mod: $Id: latticepare.c 13293 2007-07-24 00:09:47Z xiphmont $
15
16 ********************************************************************/
17
18#include <stdlib.h>
19#include <stdio.h>
20#include <math.h>
21#include <string.h>
22#include <errno.h>
23#include "../lib/scales.h"
24#include "bookutil.h"
25#include "vqgen.h"
26#include "vqsplit.h"
27#include "../lib/os.h"
28
29/* Lattice codebooks have two strengths: important fetaures that are
30   poorly modelled by global error minimization training (eg, strong
31   peaks) are not neglected 2) compact quantized representation.
32
33   A fully populated lattice codebook, however, swings point 1 too far
34   in the opposite direction; rare features need not be modelled quite
35   so religiously and as such, we waste bits unless we eliminate the
36   least common cells.  The codebook rep supports unused cells, so we
37   need to tag such cells and build an auxiliary (non-thresh) search
38   mechanism to find the proper match quickly */
39
40/* two basic steps; first is pare the cell for which dispersal creates
41   the least additional error.  This will naturally choose
42   low-population cells and cells that have not taken on points from
43   neighboring paring (but does not result in the lattice collapsing
44   inward and leaving low population ares totally unmodelled).  After
45   paring has removed the desired number of cells, we need to build an
46   auxiliary search for each culled point */
47
48/* Although lattice books (due to threshhold-based matching) do not
49   actually use error to make cell selections (in fact, it need not
50   bear any relation), the 'secondbest' entry finder here is in fact
51   error/distance based, so latticepare is only useful on such books */
52
53/* command line:
54   latticepare latticebook.vqh input_data.vqd <target_cells>
55
56   produces a new output book on stdout
57*/
58
59static float _dist(int el,float *a, float *b){
60  int i;
61  float acc=0.f;
62  for(i=0;i<el;i++){
63    float val=(a[i]-b[i]);
64    acc+=val*val;
65  }
66  return(acc);
67}
68
69static float *pointlist;
70static long points=0;
71
72void add_vector(codebook *b,float *vec,long n){
73  int dim=b->dim,i,j;
74  int step=n/dim;
75  for(i=0;i<step;i++){
76    for(j=i;j<n;j+=step){
77      pointlist[points++]=vec[j];
78    }
79  }
80}
81
82static int bestm(codebook *b,float *vec){
83  encode_aux_threshmatch *tt=b->c->thresh_tree;
84  int dim=b->dim;
85  int i,k,o;
86  int best=0;
87 
88  /* what would be the closest match if the codebook was fully
89     populated? */
90 
91  for(k=0,o=dim-1;k<dim;k++,o--){
92    int i;
93    for(i=0;i<tt->threshvals-1;i++)
94      if(vec[o]<tt->quantthresh[i])break;
95    best=(best*tt->quantvals)+tt->quantmap[i];
96  }
97  return(best);
98}
99
100static int closest(codebook *b,float *vec,int current){
101  encode_aux_threshmatch *tt=b->c->thresh_tree;
102  int dim=b->dim;
103  int i,k,o;
104
105  float bestmetric=0;
106  int bestentry=-1;
107  int best=bestm(b,vec);
108
109  if(current<0 && b->c->lengthlist[best]>0)return best;
110
111  for(i=0;i<b->entries;i++){
112    if(b->c->lengthlist[i]>0 && i!=best && i!=current){
113      float thismetric=_dist(dim, vec, b->valuelist+i*dim);
114      if(bestentry==-1 || thismetric<bestmetric){
115        bestentry=i;
116        bestmetric=thismetric;
117      }
118    }
119  }
120
121  return(bestentry);
122}
123
124static float _heuristic(codebook *b,float *ppt,int secondbest){
125  float *secondcell=b->valuelist+secondbest*b->dim;
126  int best=bestm(b,ppt);
127  float *firstcell=b->valuelist+best*b->dim;
128  float error=_dist(b->dim,firstcell,secondcell);
129  float *zero=alloca(b->dim*sizeof(float));
130  float fromzero;
131 
132  memset(zero,0,b->dim*sizeof(float));
133  fromzero=sqrt(_dist(b->dim,firstcell,zero));
134
135  return(error/fromzero);
136}
137
138static int longsort(const void *a, const void *b){
139  return **(long **)b-**(long **)a;
140}
141
142void usage(void){
143  fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
144          "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
145          "where <target_cells> is the desired number of final cells (or -1\n"
146          "                     for no change)\n"
147          "      <protected_cells> is the number of highest-hit count cells\n"
148          "                     to protect from dispersal\n"
149          "      base is the base name (not including .vqh) of the new\n"
150          "                     book\n\n");
151  exit(1);
152}
153
154int main(int argc,char *argv[]){
155  char *basename;
156  codebook *b=NULL;
157  int entries=0;
158  int dim=0;
159  long i,j,target=-1,protect=-1;
160  FILE *out=NULL;
161
162  int argnum=0;
163
164  argv++;
165  if(*argv==NULL){
166    usage();
167    exit(1);
168  }
169
170  while(*argv){
171    if(*argv[0]=='-'){
172
173      argv++;
174       
175    }else{
176      switch (argnum++){
177      case 0:case 1:
178        {
179          /* yes, this is evil.  However, it's very convenient to parse file
180             extentions */
181         
182          /* input file.  What kind? */
183          char *dot;
184          char *ext=NULL;
185          char *name=strdup(*argv++);
186          dot=strrchr(name,'.');
187          if(dot)
188            ext=dot+1;
189          else{
190            ext="";
191           
192          }
193         
194         
195          /* codebook */
196          if(!strcmp(ext,"vqh")){
197           
198            basename=strrchr(name,'/');
199            if(basename)
200              basename=strdup(basename)+1;
201            else
202              basename=strdup(name);
203            dot=strrchr(basename,'.');
204            if(dot)*dot='\0';
205           
206            b=codebook_load(name);
207            dim=b->dim;
208            entries=b->entries;
209          }
210         
211          /* data file; we do actually need to suck it into memory */
212          /* we're dealing with just one book, so we can de-interleave */ 
213          if(!strcmp(ext,"vqd") && !points){
214            int cols;
215            long lines=0;
216            char *line;
217            float *vec;
218            FILE *in=fopen(name,"r");
219            if(!in){
220              fprintf(stderr,"Could not open input file %s\n",name);
221              exit(1);
222            }
223           
224            reset_next_value();
225            line=setup_line(in);
226            /* count cols before we start reading */
227            {
228              char *temp=line;
229              while(*temp==' ')temp++;
230              for(cols=0;*temp;cols++){
231                while(*temp>32)temp++;
232                while(*temp==' ')temp++;
233              }
234            }
235            vec=alloca(cols*sizeof(float));
236            /* count, then load, to avoid fragmenting the hell out of
237               memory */
238            while(line){
239              lines++;
240              for(j=0;j<cols;j++)
241                if(get_line_value(in,vec+j)){
242                  fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
243                  exit(1);
244                }
245              if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
246              line=setup_line(in);
247            }
248            pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float));
249           
250            rewind(in);
251            line=setup_line(in);
252            while(line){
253              lines--;
254              for(j=0;j<cols;j++)
255                if(get_line_value(in,vec+j)){
256                  fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
257                  exit(1);
258                }
259              /* deinterleave, add to heap */
260              add_vector(b,vec,cols);
261              if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
262             
263              line=setup_line(in);
264            }
265            fclose(in);
266          }
267        }
268        break;
269      case 2:
270        target=atol(*argv++);
271        if(target==0)target=entries;
272        break;
273      case 3:
274        protect=atol(*argv++);
275        break;
276      case 4:
277        {
278          char *buff=alloca(strlen(*argv)+5);
279          sprintf(buff,"%s.vqh",*argv);
280          basename=*argv++;
281
282          out=fopen(buff,"w");
283          if(!out){
284            fprintf(stderr,"unable ot open %s for output",buff);
285            exit(1);
286          }
287        }
288        break;
289      default:
290        usage();
291      }
292    }
293  }
294  if(!entries || !points || !out)usage();
295  if(target==-1)usage();
296
297  /* add guard points */
298  for(i=0;i<entries;i++)
299    for(j=0;j<dim;j++)
300      pointlist[points++]=b->valuelist[i*dim+j];
301 
302  points/=dim;
303
304  /* set up auxiliary vectors for error tracking */
305  {
306    encode_aux_nearestmatch *nt=NULL;
307    long pointssofar=0;
308    long *pointindex;
309    long indexedpoints=0;
310    long *entryindex;
311    long *reventry;
312    long *membership=_ogg_malloc(points*sizeof(long));
313    long *firsthead=_ogg_malloc(entries*sizeof(long));
314    long *secondary=_ogg_malloc(points*sizeof(long));
315    long *secondhead=_ogg_malloc(entries*sizeof(long));
316
317    long *cellcount=_ogg_calloc(entries,sizeof(long));
318    long *cellcount2=_ogg_calloc(entries,sizeof(long));
319    float *cellerror=_ogg_calloc(entries,sizeof(float));
320    float *cellerrormax=_ogg_calloc(entries,sizeof(float));
321    long cellsleft=entries;
322    for(i=0;i<points;i++)membership[i]=-1;
323    for(i=0;i<entries;i++)firsthead[i]=-1;
324    for(i=0;i<points;i++)secondary[i]=-1;
325    for(i=0;i<entries;i++)secondhead[i]=-1;
326
327    for(i=0;i<points;i++){
328      /* assign vectors to the nearest cell.  Also keep track of second
329         nearest for error statistics */
330      float *ppt=pointlist+i*dim;
331      int    firstentry=closest(b,ppt,-1);
332      int    secondentry=closest(b,ppt,firstentry);
333      float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
334      float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
335     
336      if(!(i&0xff))spinnit("initializing... ",points-i);
337   
338      membership[i]=firsthead[firstentry];
339      firsthead[firstentry]=i;
340      secondary[i]=secondhead[secondentry];
341      secondhead[secondentry]=i;
342
343      if(i<points-entries){
344        cellerror[firstentry]+=secondmetric-firstmetric;
345        cellerrormax[firstentry]=max(cellerrormax[firstentry],
346                                     _heuristic(b,ppt,secondentry));
347        cellcount[firstentry]++;
348        cellcount2[secondentry]++;
349      }
350    }
351
352    /* which cells are most heavily populated?  Protect as many from
353       dispersal as the user has requested */
354    {
355      long **countindex=_ogg_calloc(entries,sizeof(long *));
356      for(i=0;i<entries;i++)countindex[i]=cellcount+i;
357      qsort(countindex,entries,sizeof(long *),longsort);
358      for(i=0;i<protect;i++){
359        int ptr=countindex[i]-cellcount;
360        cellerrormax[ptr]=9e50f;
361      }
362    }
363
364    {
365      fprintf(stderr,"\r");
366      for(i=0;i<entries;i++){
367        /* decompose index */
368        int entry=i;
369        for(j=0;j<dim;j++){
370          fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
371          entry/=b->c->thresh_tree->quantvals;
372        }
373       
374        fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
375      }
376      fprintf(stderr,"\n");
377    }
378
379    /* do the automatic cull request */
380    while(cellsleft>target){
381      int bestcell=-1;
382      float besterror=0;
383      float besterror2=0;
384      long head=-1;
385      char spinbuf[80];
386      sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);
387
388      /* find the cell with lowest removal impact */
389      for(i=0;i<entries;i++){
390        if(b->c->lengthlist[i]>0){
391          if(bestcell==-1 || cellerrormax[i]<=besterror2){
392            if(bestcell==-1 || cellerrormax[i]<besterror2 || 
393               besterror>cellerror[i]){
394              besterror=cellerror[i];
395              besterror2=cellerrormax[i];
396              bestcell=i;
397            }
398          }
399        }
400      }
401
402      fprintf(stderr,"\reliminating cell %d                              \n"
403              "     dispersal error of %g max/%g total (%ld hits)\n",
404              bestcell,besterror2,besterror,cellcount[bestcell]);
405
406      /* disperse it.  move each point out, adding it (properly) to
407         the second best */
408      b->c->lengthlist[bestcell]=0;
409      head=firsthead[bestcell];
410      firsthead[bestcell]=-1;
411      while(head!=-1){
412        /* head is a point number */
413        float *ppt=pointlist+head*dim;
414        int firstentry=closest(b,ppt,-1);
415        int secondentry=closest(b,ppt,firstentry);
416        float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
417        float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
418        long next=membership[head];
419
420        if(head<points-entries){
421          cellcount[firstentry]++;
422          cellcount[bestcell]--;
423          cellerror[firstentry]+=secondmetric-firstmetric;
424          cellerrormax[firstentry]=max(cellerrormax[firstentry],
425                                       _heuristic(b,ppt,secondentry));
426        }
427
428        membership[head]=firsthead[firstentry];
429        firsthead[firstentry]=head;
430        head=next;
431        if(cellcount[bestcell]%128==0)
432          spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);
433
434      }
435
436      /* now see that all points that had the dispersed cell as second
437         choice have second choice reassigned */
438      head=secondhead[bestcell];
439      secondhead[bestcell]=-1;
440      while(head!=-1){
441        float *ppt=pointlist+head*dim;
442        /* who are we assigned to now? */
443        int firstentry=closest(b,ppt,-1);
444        /* what is the new second closest match? */
445        int secondentry=closest(b,ppt,firstentry);
446        /* old second closest is the cell being disbanded */
447        float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
448        /* new second closest error */
449        float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
450        long next=secondary[head];
451
452        if(head<points-entries){
453          cellcount2[secondentry]++;
454          cellcount2[bestcell]--;
455          cellerror[firstentry]+=secondmetric-oldsecondmetric;
456          cellerrormax[firstentry]=max(cellerrormax[firstentry],
457                                       _heuristic(b,ppt,secondentry));
458        }
459       
460        secondary[head]=secondhead[secondentry];
461        secondhead[secondentry]=head;
462        head=next;
463
464        if(cellcount2[bestcell]%128==0)
465          spinnit(spinbuf,cellcount2[bestcell]);
466      }
467
468      cellsleft--;
469    }
470
471    /* paring is over.  Build decision trees using points that now fall
472       through the thresh matcher. */
473    /* we don't free membership; we flatten it in order to use in lp_split */
474
475    for(i=0;i<entries;i++){
476      long head=firsthead[i];
477      spinnit("rearranging membership cache... ",entries-i);
478      while(head!=-1){
479        long next=membership[head];
480        membership[head]=i;
481        head=next;
482      }
483    }
484
485    free(secondhead);
486    free(firsthead);
487    free(cellerror);
488    free(cellerrormax);
489    free(secondary);
490
491    pointindex=_ogg_malloc(points*sizeof(long));
492    /* make a point index of fall-through points */
493    for(i=0;i<points;i++){
494      int best=_best(b,pointlist+i*dim,1);
495      if(best==-1)
496        pointindex[indexedpoints++]=i;
497      spinnit("finding orphaned points... ",points-i);
498    }
499
500    /* make an entry index */
501    entryindex=_ogg_malloc(entries*sizeof(long));
502    target=0;
503    for(i=0;i<entries;i++){
504      if(b->c->lengthlist[i]>0)
505        entryindex[target++]=i;
506    }
507
508    /* make working space for a reverse entry index */
509    reventry=_ogg_malloc(entries*sizeof(long));
510
511    /* do the split */
512    nt=b->c->nearest_tree=
513      _ogg_calloc(1,sizeof(encode_aux_nearestmatch));
514
515    nt->alloc=4096;
516    nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc);
517    nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc);
518    nt->p=_ogg_malloc(sizeof(long)*nt->alloc);
519    nt->q=_ogg_malloc(sizeof(long)*nt->alloc);
520    nt->aux=0;
521
522    fprintf(stderr,"Leaves added: %d              \n",
523            lp_split(pointlist,points,
524                     b,entryindex,target,
525                     pointindex,indexedpoints,
526                     membership,reventry,
527                     0,&pointssofar));
528    free(membership);
529    free(reventry);
530    free(pointindex);
531
532    /* hack alert.  I should just change the damned splitter and
533       codebook writer */
534    for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
535    for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
536   
537    /* recount hits.  Build new lengthlist. reuse entryindex storage */
538    for(i=0;i<entries;i++)entryindex[i]=1;
539    for(i=0;i<points-entries;i++){
540      int best=_best(b,pointlist+i*dim,1);
541      float *a=pointlist+i*dim;
542      if(!(i&0xff))spinnit("counting hits...",i);
543      if(best==-1){
544        fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
545                "codebook entry.  The new decision tree is broken.\n");
546        exit(1);
547      }
548      entryindex[best]++;
549    }
550    for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
551    for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
552   
553    /* the lengthlist builder doesn't actually deal with 0 hit entries.
554       So, we pack the 'sparse' hit list into a dense list, then unpack
555       the lengths after the build */
556    {
557      int upper=0;
558      long *lengthlist=_ogg_calloc(entries,sizeof(long));
559      for(i=0;i<entries;i++){
560        if(b->c->lengthlist[i]>0)
561          entryindex[upper++]=entryindex[i];
562        else{
563          if(entryindex[i]>1){
564            fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
565            exit(1);
566          }
567        }
568      }
569     
570      /* sanity check */
571      if(upper != target){
572        fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
573        exit(1);
574      }
575   
576      build_tree_from_lengths(upper,entryindex,lengthlist);
577     
578      upper=0;
579      for(i=0;i<entries;i++){
580        if(b->c->lengthlist[i]>0)
581          b->c->lengthlist[i]=lengthlist[upper++];
582      }
583
584    }
585  }
586  /* we're done.  write it out. */
587  write_codebook(out,basename,b->c);
588
589  fprintf(stderr,"\r                                        \nDone.\n");
590  return(0);
591}
592
593
594
595
Note: See TracBrowser for help on using the repository browser.