Edinburgh Speech Tools  2.4-release
wagon.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* Added stepwise use of features, Oct 1997 */
42 /* */
43 /*=======================================================================*/
44 
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Token.h"
50 #include "EST_FMatrix.h"
51 #include "EST_multistats.h"
52 #include "EST_Wagon.h"
53 #include "EST_math.h"
54 
55 Discretes wgn_discretes;
56 
57 WDataSet wgn_dataset;
58 WDataSet wgn_test_dataset;
59 EST_FMatrix wgn_DistMatrix;
60 EST_Track wgn_VertexTrack;
61 EST_Track wgn_VertexFeats;
62 EST_Track wgn_UnitTrack;
63 
64 int wgn_min_cluster_size = 50;
65 int wgn_max_questions = 2000000; /* not ideal, but adequate */
66 int wgn_held_out = 0;
67 float wgn_dropout_feats = 0.0;
68 float wgn_dropout_samples = 0.0;
69 int wgn_cos = 1;
70 int wgn_prune = TRUE;
71 int wgn_quiet = FALSE;
72 int wgn_verbose = FALSE;
73 int wgn_count_field = -1;
74 EST_String wgn_count_field_name = "";
75 int wgn_predictee = 0;
76 EST_String wgn_predictee_name = "";
77 float wgn_float_range_split = 10;
78 float wgn_balance = 0;
79 EST_String wgn_opt_param = "";
80 EST_String wgn_vertex_output = "mean";
81 EST_String wgn_vertex_otype = "mean";
82 
83 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
84 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
85 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
86 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
87 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
88 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
89 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
90 static int wagon_split(int margin,WNode &node);
91 static WQuestion find_best_question(WVectorVector &dset);
92 static void construct_binary_ques(int feat,WQuestion &test_ques);
93 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
94 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
95 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
96 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
97 
98 Declare_TList_T(WVector *, WVectorP)
99 
100 Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
101 
102 #if defined(INSTANTIATE_TEMPLATES)
103 // Instantiate class
104 #include "../base_class/EST_TList.cc"
105 #include "../base_class/EST_TVector.cc"
106 
107 Instantiate_TList_T(WVector *, WVectorP)
108 
109 Instantiate_TVector(WVector *)
110 
111 #endif
112 
113 void wgn_load_datadescription(EST_String fname,LISP ignores)
114 {
115  // Load field description for a file
116  wgn_dataset.load_description(fname,ignores);
117  wgn_test_dataset.load_description(fname,ignores);
118 }
119 
120 void wgn_load_dataset(WDataSet &dataset,EST_String fname)
121 {
122  // Read the data set from a filename. One vector per line
123  // Assume all numbers are numbers and non-nums are categorical
124  EST_TokenStream ts;
125  WVector *v;
126  int nvec=0,i;
127 
128  if (ts.open(fname) == -1)
129  wagon_error(EST_String("unable to open data file \"")+
130  fname+"\"");
131  ts.set_PunctuationSymbols("");
133  ts.set_SingleCharSymbols("");
134 
135  for ( ;!ts.eof(); )
136  {
137  v = new WVector(dataset.width());
138  i = 0;
139  do
140  {
141  int type = dataset.ftype(i);
142  if ((type == wndt_float) ||
143  (type == wndt_ols) ||
144  (wgn_count_field == i))
145  {
146  // need to ensure this is not NaN or Infinity
147  float f = atof(ts.get().string());
148  if (isfinite(f))
149  v->set_flt_val(i,f);
150  else
151  {
152  cout << fname << ": bad float " << f
153  << " in field " <<
154  dataset.feat_name(i) << " vector " <<
155  dataset.samples() << endl;
156  v->set_flt_val(i,0.0);
157  }
158  }
159  else if (type == wndt_binary)
160  v->set_int_val(i,atoi(ts.get().string()));
161  else if (type == wndt_cluster) /* index into distmatrix */
162  v->set_int_val(i,atoi(ts.get().string()));
163  else if (type == wndt_vector) /* index into VertexTrack */
164  v->set_int_val(i,atoi(ts.get().string()));
165  else if (type == wndt_trajectory) /* index to index and length */
166  { /* a number pointing to a vector in UnitTrack that */
167  /* has an idex into VertexTrack and a number of Vertices */
168  /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
169  /* VertexTrack and UnitTrack.a(15,1) is the number of */
170  /* frames in the unit */
171  v->set_int_val(i,atoi(ts.get().string()));
172  }
173  else if (type == wndt_ignore)
174  {
175  ts.get(); // skip it
176  v->set_int_val(i,0);
177  }
178  else // should check the different classes
179  {
180  EST_String s = ts.get().string();
181  int n = wgn_discretes.discrete(type).name(s);
182  if (n == -1)
183  {
184  cout << fname << ": bad value " << s << " in field " <<
185  dataset.feat_name(i) << " vector " <<
186  dataset.samples() << endl;
187  n = 0;
188  }
189  v->set_int_val(i,n);
190  }
191  i++;
192  }
193  while (!ts.eoln() && i<dataset.width());
194  nvec ++;
195  if (i != dataset.width())
196  {
197  wagon_error(fname+": data vector "+itoString(nvec)+" contains "
198  +itoString(i)+" parameters instead of "+
199  itoString(dataset.width()));
200  }
201  if (!ts.eoln())
202  {
203  cerr << fname << ": data vector " << nvec <<
204  " contains too many parameters instead of "
205  << dataset.width() << endl;
206  wagon_error(EST_String("extra parameter(s) from ")+
207  ts.peek().string());
208  }
209  dataset.append(v);
210  }
211 
212  cout << "Dataset of " << dataset.samples() << " vectors of " <<
213  dataset.width() << " parameters from: " << fname << endl;
214  ts.close();
215 }
216 
217 float summary_results(WNode &tree,ostream *output)
218 {
219  if (wgn_test_dataset.samples() != 0)
220  return do_summary(tree,wgn_test_dataset,output);
221  else
222  return do_summary(tree,wgn_dataset,output);
223 }
224 
225 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
226 {
227  if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
228  return test_tree_cluster(tree,ds,output);
229  else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
230  return test_tree_vector(tree,ds,output);
231  else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
232  return test_tree_trajectory(tree,ds,output);
233  else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
234  return test_tree_ols(tree,ds,output);
235  else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
236  return test_tree_class(tree,ds,output);
237  else
238  return test_tree_float(tree,ds,output);
239 }
240 
241 WNode *wgn_build_tree(float &score)
242 {
243  // Build init node and split it while reducing the impurity
244  WNode *top = new WNode();
245  int margin = 0;
246 
247  wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
248 
249  margin = 0;
250  wagon_split(margin,*top); // recursively split data;
251 
252  if (wgn_held_out > 0)
253  {
254  wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
255  top->held_out_prune();
256  }
257 
258  if (wgn_prune)
259  top->prune();
260 
261  score = summary_results(*top,0);
262 
263  return top;
264 }
265 
266 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
267 {
268  // Set data ommitting held_out percent if in is true
269  // or only including 100-held_out percent if in is false
270  int i,j;
271  EST_Litem *d;
272 
273  // Make it definitely big enough
274  data.resize(ds.length());
275 
276  for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
277  {
278  if ((in) && ((j%100) >= held_out))
279  data[i++] = ds(d);
280 // else if ((!in) && ((j%100 < held_out)))
281 // data[i++] = ds(d);
282  else if (!in)
283  data[i++] = ds(d);
284 // if ((in) && (j < held_out))
285 // data[i++] = ds(d);
286 // else if ((!in) && (j >=held_out))
287 // data[i++] = ds(d);
288  }
289  // make it the actual size, but don't reset values
290  data.resize(i,1);
291 }
292 
293 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
294 {
295  // Test tree against data to get summary of results
296  EST_StrStr_KVL pairs;
297  EST_StrList lex;
298  EST_Litem *p;
299  EST_String predict,real;
300  WNode *pnode;
301  double H=0,prob;
302  int i,type;
303  float correct=0,total=0, count=0;
304 
305  float bcorrect=0, bpredicted=0, bactual=0;
306  float precision=0, recall=0;
307 
308  for (p=dataset.head(); p != 0; p=p->next())
309  {
310  pnode = tree.predict_node((*dataset(p)));
311  predict = (EST_String)pnode->get_impurity().value();
312  if (wgn_count_field == -1)
313  count = 1.0;
314  else
315  count = dataset(p)->get_flt_val(wgn_count_field);
316  prob = pnode->get_impurity().pd().probability(predict);
317  H += (log(prob))*count;
318  type = dataset.ftype(wgn_predictee);
319  real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
320 
321  if (wgn_opt_param == "B_NB_F1")
322  {
323  //cout << real << " " << predict << endl;
324  if (real == "B")
325  bactual +=count;
326  if (predict == "B")
327  {
328  bpredicted += count;
329  if (real == predict)
330  bcorrect += count;
331  }
332  // cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
333  }
334  if (real == predict)
335  correct += count;
336  total += count;
337  pairs.add_item(real,predict,1);
338  }
339  for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
340  lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
341 
342  const EST_FMatrix &m = confusion(pairs,lex);
343 
344  if (output != NULL)
345  {
346  print_confusion(m,pairs,lex); // should be to output not stdout
347  *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
348  pow(2.0,(-1*(H/total))) << endl;
349  }
350 
351 
352  // Minus it so bigger is better
353  if (wgn_opt_param == "entropy")
354  return -pow(2.0,(-1*(H/total)));
355  else if(wgn_opt_param == "B_NB_F1")
356  {
357  if(bpredicted == 0)
358  precision = 1;
359  else
360  precision = bcorrect/bpredicted;
361  if(bactual == 0)
362  recall = 1;
363  else
364  recall = bcorrect/bactual;
365  float fmeasure = 0;
366  if((precision+recall) !=0)
367  fmeasure = 2* (precision*recall)/(precision+recall);
368  cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
369  return fmeasure;
370  }
371  else
372  return (float)correct/(float)total;
373 }
374 
375 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
376 {
377  // Test tree against data to get summary of results VECTOR
378  // distance is calculated in zscores (as the values in vector may
379  // have quite different ranges
380  WNode *leaf;
381  EST_Litem *p;
382  float predict, actual;
383  EST_SuffStats x,y,xx,yy,xy,se,e;
384  EST_SuffStats b;
385  int i,j,pos;
386  double cor,error;
387  double count;
388  EST_Litem *pp;
389 
390  for (p=dataset.head(); p != 0; p=p->next())
391  {
392  leaf = tree.predict_node((*dataset(p)));
393  pos = dataset(p)->get_int_val(wgn_predictee);
394  for (j=0; j<wgn_VertexFeats.num_channels(); j++)
395  if (wgn_VertexFeats.a(0,j) > 0.0)
396  {
397  b.reset();
398  for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
399  {
400  i = leaf->get_impurity().members.item(pp);
401  b += wgn_VertexTrack.a(i,j);
402  }
403  predict = b.mean();
404  actual = wgn_VertexTrack.a(pos,j);
405  if (wgn_count_field == -1)
406  count = 1.0;
407  else
408  count = dataset(p)->get_flt_val(wgn_count_field);
409  x.cumulate(predict,count);
410  y.cumulate(actual,count);
411  /* Normalized the error by the standard deviation */
412  if (b.stddev() == 0)
413  error = predict-actual;
414  else
415  error = (predict-actual)/b.stddev();
416  error = predict-actual; /* awb_debug */
417  se.cumulate((error*error),count);
418  e.cumulate(fabs(error),count);
419  xx.cumulate(predict*predict,count);
420  yy.cumulate(actual*actual,count);
421  xy.cumulate(predict*actual,count);
422  }
423  }
424 
425  // Pearson's product moment correlation coefficient
426 // cor = (xy.mean() - (x.mean()*y.mean()))/
427 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
428 // sqrt(yy.mean()-(y.mean()*y.mean())));
429  // Because when the variation is X is very small we can
430  // go negative, thus cause the sqrt's to give FPE
431  double v1 = xx.mean()-(x.mean()*x.mean());
432  double v2 = yy.mean()-(y.mean()*y.mean());
433 
434  double v3 = v1*v2;
435 
436  if (v3 <= 0)
437  // happens when there's very little variation in x
438  cor = 0;
439  else
440  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
441 
442  if (output != NULL)
443  {
444  if (output != &cout) // save in output file
445  *output
446  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
447  << " Correlation is " << ftoString(cor,4,1)
448  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
449  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
450 
451  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
452  << " Correlation is " << ftoString(cor,4,1)
453  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
454  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
455  }
456 
457  if (wgn_opt_param == "rmse")
458  return -sqrt(se.mean()); // * -1 so bigger is better
459  else
460  return cor; // should really be % variance, I think
461 }
462 
463 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
464 {
465  // Test tree against data to get summary of results TRAJECTORY
466  // distance is calculated in zscores (as the values in vector may
467  // have quite different ranges)
468  // NOT WRITTEN YET
469  WNode *leaf;
470  EST_Litem *p;
471  float predict, actual;
472  EST_SuffStats x,y,xx,yy,xy,se,e;
473  EST_SuffStats b;
474  int i,j,pos;
475  double cor,error;
476  double count;
477  EST_Litem *pp;
478 
479  for (p=dataset.head(); p != 0; p=p->next())
480  {
481  leaf = tree.predict_node((*dataset(p)));
482  pos = dataset(p)->get_int_val(wgn_predictee);
483  for (j=0; j<wgn_VertexFeats.num_channels(); j++)
484  if (wgn_VertexFeats.a(0,j) > 0.0)
485  {
486  b.reset();
487  for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
488  {
489  i = leaf->get_impurity().members.item(pp);
490  b += wgn_VertexTrack.a(i,j);
491  }
492  predict = b.mean();
493  actual = wgn_VertexTrack.a(pos,j);
494  if (wgn_count_field == -1)
495  count = 1.0;
496  else
497  count = dataset(p)->get_flt_val(wgn_count_field);
498  x.cumulate(predict,count);
499  y.cumulate(actual,count);
500  /* Normalized the error by the standard deviation */
501  if (b.stddev() == 0)
502  error = predict-actual;
503  else
504  error = (predict-actual)/b.stddev();
505  error = predict-actual; /* awb_debug */
506  se.cumulate((error*error),count);
507  e.cumulate(fabs(error),count);
508  xx.cumulate(predict*predict,count);
509  yy.cumulate(actual*actual,count);
510  xy.cumulate(predict*actual,count);
511  }
512  }
513 
514  // Pearson's product moment correlation coefficient
515 // cor = (xy.mean() - (x.mean()*y.mean()))/
516 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
517 // sqrt(yy.mean()-(y.mean()*y.mean())));
518  // Because when the variation is X is very small we can
519  // go negative, thus cause the sqrt's to give FPE
520  double v1 = xx.mean()-(x.mean()*x.mean());
521  double v2 = yy.mean()-(y.mean()*y.mean());
522 
523  double v3 = v1*v2;
524 
525  if (v3 <= 0)
526  // happens when there's very little variation in x
527  cor = 0;
528  else
529  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
530 
531  if (output != NULL)
532  {
533  if (output != &cout) // save in output file
534  *output
535  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
536  << " Correlation is " << ftoString(cor,4,1)
537  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
538  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
539 
540  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
541  << " Correlation is " << ftoString(cor,4,1)
542  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
543  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
544  }
545 
546  if (wgn_opt_param == "rmse")
547  return -sqrt(se.mean()); // * -1 so bigger is better
548  else
549  return cor; // should really be % variance, I think
550 }
551 
552 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
553 {
554  // Test tree against data to get summary of results for cluster trees
555  WNode *leaf;
556  int real;
557  int right_cluster=0;
558  EST_SuffStats ranking, meandist;
559  EST_Litem *p;
560 
561  for (p=dataset.head(); p != 0; p=p->next())
562  {
563  leaf = tree.predict_node((*dataset(p)));
564  real = dataset(p)->get_int_val(wgn_predictee);
565  meandist += leaf->get_impurity().cluster_distance(real);
566  right_cluster += leaf->get_impurity().in_cluster(real);
567  ranking += leaf->get_impurity().cluster_ranking(real);
568  }
569 
570  if (output != NULL)
571  {
572  // Want number in right class, mean distance in sds, mean ranking
573  if (output != &cout) // save in output file
574  *output << ";; Right cluster " << right_cluster << " (" <<
575  (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
576  "%) mean ranking " << ranking.mean() << " mean distance "
577  << meandist.mean() << endl;
578  cout << "Right cluster " << right_cluster << " (" <<
579  (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
580  "%) mean ranking " << ranking.mean() << " mean distance "
581  << meandist.mean() << endl;
582  }
583 
584  return 10000-meandist.mean(); // this doesn't work but I tested it
585 }
586 
587 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
588 {
589  // Test tree against data to get summary of results FLOAT
590  EST_Litem *p;
591  float predict,real;
592  EST_SuffStats x,y,xx,yy,xy,se,e;
593  double cor,error;
594  double count;
595 
596  for (p=dataset.head(); p != 0; p=p->next())
597  {
598  predict = tree.predict((*dataset(p)));
599  real = dataset(p)->get_flt_val(wgn_predictee);
600  if (wgn_count_field == -1)
601  count = 1.0;
602  else
603  count = dataset(p)->get_flt_val(wgn_count_field);
604  x.cumulate(predict,count);
605  y.cumulate(real,count);
606  error = predict-real;
607  se.cumulate((error*error),count);
608  e.cumulate(fabs(error),count);
609  xx.cumulate(predict*predict,count);
610  yy.cumulate(real*real,count);
611  xy.cumulate(predict*real,count);
612  }
613 
614  // Pearson's product moment correlation coefficient
615 // cor = (xy.mean() - (x.mean()*y.mean()))/
616 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
617 // sqrt(yy.mean()-(y.mean()*y.mean())));
618  // Because when the variation is X is very small we can
619  // go negative, thus cause the sqrt's to give FPE
620  double v1 = xx.mean()-(x.mean()*x.mean());
621  double v2 = yy.mean()-(y.mean()*y.mean());
622 
623  double v3 = v1*v2;
624 
625  if (v3 <= 0)
626  // happens when there's very little variation in x
627  cor = 0;
628  else
629  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
630 
631  if (output != NULL)
632  {
633  if (output != &cout) // save in output file
634  *output
635  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
636  << " Correlation is " << ftoString(cor,4,1)
637  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
638  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
639 
640  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
641  << " Correlation is " << ftoString(cor,4,1)
642  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
643  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
644  }
645 
646  if (wgn_opt_param == "rmse")
647  return -sqrt(se.mean()); // * -1 so bigger is better
648  else
649  return cor; // should really be % variance, I think
650 }
651 
652 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
653 {
654  // Test tree against data to get summary of results OLS
655  EST_Litem *p;
656  WNode *leaf;
657  float predict,real;
658  EST_SuffStats x,y,xx,yy,xy,se,e;
659  double cor,error;
660  double count;
661 
662  for (p=dataset.head(); p != 0; p=p->next())
663  {
664  leaf = tree.predict_node((*dataset(p)));
665  // do ols to get predict;
666  predict = 0.0; // This is incomplete ! you need to use leaf
667  real = dataset(p)->get_flt_val(wgn_predictee);
668  if (wgn_count_field == -1)
669  count = 1.0;
670  else
671  count = dataset(p)->get_flt_val(wgn_count_field);
672  x.cumulate(predict,count);
673  y.cumulate(real,count);
674  error = predict-real;
675  se.cumulate((error*error),count);
676  e.cumulate(fabs(error),count);
677  xx.cumulate(predict*predict,count);
678  yy.cumulate(real*real,count);
679  xy.cumulate(predict*real,count);
680  }
681 
682  // Pearson's product moment correlation coefficient
683 // cor = (xy.mean() - (x.mean()*y.mean()))/
684 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
685 // sqrt(yy.mean()-(y.mean()*y.mean())));
686  // Because when the variation is X is very small we can
687  // go negative, thus cause the sqrt's to give FPE
688  double v1 = xx.mean()-(x.mean()*x.mean());
689  double v2 = yy.mean()-(y.mean()*y.mean());
690 
691  double v3 = v1*v2;
692 
693  if (v3 <= 0)
694  // happens when there's very little variation in x
695  cor = 0;
696  else
697  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
698 
699  if (output != NULL)
700  {
701  if (output != &cout) // save in output file
702  *output
703  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
704  << " Correlation is " << ftoString(cor,4,1)
705  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
706  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
707 
708  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
709  << " Correlation is " << ftoString(cor,4,1)
710  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
711  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
712  }
713 
714  if (wgn_opt_param == "rmse")
715  return -sqrt(se.mean()); // * -1 so bigger is better
716  else
717  return cor; // should really be % variance, I think
718 }
719 
720 static int wagon_split(int margin, WNode &node)
721 {
722  // Split given node (if possible)
723  WQuestion q;
724  WNode *l,*r;
725 
726  node.set_impurity(WImpurity(node.get_data()));
727  if (wgn_max_questions < 1)
728  return FALSE;
729 
730  q = find_best_question(node.get_data());
731 
732 /* printf("q.score() %f impurity %f\n",
733  q.get_score(),
734  node.get_impurity().measure()); */
735 
736  double impurity_measure = node.get_impurity().measure();
737  double question_score = q.get_score();
738 
739  if ((question_score < WGN_HUGE_VAL) &&
740  (question_score < impurity_measure))
741  {
742  // Ok its worth a split
743  l = new WNode();
744  r = new WNode();
745  wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
746  node.set_subnodes(l,r);
747  node.set_question(q);
748  if (wgn_verbose)
749  {
750  int i;
751  for (i=0; i < margin; i++)
752  cout << " ";
753  cout << q << endl;
754  }
755  wgn_max_questions--;
756  margin++;
757  wagon_split(margin,*l);
758  margin++;
759  wagon_split(margin,*r);
760  margin--;
761  return TRUE;
762  }
763  else
764  {
765  if (wgn_verbose)
766  {
767  int i;
768  for (i=0; i < margin; i++)
769  cout << " ";
770  cout << "stopped samples: " << node.samples() << " impurity: "
771  << node.get_impurity() << endl;
772  }
773  margin--;
774  return FALSE;
775  }
776 }
777 
778 void wgn_find_split(WQuestion &q,WVectorVector &ds,
780 {
781  int i, iy, in;
782 
783  if (wgn_dropout_samples > 0.0)
784  {
785  // You need to count the number of yes/no again in all ds
786  for (iy=in=i=0; i < ds.n(); i++)
787  if (q.ask(*ds(i)) == TRUE)
788  iy++;
789  else
790  in++;
791  }
792  else
793  {
794  // Current counts are corrent (as all data was used)
795  iy = q.get_yes();
796  in = q.get_no();
797  }
798 
799  y.resize(iy);
800  n.resize(in);
801 
802  for (iy=in=i=0; i < ds.n(); i++)
803  if (q.ask(*ds(i)) == TRUE)
804  y[iy++] = ds(i);
805  else
806  n[in++] = ds(i);
807 
808 }
809 
810 static float wgn_random_number(float x)
811 {
812  // Returns random number between 0 and x
813  return (((float)random())/RAND_MAX)*x;
814 }
815 
816 #ifdef OMP_WAGON
817 static WQuestion find_best_question(WVectorVector &dset)
818 {
819  // Ask all possible questions and find the best one
820  int i;
821  float bscore,tscore;
822  WQuestion test_ques, best_ques;
823  WQuestion** questions=new WQuestion*[wgn_dataset.width()];
824  float* scores = new float[wgn_dataset.width()];
825  bscore = tscore = WGN_HUGE_VAL;
826  best_ques.set_score(bscore);
827 #pragma omp parallel
828 #pragma omp for
829  for (i=0;i < wgn_dataset.width(); i++)
830  {
831  questions[i] = new WQuestion;
832  questions[i]->set_score(bscore);}
833 #pragma omp parallel
834 #pragma omp for
835  for (i=0;i < wgn_dataset.width(); i++)
836  {
837  if ((wgn_dataset.ignore(i) == TRUE) ||
838  (i == wgn_predictee))
839  scores[i] = WGN_HUGE_VAL; // ignore this feature this time
840  else if (wgn_random_number(1.0) < wgn_dropout_feats)
841  scores[i] = WGN_HUGE_VAL; // randomly dropout feature
842  else if (wgn_dataset.ftype(i) == wndt_binary)
843  {
844  construct_binary_ques(i,*questions[i]);
845  scores[i] = wgn_score_question(*questions[i],dset);
846  }
847  else if (wgn_dataset.ftype(i) == wndt_float)
848  {
849  scores[i] = construct_float_ques(i,*questions[i],dset);
850  }
851  else if (wgn_dataset.ftype(i) == wndt_ignore)
852  scores[i] = WGN_HUGE_VAL; // always ignore this feature
853 #if 0
854  // This doesn't work reasonably
855  else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
856  {
857  wagon_error("subset selection temporarily deleted");
858  tscore = construct_class_ques_subset(i,test_ques,dset);
859  }
860 #endif
861  else if (wgn_dataset.ftype(i) >= wndt_class)
862  scores[i] = construct_class_ques(i,*questions[i],dset);
863  }
864  for (i=0;i < wgn_dataset.width(); i++)
865  {
866  if (scores[i] < bscore)
867  {
868  memcpy(&best_ques,questions[i],sizeof(*questions[i]));
869  best_ques.set_score(scores[i]);
870  bscore = scores[i];
871  }
872  delete questions[i];
873  }
874  delete [] questions;
875  delete [] scores;
876  return best_ques;
877 }
878 #else
879 // No OMP parallelism
880 static WQuestion find_best_question(WVectorVector &dset)
881 {
882  // Ask all possible questions and find the best one
883  int i;
884  float bscore,tscore;
885  WQuestion test_ques, best_ques;
886 
887  bscore = tscore = WGN_HUGE_VAL;
888  best_ques.set_score(bscore);
889  // test each feature with each possible question
890  for (i=0;i < wgn_dataset.width(); i++)
891  {
892  if ((wgn_dataset.ignore(i) == TRUE) ||
893  (i == wgn_predictee))
894  tscore = WGN_HUGE_VAL; // ignore this feature this time
895  else if (wgn_random_number(1.0) < wgn_dropout_feats)
896  tscore = WGN_HUGE_VAL; // randomly dropout feature
897  else if (wgn_dataset.ftype(i) == wndt_binary)
898  {
899  construct_binary_ques(i,test_ques);
900  tscore = wgn_score_question(test_ques,dset);
901  }
902  else if (wgn_dataset.ftype(i) == wndt_float)
903  {
904  tscore = construct_float_ques(i,test_ques,dset);
905  }
906  else if (wgn_dataset.ftype(i) == wndt_ignore)
907  tscore = WGN_HUGE_VAL; // always ignore this feature
908 #if 0
909  // This doesn't work reasonably
910  else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
911  {
912  wagon_error("subset selection temporarily deleted");
913  tscore = construct_class_ques_subset(i,test_ques,dset);
914  }
915 #endif
916  else if (wgn_dataset.ftype(i) >= wndt_class)
917  tscore = construct_class_ques(i,test_ques,dset);
918  if (tscore < bscore)
919  {
920  best_ques = test_ques;
921  best_ques.set_score(tscore);
922  bscore = tscore;
923  }
924  }
925 
926  return best_ques;
927 }
928 #endif
929 
930 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
931 {
932  // Find out which member of a class gives the best split
933  float tscore,bscore = WGN_HUGE_VAL;
934  int cl;
935  WQuestion test_q;
936 
937  test_q.set_fp(feat);
938  test_q.set_oper(wnop_is);
939  ques = test_q;
940 
941  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
942  {
943  test_q.set_operand1(EST_Val(cl));
944  tscore = wgn_score_question(test_q,ds);
945  if (tscore < bscore)
946  {
947  ques = test_q;
948  bscore = tscore;
949  }
950  }
951 
952  return bscore;
953 }
954 
955 #if 0
956 static float construct_class_ques_subset(int feat,WQuestion &ques,
957  WVectorVector &ds)
958 {
959  // Find out which subset of a class gives the best split.
960  // We first measure the subset of the data for each member of
961  // of the class. Then order those splits. Then go through finding
962  // where the best split of that ordered list is. This is described
963  // on page 247 of Breiman et al.
964  float tscore,bscore = WGN_HUGE_VAL;
965  LISP l;
966  int cl;
967 
968  ques.set_fp(feat);
969  ques.set_oper(wnop_is);
970  float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
971 
972  // Only do it for exists values
973  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
974  {
975  ques.set_operand(flocons(cl));
976  scores[cl] = wgn_score_question(ques,ds);
977  }
978 
979  LISP order = sort_class_scores(feat,scores);
980  if (order == NIL)
981  return WGN_HUGE_VAL;
982  if (siod_llength(order) == 1)
983  { // Only one so we know the best "split"
984  ques.set_oper(wnop_is);
985  ques.set_operand(car(order));
986  return scores[get_c_int(car(order))];
987  }
988 
989  ques.set_oper(wnop_in);
990  LISP best_l = NIL;
991  for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
992  {
993  ques.set_operand(l);
994  tscore = wgn_score_question(ques,ds);
995  if (tscore < bscore)
996  {
997  best_l = l;
998  bscore = tscore;
999  }
1000 
1001  }
1002 
1003  if (best_l != NIL)
1004  {
1005  if (siod_llength(best_l) == 1)
1006  {
1007  ques.set_oper(wnop_is);
1008  ques.set_operand(car(best_l));
1009  }
1010  else if (equal(cdr(order),best_l) != NIL)
1011  {
1012  ques.set_oper(wnop_is);
1013  ques.set_operand(car(order));
1014  }
1015  else
1016  {
1017  cout << "Found a good subset" << endl;
1018  ques.set_operand(best_l);
1019  }
1020  }
1021  return bscore;
1022 }
1023 
1024 static LISP sort_class_scores(int feat,float *scores)
1025 {
1026  // returns sorted list of (non WGN_HUGE_VAL) items
1027  int i;
1028  LISP items = NIL;
1029  LISP l;
1030 
1031  for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
1032  {
1033  if (scores[i] != WGN_HUGE_VAL)
1034  {
1035  if (items == NIL)
1036  items = cons(flocons(i),NIL);
1037  else
1038  {
1039  for (l=items; l != NIL; l=cdr(l))
1040  {
1041  if (scores[i] < scores[get_c_int(car(l))])
1042  {
1043  CDR(l) = cons(car(l),cdr(l));
1044  CAR(l) = flocons(i);
1045  break;
1046  }
1047  }
1048  if (l == NIL)
1049  items = l_append(items,cons(flocons(i),NIL));
1050  }
1051  }
1052  }
1053  return items;
1054 }
1055 #endif
1056 
1057 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
1058 {
1059  // Find out a split of the range that gives the best score
1060  // Naively does this by partitioning the range into float_range_split slots
1061  float tscore,bscore = WGN_HUGE_VAL;
1062  int d, i;
1063  float p;
1064  WQuestion test_q;
1065  float max,min,val,incr;
1066 
1067  test_q.set_fp(feat);
1068  test_q.set_oper(wnop_lessthan);
1069  ques = test_q;
1070 
1071  min = max = ds(0)->get_flt_val(feat); /* set up some value */
1072  for (d=0; d < ds.n(); d++)
1073  {
1074  val = ds(d)->get_flt_val(feat);
1075  if (val < min)
1076  min = val;
1077  else if (val > max)
1078  max = val;
1079  }
1080  if (max == min) // we're pure
1081  return WGN_HUGE_VAL;
1082  incr = (max-min)/wgn_float_range_split;
1083  // so do float_range-1 splits
1084  /* We calculate this based on the number splits, not the increments, */
1085  /* becuase incr can be so small it doesn't increment p */
1086  for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
1087  {
1088  test_q.set_operand1(EST_Val(p));
1089  tscore = wgn_score_question(test_q,ds);
1090  if (tscore < bscore)
1091  {
1092  ques = test_q;
1093  bscore = tscore;
1094  }
1095  }
1096 
1097  return bscore;
1098 }
1099 
1100 static void construct_binary_ques(int feat,WQuestion &test_ques)
1101 {
1102  // construct a question. Not sure about this in general
1103  // of course continuous/categorical features will require different
1104  // rule and non-binary ones will require some test point
1105 
1106  test_ques.set_fp(feat);
1107  test_ques.set_oper(wnop_binary);
1108  test_ques.set_operand1(EST_Val(""));
1109 }
1110 
1111 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1112 {
1113  // score this question as a possible split by finding
1114  // the sum of the impurities when ds is split with this question
1115  WImpurity y,n;
1116  int d, num_yes, num_no;
1117  float count;
1118  WVector *wv;
1119 
1120  num_yes = num_no = 0;
1121  y.data = &ds;
1122  n.data = &ds;
1123  for (d=0; d < ds.n(); d++)
1124  {
1125  if (wgn_random_number(1.0) < wgn_dropout_samples)
1126  {
1127  continue; // dropout this sample
1128  }
1129  else if ((ignorenth < 2) ||
1130  (d%ignorenth != ignorenth-1))
1131  {
1132  wv = ds(d);
1133  if (wgn_count_field == -1)
1134  count = 1.0;
1135  else
1136  count = (*wv)[wgn_count_field];
1137 
1138  if (q.ask(*wv) == TRUE)
1139  {
1140  num_yes++;
1141  if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1142  y.cumulate(d,count); // note the sample number not value
1143  else
1144  y.cumulate((*wv)[wgn_predictee],count);
1145  }
1146  else
1147  {
1148  num_no++;
1149  if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1150  n.cumulate(d,count); // note the sample number not value
1151  else
1152  n.cumulate((*wv)[wgn_predictee],count);
1153  }
1154  }
1155  }
1156 
1157  q.set_yes(num_yes);
1158  q.set_no(num_no);
1159 
1160  int min_cluster;
1161 
1162  if ((wgn_balance == 0.0) ||
1163  (ds.n()/wgn_balance < wgn_min_cluster_size))
1164  min_cluster = wgn_min_cluster_size;
1165  else
1166  min_cluster = (int)(ds.n()/wgn_balance);
1167 
1168  if ((y.samples() < min_cluster) ||
1169  (n.samples() < min_cluster))
1170  return WGN_HUGE_VAL;
1171 
1172  float ym,nm,bm;
1173  // printf("awb_debug score_question_set X %f Y %f\n",
1174  // y.samples(), n.samples());
1175  ym = y.measure();
1176  nm = n.measure();
1177  bm = ym + nm;
1178 
1179  /* cout << q << endl;
1180  printf("test question y %f n %f b %f\n",
1181  ym, nm, bm); */
1182 
1183  return bm/2.0;
1184 }
1185 
1186 float wgn_score_question(WQuestion &q, WVectorVector &ds)
1187 {
1188  // This level of indirection was introduced for later expansion
1189 
1190  return score_question_set(q,ds,1);
1191 }
1192 
1193 WNode *wagon_stepwise(float limit)
1194 {
1195  // Find the best single features and incrementally add features
1196  // that best improve result until it doesn't improve.
1197  // This is basically to automate what Kurt was doing in building
1198  // trees, he then automated it in PERL and as it seemed to work
1199  // I put it into wagon itself.
1200  // This can be pretty computationally intensive.
1201  WNode *best = 0,*new_best = 0;
1202  float bscore,best_score = -WGN_HUGE_VAL;
1203  int best_feat,i;
1204  int nf = 1;
1205 
1206  // Set all features to ignore
1207  for (i=0; i < wgn_dataset.width(); i++)
1208  wgn_dataset.set_ignore(i,TRUE);
1209 
1210  for (i=0; i < wgn_dataset.width(); i++)
1211  {
1212  if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1213  {
1214  // This skips the round not because this has anything to
1215  // do with this feature being (user specified) ignored
1216  // but because it indicates there is one less cycle that is
1217  // necessary
1218  continue;
1219  }
1220  new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1221 
1222  if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1223  {
1224  // gone as far as we can
1225  delete new_best;
1226  break;
1227  }
1228  else
1229  {
1230  best_score = bscore;
1231  delete best;
1232  best = new_best;
1233  wgn_dataset.set_ignore(best_feat,FALSE);
1234  if (!wgn_quiet)
1235  {
1236  fprintf(stdout,"FEATURE %d %s: %2.4f\n",
1237  nf,
1238  (const char *)wgn_dataset.feat_name(best_feat),
1239  best_score);
1240  fflush(stdout);
1241  nf++;
1242  }
1243  }
1244  }
1245 
1246  return best;
1247 }
1248 
1249 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1250 {
1251  // Find which of the currently ignored features will best improve
1252  // the result
1253  WNode *best = 0;
1254  float best_score = -WGN_HUGE_VAL;
1255  int best_new_feat = -1;
1256  int i;
1257 
1258  for (i=0; i < wgn_dataset.width(); i++)
1259  {
1260  if (wgn_dataset.ftype(i) == wndt_ignore)
1261  continue; // user wants me to ignore this completely
1262  else if (i == wgn_predictee) // can't use the answer
1263  continue;
1264  else if (wgn_dataset.ignore(i) == TRUE)
1265  {
1266  WNode *current;
1267  float score;
1268 
1269  // Allow this feature to participate
1270  wgn_dataset.set_ignore(i,FALSE);
1271 
1272  current = wgn_build_tree(score);
1273 
1274  if (score > best_score)
1275  {
1276  best_score = score;
1277  delete best;
1278  best = current;
1279  best_new_feat = i;
1280 // fprintf(stdout,"BETTER FEATURE %d %s: %2.4f\n",
1281 // i,
1282 // (const char *)wgn_dataset.feat_name(i),
1283 // best_score);
1284 // fflush(stdout);
1285  }
1286  else
1287  delete current;
1288 
1289  // switch it off again
1290  wgn_dataset.set_ignore(i,TRUE);
1291  }
1292  }
1293 
1294  bscore = best_score;
1295  best_feat = best_new_feat;
1296  return best;
1297 }
const EST_String & name(const int n) const
The name given the index.
double stddev(void) const
standard deviation of currently cummulated values
double mean(void) const
mean of currently cummulated values
void reset(void)
reset internal values
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition: EST_TKVL.cc:248
T & item(const EST_Litem *p)
Definition: EST_TList.h:133
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:191
void resize(int n, int set=1)
Definition: EST_TVector.cc:196
INLINE int n() const
number of items in vector.
Definition: EST_TVector.h:254
int eof()
end of file
Definition: EST_Token.h:356
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:338
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:344
int eoln()
end of line
Definition: EST_Token.cc:818
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:341
EST_Token & peek(void)
peek at next token
Definition: EST_Token.cc:830
void close(void)
Close stream.
Definition: EST_Token.cc:406
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486
float & a(int i, int c=0)
Definition: EST_Track.cc:1022
int num_channels() const
return number of channels in track
Definition: EST_Track.h:656