Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
wagon.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : May 1996 */
35/*-----------------------------------------------------------------------*/
36/* A Classification and Regression Tree (CART) Program */
37/* A basic implementation of many of the techniques in */
38/* Briemen et al. 1984 */
39/* */
40/* Added decision list support, Feb 1997 */
41/* Added stepwise use of features, Oct 1997 */
42/* */
43/*=======================================================================*/
44
45#include <cstdlib>
46#include <iostream>
47#include <fstream>
48#include <cstring>
49#include "EST_Token.h"
50#include "EST_FMatrix.h"
51#include "EST_multistats.h"
52#include "EST_Wagon.h"
53#include "EST_math.h"
54
55Discretes wgn_discretes;
56
57WDataSet wgn_dataset;
58WDataSet wgn_test_dataset;
59EST_FMatrix wgn_DistMatrix;
60EST_Track wgn_VertexTrack;
61EST_Track wgn_VertexFeats;
62EST_Track wgn_UnitTrack;
63
64int wgn_min_cluster_size = 50;
65int wgn_max_questions = 2000000; /* not ideal, but adequate */
66int wgn_held_out = 0;
67float wgn_dropout_feats = 0.0;
68float wgn_dropout_samples = 0.0;
69int wgn_cos = 1;
70int wgn_prune = TRUE;
71int wgn_quiet = FALSE;
72int wgn_verbose = FALSE;
73int wgn_count_field = -1;
74EST_String wgn_count_field_name = "";
75int wgn_predictee = 0;
76EST_String wgn_predictee_name = "";
77float wgn_float_range_split = 10;
78float wgn_balance = 0;
79EST_String wgn_opt_param = "";
80EST_String wgn_vertex_output = "mean";
81EST_String wgn_vertex_otype = "mean";
82
83static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
84static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
85static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
86static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
87static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
88static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
89static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
90static int wagon_split(int margin,WNode &node);
91static WQuestion find_best_question(WVectorVector &dset);
92static void construct_binary_ques(int feat,WQuestion &test_ques);
93static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
94static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
95static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
96static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
97
98Declare_TList_T(WVector *, WVectorP)
99
100Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
101
102#if defined(INSTANTIATE_TEMPLATES)
103// Instantiate class
104#include "../base_class/EST_TList.cc"
105#include "../base_class/EST_TVector.cc"
106
107Instantiate_TList_T(WVector *, WVectorP)
108
109Instantiate_TVector(WVector *)
110
111#endif
112
113void wgn_load_datadescription(EST_String fname,LISP ignores)
114{
115 // Load field description for a file
116 wgn_dataset.load_description(fname,ignores);
117 wgn_test_dataset.load_description(fname,ignores);
118}
119
120void wgn_load_dataset(WDataSet &dataset,EST_String fname)
121{
122 // Read the data set from a filename. One vector per line
123 // Assume all numbers are numbers and non-nums are categorical
125 WVector *v;
126 int nvec=0,i;
127
128 if (ts.open(fname) == -1)
129 wagon_error(EST_String("unable to open data file \"")+
130 fname+"\"");
131 ts.set_PunctuationSymbols("");
132 ts.set_PrePunctuationSymbols("");
133 ts.set_SingleCharSymbols("");
134
135 for ( ;!ts.eof(); )
136 {
137 v = new WVector(dataset.width());
138 i = 0;
139 do
140 {
141 int type = dataset.ftype(i);
142 if ((type == wndt_float) ||
143 (type == wndt_ols) ||
144 (wgn_count_field == i))
145 {
146 // need to ensure this is not NaN or Infinity
147 float f = atof(ts.get().string());
148 if (isfinite(f))
149 v->set_flt_val(i,f);
150 else
151 {
152 cout << fname << ": bad float " << f
153 << " in field " <<
154 dataset.feat_name(i) << " vector " <<
155 dataset.samples() << endl;
156 v->set_flt_val(i,0.0);
157 }
158 }
159 else if (type == wndt_binary)
160 v->set_int_val(i,atoi(ts.get().string()));
161 else if (type == wndt_cluster) /* index into distmatrix */
162 v->set_int_val(i,atoi(ts.get().string()));
163 else if (type == wndt_vector) /* index into VertexTrack */
164 v->set_int_val(i,atoi(ts.get().string()));
165 else if (type == wndt_trajectory) /* index to index and length */
166 { /* a number pointing to a vector in UnitTrack that */
167 /* has an idex into VertexTrack and a number of Vertices */
168 /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
169 /* VertexTrack and UnitTrack.a(15,1) is the number of */
170 /* frames in the unit */
171 v->set_int_val(i,atoi(ts.get().string()));
172 }
173 else if (type == wndt_ignore)
174 {
175 ts.get(); // skip it
176 v->set_int_val(i,0);
177 }
178 else // should check the different classes
179 {
180 EST_String s = ts.get().string();
181 int n = wgn_discretes.discrete(type).name(s);
182 if (n == -1)
183 {
184 cout << fname << ": bad value " << s << " in field " <<
185 dataset.feat_name(i) << " vector " <<
186 dataset.samples() << endl;
187 n = 0;
188 }
189 v->set_int_val(i,n);
190 }
191 i++;
192 }
193 while (!ts.eoln() && i<dataset.width());
194 nvec ++;
195 if (i != dataset.width())
196 {
197 wagon_error(fname+": data vector "+itoString(nvec)+" contains "
198 +itoString(i)+" parameters instead of "+
199 itoString(dataset.width()));
200 }
201 if (!ts.eoln())
202 {
203 cerr << fname << ": data vector " << nvec <<
204 " contains too many parameters instead of "
205 << dataset.width() << endl;
206 wagon_error(EST_String("extra parameter(s) from ")+
207 ts.peek().string());
208 }
209 dataset.append(v);
210 }
211
212 cout << "Dataset of " << dataset.samples() << " vectors of " <<
213 dataset.width() << " parameters from: " << fname << endl;
214 ts.close();
215}
216
217float summary_results(WNode &tree,ostream *output)
218{
219 if (wgn_test_dataset.samples() != 0)
220 return do_summary(tree,wgn_test_dataset,output);
221 else
222 return do_summary(tree,wgn_dataset,output);
223}
224
225static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
226{
227 if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
228 return test_tree_cluster(tree,ds,output);
229 else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
230 return test_tree_vector(tree,ds,output);
231 else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
232 return test_tree_trajectory(tree,ds,output);
233 else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
234 return test_tree_ols(tree,ds,output);
235 else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
236 return test_tree_class(tree,ds,output);
237 else
238 return test_tree_float(tree,ds,output);
239}
240
241WNode *wgn_build_tree(float &score)
242{
243 // Build init node and split it while reducing the impurity
244 WNode *top = new WNode();
245 int margin = 0;
246
247 wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
248
249 margin = 0;
250 wagon_split(margin,*top); // recursively split data;
251
252 if (wgn_held_out > 0)
253 {
254 wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
255 top->held_out_prune();
256 }
257
258 if (wgn_prune)
259 top->prune();
260
261 score = summary_results(*top,0);
262
263 return top;
264}
265
266static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
267{
268 // Set data ommitting held_out percent if in is true
269 // or only including 100-held_out percent if in is false
270 int i,j;
271 EST_Litem *d;
272
273 // Make it definitely big enough
274 data.resize(ds.length());
275
276 for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
277 {
278 if ((in) && ((j%100) >= held_out))
279 data[i++] = ds(d);
280// else if ((!in) && ((j%100 < held_out)))
281// data[i++] = ds(d);
282 else if (!in)
283 data[i++] = ds(d);
284// if ((in) && (j < held_out))
285// data[i++] = ds(d);
286// else if ((!in) && (j >=held_out))
287// data[i++] = ds(d);
288 }
289 // make it the actual size, but don't reset values
290 data.resize(i,1);
291}
292
293static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
294{
295 // Test tree against data to get summary of results
298 EST_Litem *p;
299 EST_String predict,real;
300 WNode *pnode;
301 double H=0,prob;
302 int i,type;
303 float correct=0,total=0, count=0;
304
305 float bcorrect=0, bpredicted=0, bactual=0;
306 float precision=0, recall=0;
307
308 for (p=dataset.head(); p != 0; p=p->next())
309 {
310 pnode = tree.predict_node((*dataset(p)));
311 predict = (EST_String)pnode->get_impurity().value();
312 if (wgn_count_field == -1)
313 count = 1.0;
314 else
315 count = dataset(p)->get_flt_val(wgn_count_field);
316 prob = pnode->get_impurity().pd().probability(predict);
317 H += (log(prob))*count;
318 type = dataset.ftype(wgn_predictee);
319 real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
320
321 if (wgn_opt_param == "B_NB_F1")
322 {
323 //cout << real << " " << predict << endl;
324 if (real == "B")
325 bactual +=count;
326 if (predict == "B")
327 {
328 bpredicted += count;
329 if (real == predict)
330 bcorrect += count;
331 }
332 // cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
333 }
334 if (real == predict)
335 correct += count;
336 total += count;
337 pairs.add_item(real,predict,1);
338 }
339 for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
340 lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
341
342 const EST_FMatrix &m = confusion(pairs,lex);
343
344 if (output != NULL)
345 {
346 print_confusion(m,pairs,lex); // should be to output not stdout
347 *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
348 pow(2.0,(-1*(H/total))) << endl;
349 }
350
351
352 // Minus it so bigger is better
353 if (wgn_opt_param == "entropy")
354 return -pow(2.0,(-1*(H/total)));
355 else if(wgn_opt_param == "B_NB_F1")
356 {
357 if(bpredicted == 0)
358 precision = 1;
359 else
361 if(bactual == 0)
362 recall = 1;
363 else
365 float fmeasure = 0;
366 if((precision+recall) !=0)
368 cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
369 return fmeasure;
370 }
371 else
372 return (float)correct/(float)total;
373}
374
375static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
376{
377 // Test tree against data to get summary of results VECTOR
378 // distance is calculated in zscores (as the values in vector may
379 // have quite different ranges
380 WNode *leaf;
381 EST_Litem *p;
382 float predict, actual;
385 int i,j,pos;
386 double cor,error;
387 double count;
388 EST_Litem *pp;
389
390 for (p=dataset.head(); p != 0; p=p->next())
391 {
392 leaf = tree.predict_node((*dataset(p)));
393 pos = dataset(p)->get_int_val(wgn_predictee);
394 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
395 if (wgn_VertexFeats.a(0,j) > 0.0)
396 {
397 b.reset();
398 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
399 {
400 i = leaf->get_impurity().members.item(pp);
401 b += wgn_VertexTrack.a(i,j);
402 }
403 predict = b.mean();
404 actual = wgn_VertexTrack.a(pos,j);
405 if (wgn_count_field == -1)
406 count = 1.0;
407 else
408 count = dataset(p)->get_flt_val(wgn_count_field);
409 x.cumulate(predict,count);
410 y.cumulate(actual,count);
411 /* Normalized the error by the standard deviation */
412 if (b.stddev() == 0)
413 error = predict-actual;
414 else
415 error = (predict-actual)/b.stddev();
416 error = predict-actual; /* awb_debug */
417 se.cumulate((error*error),count);
418 e.cumulate(fabs(error),count);
419 xx.cumulate(predict*predict,count);
420 yy.cumulate(actual*actual,count);
421 xy.cumulate(predict*actual,count);
422 }
423 }
424
425 // Pearson's product moment correlation coefficient
426// cor = (xy.mean() - (x.mean()*y.mean()))/
427// (sqrt(xx.mean()-(x.mean()*x.mean())) *
428// sqrt(yy.mean()-(y.mean()*y.mean())));
429 // Because when the variation is X is very small we can
430 // go negative, thus cause the sqrt's to give FPE
431 double v1 = xx.mean()-(x.mean()*x.mean());
432 double v2 = yy.mean()-(y.mean()*y.mean());
433
434 double v3 = v1*v2;
435
436 if (v3 <= 0)
437 // happens when there's very little variation in x
438 cor = 0;
439 else
440 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
441
442 if (output != NULL)
443 {
444 if (output != &cout) // save in output file
445 *output
446 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
447 << " Correlation is " << ftoString(cor,4,1)
448 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
449 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
450
451 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
452 << " Correlation is " << ftoString(cor,4,1)
453 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
454 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
455 }
456
457 if (wgn_opt_param == "rmse")
458 return -sqrt(se.mean()); // * -1 so bigger is better
459 else
460 return cor; // should really be % variance, I think
461}
462
463static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
464{
465 // Test tree against data to get summary of results TRAJECTORY
466 // distance is calculated in zscores (as the values in vector may
467 // have quite different ranges)
468 // NOT WRITTEN YET
469 WNode *leaf;
470 EST_Litem *p;
471 float predict, actual;
474 int i,j,pos;
475 double cor,error;
476 double count;
477 EST_Litem *pp;
478
479 for (p=dataset.head(); p != 0; p=p->next())
480 {
481 leaf = tree.predict_node((*dataset(p)));
482 pos = dataset(p)->get_int_val(wgn_predictee);
483 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
484 if (wgn_VertexFeats.a(0,j) > 0.0)
485 {
486 b.reset();
487 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
488 {
489 i = leaf->get_impurity().members.item(pp);
490 b += wgn_VertexTrack.a(i,j);
491 }
492 predict = b.mean();
493 actual = wgn_VertexTrack.a(pos,j);
494 if (wgn_count_field == -1)
495 count = 1.0;
496 else
497 count = dataset(p)->get_flt_val(wgn_count_field);
498 x.cumulate(predict,count);
499 y.cumulate(actual,count);
500 /* Normalized the error by the standard deviation */
501 if (b.stddev() == 0)
502 error = predict-actual;
503 else
504 error = (predict-actual)/b.stddev();
505 error = predict-actual; /* awb_debug */
506 se.cumulate((error*error),count);
507 e.cumulate(fabs(error),count);
508 xx.cumulate(predict*predict,count);
509 yy.cumulate(actual*actual,count);
510 xy.cumulate(predict*actual,count);
511 }
512 }
513
514 // Pearson's product moment correlation coefficient
515// cor = (xy.mean() - (x.mean()*y.mean()))/
516// (sqrt(xx.mean()-(x.mean()*x.mean())) *
517// sqrt(yy.mean()-(y.mean()*y.mean())));
518 // Because when the variation is X is very small we can
519 // go negative, thus cause the sqrt's to give FPE
520 double v1 = xx.mean()-(x.mean()*x.mean());
521 double v2 = yy.mean()-(y.mean()*y.mean());
522
523 double v3 = v1*v2;
524
525 if (v3 <= 0)
526 // happens when there's very little variation in x
527 cor = 0;
528 else
529 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
530
531 if (output != NULL)
532 {
533 if (output != &cout) // save in output file
534 *output
535 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
536 << " Correlation is " << ftoString(cor,4,1)
537 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
538 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
539
540 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
541 << " Correlation is " << ftoString(cor,4,1)
542 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
543 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
544 }
545
546 if (wgn_opt_param == "rmse")
547 return -sqrt(se.mean()); // * -1 so bigger is better
548 else
549 return cor; // should really be % variance, I think
550}
551
552static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
553{
554 // Test tree against data to get summary of results for cluster trees
555 WNode *leaf;
556 int real;
557 int right_cluster=0;
559 EST_Litem *p;
560
561 for (p=dataset.head(); p != 0; p=p->next())
562 {
563 leaf = tree.predict_node((*dataset(p)));
564 real = dataset(p)->get_int_val(wgn_predictee);
565 meandist += leaf->get_impurity().cluster_distance(real);
566 right_cluster += leaf->get_impurity().in_cluster(real);
567 ranking += leaf->get_impurity().cluster_ranking(real);
568 }
569
570 if (output != NULL)
571 {
572 // Want number in right class, mean distance in sds, mean ranking
573 if (output != &cout) // save in output file
574 *output << ";; Right cluster " << right_cluster << " (" <<
575 (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
576 "%) mean ranking " << ranking.mean() << " mean distance "
577 << meandist.mean() << endl;
578 cout << "Right cluster " << right_cluster << " (" <<
579 (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
580 "%) mean ranking " << ranking.mean() << " mean distance "
581 << meandist.mean() << endl;
582 }
583
584 return 10000-meandist.mean(); // this doesn't work but I tested it
585}
586
587static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
588{
589 // Test tree against data to get summary of results FLOAT
590 EST_Litem *p;
591 float predict,real;
593 double cor,error;
594 double count;
595
596 for (p=dataset.head(); p != 0; p=p->next())
597 {
598 predict = tree.predict((*dataset(p)));
599 real = dataset(p)->get_flt_val(wgn_predictee);
600 if (wgn_count_field == -1)
601 count = 1.0;
602 else
603 count = dataset(p)->get_flt_val(wgn_count_field);
604 x.cumulate(predict,count);
605 y.cumulate(real,count);
606 error = predict-real;
607 se.cumulate((error*error),count);
608 e.cumulate(fabs(error),count);
609 xx.cumulate(predict*predict,count);
610 yy.cumulate(real*real,count);
611 xy.cumulate(predict*real,count);
612 }
613
614 // Pearson's product moment correlation coefficient
615// cor = (xy.mean() - (x.mean()*y.mean()))/
616// (sqrt(xx.mean()-(x.mean()*x.mean())) *
617// sqrt(yy.mean()-(y.mean()*y.mean())));
618 // Because when the variation is X is very small we can
619 // go negative, thus cause the sqrt's to give FPE
620 double v1 = xx.mean()-(x.mean()*x.mean());
621 double v2 = yy.mean()-(y.mean()*y.mean());
622
623 double v3 = v1*v2;
624
625 if (v3 <= 0)
626 // happens when there's very little variation in x
627 cor = 0;
628 else
629 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
630
631 if (output != NULL)
632 {
633 if (output != &cout) // save in output file
634 *output
635 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
636 << " Correlation is " << ftoString(cor,4,1)
637 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
638 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
639
640 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
641 << " Correlation is " << ftoString(cor,4,1)
642 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
643 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
644 }
645
646 if (wgn_opt_param == "rmse")
647 return -sqrt(se.mean()); // * -1 so bigger is better
648 else
649 return cor; // should really be % variance, I think
650}
651
652static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
653{
654 // Test tree against data to get summary of results OLS
655 EST_Litem *p;
656 WNode *leaf;
657 float predict,real;
659 double cor,error;
660 double count;
661
662 for (p=dataset.head(); p != 0; p=p->next())
663 {
664 leaf = tree.predict_node((*dataset(p)));
665 // do ols to get predict;
666 predict = 0.0; // This is incomplete ! you need to use leaf
667 real = dataset(p)->get_flt_val(wgn_predictee);
668 if (wgn_count_field == -1)
669 count = 1.0;
670 else
671 count = dataset(p)->get_flt_val(wgn_count_field);
672 x.cumulate(predict,count);
673 y.cumulate(real,count);
674 error = predict-real;
675 se.cumulate((error*error),count);
676 e.cumulate(fabs(error),count);
677 xx.cumulate(predict*predict,count);
678 yy.cumulate(real*real,count);
679 xy.cumulate(predict*real,count);
680 }
681
682 // Pearson's product moment correlation coefficient
683// cor = (xy.mean() - (x.mean()*y.mean()))/
684// (sqrt(xx.mean()-(x.mean()*x.mean())) *
685// sqrt(yy.mean()-(y.mean()*y.mean())));
686 // Because when the variation is X is very small we can
687 // go negative, thus cause the sqrt's to give FPE
688 double v1 = xx.mean()-(x.mean()*x.mean());
689 double v2 = yy.mean()-(y.mean()*y.mean());
690
691 double v3 = v1*v2;
692
693 if (v3 <= 0)
694 // happens when there's very little variation in x
695 cor = 0;
696 else
697 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
698
699 if (output != NULL)
700 {
701 if (output != &cout) // save in output file
702 *output
703 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
704 << " Correlation is " << ftoString(cor,4,1)
705 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
706 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
707
708 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
709 << " Correlation is " << ftoString(cor,4,1)
710 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
711 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
712 }
713
714 if (wgn_opt_param == "rmse")
715 return -sqrt(se.mean()); // * -1 so bigger is better
716 else
717 return cor; // should really be % variance, I think
718}
719
720static int wagon_split(int margin, WNode &node)
721{
722 // Split given node (if possible)
723 WQuestion q;
724 WNode *l,*r;
725
726 node.set_impurity(WImpurity(node.get_data()));
727 if (wgn_max_questions < 1)
728 return FALSE;
729
730 q = find_best_question(node.get_data());
731
732/* printf("q.score() %f impurity %f\n",
733 q.get_score(),
734 node.get_impurity().measure()); */
735
736 double impurity_measure = node.get_impurity().measure();
737 double question_score = q.get_score();
738
739 if ((question_score < WGN_HUGE_VAL) &&
741 {
742 // Ok its worth a split
743 l = new WNode();
744 r = new WNode();
745 wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
746 node.set_subnodes(l,r);
747 node.set_question(q);
748 if (wgn_verbose)
749 {
750 int i;
751 for (i=0; i < margin; i++)
752 cout << " ";
753 cout << q << endl;
754 }
755 wgn_max_questions--;
756 margin++;
757 wagon_split(margin,*l);
758 margin++;
759 wagon_split(margin,*r);
760 margin--;
761 return TRUE;
762 }
763 else
764 {
765 if (wgn_verbose)
766 {
767 int i;
768 for (i=0; i < margin; i++)
769 cout << " ";
770 cout << "stopped samples: " << node.samples() << " impurity: "
771 << node.get_impurity() << endl;
772 }
773 margin--;
774 return FALSE;
775 }
776}
777
778void wgn_find_split(WQuestion &q,WVectorVector &ds,
780{
781 int i, iy, in;
782
783 if (wgn_dropout_samples > 0.0)
784 {
785 // You need to count the number of yes/no again in all ds
786 for (iy=in=i=0; i < ds.n(); i++)
787 if (q.ask(*ds(i)) == TRUE)
788 iy++;
789 else
790 in++;
791 }
792 else
793 {
794 // Current counts are corrent (as all data was used)
795 iy = q.get_yes();
796 in = q.get_no();
797 }
798
799 y.resize(iy);
800 n.resize(in);
801
802 for (iy=in=i=0; i < ds.n(); i++)
803 if (q.ask(*ds(i)) == TRUE)
804 y[iy++] = ds(i);
805 else
806 n[in++] = ds(i);
807
808}
809
810static float wgn_random_number(float x)
811{
812 // Returns random number between 0 and x
813 return (((float)random())/RAND_MAX)*x;
814}
815
816#ifdef OMP_WAGON
817static WQuestion find_best_question(WVectorVector &dset)
818{
819 // Ask all possible questions and find the best one
820 int i;
821 float bscore,tscore;
823 WQuestion** questions=new WQuestion*[wgn_dataset.width()];
824 float* scores = new float[wgn_dataset.width()];
825 bscore = tscore = WGN_HUGE_VAL;
826 best_ques.set_score(bscore);
827#pragma omp parallel
828#pragma omp for
829 for (i=0;i < wgn_dataset.width(); i++)
830 {
831 questions[i] = new WQuestion;
832 questions[i]->set_score(bscore);}
833#pragma omp parallel
834#pragma omp for
835 for (i=0;i < wgn_dataset.width(); i++)
836 {
837 if ((wgn_dataset.ignore(i) == TRUE) ||
838 (i == wgn_predictee))
839 scores[i] = WGN_HUGE_VAL; // ignore this feature this time
840 else if (wgn_random_number(1.0) < wgn_dropout_feats)
841 scores[i] = WGN_HUGE_VAL; // randomly dropout feature
842 else if (wgn_dataset.ftype(i) == wndt_binary)
843 {
844 construct_binary_ques(i,*questions[i]);
845 scores[i] = wgn_score_question(*questions[i],dset);
846 }
847 else if (wgn_dataset.ftype(i) == wndt_float)
848 {
849 scores[i] = construct_float_ques(i,*questions[i],dset);
850 }
851 else if (wgn_dataset.ftype(i) == wndt_ignore)
852 scores[i] = WGN_HUGE_VAL; // always ignore this feature
853#if 0
854 // This doesn't work reasonably
855 else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
856 {
857 wagon_error("subset selection temporarily deleted");
859 }
860#endif
861 else if (wgn_dataset.ftype(i) >= wndt_class)
862 scores[i] = construct_class_ques(i,*questions[i],dset);
863 }
864 for (i=0;i < wgn_dataset.width(); i++)
865 {
866 if (scores[i] < bscore)
867 {
868 memcpy(&best_ques,questions[i],sizeof(*questions[i]));
869 best_ques.set_score(scores[i]);
870 bscore = scores[i];
871 }
872 delete questions[i];
873 }
874 delete [] questions;
875 delete [] scores;
876 return best_ques;
877}
878#else
879// No OMP parallelism
880static WQuestion find_best_question(WVectorVector &dset)
881{
882 // Ask all possible questions and find the best one
883 int i;
884 float bscore,tscore;
886
887 bscore = tscore = WGN_HUGE_VAL;
888 best_ques.set_score(bscore);
889 // test each feature with each possible question
890 for (i=0;i < wgn_dataset.width(); i++)
891 {
892 if ((wgn_dataset.ignore(i) == TRUE) ||
893 (i == wgn_predictee))
894 tscore = WGN_HUGE_VAL; // ignore this feature this time
895 else if (wgn_random_number(1.0) < wgn_dropout_feats)
896 tscore = WGN_HUGE_VAL; // randomly dropout feature
897 else if (wgn_dataset.ftype(i) == wndt_binary)
898 {
899 construct_binary_ques(i,test_ques);
900 tscore = wgn_score_question(test_ques,dset);
901 }
902 else if (wgn_dataset.ftype(i) == wndt_float)
903 {
904 tscore = construct_float_ques(i,test_ques,dset);
905 }
906 else if (wgn_dataset.ftype(i) == wndt_ignore)
907 tscore = WGN_HUGE_VAL; // always ignore this feature
908#if 0
909 // This doesn't work reasonably
910 else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
911 {
912 wagon_error("subset selection temporarily deleted");
914 }
915#endif
916 else if (wgn_dataset.ftype(i) >= wndt_class)
917 tscore = construct_class_ques(i,test_ques,dset);
918 if (tscore < bscore)
919 {
921 best_ques.set_score(tscore);
922 bscore = tscore;
923 }
924 }
925
926 return best_ques;
927}
928#endif
929
930static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
931{
932 // Find out which member of a class gives the best split
933 float tscore,bscore = WGN_HUGE_VAL;
934 int cl;
936
937 test_q.set_fp(feat);
938 test_q.set_oper(wnop_is);
939 ques = test_q;
940
941 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
942 {
943 test_q.set_operand1(EST_Val(cl));
944 tscore = wgn_score_question(test_q,ds);
945 if (tscore < bscore)
946 {
947 ques = test_q;
948 bscore = tscore;
949 }
950 }
951
952 return bscore;
953}
954
955#if 0
958{
959 // Find out which subset of a class gives the best split.
960 // We first measure the subset of the data for each member of
961 // of the class. Then order those splits. Then go through finding
962 // where the best split of that ordered list is. This is described
963 // on page 247 of Breiman et al.
964 float tscore,bscore = WGN_HUGE_VAL;
965 LISP l;
966 int cl;
967
968 ques.set_fp(feat);
969 ques.set_oper(wnop_is);
970 float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
971
972 // Only do it for exists values
973 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
974 {
975 ques.set_operand(flocons(cl));
976 scores[cl] = wgn_score_question(ques,ds);
977 }
978
980 if (order == NIL)
981 return WGN_HUGE_VAL;
982 if (siod_llength(order) == 1)
983 { // Only one so we know the best "split"
984 ques.set_oper(wnop_is);
985 ques.set_operand(car(order));
986 return scores[get_c_int(car(order))];
987 }
988
989 ques.set_oper(wnop_in);
990 LISP best_l = NIL;
991 for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
992 {
993 ques.set_operand(l);
994 tscore = wgn_score_question(ques,ds);
995 if (tscore < bscore)
996 {
997 best_l = l;
998 bscore = tscore;
999 }
1000
1001 }
1002
1003 if (best_l != NIL)
1004 {
1005 if (siod_llength(best_l) == 1)
1006 {
1007 ques.set_oper(wnop_is);
1008 ques.set_operand(car(best_l));
1009 }
1010 else if (equal(cdr(order),best_l) != NIL)
1011 {
1012 ques.set_oper(wnop_is);
1013 ques.set_operand(car(order));
1014 }
1015 else
1016 {
1017 cout << "Found a good subset" << endl;
1018 ques.set_operand(best_l);
1019 }
1020 }
1021 return bscore;
1022}
1023
1024static LISP sort_class_scores(int feat,float *scores)
1025{
1026 // returns sorted list of (non WGN_HUGE_VAL) items
1027 int i;
1028 LISP items = NIL;
1029 LISP l;
1030
1031 for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
1032 {
1033 if (scores[i] != WGN_HUGE_VAL)
1034 {
1035 if (items == NIL)
1036 items = cons(flocons(i),NIL);
1037 else
1038 {
1039 for (l=items; l != NIL; l=cdr(l))
1040 {
1041 if (scores[i] < scores[get_c_int(car(l))])
1042 {
1043 CDR(l) = cons(car(l),cdr(l));
1044 CAR(l) = flocons(i);
1045 break;
1046 }
1047 }
1048 if (l == NIL)
1049 items = l_append(items,cons(flocons(i),NIL));
1050 }
1051 }
1052 }
1053 return items;
1054}
1055#endif
1056
1057static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
1058{
1059 // Find out a split of the range that gives the best score
1060 // Naively does this by partitioning the range into float_range_split slots
1061 float tscore,bscore = WGN_HUGE_VAL;
1062 int d, i;
1063 float p;
1065 float max,min,val,incr;
1066
1067 test_q.set_fp(feat);
1068 test_q.set_oper(wnop_lessthan);
1069 ques = test_q;
1070
1071 min = max = ds(0)->get_flt_val(feat); /* set up some value */
1072 for (d=0; d < ds.n(); d++)
1073 {
1074 val = ds(d)->get_flt_val(feat);
1075 if (val < min)
1076 min = val;
1077 else if (val > max)
1078 max = val;
1079 }
1080 if (max == min) // we're pure
1081 return WGN_HUGE_VAL;
1082 incr = (max-min)/wgn_float_range_split;
1083 // so do float_range-1 splits
1084 /* We calculate this based on the number splits, not the increments, */
1085 /* becuase incr can be so small it doesn't increment p */
1086 for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
1087 {
1088 test_q.set_operand1(EST_Val(p));
1089 tscore = wgn_score_question(test_q,ds);
1090 if (tscore < bscore)
1091 {
1092 ques = test_q;
1093 bscore = tscore;
1094 }
1095 }
1096
1097 return bscore;
1098}
1099
1100static void construct_binary_ques(int feat,WQuestion &test_ques)
1101{
1102 // construct a question. Not sure about this in general
1103 // of course continuous/categorical features will require different
1104 // rule and non-binary ones will require some test point
1105
1106 test_ques.set_fp(feat);
1107 test_ques.set_oper(wnop_binary);
1108 test_ques.set_operand1(EST_Val(""));
1109}
1110
1111static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1112{
1113 // score this question as a possible split by finding
1114 // the sum of the impurities when ds is split with this question
1115 WImpurity y,n;
1116 int d, num_yes, num_no;
1117 float count;
1118 WVector *wv;
1119
1120 num_yes = num_no = 0;
1121 y.data = &ds;
1122 n.data = &ds;
1123 for (d=0; d < ds.n(); d++)
1124 {
1125 if (wgn_random_number(1.0) < wgn_dropout_samples)
1126 {
1127 continue; // dropout this sample
1128 }
1129 else if ((ignorenth < 2) ||
1130 (d%ignorenth != ignorenth-1))
1131 {
1132 wv = ds(d);
1133 if (wgn_count_field == -1)
1134 count = 1.0;
1135 else
1136 count = (*wv)[wgn_count_field];
1137
1138 if (q.ask(*wv) == TRUE)
1139 {
1140 num_yes++;
1141 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1142 y.cumulate(d,count); // note the sample number not value
1143 else
1144 y.cumulate((*wv)[wgn_predictee],count);
1145 }
1146 else
1147 {
1148 num_no++;
1149 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1150 n.cumulate(d,count); // note the sample number not value
1151 else
1152 n.cumulate((*wv)[wgn_predictee],count);
1153 }
1154 }
1155 }
1156
1157 q.set_yes(num_yes);
1158 q.set_no(num_no);
1159
1160 int min_cluster;
1161
1162 if ((wgn_balance == 0.0) ||
1163 (ds.n()/wgn_balance < wgn_min_cluster_size))
1164 min_cluster = wgn_min_cluster_size;
1165 else
1166 min_cluster = (int)(ds.n()/wgn_balance);
1167
1168 if ((y.samples() < min_cluster) ||
1169 (n.samples() < min_cluster))
1170 return WGN_HUGE_VAL;
1171
1172 float ym,nm,bm;
1173 // printf("awb_debug score_question_set X %f Y %f\n",
1174 // y.samples(), n.samples());
1175 ym = y.measure();
1176 nm = n.measure();
1177 bm = ym + nm;
1178
1179 /* cout << q << endl;
1180 printf("test question y %f n %f b %f\n",
1181 ym, nm, bm); */
1182
1183 return bm/2.0;
1184}
1185
1186float wgn_score_question(WQuestion &q, WVectorVector &ds)
1187{
1188 // This level of indirection was introduced for later expansion
1189
1190 return score_question_set(q,ds,1);
1191}
1192
1193WNode *wagon_stepwise(float limit)
1194{
1195 // Find the best single features and incrementally add features
1196 // that best improve result until it doesn't improve.
1197 // This is basically to automate what Kurt was doing in building
1198 // trees, he then automated it in PERL and as it seemed to work
1199 // I put it into wagon itself.
1200 // This can be pretty computationally intensive.
1201 WNode *best = 0,*new_best = 0;
1202 float bscore,best_score = -WGN_HUGE_VAL;
1203 int best_feat,i;
1204 int nf = 1;
1205
1206 // Set all features to ignore
1207 for (i=0; i < wgn_dataset.width(); i++)
1208 wgn_dataset.set_ignore(i,TRUE);
1209
1210 for (i=0; i < wgn_dataset.width(); i++)
1211 {
1212 if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1213 {
1214 // This skips the round not because this has anything to
1215 // do with this feature being (user specified) ignored
1216 // but because it indicates there is one less cycle that is
1217 // necessary
1218 continue;
1219 }
1220 new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1221
1222 if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1223 {
1224 // gone as far as we can
1225 delete new_best;
1226 break;
1227 }
1228 else
1229 {
1231 delete best;
1232 best = new_best;
1233 wgn_dataset.set_ignore(best_feat,FALSE);
1234 if (!wgn_quiet)
1235 {
1236 fprintf(stdout,"FEATURE %d %s: %2.4f\n",
1237 nf,
1238 (const char *)wgn_dataset.feat_name(best_feat),
1239 best_score);
1240 fflush(stdout);
1241 nf++;
1242 }
1243 }
1244 }
1245
1246 return best;
1247}
1248
1249static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1250{
1251 // Find which of the currently ignored features will best improve
1252 // the result
1253 WNode *best = 0;
1254 float best_score = -WGN_HUGE_VAL;
1255 int best_new_feat = -1;
1256 int i;
1257
1258 for (i=0; i < wgn_dataset.width(); i++)
1259 {
1260 if (wgn_dataset.ftype(i) == wndt_ignore)
1261 continue; // user wants me to ignore this completely
1262 else if (i == wgn_predictee) // can't use the answer
1263 continue;
1264 else if (wgn_dataset.ignore(i) == TRUE)
1265 {
1266 WNode *current;
1267 float score;
1268
1269 // Allow this feature to participate
1270 wgn_dataset.set_ignore(i,FALSE);
1271
1272 current = wgn_build_tree(score);
1273
1274 if (score > best_score)
1275 {
1276 best_score = score;
1277 delete best;
1278 best = current;
1279 best_new_feat = i;
1280// fprintf(stdout,"BETTER FEATURE %d %s: %2.4f\n",
1281// i,
1282// (const char *)wgn_dataset.feat_name(i),
1283// best_score);
1284// fflush(stdout);
1285 }
1286 else
1287 delete current;
1288
1289 // switch it off again
1290 wgn_dataset.set_ignore(i,TRUE);
1291 }
1292 }
1293
1296 return best;
1297}
const EST_String & name(const int n) const
The name given the index.
double stddev(void) const
standard deviation of currently cummulated values
double mean(void) const
mean of currently cummulated values
void reset(void)
reset internal values
T & item(const EST_Litem *p)
Definition EST_TList.h:133
void resize(int n, int set=1)
float & a(int i, int c=0)
int num_channels() const
return number of channels in track
Definition EST_Track.h:656