[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

rf_algorithm.hxx
1/************************************************************************/
2/* */
3/* Copyright 2008-2009 by Rahul Nair */
4/* */
5/* This file is part of the VIGRA computer vision library. */
6/* The VIGRA Website is */
7/* http://hci.iwr.uni-heidelberg.de/vigra/ */
8/* Please direct questions, bug reports, and contributions to */
9/* ullrich.koethe@iwr.uni-heidelberg.de or */
10/* vigra@informatik.uni-hamburg.de */
11/* */
12/* Permission is hereby granted, free of charge, to any person */
13/* obtaining a copy of this software and associated documentation */
14/* files (the "Software"), to deal in the Software without */
15/* restriction, including without limitation the rights to use, */
16/* copy, modify, merge, publish, distribute, sublicense, and/or */
17/* sell copies of the Software, and to permit persons to whom the */
18/* Software is furnished to do so, subject to the following */
19/* conditions: */
20/* */
21/* The above copyright notice and this permission notice shall be */
22/* included in all copies or substantial portions of the */
23/* Software. */
24/* */
25/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27/* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28/* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29/* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30/* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31/* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32/* OTHER DEALINGS IN THE SOFTWARE. */
33/* */
34/************************************************************************/
35#ifndef VIGRA_RF_ALGORITHM_HXX
36#define VIGRA_RF_ALGORITHM_HXX
37#include <vector>
38#include "splices.hxx"
39#include <queue>
40#include <fstream>
41namespace vigra
42{
43
44namespace rf
45{
46/** This namespace contains all algorithms developed for feature
47 * selection
48 *
49 */
50namespace algorithms
51{
52
53namespace detail
54{
55 /** create a MultiArray containing only columns supplied between iterators
56 b and e
57 */
58 template<class OrigMultiArray,
59 class Iter,
60 class DestMultiArray>
61 void choose(OrigMultiArray const & in,
62 Iter const & b,
63 Iter const & e,
65 {
66 int columnCount = std::distance(b, e);
67 int rowCount = in.shape(0);
68 out.reshape(MultiArrayShape<2>::type(rowCount, columnCount));
69 int ii = 0;
70 for(Iter iter = b; iter != e; ++iter, ++ii)
71 {
72 columnVector(out, ii) = columnVector(in, *iter);
73 }
74 }
75}
76
77
78
79/** Standard random forest Errorrate callback functor
80 *
81 * returns the random forest error estimate when invoked.
82 */
84{
85 RandomForestOptions options;
86
87 public:
88 /** Default constructor
89 *
90 * optionally supply options to the random forest classifier
91 * \sa RandomForestOptions
92 */
96
97 /** returns the RF OOB error estimate given features and
98 * labels
99 */
100 template<class Feature_t, class Response_t>
101 double operator() (Feature_t const & features,
102 Response_t const & response)
103 {
104 RandomForest<> rf(options);
106 rf.learn(features,
107 response,
109 return oob.oob_breiman;
110 }
111};
112
113
114/** Structure to hold Variable Selection results
115 */
117{
118 bool initialized;
119
120 public:
122 : initialized(false)
123 {}
124
125 typedef std::vector<int> FeatureList_t;
126 typedef std::vector<double> ErrorList_t;
127 typedef FeatureList_t::iterator Pivot_t;
128
129 Pivot_t pivot;
130
131 /** list of features.
132 */
133 FeatureList_t selected;
134
135 /** vector of size (number of features)
136 *
137 * the i-th entry encodes the error rate obtained
138 * while using features [0 - i](including i)
139 *
140 * if the i-th entry is -1 then no error rate was obtained
141 * this may happen if more than one feature is added to the
142 * selected list in one step of the algorithm.
143 *
144 * during initialisation error[m+n-1] is always filled
145 */
146 ErrorList_t errors;
147
148
149 /** errorrate using no features
150 */
152
153 template<class FeatureT,
154 class ResponseT,
155 class Iter,
156 class ErrorRateCallBack>
157 bool init(FeatureT const & all_features,
158 ResponseT const & response,
159 Iter b,
160 Iter e,
162 {
163 bool ret_ = init(all_features, response, errorcallback);
164 if(!ret_)
165 return false;
166 vigra_precondition(std::distance(b, e) == static_cast<std::ptrdiff_t>(selected.size()),
167 "Number of features in ranking != number of features matrix");
168 std::copy(b, e, selected.begin());
169 return true;
170 }
171
172 template<class FeatureT,
173 class ResponseT,
174 class Iter>
175 bool init(FeatureT const & all_features,
176 ResponseT const & response,
177 Iter b,
178 Iter e)
179 {
181 return init(all_features, response, b, e, ecallback);
182 }
183
184
185 template<class FeatureT,
186 class ResponseT>
187 bool init(FeatureT const & all_features,
188 ResponseT const & response)
189 {
190 return init(all_features, response, RFErrorCallback());
191 }
192 /**initialization routine. Will be called only once in the lifetime
193 * of a VariableSelectionResult. Subsequent calls will not reinitialize
194 * member variables.
195 *
196 * This is intended, to allow continuing variable selection at a point
197 * stopped in an earlier iteration.
198 *
199 * returns true if initialization was successful and false if
200 * the object was already initialized before.
201 */
202 template<class FeatureT,
203 class ResponseT,
204 class ErrorRateCallBack>
206 ResponseT const & response,
208 {
209 if(initialized)
210 {
211 return false;
212 }
213 initialized = true;
214 // calculate error with all features
215 selected.resize(all_features.shape(1), 0);
216 for(unsigned int ii = 0; ii < selected.size(); ++ii)
217 selected[ii] = ii;
218 errors.resize(all_features.shape(1), -1);
219 errors.back() = errorcallback(all_features, response);
220
221 // calculate error rate if no features are chosen
222 // corresponds to max(prior probability) of the classes
223 std::map<typename ResponseT::value_type, int> res_map;
224 std::vector<int> cts;
225 int counter = 0;
226 for(int ii = 0; ii < response.shape(0); ++ii)
227 {
228 if(res_map.find(response(ii, 0)) == res_map.end())
229 {
230 res_map[response(ii, 0)] = counter;
231 ++counter;
232 cts.push_back(0);
233 }
234 cts[res_map[response(ii,0)]] +=1;
235 }
236 no_features = double(*(std::max_element(cts.begin(),
237 cts.end())))
238 / double(response.shape(0));
239
240 /*init not_selected vector;
241 not_selected.resize(all_features.shape(1), 0);
242 for(int ii = 0; ii < not_selected.size(); ++ii)
243 {
244 not_selected[ii] = ii;
245 }
246 initialized = true;
247 */
248 pivot = selected.begin();
249 return true;
250 }
251};
252
253
254
255/** Perform forward selection
256 *
257 * \param features IN: n x p matrix containing n instances with p attributes/features
258 * used in the variable selection algorithm
259 * \param response IN: n x 1 matrix containing the corresponding response
260 * \param result IN/OUT: VariableSelectionResult struct which will contain the results
261 * of the algorithm.
262 * Features between result.selected.begin() and result.pivot will
263 * be left untouched.
264 * \sa VariableSelectionResult
265 * \param errorcallback
266 * IN, OPTIONAL:
267 * Functor that returns the error rate given a set of
268 * features and labels. Default is the RandomForest OOB Error.
269 *
270 * Forward selection subsequently chooses the next feature that decreases the Error rate most.
271 *
272 * usage:
273 * \code
274 * MultiArray<2, double> features = createSomeFeatures();
275 * MultiArray<2, int> labels = createCorrespondingLabels();
276 * VariableSelectionResult result;
277 * forward_selection(features, labels, result);
278 * \endcode
279 * To use forward selection but ensure that a specific feature e.g. feature 5 is always
280 * included one would do the following
281 *
282 * \code
283 * VariableSelectionResult result;
284 * result.init(features, labels);
285 * std::swap(result.selected[0], result.selected[5]);
286 * result.setPivot(1);
287 * forward_selection(features, labels, result);
288 * \endcode
289 *
290 * \sa VariableSelectionResult
291 *
292 */
293template<class FeatureT, class ResponseT, class ErrorRateCallBack>
294void forward_selection(FeatureT const & features,
295 ResponseT const & response,
298{
299 VariableSelectionResult::FeatureList_t & selected = result.selected;
300 VariableSelectionResult::ErrorList_t & errors = result.errors;
301 VariableSelectionResult::Pivot_t & pivot = result.pivot;
302 int featureCount = features.shape(1);
303 // initialize result struct if in use for the first time
304 if(!result.init(features, response, errorcallback))
305 {
306 //result is being reused just ensure that the number of features is
307 //the same.
308 vigra_precondition(static_cast<int>(selected.size()) == featureCount,
309 "forward_selection(): Number of features in Feature "
310 "matrix and number of features in previously used "
311 "result struct mismatch!");
312 }
313
314
315 int not_selected_size = std::distance(pivot, selected.end());
316 while(not_selected_size > 1)
317 {
318 std::vector<double> current_errors;
319 VariableSelectionResult::Pivot_t next = pivot;
320 for(int ii = 0; ii < not_selected_size; ++ii, ++next)
321 {
322 std::swap(*pivot, *next);
324 detail::choose( features,
325 selected.begin(),
326 pivot+1,
327 cur_feats);
328 double error = errorcallback(cur_feats, response);
329 current_errors.push_back(error);
330 std::swap(*pivot, *next);
331 }
332 int pos = std::distance(current_errors.begin(),
333 std::min_element(current_errors.begin(),
335 next = pivot;
336 std::advance(next, pos);
337 std::swap(*pivot, *next);
338 errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
339#ifdef RN_VERBOSE
340 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", "));
341 std::cerr << "Choosing " << *pivot << " at error of " << current_errors[pos] << std::endl;
342#endif
343 ++pivot;
344 not_selected_size = std::distance(pivot, selected.end());
345 }
346}
347template<class FeatureT, class ResponseT>
348void forward_selection(FeatureT const & features,
349 ResponseT const & response,
350 VariableSelectionResult & result)
351{
352 forward_selection(features, response, result, RFErrorCallback());
353}
354
355
356/** Perform backward elimination
357 *
358 * \param features IN: n x p matrix containing n instances with p attributes/features
359 * used in the variable selection algorithm
360 * \param response IN: n x 1 matrix containing the corresponding response
361 * \param result IN/OUT: VariableSelectionResult struct which will contain the results
362 * of the algorithm.
363 * Features between result.pivot and result.selected.end() will
364 * be left untouched.
365 * \sa VariableSelectionResult
366 * \param errorcallback
367 * IN, OPTIONAL:
368 * Functor that returns the error rate given a set of
369 * features and labels. Default is the RandomForest OOB Error.
370 *
371 * Backward elimination subsequently eliminates features that have the least influence
372 * on the error rate
373 *
374 * usage:
375 * \code
376 * MultiArray<2, double> features = createSomeFeatures();
377 * MultiArray<2, int> labels = createCorrespondingLabels();
378 * VariableSelectionResult result;
379 * backward_elimination(features, labels, result);
380 * \endcode
381 * To use backward elimination but ensure that a specific feature e.g. feature 5 is always
382 * excluded one would do the following:
383 *
384 * \code
385 * VariableSelectionResult result;
386 * result.init(features, labels);
387 * std::swap(result.selected[result.selected.size()-1], result.selected[5]);
388 * result.setPivot(result.selected.size()-1);
389 * backward_elimination(features, labels, result);
390 * \endcode
391 *
392 * \sa VariableSelectionResult
393 *
394 */
395template<class FeatureT, class ResponseT, class ErrorRateCallBack>
396void backward_elimination(FeatureT const & features,
397 ResponseT const & response,
400{
401 int featureCount = features.shape(1);
402 VariableSelectionResult::FeatureList_t & selected = result.selected;
403 VariableSelectionResult::ErrorList_t & errors = result.errors;
404 VariableSelectionResult::Pivot_t & pivot = result.pivot;
405
406 // initialize result struct if in use for the first time
407 if(!result.init(features, response, errorcallback))
408 {
409 //result is being reused just ensure that the number of features is
410 //the same.
411 vigra_precondition(static_cast<int>(selected.size()) == featureCount,
412 "backward_elimination(): Number of features in Feature "
413 "matrix and number of features in previously used "
414 "result struct mismatch!");
415 }
416 pivot = selected.end() - 1;
417
418 int selected_size = std::distance(selected.begin(), pivot);
419 while(selected_size > 1)
420 {
421 VariableSelectionResult::Pivot_t next = selected.begin();
422 std::vector<double> current_errors;
423 for(int ii = 0; ii < selected_size; ++ii, ++next)
424 {
425 std::swap(*pivot, *next);
427 detail::choose( features,
428 selected.begin(),
429 pivot+1,
430 cur_feats);
431 double error = errorcallback(cur_feats, response);
432 current_errors.push_back(error);
433 std::swap(*pivot, *next);
434 }
435 int pos = std::distance(current_errors.begin(),
436 std::min_element(current_errors.begin(),
438 next = selected.begin();
439 std::advance(next, pos);
440 std::swap(*pivot, *next);
441// std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl;
442 errors[std::distance(selected.begin(), pivot)-1] = current_errors[pos];
443 selected_size = std::distance(selected.begin(), pivot);
444#ifdef RN_VERBOSE
445 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", "));
446 std::cerr << "Eliminating " << *pivot << " at error of " << current_errors[pos] << std::endl;
447#endif
448 --pivot;
449 }
450}
451
452template<class FeatureT, class ResponseT>
453void backward_elimination(FeatureT const & features,
454 ResponseT const & response,
455 VariableSelectionResult & result)
456{
457 backward_elimination(features, response, result, RFErrorCallback());
458}
459
460/** Perform rank selection using a predefined ranking
461 *
462 * \param features IN: n x p matrix containing n instances with p attributes/features
463 * used in the variable selection algorithm
464 * \param response IN: n x 1 matrix containing the corresponding response
465 * \param result IN/OUT: VariableSelectionResult struct which will contain the results
466 * of the algorithm. The struct should be initialized with the
467 * predefined ranking.
468 *
469 * \sa VariableSelectionResult
470 * \param errorcallback
471 * IN, OPTIONAL:
472 * Functor that returns the error rate given a set of
473 * features and labels. Default is the RandomForest OOB Error.
474 *
475 * Often some variable importance, score measure is used to create the ordering in which
476 * variables have to be selected. This method takes such a ranking and calculates the
477 * corresponding error rates.
478 *
479 * usage:
480 * \code
481 * MultiArray<2, double> features = createSomeFeatures();
482 * MultiArray<2, int> labels = createCorrespondingLabels();
483 * std::vector<int> ranking = createRanking(features);
484 * VariableSelectionResult result;
485 * result.init(features, labels, ranking.begin(), ranking.end());
486 * backward_elimination(features, labels, result);
487 * \endcode
488 *
489 * \sa VariableSelectionResult
490 *
491 */
492template<class FeatureT, class ResponseT, class ErrorRateCallBack>
493void rank_selection (FeatureT const & features,
494 ResponseT const & response,
497{
498 VariableSelectionResult::FeatureList_t & selected = result.selected;
499 VariableSelectionResult::ErrorList_t & errors = result.errors;
500 VariableSelectionResult::Pivot_t & iter = result.pivot;
501 int featureCount = features.shape(1);
502 // initialize result struct if in use for the first time
503 if(!result.init(features, response, errorcallback))
504 {
505 //result is being reused just ensure that the number of features is
506 //the same.
507 vigra_precondition(static_cast<int>(selected.size()) == featureCount,
508 "forward_selection(): Number of features in Feature "
509 "matrix and number of features in previously used "
510 "result struct mismatch!");
511 }
512
513 int ii = 0;
514 for(; iter != selected.end(); ++iter)
515 {
516 ++ii;
518 detail::choose( features,
519 selected.begin(),
520 iter+1,
521 cur_feats);
522 double error = errorcallback(cur_feats, response);
523 errors[std::distance(selected.begin(), iter)] = error;
524#ifdef RN_VERBOSE
525 std::copy(selected.begin(), iter+1, std::ostream_iterator<int>(std::cerr, ", "));
526 std::cerr << "Choosing " << *(iter+1) << " at error of " << error << std::endl;
527#endif
528
529 }
530}
531
532template<class FeatureT, class ResponseT>
533void rank_selection (FeatureT const & features,
534 ResponseT const & response,
535 VariableSelectionResult & result)
536{
537 rank_selection(features, response, result, RFErrorCallback());
538}
539
540
541
542enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99};
543
544/* View of a Node in the hierarchical clustering
545 * class
546 * For internal use only -
547 * \sa NodeBase
548 */
549class ClusterNode
550: public NodeBase
551{
552 public:
553
554 typedef NodeBase BT;
555
556 /**constructors **/
557 ClusterNode():NodeBase(){}
558 ClusterNode( int nCol,
559 BT::T_Container_type & topology,
560 BT::P_Container_type & split_param)
561 : BT(nCol + 5, 5,topology, split_param)
562 {
563 status() = 0;
564 BT::column_data()[0] = nCol;
565 if(nCol == 1)
566 BT::typeID() = c_Leaf;
567 else
568 BT::typeID() = c_Node;
569 }
570
571 ClusterNode( BT::T_Container_type const & topology,
572 BT::P_Container_type const & split_param,
573 int n )
574 : NodeBase(5 , 5,topology, split_param, n)
575 {
576 //TODO : is there a more elegant way to do this?
577 BT::topology_size_ += BT::column_data()[0];
578 }
579
580 ClusterNode( BT & node_)
581 : BT(5, 5, node_)
582 {
583 //TODO : is there a more elegant way to do this?
584 BT::topology_size_ += BT::column_data()[0];
585 BT::parameter_size_ += 0;
586 }
587 int index()
588 {
589 return static_cast<int>(BT::parameters_begin()[1]);
590 }
591 void set_index(int in)
592 {
593 BT::parameters_begin()[1] = in;
594 }
595 double& mean()
596 {
597 return BT::parameters_begin()[2];
598 }
599 double& stdev()
600 {
601 return BT::parameters_begin()[3];
602 }
603 double& status()
604 {
605 return BT::parameters_begin()[4];
606 }
607};
608
609/** Stackentry class for HClustering class
610 */
612{
613 int parent;
614 int level;
615 int addr;
616 bool infm;
617 HC_Entry(int p, int l, int a, bool in)
618 : parent(p), level(l), addr(a), infm(in)
619 {}
620};
621
622
623/** Hierarchical Clustering class.
624 * Performs single linkage clustering
625 * \code
626 * Matrix<double> distance = get_distance_matrix();
627 * linkage.cluster(distance);
628 * // Draw clustering tree.
629 * Draw<double, int> draw(features, labels, "linkagetree.graph");
630 * linkage.breadth_first_traversal(draw);
631 * \endcode
632 * \sa ClusterImportanceVisitor
633 *
634 * once the clustering has taken place. Information queries can be made
635 * using the breadth_first_traversal() method and iterate() method
636 *
637 */
639{
640public:
642 ArrayVector<int> topology_;
643 ArrayVector<double> parameters_;
644 int begin_addr;
645
646 // Calculates the distance between two
647 double dist_func(double a, double b)
648 {
649 return std::min(a, b);
650 }
651
652 /** Visit each node with a Functor
653 * in creation order (should be depth first)
654 */
655 template<class Functor>
656 void iterate(Functor & tester)
657 {
658
659 std::vector<int> stack;
660 stack.push_back(begin_addr);
661 while(!stack.empty())
662 {
663 ClusterNode node(topology_, parameters_, stack.back());
664 stack.pop_back();
665 if(!tester(node))
666 {
667 if(node.columns_size() != 1)
668 {
669 stack.push_back(node.child(0));
670 stack.push_back(node.child(1));
671 }
672 }
673 }
674 }
675
676 /** Perform breadth first traversal of hierarchical cluster tree
677 */
678 template<class Functor>
680 {
681
682 std::queue<HC_Entry> queue;
683 int level = 0;
684 int parent = -1;
685 int addr = -1;
686 bool infm = false;
687 queue.push(HC_Entry(parent,level,begin_addr, infm));
688 while(!queue.empty())
689 {
690 level = queue.front().level;
691 parent = queue.front().parent;
692 addr = queue.front().addr;
693 infm = queue.front().infm;
694 ClusterNode node(topology_, parameters_, queue.front().addr);
695 ClusterNode parnt;
696 if(parent != -1)
697 {
698 parnt = ClusterNode(topology_, parameters_, parent);
699 }
700 queue.pop();
701 bool istrue = tester(node, level, parnt, infm);
702 if(node.columns_size() != 1)
703 {
704 queue.push(HC_Entry(addr, level +1,node.child(0),istrue));
705 queue.push(HC_Entry(addr, level +1,node.child(1),istrue));
706 }
707 }
708 }
709 /**save to HDF5 - defunct - has to be updated to new HDF5 interface
710 */
711#ifdef HasHDF5
712 void save(std::string file, std::string prefix)
713 {
714
715 vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(),
717 Shp(topology_.size(),1),
718 topology_.data()));
719 vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(),
721 Shp(parameters_.size(), 1),
722 parameters_.data()));
723 vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(),
724 MultiArrayView<2, int>(Shp(1,1), &begin_addr));
725
726 }
727#endif
728
729 /**Perform single linkage clustering
730 * \param distance distance matrix used. \sa CorrelationVisitor
731 */
732 template<class T, class C>
734 {
735 MultiArray<2, T> dist(distance);
736 std::vector<std::pair<int, int> > addr;
737 int index = 0;
738 for(int ii = 0; ii < distance.shape(0); ++ii)
739 {
740 addr.push_back(std::make_pair(topology_.size(), ii));
741 ClusterNode leaf(1, topology_, parameters_);
742 leaf.set_index(index);
743 ++index;
744 leaf.columns_begin()[0] = ii;
745 }
746
747 while(addr.size() != 1)
748 {
749 //find the two nodes with the smallest distance
750 int ii_min = 0;
751 int jj_min = 1;
752 double min_dist = dist((addr.begin()+ii_min)->second,
753 (addr.begin()+jj_min)->second);
754 for(unsigned int ii = 0; ii < addr.size(); ++ii)
755 {
756 for(unsigned int jj = ii+1; jj < addr.size(); ++jj)
757 {
758 if( dist((addr.begin()+ii_min)->second,
759 (addr.begin()+jj_min)->second)
760 > dist((addr.begin()+ii)->second,
761 (addr.begin()+jj)->second))
762 {
763 min_dist = dist((addr.begin()+ii)->second,
764 (addr.begin()+jj)->second);
765 ii_min = ii;
766 jj_min = jj;
767 }
768 }
769 }
770
771 //merge two nodes
772 int col_size = 0;
773 // The problem is that creating a new node invalidates the iterators stored
774 // in firstChild and secondChild.
775 {
776 ClusterNode firstChild(topology_,
777 parameters_,
778 (addr.begin() +ii_min)->first);
779 ClusterNode secondChild(topology_,
780 parameters_,
781 (addr.begin() +jj_min)->first);
782 col_size = firstChild.columns_size() + secondChild.columns_size();
783 }
784 int cur_addr = topology_.size();
785 begin_addr = cur_addr;
786// std::cerr << col_size << std::endl;
787 ClusterNode parent(col_size,
788 topology_,
789 parameters_);
790 ClusterNode firstChild(topology_,
791 parameters_,
792 (addr.begin() +ii_min)->first);
793 ClusterNode secondChild(topology_,
794 parameters_,
795 (addr.begin() +jj_min)->first);
796 parent.parameters_begin()[0] = min_dist;
797 parent.set_index(index);
798 ++index;
799 std::merge(firstChild.columns_begin(), firstChild.columns_end(),
800 secondChild.columns_begin(),secondChild.columns_end(),
801 parent.columns_begin());
802 //merge nodes in addr
803 int to_desc;
804 int ii_keep;
805 if(*parent.columns_begin() == *firstChild.columns_begin())
806 {
807 parent.child(0) = (addr.begin()+ii_min)->first;
808 parent.child(1) = (addr.begin()+jj_min)->first;
809 (addr.begin()+ii_min)->first = cur_addr;
810 ii_keep = ii_min;
811 to_desc = (addr.begin()+jj_min)->second;
812 addr.erase(addr.begin()+jj_min);
813 }
814 else
815 {
816 parent.child(1) = (addr.begin()+ii_min)->first;
817 parent.child(0) = (addr.begin()+jj_min)->first;
818 (addr.begin()+jj_min)->first = cur_addr;
819 ii_keep = jj_min;
820 to_desc = (addr.begin()+ii_min)->second;
821 addr.erase(addr.begin()+ii_min);
822 }
823 //update distances;
824
825 for(int jj = 0 ; jj < static_cast<int>(addr.size()); ++jj)
826 {
827 if(jj == ii_keep)
828 continue;
829 double bla = dist_func(
830 dist(to_desc, (addr.begin()+jj)->second),
831 dist((addr.begin()+ii_keep)->second,
832 (addr.begin()+jj)->second));
833
834 dist((addr.begin()+ii_keep)->second,
835 (addr.begin()+jj)->second) = bla;
836 dist((addr.begin()+jj)->second,
837 (addr.begin()+ii_keep)->second) = bla;
838 }
839 }
840 }
841
842};
843
844
845/** Normalize the status value in the HClustering tree (HClustering Visitor)
846 */
848{
849public:
850 double n;
851 /** Constructor
852 * \param m normalize status() by m
853 */
855 :n(m)
856 {}
857 template<class Node>
858 bool operator()(Node& node)
859 {
860 node.status()/=n;
861 return false;
862 }
863};
864
865
866/** Perform Permutation importance on HClustering clusters
867 * (See visit_after_tree() method of visitors::VariableImportance to
868 * see the basic idea. (Just that we apply the permutation not only to
869 * variables but also to clusters))
870 */
871template<class Iter, class DT>
873{
874public:
876 Matrix<double> tmp_mem_;
879 Matrix<double> feats_;
880 Matrix<int> labels_;
881 const int nPerm;
882 DT const & dt;
883 int index;
884 int oob_size;
885
886 template<class Feat_T, class Label_T>
887 PermuteCluster(Iter a,
888 Iter b,
889 Feat_T const & feats,
890 Label_T const & labls,
893 int np,
894 DT const & dt_)
895 :tmp_mem_(_spl(a, b).size(), feats.shape(1)),
896 perm_imp(p_imp),
897 orig_imp(o_imp),
898 feats_(_spl(a,b).size(), feats.shape(1)),
899 labels_(_spl(a,b).size(),1),
900 nPerm(np),
901 dt(dt_),
902 index(0),
903 oob_size(b-a)
904 {
905 copy_splice(_spl(a,b),
906 _spl(feats.shape(1)),
907 feats,
908 feats_);
909 copy_splice(_spl(a,b),
910 _spl(labls.shape(1)),
911 labls,
912 labels_);
913 }
914
915 template<class Node>
916 bool operator()(Node& node)
917 {
918 tmp_mem_ = feats_;
919 RandomMT19937 random;
920 int class_count = perm_imp.shape(1) - 1;
921 //permute columns together
922 for(int kk = 0; kk < nPerm; ++kk)
923 {
924 tmp_mem_ = feats_;
925 for(int ii = 0; ii < rowCount(feats_); ++ii)
926 {
927 int index = random.uniformInt(rowCount(feats_) - ii) +ii;
928 for(int jj = 0; jj < node.columns_size(); ++jj)
929 {
930 if(node.columns_begin()[jj] != feats_.shape(1))
931 tmp_mem_(ii, node.columns_begin()[jj])
932 = tmp_mem_(index, node.columns_begin()[jj]);
933 }
934 }
935
936 for(int ii = 0; ii < rowCount(tmp_mem_); ++ii)
937 {
938 if(dt
939 .predictLabel(rowVector(tmp_mem_, ii))
940 == labels_(ii, 0))
941 {
942 //per class
943 ++perm_imp(index,labels_(ii, 0));
944 //total
945 ++perm_imp(index, class_count);
946 }
947 }
948 }
949 double node_status = perm_imp(index, class_count);
950 node_status /= nPerm;
951 node_status -= orig_imp(0, class_count);
952 node_status *= -1;
953 node_status /= oob_size;
954 node.status() += node_status;
955 ++index;
956
957 return false;
958 }
959};
960
961/** Convert ClusteringTree into a list (HClustering visitor)
962 */
964{
965public:
966 /** NumberOfClusters x NumberOfVariables MultiArrayView containing
967 * in each row the variable belonging to a cluster
968 */
970 int index;
972 :variables(vars), index(0)
973 {}
974#ifdef HasHDF5
975 void save(std::string file, std::string prefix)
976 {
977 vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(),
978 variables);
979 }
980#endif
981
982 template<class Node>
983 bool operator()(Node& node)
984 {
985 for(int ii = 0; ii < node.columns_size(); ++ii)
986 variables(index, ii) = node.columns_begin()[ii];
987 ++index;
988 return false;
989 }
990};
991/** corrects the status fields of a linkage Clustering (HClustering Visitor)
992 *
993 * such that status(currentNode) = min(status(parent), status(currentNode))
994 * \sa cluster_permutation_importance()
995 */
997{
998public:
999 template<class Nde>
1000 bool operator()(Nde & cur, int /*level*/, Nde parent, bool /*infm*/)
1001 {
1002 if(parent.hasData_)
1003 cur.status() = std::min(parent.status(), cur.status());
1004 return true;
1005 }
1006};
1007
1008
1009/** draw current linkage Clustering (HClustering Visitor)
1010 *
1011 * create a graphviz .dot file
1012 * usage:
1013 * \code
1014 * Matrix<double> distance = get_distance_matrix();
1015 * linkage.cluster(distance);
1016 * Draw<double, int> draw(features, labels, "linkagetree.graph");
1017 * linkage.breadth_first_traversal(draw);
1018 * \endcode
1019 */
1020template<class T1,
1021 class T2,
1022 class C1 = UnstridedArrayTag,
1023 class C2 = UnstridedArrayTag>
1024class Draw
1025{
1026public:
1028 MultiArrayView<2, T1, C1> const & features_;
1029 MultiArrayView<2, T2, C2> const & labels_;
1030 std::ofstream graphviz;
1031
1032
1033 Draw(MultiArrayView<2, T1, C1> const & features,
1034 MultiArrayView<2, T2, C2> const& labels,
1035 std::string const gz)
1036 :features_(features), labels_(labels),
1037 graphviz(gz.c_str(), std::ios::out)
1038 {
1039 graphviz << "digraph G\n{\n node [shape=\"record\"]";
1040 }
1041 ~Draw()
1042 {
1043 graphviz << "\n}\n";
1044 graphviz.close();
1045 }
1046
1047 template<class Nde>
1048 bool operator()(Nde & cur, int /*level*/, Nde parent, bool /*infm*/)
1049 {
1050 graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n";
1051 graphviz << " status: " << cur.status() << "\\n";
1052 for(int kk = 0; kk < cur.columns_size(); ++kk)
1053 {
1054 graphviz << cur.columns_begin()[kk] << " ";
1055 if(kk % 15 == 14)
1056 graphviz << "\\n";
1057 }
1058 graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n";
1059 if(parent.hasData_)
1060 graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n";
1061 return true;
1062 }
1063};
1064
1065/** calculate Cluster based permutation importance while learning. (RandomForestVisitor)
1066 */
1068{
1069 public:
1070
1071 /** List of variables as produced by GetClusterVariables
1072 */
1074 /** Corresponding importance measures
1075 */
1077 /** Corresponding error
1078 */
1080 int repetition_count_;
1081 bool in_place_;
1082 HClustering & clustering;
1083
1084
1085#ifdef HasHDF5
1086 void save(std::string filename, std::string prefix)
1087 {
1088 std::string prefix1 = "cluster_importance_" + prefix;
1089 writeHDF5(filename.c_str(),
1090 prefix1.c_str(),
1092 prefix1 = "vars_" + prefix;
1093 writeHDF5(filename.c_str(),
1094 prefix1.c_str(),
1095 variables);
1096 }
1097#endif
1098
1100 : repetition_count_(rep_cnt), clustering(clst)
1101
1102 {}
1103
1104 /** Allocate enough memory
1105 */
1106 template<class RF, class PR>
1107 void visit_at_beginning(RF const & rf, PR const & /*pr*/)
1108 {
1109 Int32 const class_count = rf.ext_param_.class_count_;
1110 Int32 const column_count = rf.ext_param_.column_count_+1;
1112 .reshape(MultiArrayShape<2>::type(2*column_count-1,
1113 class_count+1));
1115 .reshape(MultiArrayShape<2>::type(2*column_count-1,
1116 class_count+1));
1117 variables
1118 .reshape(MultiArrayShape<2>::type(2*column_count-1,
1119 column_count), -1);
1121 clustering.iterate(gcv);
1122
1123 }
1124
1125 /**compute permutation based var imp.
1126 * (Only an Array of size oob_sample_count x 1 is created.
1127 * - apposed to oob_sample_count x feature_count in the other method.
1128 *
1129 * \sa FieldProxy
1130 */
1131 template<class RF, class PR, class SM, class ST>
1132 void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & /*st*/, int index)
1133 {
1135 Int32 column_count = rf.ext_param_.column_count_ +1;
1136 Int32 class_count = rf.ext_param_.class_count_;
1137
1138 // remove the const cast on the features (yep , I know what I am
1139 // doing here.) data is not destroyed.
1140 typename PR::Feature_t & features
1141 = const_cast<typename PR::Feature_t &>(pr.features());
1142
1143 //find the oob indices of current tree.
1145 ArrayVector<Int32>::iterator
1146 iter;
1147
1148 if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000)
1149 {
1150 ArrayVector<int> cts(2, 0);
1151 ArrayVector<Int32> indices(pr.features().shape(0));
1152 for(int ii = 0; ii < pr.features().shape(0); ++ii)
1153 indices.push_back(ii);
1154 std::random_shuffle(indices.begin(), indices.end());
1155 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1156 {
1157 if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000)
1158 {
1159 oob_indices.push_back(indices[ii]);
1160 ++cts[pr.response()(indices[ii], 0)];
1161 }
1162 }
1163 }
1164 else
1165 {
1166 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1167 if(!sm.is_used()[ii])
1168 oob_indices.push_back(ii);
1169 }
1170
1171 // Random foo
1172 RandomMT19937 random(RandomSeed);
1174 randint(random);
1175
1176 //make some space for the results
1178 oob_right(Shp_t(1, class_count + 1));
1179
1180 // get the oob success rate with the original samples
1181 for(iter = oob_indices.begin();
1182 iter != oob_indices.end();
1183 ++iter)
1184 {
1185 if(rf.tree(index)
1186 .predictLabel(rowVector(features, *iter))
1187 == pr.response()(*iter, 0))
1188 {
1189 //per class
1190 ++oob_right[pr.response()(*iter,0)];
1191 //total
1192 ++oob_right[class_count];
1193 }
1194 }
1195
1197 perm_oob_right (Shp_t(2* column_count-1, class_count + 1));
1198
1199 PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t>
1201 pr.features(),
1202 pr.response(),
1204 oob_right,
1205 repetition_count_,
1206 rf.tree(index));
1207 clustering.iterate(pc);
1208
1209 perm_oob_right /= repetition_count_;
1210 for(int ii = 0; ii < rowCount(perm_oob_right); ++ii)
1211 rowVector(perm_oob_right, ii) -= oob_right;
1212
1213 perm_oob_right *= -1;
1216 }
1217
1218 /** calculate permutation based impurity after every tree has been
1219 * learned default behaviour is that this happens out of place.
1220 * If you have very big data sets and want to avoid copying of data
1221 * set the in_place_ flag to true.
1222 */
1223 template<class RF, class PR, class SM, class ST>
1224 void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index)
1225 {
1226 after_tree_ip_impl(rf, pr, sm, st, index);
1227 }
1228
1229 /** Normalise variable importance after the number of trees is known.
1230 */
1231 template<class RF, class PR>
1232 void visit_at_end(RF & rf, PR & /*pr*/)
1233 {
1234 NormalizeStatus nrm(rf.tree_count());
1235 clustering.iterate(nrm);
1236 cluster_importance_ /= rf.trees_.size();
1237 }
1238};
1239
1240/** Perform hierarchical clustering of variables and assess importance of clusters
1241 *
1242 * \param features IN: n x p matrix containing n instances with p attributes/features
1243 * used in the variable selection algorithm
1244 * \param response IN: n x 1 matrix containing the corresponding response
1245 * \param linkage OUT: Hierarchical grouping of variables.
1246 * \param distance OUT: distance matrix used for creating the linkage
1247 *
1248 * Performs Hierarchical clustering of variables. And calculates the permutation importance
1249 * measures of each of the clusters. Use the Draw functor to create human readable output
1250 * The cluster-permutation importance measure corresponds to the normal permutation importance
1251 * measure with all columns corresponding to a cluster permuted.
1252 * The importance measure for each cluster is stored as the status() field of each clusternode
1253 * \sa HClustering
1254 *
1255 * usage:
1256 * \code
1257 * MultiArray<2, double> features = createSomeFeatures();
1258 * MultiArray<2, int> labels = createCorrespondingLabels();
1259 * HClustering linkage;
1260 * MultiArray<2, double> distance;
1261 * cluster_permutation_importance(features, labels, linkage, distance)
1262 * // create graphviz output
1263 *
1264 * Draw<double, int> draw(features, labels, "linkagetree.graph");
1265 * linkage.breadth_first_traversal(draw);
1266 *
1267 * \endcode
1268 *
1269 *
1270 */
1271template<class FeatureT, class ResponseT>
1273 ResponseT const & response,
1275 MultiArray<2, double> & distance)
1276{
1277
1279 opt.tree_count(100);
1280 if(features.shape(0) > 40000)
1281 opt.samples_per_tree(20000).use_stratification(RF_EQUAL);
1282
1283
1287 RF.learn(features, response,
1288 create_visitor(missc, progress));
1289 distance = missc.distance;
1290 /*
1291 missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH");
1292 */
1293
1294
1295 // Produce linkage
1296 linkage.cluster(distance);
1297
1298 //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/");
1301 RF2.learn(features,
1302 response,
1303 create_visitor(progress, ci));
1304
1305
1307 linkage.breadth_first_traversal(cs);
1308
1309 //ci.save(exp_dir + dset.name() + "_result.h5", dset.name());
1310 //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph");
1311 //linkage.breadth_first_traversal(draw);
1312
1313}
1314
1315
1316template<class FeatureT, class ResponseT>
1317void cluster_permutation_importance(FeatureT const & features,
1318 ResponseT const & response,
1319 HClustering & linkage)
1320{
1321 MultiArray<2, double> distance;
1322 cluster_permutation_importance(features, response, linkage, distance);
1323}
1324
1325
1326template<class Array1, class Vector1>
1327void get_ranking(Array1 const & in, Vector1 & out)
1328{
1329 std::map<double, int> mymap;
1330 for(int ii = 0; ii < in.size(); ++ii)
1331 mymap[in[ii]] = ii;
1332 for(std::map<double, int>::reverse_iterator iter = mymap.rbegin(); iter!= mymap.rend(); ++iter)
1333 {
1334 out.push_back(iter->second);
1335 }
1336}
1337}//namespace algorithms
1338}//namespace rf
1339}//namespace vigra
1340#endif //VIGRA_RF_ALGORITHM_HXX
void reshape(const difference_type &shape)
Definition multi_array.hxx:2861
Topology_type column_data() const
Definition rf_nodeproxy.hxx:159
INT & typeID()
Definition rf_nodeproxy.hxx:136
NodeBase()
Definition rf_nodeproxy.hxx:237
Parameter_type parameters_begin() const
Definition rf_nodeproxy.hxx:207
Class for a single RGB value.
Definition rgbvalue.hxx:128
Options object for the random forest.
Definition rf_common.hxx:171
size_type size() const
Definition tinyvector.hxx:913
iterator end()
Definition tinyvector.hxx:864
iterator begin()
Definition tinyvector.hxx:861
Class for fixed size vectors.
Definition tinyvector.hxx:1008
Definition rf_algorithm.hxx:1068
MultiArray< 2, double > cluster_importance_
Definition rf_algorithm.hxx:1076
MultiArray< 2, int > variables
Definition rf_algorithm.hxx:1073
void visit_at_end(RF &rf, PR &)
Definition rf_algorithm.hxx:1232
void visit_after_tree(RF &rf, PR &pr, SM &sm, ST &st, int index)
Definition rf_algorithm.hxx:1224
MultiArray< 2, double > cluster_stdev_
Definition rf_algorithm.hxx:1079
void after_tree_ip_impl(RF &rf, PR &pr, SM &sm, ST &, int index)
Definition rf_algorithm.hxx:1132
void visit_at_beginning(RF const &rf, PR const &)
Definition rf_algorithm.hxx:1107
Definition rf_algorithm.hxx:997
Definition rf_algorithm.hxx:1025
Definition rf_algorithm.hxx:964
MultiArrayView< 2, int > variables
Definition rf_algorithm.hxx:969
Definition rf_algorithm.hxx:639
void iterate(Functor &tester)
Definition rf_algorithm.hxx:656
void cluster(MultiArrayView< 2, T, C > distance)
Definition rf_algorithm.hxx:733
void breadth_first_traversal(Functor &tester)
Definition rf_algorithm.hxx:679
Definition rf_algorithm.hxx:848
NormalizeStatus(double m)
Definition rf_algorithm.hxx:854
Definition rf_algorithm.hxx:873
Definition rf_algorithm.hxx:84
double operator()(Feature_t const &features, Response_t const &response)
Definition rf_algorithm.hxx:101
RFErrorCallback(RandomForestOptions opt=RandomForestOptions())
Definition rf_algorithm.hxx:93
Definition rf_algorithm.hxx:117
double no_features
Definition rf_algorithm.hxx:151
ErrorList_t errors
Definition rf_algorithm.hxx:146
FeatureList_t selected
Definition rf_algorithm.hxx:133
bool init(FeatureT const &all_features, ResponseT const &response, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:205
Definition rf_visitors.hxx:1496
Definition rf_visitors.hxx:864
Definition rf_visitors.hxx:102
void backward_elimination(FeatureT const &features, ResponseT const &response, VariableSelectionResult &result, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:396
void rank_selection(FeatureT const &features, ResponseT const &response, VariableSelectionResult &result, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:493
void forward_selection(FeatureT const &features, ResponseT const &response, VariableSelectionResult &result, ErrorRateCallBack errorcallback)
Definition rf_algorithm.hxx:294
void cluster_permutation_importance(FeatureT const &features, ResponseT const &response, HClustering &linkage, MultiArray< 2, double > &distance)
Definition rf_algorithm.hxx:1272
detail::VisitorNode< A > create_visitor(A &a)
Definition rf_visitors.hxx:344
void writeHDF5(...)
Store array data in an HDF5 file.
detail::SelectIntegerType< 32, detail::SignedIntTypes >::type Int32
32-bit signed int
Definition sized_int.hxx:175
Definition metaprogramming.hxx:123
Definition rf_algorithm.hxx:612

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.11.1