diff options
Diffstat (limited to 'ml/dlib/dlib/random_forest/random_forest_regression.h')
-rw-r--r-- | ml/dlib/dlib/random_forest/random_forest_regression.h | 738 |
1 files changed, 0 insertions, 738 deletions
diff --git a/ml/dlib/dlib/random_forest/random_forest_regression.h b/ml/dlib/dlib/random_forest/random_forest_regression.h deleted file mode 100644 index a61f7a1a2..000000000 --- a/ml/dlib/dlib/random_forest/random_forest_regression.h +++ /dev/null @@ -1,738 +0,0 @@ -// Copyright (C) 2018 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_RANdOM_FOREST_REGRESSION_H_ -#define DLIB_RANdOM_FOREST_REGRESSION_H_ - -#include "random_forest_regression_abstract.h" -#include <vector> -#include "../matrix.h" -#include <algorithm> -#include "../threads.h" - -namespace dlib -{ - -// ---------------------------------------------------------------------------------------- - - class dense_feature_extractor - { - - public: - typedef uint32_t feature; - typedef matrix<double,0,1> sample_type; - - dense_feature_extractor( - ) = default; - - void setup ( - const std::vector<sample_type>& x, - const std::vector<double>& y - ) - { - DLIB_CASSERT(x.size() > 0); - DLIB_CASSERT(x.size() == y.size()); - for (auto& el : x) - DLIB_CASSERT(el.size() == x[0].size(), "All the vectors in a training set have to have the same dimensionality."); - - DLIB_CASSERT(x[0].size() != 0, "The vectors can't be empty."); - - num_feats = x[0].size(); - } - - - void get_random_features ( - dlib::rand& rnd, - size_t num, - std::vector<feature>& feats - ) const - { - DLIB_ASSERT(max_num_feats() != 0); - num = std::min(num, num_feats); - - feats.clear(); - for (size_t i = 0; i < num_feats; ++i) - feats.push_back(i); - - // now pick num features at random - for (size_t i = 0; i < num; ++i) - { - auto idx = rnd.get_integer_in_range(i,num_feats); - std::swap(feats[i], feats[idx]); - } - feats.resize(num); - } - - double extract_feature_value ( - const sample_type& item, - const feature& f - ) const - { - DLIB_ASSERT(max_num_feats() != 0); - return item(f); - } - - size_t max_num_feats ( - ) const - { - return num_feats; - } - - friend void serialize(const dense_feature_extractor& item, std::ostream& out) - { - serialize("dense_feature_extractor", out); - serialize(item.num_feats, out); - } - - friend void deserialize(dense_feature_extractor& item, std::istream& in) - { - check_serialized_version("dense_feature_extractor", in); - deserialize(item.num_feats, in); - } - - private: - size_t num_feats = 0; - }; - -// ---------------------------------------------------------------------------------------- - - template < - typename feature_extractor - > - struct internal_tree_node - { - uint32_t left; - uint32_t right; - float split_threshold; - typename feature_extractor::feature split_feature; - }; - - template <typename feature_extractor> - void serialize(const internal_tree_node<feature_extractor>& item, std::ostream& out) - { - serialize(item.left, out); - serialize(item.right, out); - serialize(item.split_threshold, out); - serialize(item.split_feature, out); - } - - template <typename feature_extractor> - void deserialize(internal_tree_node<feature_extractor>& item, std::istream& in) - { - deserialize(item.left, in); - deserialize(item.right, in); - deserialize(item.split_threshold, in); - deserialize(item.split_feature, in); - } - -// ---------------------------------------------------------------------------------------- - - template < - typename feature_extractor = dense_feature_extractor - > - class random_forest_regression_function - { - - public: - - typedef feature_extractor feature_extractor_type; - typedef typename feature_extractor::sample_type sample_type; - - random_forest_regression_function( - ) = default; - - random_forest_regression_function ( - feature_extractor_type&& fe_, - std::vector<std::vector<internal_tree_node<feature_extractor>>>&& trees_, - std::vector<std::vector<float>>&& leaves_ - ) : - fe(std::move(fe_)), - trees(std::move(trees_)), - leaves(std::move(leaves_)) - { - DLIB_ASSERT(trees.size() > 0); - DLIB_ASSERT(trees.size() == leaves.size(), "Every set of tree nodes has to have leaves"); -#ifdef ENABLE_ASSERTS - for (size_t i = 0; i < trees.size(); ++i) - { - DLIB_ASSERT(trees[i].size() > 0, "A tree can't have 0 leaves."); - for (auto& node : trees[i]) - { - DLIB_ASSERT(trees[i].size()+leaves[i].size() > node.left, "left node index in tree is too big. There is no associated tree node or leaf."); - DLIB_ASSERT(trees[i].size()+leaves[i].size() > node.right, "right node index in tree is too big. There is no associated tree node or leaf."); - } - } -#endif - } - - size_t get_num_trees( - ) const - { - return trees.size(); - } - - const std::vector<std::vector<internal_tree_node<feature_extractor>>>& get_internal_tree_nodes ( - ) const { return trees; } - - const std::vector<std::vector<float>>& get_tree_leaves ( - ) const { return leaves; } - - const feature_extractor_type& get_feature_extractor ( - ) const { return fe; } - - double operator() ( - const sample_type& x - ) const - { - DLIB_ASSERT(get_num_trees() > 0); - - double accum = 0; - - for (size_t i = 0; i < trees.size(); ++i) - { - auto& tree = trees[i]; - // walk the tree to the leaf - uint32_t idx = 0; - while(idx < tree.size()) - { - auto feature_value = fe.extract_feature_value(x, tree[idx].split_feature); - if (feature_value < tree[idx].split_threshold) - idx = tree[idx].left; - else - idx = tree[idx].right; - } - // compute leaf index - accum += leaves[i][idx-tree.size()]; - } - - return accum/trees.size(); - } - - friend void serialize(const random_forest_regression_function& item, std::ostream& out) - { - serialize("random_forest_regression_function", out); - serialize(item.fe, out); - serialize(item.trees, out); - serialize(item.leaves, out); - } - - friend void deserialize(random_forest_regression_function& item, std::istream& in) - { - check_serialized_version("random_forest_regression_function", in); - deserialize(item.fe, in); - deserialize(item.trees, in); - deserialize(item.leaves, in); - } - - private: - - /*! - CONVENTION - - trees.size() == leaves.size() - - Any .left or .right index in trees that is larger than the number of - nodes in the tree references a leaf. Moreover, the index of the leaf is - computed by subtracting the number of nodes in the tree. - !*/ - - feature_extractor_type fe; - - // internal nodes of trees - std::vector<std::vector<internal_tree_node<feature_extractor>>> trees; - // leaves of trees - std::vector<std::vector<float>> leaves; - - }; - -// ---------------------------------------------------------------------------------------- - - template < - typename feature_extractor = dense_feature_extractor - > - class random_forest_regression_trainer - { - public: - typedef feature_extractor feature_extractor_type; - typedef random_forest_regression_function<feature_extractor> trained_function_type; - typedef typename feature_extractor::sample_type sample_type; - - - random_forest_regression_trainer ( - ) = default; - - const feature_extractor_type& get_feature_extractor ( - ) const - { - return fe_; - } - - void set_feature_extractor ( - const feature_extractor_type& feat_extractor - ) - { - fe_ = feat_extractor; - } - - void set_seed ( - const std::string& seed - ) - { - random_seed = seed; - } - - const std::string& get_random_seed ( - ) const - { - return random_seed; - } - - size_t get_num_trees ( - ) const - { - return num_trees; - } - - void set_num_trees ( - size_t num - ) - { - DLIB_CASSERT(num > 0); - num_trees = num; - } - - void set_feature_subsampling_fraction ( - double frac - ) - { - DLIB_CASSERT(0 < frac && frac <= 1); - feature_subsampling_frac = frac; - } - - double get_feature_subsampling_frac( - ) const - { - return feature_subsampling_frac; - } - - void set_min_samples_per_leaf ( - size_t num - ) - { - DLIB_ASSERT(num > 0); - min_samples_per_leaf = num; - } - - size_t get_min_samples_per_leaf( - ) const - { - return min_samples_per_leaf; - } - - void be_verbose ( - ) - { - verbose = true; - } - - void be_quiet ( - ) - { - verbose = false; - } - - trained_function_type train ( - const std::vector<sample_type>& x, - const std::vector<double>& y - ) const - { - std::vector<double> junk; - return do_train(x,y,junk,false); - } - - trained_function_type train ( - const std::vector<sample_type>& x, - const std::vector<double>& y, - std::vector<double>& oob_values - ) const - { - return do_train(x,y,oob_values,true); - } - - private: - - trained_function_type do_train ( - const std::vector<sample_type>& x, - const std::vector<double>& y, - std::vector<double>& oob_values, - bool compute_oob_values - ) const - { - DLIB_CASSERT(x.size() == y.size()); - DLIB_CASSERT(x.size() > 0); - - feature_extractor_type fe = fe_; - fe.setup(x,y); - - DLIB_CASSERT(fe.max_num_feats() != 0); - - std::vector<std::vector<internal_tree_node<feature_extractor>>> all_trees(num_trees); - std::vector<std::vector<float>> all_leaves(num_trees); - - const double sumy = sum(mat(y)); - - const size_t feats_per_node = std::max(1.0,std::round(fe.max_num_feats()*feature_subsampling_frac)); - - // Each tree couldn't have more than this many interior nodes. It might - // end up having less though. We need to know this value because the way - // we mark a left or right pointer in a tree as pointing to a leaf is by - // making its index larger than the number of interior nodes in the tree. - // But we don't know the tree's size before we finish building it. So we - // will use max_num_nodes as a proxy during tree construction and then go - // back and fix it once a tree's size is known. - const uint32_t max_num_nodes = y.size(); - - std::vector<uint32_t> oob_hits; - - if (compute_oob_values) - { - oob_values.resize(y.size()); - oob_hits.resize(y.size()); - } - - - std::mutex m; - - // Calling build_tree(i) creates the ith tree and stores the results in - // all_trees and all_leaves. - auto build_tree = [&](long i) - { - dlib::rand rnd(random_seed + std::to_string(i)); - auto& tree = all_trees[i]; - auto& leaves = all_leaves[i]; - - // Check if there are fewer than min_samples_per_leaf and if so then - // don't make any tree. Just average the things and be done. - if (y.size() <= min_samples_per_leaf) - { - leaves.push_back(sumy/y.size()); - return; - } - - - // pick a random bootstrap of the data. - std::vector<std::pair<float,uint32_t>> idxs(y.size()); - for (auto& idx : idxs) - idx = std::make_pair(0,rnd.get_integer(y.size())); - - // We are going to use ranges_to_process as a stack that tracks which - // range of samples we are going to split next. - std::vector<range_t> ranges_to_process; - // start with the root of the tree, i.e. the entire range of training - // samples. - ranges_to_process.emplace_back(sumy,0,y.size()); - // push an unpopulated root node into the tree. We will populate it - // when we process its corresponding range. - tree.emplace_back(); - - std::vector<typename feature_extractor::feature> feats; - - while(ranges_to_process.size() > 0) - { - // Grab the next range/node to process. - const auto range = ranges_to_process.back(); - ranges_to_process.pop_back(); - - - // Get the split features we will consider at this node. - fe.get_random_features(rnd, feats_per_node, feats); - // Then find the best split - auto best_split = find_best_split_among_feats(fe, range, feats, x, y, idxs); - - range_t left_split(best_split.left_sum, range.begin, best_split.split_idx); - range_t right_split(best_split.right_sum, best_split.split_idx, range.end); - - DLIB_ASSERT(left_split.begin < left_split.end); - DLIB_ASSERT(right_split.begin < right_split.end); - - // Now that we know the split we can populate the parent node we popped - // from ranges_to_process. - tree[range.tree_idx].split_threshold = best_split.split_threshold; - tree[range.tree_idx].split_feature = best_split.split_feature; - - // If the left split is big enough to make a new interior leaf - // node. We also stop splitting if all the samples went into this node. - // This could happen if the features are all uniform so there just - // isn't any way to split them anymore. - if (left_split.size() > min_samples_per_leaf && right_split.size() != 0) - { - // allocate an interior leaf node for it. - left_split.tree_idx = tree.size(); - tree.emplace_back(); - // set the pointer in the parent node to the newly allocated - // node. - tree[range.tree_idx].left = left_split.tree_idx; - - ranges_to_process.emplace_back(left_split); - } - else - { - // Add to leaves. Don't forget to set the pointer in the - // parent node to the newly allocated leaf node. - tree[range.tree_idx].left = leaves.size() + max_num_nodes; - leaves.emplace_back(left_split.avg()); - } - - - // If the right split is big enough to make a new interior leaf - // node. We also stop splitting if all the samples went into this node. - // This could happen if the features are all uniform so there just - // isn't any way to split them anymore. - if (right_split.size() > min_samples_per_leaf && left_split.size() != 0) - { - // allocate an interior leaf node for it. - right_split.tree_idx = tree.size(); - tree.emplace_back(); - // set the pointer in the parent node to the newly allocated - // node. - tree[range.tree_idx].right = right_split.tree_idx; - - ranges_to_process.emplace_back(right_split); - } - else - { - // Add to leaves. Don't forget to set the pointer in the - // parent node to the newly allocated leaf node. - tree[range.tree_idx].right = leaves.size() + max_num_nodes; - leaves.emplace_back(right_split.avg()); - } - } // end while (still building tree) - - // Fix the leaf pointers in the tree now that we know the correct - // tree.size() value. - DLIB_CASSERT(max_num_nodes >= tree.size()); - const auto offset = max_num_nodes - tree.size(); - for (auto& n : tree) - { - if (n.left >= max_num_nodes) - n.left -= offset; - if (n.right >= max_num_nodes) - n.right -= offset; - } - - - if (compute_oob_values) - { - std::sort(idxs.begin(), idxs.end(), - [](const std::pair<float,uint32_t>& a, const std::pair<float,uint32_t>& b) {return a.second<b.second; }); - - std::lock_guard<std::mutex> lock(m); - - size_t j = 0; - for (size_t i = 0; i < oob_values.size(); ++i) - { - // check if i is in idxs - while(j < idxs.size() && i > idxs[j].second) - ++j; - - // i isn't in idxs so it's an oob sample and we should process it. - if (j == idxs.size() || idxs[j].second != i) - { - oob_hits[i]++; - - // walk the tree to find the leaf value for this oob sample - uint32_t idx = 0; - while(idx < tree.size()) - { - auto feature_value = fe.extract_feature_value(x[i], tree[idx].split_feature); - if (feature_value < tree[idx].split_threshold) - idx = tree[idx].left; - else - idx = tree[idx].right; - } - oob_values[i] += leaves[idx-tree.size()]; - } - } - } - }; - - if (verbose) - parallel_for_verbose(0, num_trees, build_tree); - else - parallel_for(0, num_trees, build_tree); - - - if (compute_oob_values) - { - double meanval = 0; - double cnt = 0; - for (size_t i = 0; i < oob_values.size(); ++i) - { - if (oob_hits[i] != 0) - { - oob_values[i] /= oob_hits[i]; - meanval += oob_values[i]; - ++cnt; - } - } - - // If there are some elements that didn't get hits, we set their oob values - // to the mean oob value. - if (cnt != 0) - { - const double typical_value = meanval/cnt; - for (size_t i = 0; i < oob_values.size(); ++i) - { - if (oob_hits[i] == 0) - oob_values[i] = typical_value; - } - } - } - - return trained_function_type(std::move(fe), std::move(all_trees), std::move(all_leaves)); - } - - struct range_t - { - range_t( - double sumy, - uint32_t begin, - uint32_t end - ) : sumy(sumy), begin(begin), end(end), tree_idx(0) {} - - double sumy; - uint32_t begin; - uint32_t end; - - // Every range object corresponds to an entry in a tree. This tells you the - // tree node that owns the range. - uint32_t tree_idx; - - uint32_t size() const { return end-begin; } - double avg() const { return sumy/size(); } - }; - - struct best_split_details - { - double score = -std::numeric_limits<double>::infinity(); - double left_sum; - double right_sum; - uint32_t split_idx; - double split_threshold; - typename feature_extractor::feature split_feature; - - bool operator < (const best_split_details& rhs) const - { - return score < rhs.score; - } - }; - - static best_split_details find_best_split ( - const range_t& range, - const std::vector<double>& y, - const std::vector<std::pair<float,uint32_t>>& idxs - ) - /*! - requires - - max(mat(idxs)) < y.size() - - range.sumy == sum of y[idxs[j].second] for all valid j in range [range.begin, range.end). - ensures - - finds a threshold T such that there exists an i satisfying the following: - - y[idxs[j].second] < T for all j <= i - - y[idxs[j].second] > T for all j > i - Therefore, the threshold T partitions the contents of y into two groups, - relative to the ordering established by idxs. Moreover the partitioning - of y values into two groups has the additional requirement that it is - optimal in the sense that the sum of the squared deviations from each - partition's mean is minimized. - !*/ - { - - size_t best_i = range.begin; - double best_score = -1; - double left_sum = 0; - double best_left_sum = y[idxs[range.begin].second]; - const auto size = range.size(); - size_t left_size = 0; - for (size_t i = range.begin; i+1 < range.end; ++i) - { - ++left_size; - left_sum += y[idxs[i].second]; - - // Don't split here because the next element has the same feature value so - // we can't *really* split here. - if (idxs[i].first==idxs[i+1].first) - continue; - - const double right_sum = range.sumy-left_sum; - - const double score = left_sum*left_sum/left_size + right_sum*right_sum/(size-left_size); - - if (score > best_score) - { - best_score = score; - best_i = i; - best_left_sum = left_sum; - } - } - - best_split_details result; - result.score = best_score; - result.left_sum = best_left_sum; - result.right_sum = range.sumy-best_left_sum; - result.split_idx = best_i+1; // one past the end of the left range - result.split_threshold = (idxs[best_i].first+idxs[best_i+1].first)/2; - - return result; - } - - - static best_split_details find_best_split_among_feats( - const feature_extractor& fe, - const range_t& range, - const std::vector<typename feature_extractor::feature>& feats, - const std::vector<sample_type>& x, - const std::vector<double>& y, - std::vector<std::pair<float,uint32_t>>& idxs - ) - { - auto compare_first = [](const std::pair<float,uint32_t>& a, const std::pair<float,uint32_t>& b) { return a.first<b.first; }; - best_split_details best; - for (auto& feat : feats) - { - // Extract feature values for this feature and sort the indexes based on - // that feature so we can then find the best split. - for (auto i = range.begin; i < range.end; ++i) - idxs[i].first = fe.extract_feature_value(x[idxs[i].second], feat); - - std::sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first); - - auto split = find_best_split(range, y, idxs); - - if (best < split) - { - best = split; - best.split_feature = feat; - } - } - - // resort idxs based on winning feat - for (auto i = range.begin; i < range.end; ++i) - idxs[i].first = fe.extract_feature_value(x[idxs[i].second], best.split_feature); - std::sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first); - - return best; - } - - std::string random_seed; - size_t num_trees = 1000; - double feature_subsampling_frac = 1.0/3.0; - size_t min_samples_per_leaf = 5; - feature_extractor_type fe_; - bool verbose = false; - }; - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_RANdOM_FOREST_REGRESSION_H_ - - |