diff options
Diffstat (limited to '')
-rw-r--r-- | ml/dlib/dlib/svm/reduced.h | 613 |
1 files changed, 613 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/reduced.h b/ml/dlib/dlib/svm/reduced.h new file mode 100644 index 000000000..b4c5b63ca --- /dev/null +++ b/ml/dlib/dlib/svm/reduced.h @@ -0,0 +1,613 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_REDUCEd_TRAINERS_ +#define DLIB_REDUCEd_TRAINERS_ + +#include "reduced_abstract.h" +#include "../matrix.h" +#include "../algs.h" +#include "function.h" +#include "kernel.h" +#include "kcentroid.h" +#include "linearly_independent_subset_finder.h" +#include "../optimization.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename trainer_type + > + class reduced_decision_function_trainer + { + public: + typedef typename trainer_type::kernel_type kernel_type; + typedef typename trainer_type::scalar_type scalar_type; + typedef typename trainer_type::sample_type sample_type; + typedef typename trainer_type::mem_manager_type mem_manager_type; + typedef typename trainer_type::trained_function_type trained_function_type; + + reduced_decision_function_trainer ( + ) :num_bv(0) {} + + reduced_decision_function_trainer ( + const trainer_type& trainer_, + const unsigned long num_sb_ + ) : + trainer(trainer_), + num_bv(num_sb_) + { + // make sure requires clause is not broken + DLIB_ASSERT(num_bv > 0, + "\t reduced_decision_function_trainer()" + << "\n\t you have given invalid arguments to this function" + << "\n\t num_bv: " << num_bv + ); + } + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(num_bv > 0, + "\t reduced_decision_function_trainer::train(x,y)" + << "\n\t You have tried to use an uninitialized version of this object" + << "\n\t num_bv: " << num_bv ); + return do_train(mat(x), mat(y)); + } + + private: + + // ------------------------------------------------------------------------------------ + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> do_train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y + ) const + { + // get the decision function object we are going to try and approximate + const decision_function<kernel_type>& dec_funct = trainer.train(x,y); + + // now find a linearly independent subset of the training points of num_bv points. + linearly_independent_subset_finder<kernel_type> lisf(dec_funct.kernel_function, num_bv); + fill_lisf(lisf, x); + + // The next few statements just find the best weights with which to approximate + // the dec_funct object with the smaller set of vectors in the lisf dictionary. This + // is really just a simple application of some linear algebra. For the details + // see page 554 of Learning with kernels by Scholkopf and Smola where they talk + // about "Optimal Expansion Coefficients." + + const kernel_type kern(dec_funct.kernel_function); + + matrix<scalar_type,0,1,mem_manager_type> alpha; + + alpha = lisf.get_inv_kernel_marix()*(kernel_matrix(kern,lisf,dec_funct.basis_vectors)*dec_funct.alpha); + + decision_function<kernel_type> new_df(alpha, + 0, + kern, + lisf.get_dictionary()); + + // now we have to figure out what the new bias should be. It might be a little + // different since we just messed with all the weights and vectors. + double bias = 0; + for (long i = 0; i < x.nr(); ++i) + { + bias += new_df(x(i)) - dec_funct(x(i)); + } + + new_df.b = bias/x.nr(); + + return new_df; + } + + // ------------------------------------------------------------------------------------ + + trainer_type trainer; + unsigned long num_bv; + + + }; // end of class reduced_decision_function_trainer + + template <typename trainer_type> + const reduced_decision_function_trainer<trainer_type> reduced ( + const trainer_type& trainer, + const unsigned long num_bv + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(num_bv > 0, + "\tconst reduced_decision_function_trainer reduced()" + << "\n\t you have given invalid arguments to this function" + << "\n\t num_bv: " << num_bv + ); + + return reduced_decision_function_trainer<trainer_type>(trainer, num_bv); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace red_impl + { + + // ------------------------------------------------------------------------------------ + + template <typename kernel_type> + class objective + { + /* + This object represents the objective function we will try to + minimize in approximate_distance_function(). + + The objective is the distance, in kernel induced feature space, between + the original distance function and the approximated version. + + */ + typedef typename kernel_type::scalar_type scalar_type; + typedef typename kernel_type::sample_type sample_type; + typedef typename kernel_type::mem_manager_type mem_manager_type; + public: + objective( + const distance_function<kernel_type>& dist_funct_, + matrix<scalar_type,0,1,mem_manager_type>& b_, + matrix<sample_type,0,1,mem_manager_type>& out_vectors_ + ) : + dist_funct(dist_funct_), + b(b_), + out_vectors(out_vectors_) + { + } + + const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector ( + ) const + /*! + ensures + - returns a vector that contains all the information necessary to + reproduce the current state of the approximated distance function + !*/ + { + matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr()); + long i = 0; + for (long j = 0; j < b.nr(); ++j) + { + z(i) = b(j); + ++i; + } + + for (long j = 0; j < out_vectors.size(); ++j) + { + for (long k = 0; k < out_vectors(j).size(); ++k) + { + z(i) = out_vectors(j)(k); + ++i; + } + } + return z; + } + + + void vector_to_state ( + const matrix<scalar_type, 0, 1, mem_manager_type>& z + ) const + /*! + requires + - z came from the state_to_vector() function or has a compatible format + ensures + - loads the vector z into the state variables of the approximate + distance function (i.e. b and out_vectors) + !*/ + { + long i = 0; + for (long j = 0; j < b.nr(); ++j) + { + b(j) = z(i); + ++i; + } + + for (long j = 0; j < out_vectors.size(); ++j) + { + for (long k = 0; k < out_vectors(j).size(); ++k) + { + out_vectors(j)(k) = z(i); + ++i; + } + } + } + + double operator() ( + const matrix<scalar_type, 0, 1, mem_manager_type>& z + ) const + /*! + ensures + - loads the current approximate distance function with z + - returns the distance between the original distance function + and the approximate one. + !*/ + { + vector_to_state(z); + const kernel_type k(dist_funct.get_kernel()); + + double temp = 0; + for (long i = 0; i < out_vectors.size(); ++i) + { + for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j) + { + temp -= b(i)*dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j)); + } + } + + temp *= 2; + + for (long i = 0; i < out_vectors.size(); ++i) + { + for (long j = 0; j < out_vectors.size(); ++j) + { + temp += b(i)*b(j)*k(out_vectors(i), out_vectors(j)); + } + } + + return temp + dist_funct.get_squared_norm(); + } + + private: + + const distance_function<kernel_type>& dist_funct; + matrix<scalar_type,0,1,mem_manager_type>& b; + matrix<sample_type,0,1,mem_manager_type>& out_vectors; + + }; + + // ------------------------------------------------------------------------------------ + + template <typename kernel_type> + class objective_derivative + { + /*! + This object represents the derivative of the objective object + !*/ + typedef typename kernel_type::scalar_type scalar_type; + typedef typename kernel_type::sample_type sample_type; + typedef typename kernel_type::mem_manager_type mem_manager_type; + public: + + + objective_derivative( + const distance_function<kernel_type>& dist_funct_, + matrix<scalar_type,0,1,mem_manager_type>& b_, + matrix<sample_type,0,1,mem_manager_type>& out_vectors_ + ) : + dist_funct(dist_funct_), + b(b_), + out_vectors(out_vectors_) + { + } + + void vector_to_state ( + const matrix<scalar_type, 0, 1, mem_manager_type>& z + ) const + /*! + requires + - z came from the state_to_vector() function or has a compatible format + ensures + - loads the vector z into the state variables of the approximate + distance function (i.e. b and out_vectors) + !*/ + { + long i = 0; + for (long j = 0; j < b.nr(); ++j) + { + b(j) = z(i); + ++i; + } + + for (long j = 0; j < out_vectors.size(); ++j) + { + for (long k = 0; k < out_vectors(j).size(); ++k) + { + out_vectors(j)(k) = z(i); + ++i; + } + } + } + + const matrix<scalar_type,0,1,mem_manager_type>& operator() ( + const matrix<scalar_type, 0, 1, mem_manager_type>& z + ) const + /*! + ensures + - loads the current approximate distance function with z + - returns the derivative of the distance between the original + distance function and the approximate one. + !*/ + { + vector_to_state(z); + res.set_size(z.nr()); + set_all_elements(res,0); + const kernel_type k(dist_funct.get_kernel()); + const kernel_derivative<kernel_type> K_der(k); + + // first compute the gradient for the beta weights + for (long i = 0; i < out_vectors.size(); ++i) + { + for (long j = 0; j < out_vectors.size(); ++j) + { + res(i) += b(j)*k(out_vectors(i), out_vectors(j)); + } + } + for (long i = 0; i < out_vectors.size(); ++i) + { + for (long j = 0; j < dist_funct.get_basis_vectors().size(); ++j) + { + res(i) -= dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j)); + } + } + + + // now compute the gradient of the actual vectors that go into the kernel functions + long pos = out_vectors.size(); + const long num = out_vectors(0).nr(); + temp.set_size(num,1); + for (long i = 0; i < out_vectors.size(); ++i) + { + set_all_elements(temp,0); + for (long j = 0; j < out_vectors.size(); ++j) + { + temp += b(j)*K_der(out_vectors(j), out_vectors(i)); + } + for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j) + { + temp -= dist_funct.get_alpha()(j)*K_der(dist_funct.get_basis_vectors()(j), out_vectors(i) ); + } + + // store the gradient for out_vectors(i) into result in the proper spot + set_subm(res,pos,0,num,1) = b(i)*temp; + pos += num; + } + + + res *= 2; + return res; + } + + private: + + mutable matrix<scalar_type, 0, 1, mem_manager_type> res; + mutable sample_type temp; + + const distance_function<kernel_type>& dist_funct; + matrix<scalar_type,0,1,mem_manager_type>& b; + matrix<sample_type,0,1,mem_manager_type>& out_vectors; + + }; + + // ------------------------------------------------------------------------------------ + + } + + template < + typename K, + typename stop_strategy_type, + typename T + > + distance_function<K> approximate_distance_function ( + stop_strategy_type stop_strategy, + const distance_function<K>& target, + const T& starting_basis + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(target.get_basis_vectors().size() > 0 && + starting_basis.size() > 0, + "\t distance_function approximate_distance_function()" + << "\n\t Invalid inputs were given to this function." + << "\n\t target.get_basis_vectors().size(): " << target.get_basis_vectors().size() + << "\n\t starting_basis.size(): " << starting_basis.size() + ); + + using namespace red_impl; + // The next few statements just find the best weights with which to approximate + // the target object with the set of basis vectors in starting_basis. This + // is really just a simple application of some linear algebra. For the details + // see page 554 of Learning with kernels by Scholkopf and Smola where they talk + // about "Optimal Expansion Coefficients." + + const K kern(target.get_kernel()); + typedef typename K::scalar_type scalar_type; + typedef typename K::sample_type sample_type; + typedef typename K::mem_manager_type mem_manager_type; + + matrix<scalar_type,0,1,mem_manager_type> beta; + + // Now we compute the fist approximate distance function. + beta = pinv(kernel_matrix(kern,starting_basis)) * + (kernel_matrix(kern,starting_basis,target.get_basis_vectors())*target.get_alpha()); + matrix<sample_type,0,1,mem_manager_type> out_vectors(mat(starting_basis)); + + + // Now setup to do a global optimization of all the parameters in the approximate + // distance function. + const objective<K> obj(target, beta, out_vectors); + const objective_derivative<K> obj_der(target, beta, out_vectors); + matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector()); + + + // perform a full optimization of all the parameters (i.e. both beta and the basis vectors together) + find_min(lbfgs_search_strategy(20), + stop_strategy, + obj, obj_der, opt_starting_point, 0); + + // now make sure that the final optimized value is loaded into the beta and + // out_vectors matrices + obj.vector_to_state(opt_starting_point); + + // Do a final reoptimization of beta just to make sure it is optimal given the new + // set of basis vectors. + beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,target.get_basis_vectors())*target.get_alpha()); + + // It is possible that some of the beta weights will be very close to zero. Lets remove + // the basis vectors with these essentially zero weights. + const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon(); + for (long i = 0; i < beta.size(); ++i) + { + // if beta(i) is zero (but leave at least one beta no matter what) + if (std::abs(beta(i)) < eps && beta.size() > 1) + { + beta = remove_row(beta, i); + out_vectors = remove_row(out_vectors, i); + --i; + } + } + + return distance_function<K>(beta, kern, out_vectors); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename trainer_type + > + class reduced_decision_function_trainer2 + { + public: + typedef typename trainer_type::kernel_type kernel_type; + typedef typename trainer_type::scalar_type scalar_type; + typedef typename trainer_type::sample_type sample_type; + typedef typename trainer_type::mem_manager_type mem_manager_type; + typedef typename trainer_type::trained_function_type trained_function_type; + + reduced_decision_function_trainer2 () : num_bv(0) {} + reduced_decision_function_trainer2 ( + const trainer_type& trainer_, + const long num_sb_, + const double eps_ = 1e-3 + ) : + trainer(trainer_), + num_bv(num_sb_), + eps(eps_) + { + COMPILE_TIME_ASSERT(is_matrix<sample_type>::value); + + // make sure requires clause is not broken + DLIB_ASSERT(num_bv > 0 && eps > 0, + "\t reduced_decision_function_trainer2()" + << "\n\t you have given invalid arguments to this function" + << "\n\t num_bv: " << num_bv + << "\n\t eps: " << eps + ); + } + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(num_bv > 0, + "\t reduced_decision_function_trainer2::train(x,y)" + << "\n\t You have tried to use an uninitialized version of this object" + << "\n\t num_bv: " << num_bv ); + return do_train(mat(x), mat(y)); + } + + private: + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> do_train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y + ) const + { + // get the decision function object we are going to try and approximate + const decision_function<kernel_type>& dec_funct = trainer.train(x,y); + const kernel_type kern(dec_funct.kernel_function); + + // now find a linearly independent subset of the training points of num_bv points. + linearly_independent_subset_finder<kernel_type> lisf(kern, num_bv); + fill_lisf(lisf,x); + + distance_function<kernel_type> approx, target; + target = dec_funct; + approx = approximate_distance_function(objective_delta_stop_strategy(eps), target, lisf); + + decision_function<kernel_type> new_df(approx.get_alpha(), + 0, + kern, + approx.get_basis_vectors()); + + // now we have to figure out what the new bias should be. It might be a little + // different since we just messed with all the weights and vectors. + double bias = 0; + for (long i = 0; i < x.nr(); ++i) + { + bias += new_df(x(i)) - dec_funct(x(i)); + } + + new_df.b = bias/x.nr(); + + return new_df; + + } + + // ------------------------------------------------------------------------------------ + + trainer_type trainer; + long num_bv; + double eps; + + + }; // end of class reduced_decision_function_trainer2 + + template <typename trainer_type> + const reduced_decision_function_trainer2<trainer_type> reduced2 ( + const trainer_type& trainer, + const long num_bv, + double eps = 1e-3 + ) + { + COMPILE_TIME_ASSERT(is_matrix<typename trainer_type::sample_type>::value); + + // make sure requires clause is not broken + DLIB_ASSERT(num_bv > 0 && eps > 0, + "\tconst reduced_decision_function_trainer2 reduced2()" + << "\n\t you have given invalid arguments to this function" + << "\n\t num_bv: " << num_bv + << "\n\t eps: " << eps + ); + + return reduced_decision_function_trainer2<trainer_type>(trainer, num_bv, eps); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_REDUCEd_TRAINERS_ + |