diff options
Diffstat (limited to 'ml/dlib/dlib/svm/svm_c_linear_trainer.h')
-rw-r--r-- | ml/dlib/dlib/svm/svm_c_linear_trainer.h | 706 |
1 files changed, 706 insertions, 0 deletions
diff --git a/ml/dlib/dlib/svm/svm_c_linear_trainer.h b/ml/dlib/dlib/svm/svm_c_linear_trainer.h new file mode 100644 index 000000000..8d136d711 --- /dev/null +++ b/ml/dlib/dlib/svm/svm_c_linear_trainer.h @@ -0,0 +1,706 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SVM_C_LiNEAR_TRAINER_Hh_ +#define DLIB_SVM_C_LiNEAR_TRAINER_Hh_ + +#include "svm_c_linear_trainer_abstract.h" +#include "../algs.h" +#include "../optimization.h" +#include "../matrix.h" +#include "function.h" +#include "kernel.h" +#include <iostream> +#include <vector> +#include "sparse_vector.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type, + typename in_sample_vector_type, + typename in_scalar_vector_type + > + class oca_problem_c_svm : public oca_problem<matrix_type > + { + public: + /* + This class is used as part of the implementation of the svm_c_linear_trainer + defined towards the end of this file. + + + The bias parameter is dealt with by imagining that each sample vector has -1 + as its last element. + */ + + typedef typename matrix_type::type scalar_type; + + oca_problem_c_svm( + const scalar_type C_pos, + const scalar_type C_neg, + const in_sample_vector_type& samples_, + const in_scalar_vector_type& labels_, + const bool be_verbose_, + const scalar_type eps_, + const unsigned long max_iter, + const unsigned long dims_ + ) : + samples(samples_), + labels(labels_), + C(std::min(C_pos,C_neg)), + Cpos(C_pos/C), + Cneg(C_neg/C), + be_verbose(be_verbose_), + eps(eps_), + max_iterations(max_iter), + dims(dims_) + { + dot_prods.resize(samples.size()); + is_first_call = true; + } + + virtual scalar_type get_c ( + ) const + { + return C; + } + + virtual long get_num_dimensions ( + ) const + { + // plus 1 for the bias term + return dims + 1; + } + + virtual bool optimization_status ( + scalar_type current_objective_value, + scalar_type current_error_gap, + scalar_type current_risk_value, + scalar_type current_risk_gap, + unsigned long num_cutting_planes, + unsigned long num_iterations + ) const + { + if (be_verbose) + { + using namespace std; + cout << "objective: " << current_objective_value << endl; + cout << "objective gap: " << current_error_gap << endl; + cout << "risk: " << current_risk_value << endl; + cout << "risk gap: " << current_risk_gap << endl; + cout << "num planes: " << num_cutting_planes << endl; + cout << "iter: " << num_iterations << endl; + cout << endl; + } + + if (num_iterations >= max_iterations) + return true; + + if (current_risk_gap < eps) + return true; + + return false; + } + + virtual bool risk_has_lower_bound ( + scalar_type& lower_bound + ) const + { + lower_bound = 0; + return true; + } + + virtual void get_risk ( + matrix_type& w, + scalar_type& risk, + matrix_type& subgradient + ) const + { + line_search(w); + + subgradient.set_size(w.size(),1); + subgradient = 0; + risk = 0; + + + // loop over all the samples and compute the risk and its subgradient at the current solution point w + for (long i = 0; i < samples.size(); ++i) + { + // multiply current SVM output for the ith sample by its label + const scalar_type df_val = labels(i)*dot_prods[i]; + + if (labels(i) > 0) + risk += Cpos*std::max<scalar_type>(0.0,1 - df_val); + else + risk += Cneg*std::max<scalar_type>(0.0,1 - df_val); + + if (df_val < 1) + { + if (labels(i) > 0) + { + subtract_from(subgradient, samples(i), Cpos); + + subgradient(subgradient.size()-1) += Cpos; + } + else + { + add_to(subgradient, samples(i), Cneg); + + subgradient(subgradient.size()-1) -= Cneg; + } + } + } + + scalar_type scale = 1.0/samples.size(); + + risk *= scale; + subgradient = scale*subgradient; + } + + private: + + // ----------------------------------------------------- + // ----------------------------------------------------- + + void line_search ( + matrix_type& w + ) const + /*! + ensures + - does a line search to find a better w + - for all i: #dot_prods[i] == dot(colm(#w,0,w.size()-1), samples(i)) - #w(w.size()-1) + !*/ + { + // The reason for using w_size_m1 and not just w.size()-1 is because + // doing it this way avoids an inane warning from gcc that can occur in some cases. + const long w_size_m1 = w.size()-1; + for (long i = 0; i < samples.size(); ++i) + dot_prods[i] = dot(colm(w,0,w_size_m1), samples(i)) - w(w_size_m1); + + if (is_first_call) + { + is_first_call = false; + best_so_far = w; + dot_prods_best = dot_prods; + } + else + { + // do line search going from best_so_far to w. Store results in w. + // Here we use the line search algorithm presented in section 3.1.1 of Franc and Sonnenburg. + + const scalar_type A0 = length_squared(best_so_far - w); + const scalar_type BB0 = dot(best_so_far, w - best_so_far); + + const scalar_type scale_pos = (get_c()*Cpos)/samples.size(); + const scalar_type scale_neg = (get_c()*Cneg)/samples.size(); + + ks.clear(); + ks.reserve(samples.size()); + + scalar_type f0 = BB0; + for (long i = 0; i < samples.size(); ++i) + { + const scalar_type& scale = (labels(i)>0) ? scale_pos : scale_neg; + + const scalar_type B = scale*labels(i) * ( dot_prods_best[i] - dot_prods[i]); + const scalar_type C = scale*(1 - labels(i)* dot_prods_best[i]); + // Note that if B is 0 then it doesn't matter what k is set to. So 0 is fine. + scalar_type k = 0; + if (B != 0) + k = -C/B; + + if (k > 0) + ks.push_back(helper(k, std::abs(B))); + + if ( (B < 0 && k > 0) || (B > 0 && k <= 0) ) + f0 += B; + } + + scalar_type opt_k = 1; + // ks.size() == 0 shouldn't happen but check anyway + if (f0 >= 0 || ks.size() == 0) + { + // Getting here means that we aren't searching in a descent direction. + // We could take a zero step but instead lets just assign w to the new best + // so far point just to make sure we don't get stuck coming back to this + // case over and over. This might happen if we never move the best point + // seen so far. + + // So we let opt_k be 1 + } + else + { + std::sort(ks.begin(), ks.end()); + + // figure out where f0 goes positive. + for (unsigned long i = 0; i < ks.size(); ++i) + { + f0 += ks[i].B; + if (f0 + A0*ks[i].k >= 0) + { + opt_k = ks[i].k; + break; + } + } + + } + + // Don't let the step size get too big. Otherwise we might pick huge steps + // over and over that don't improve the cutting plane approximation. + if (opt_k > 1.0) + { + opt_k = 1.0; + } + + // take the step suggested by the line search + best_so_far = (1-opt_k)*best_so_far + opt_k*w; + + // update best_so_far dot products + for (unsigned long i = 0; i < dot_prods_best.size(); ++i) + dot_prods_best[i] = (1-opt_k)*dot_prods_best[i] + opt_k*dot_prods[i]; + + + const scalar_type mu = 0.1; + // Make sure we always take a little bit of a step towards w regardless of what the + // line search says to do. We do this since it is possible that some steps won't + // advance the best_so_far point. So this ensures we always make some progress each + // iteration. + w = (1-mu)*best_so_far + mu*w; + + // update dot products + for (unsigned long i = 0; i < dot_prods.size(); ++i) + dot_prods[i] = (1-mu)*dot_prods_best[i] + mu*dot_prods[i]; + } + } + + struct helper + { + helper(scalar_type k_, scalar_type B_) : k(k_), B(B_) {} + scalar_type k; + scalar_type B; + + bool operator< (const helper& item) const { return k < item.k; } + }; + + mutable std::vector<helper> ks; + + mutable bool is_first_call; + mutable std::vector<scalar_type> dot_prods; + + mutable matrix_type best_so_far; // best w seen so far + mutable std::vector<scalar_type> dot_prods_best; // dot products between best_so_far and samples + + + const in_sample_vector_type& samples; + const in_scalar_vector_type& labels; + const scalar_type C; + const scalar_type Cpos; + const scalar_type Cneg; + + const bool be_verbose; + const scalar_type eps; + const unsigned long max_iterations; + const unsigned long dims; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type, + typename in_sample_vector_type, + typename in_scalar_vector_type, + typename scalar_type + > + oca_problem_c_svm<matrix_type, in_sample_vector_type, in_scalar_vector_type> make_oca_problem_c_svm ( + const scalar_type C_pos, + const scalar_type C_neg, + const in_sample_vector_type& samples, + const in_scalar_vector_type& labels, + const bool be_verbose, + const scalar_type eps, + const unsigned long max_iterations, + const unsigned long dims + ) + { + return oca_problem_c_svm<matrix_type, in_sample_vector_type, in_scalar_vector_type>( + C_pos, C_neg, samples, labels, be_verbose, eps, max_iterations, dims); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename K + > + class svm_c_linear_trainer + { + + public: + typedef K kernel_type; + typedef typename kernel_type::scalar_type scalar_type; + typedef typename kernel_type::sample_type sample_type; + typedef typename kernel_type::mem_manager_type mem_manager_type; + typedef decision_function<kernel_type> trained_function_type; + + // You are getting a compiler error on this line because you supplied a non-linear kernel + // to the svm_c_linear_trainer object. You have to use one of the linear kernels with this + // trainer. + COMPILE_TIME_ASSERT((is_same_type<K, linear_kernel<sample_type> >::value || + is_same_type<K, sparse_linear_kernel<sample_type> >::value )); + + svm_c_linear_trainer ( + ) + { + Cpos = 1; + Cneg = 1; + verbose = false; + eps = 0.001; + max_iterations = 10000; + learn_nonnegative_weights = false; + last_weight_1 = false; + } + + explicit svm_c_linear_trainer ( + const scalar_type& C + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(C > 0, + "\t svm_c_linear_trainer::svm_c_linear_trainer()" + << "\n\t C must be greater than 0" + << "\n\t C: " << C + << "\n\t this: " << this + ); + + Cpos = C; + Cneg = C; + verbose = false; + eps = 0.001; + max_iterations = 10000; + learn_nonnegative_weights = false; + last_weight_1 = false; + } + + void set_epsilon ( + scalar_type eps_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(eps_ > 0, + "\t void svm_c_linear_trainer::set_epsilon()" + << "\n\t eps_ must be greater than 0" + << "\n\t eps_: " << eps_ + << "\n\t this: " << this + ); + + eps = eps_; + } + + const scalar_type get_epsilon ( + ) const { return eps; } + + unsigned long get_max_iterations ( + ) const { return max_iterations; } + + void set_max_iterations ( + unsigned long max_iter + ) + { + max_iterations = max_iter; + } + + void be_verbose ( + ) + { + verbose = true; + } + + void be_quiet ( + ) + { + verbose = false; + } + + void set_oca ( + const oca& item + ) + { + solver = item; + } + + const oca get_oca ( + ) const + { + return solver; + } + + const kernel_type get_kernel ( + ) const + { + return kernel_type(); + } + + bool learns_nonnegative_weights ( + ) const { return learn_nonnegative_weights; } + + void set_learns_nonnegative_weights ( + bool value + ) + { + learn_nonnegative_weights = value; + if (learn_nonnegative_weights) + prior.set_size(0); + } + + bool forces_last_weight_to_1 ( + ) const + { + return last_weight_1; + } + + void force_last_weight_to_1 ( + bool should_last_weight_be_1 + ) + { + last_weight_1 = should_last_weight_be_1; + if (last_weight_1) + prior.set_size(0); + } + + void set_prior ( + const trained_function_type& prior_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(prior_.basis_vectors.size() == 1 && + prior_.alpha(0) == 1, + "\t void svm_c_linear_trainer::set_prior()" + << "\n\t The supplied prior could not have been created by this object's train() method." + << "\n\t prior_.basis_vectors.size(): " << prior_.basis_vectors.size() + << "\n\t prior_.alpha(0): " << prior_.alpha(0) + << "\n\t this: " << this + ); + + prior = sparse_to_dense(prior_.basis_vectors(0)); + prior_b = prior_.b; + learn_nonnegative_weights = false; + last_weight_1 = false; + } + + bool has_prior ( + ) const + { + return prior.size() != 0; + } + + void set_c ( + scalar_type C + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(C > 0, + "\t void svm_c_linear_trainer::set_c()" + << "\n\t C must be greater than 0" + << "\n\t C: " << C + << "\n\t this: " << this + ); + + Cpos = C; + Cneg = C; + } + + const scalar_type get_c_class1 ( + ) const + { + return Cpos; + } + + const scalar_type get_c_class2 ( + ) const + { + return Cneg; + } + + void set_c_class1 ( + scalar_type C + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(C > 0, + "\t void svm_c_linear_trainer::set_c_class1()" + << "\n\t C must be greater than 0" + << "\n\t C: " << C + << "\n\t this: " << this + ); + + Cpos = C; + } + + void set_c_class2 ( + scalar_type C + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(C > 0, + "\t void svm_c_linear_trainer::set_c_class2()" + << "\n\t C must be greater than 0" + << "\n\t C: " << C + << "\n\t this: " << this + ); + + Cneg = C; + } + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y + ) const + { + scalar_type obj; + return do_train(mat(x),mat(y),obj); + } + + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y, + scalar_type& svm_objective + ) const + { + return do_train(mat(x),mat(y),svm_objective); + } + + private: + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function<kernel_type> do_train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y, + scalar_type& svm_objective + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(is_learning_problem(x,y) == true, + "\t decision_function svm_c_linear_trainer::train(x,y)" + << "\n\t invalid inputs were given to this function" + << "\n\t x.nr(): " << x.nr() + << "\n\t y.nr(): " << y.nr() + << "\n\t x.nc(): " << x.nc() + << "\n\t y.nc(): " << y.nc() + << "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y) + ); +#ifdef ENABLE_ASSERTS + for (long i = 0; i < x.size(); ++i) + { + DLIB_ASSERT(y(i) == +1 || y(i) == -1, + "\t decision_function svm_c_linear_trainer::train(x,y)" + << "\n\t invalid inputs were given to this function" + << "\n\t y("<<i<<"): " << y(i) + ); + } +#endif + + + typedef matrix<scalar_type,0,1> w_type; + w_type w; + + const unsigned long num_dims = max_index_plus_one(x); + + unsigned long num_nonnegative = 0; + if (learn_nonnegative_weights) + { + num_nonnegative = num_dims; + } + + unsigned long force_weight_1_idx = std::numeric_limits<unsigned long>::max(); + if (last_weight_1) + { + force_weight_1_idx = num_dims-1; + } + + + if (has_prior()) + { + if (is_matrix<sample_type>::value) + { + // make sure requires clause is not broken + DLIB_CASSERT(num_dims == (unsigned long)prior.size(), + "\t decision_function svm_c_linear_trainer::train(x,y)" + << "\n\t The dimension of the training vectors must match the dimension of\n" + << "\n\t those used to create the prior." + << "\n\t num_dims: " << num_dims + << "\n\t prior.size(): " << prior.size() + ); + } + const unsigned long dims = std::max(num_dims, (unsigned long)prior.size()); + // In the case of sparse sample vectors, it is possible that the input + // vector dimensionality is larger than the prior vector dimensionality. + // We need to check for this case and pad prior with zeros if it is the + // case. + matrix<scalar_type,0,1> prior_temp = join_cols(join_cols(prior, + zeros_matrix<scalar_type>(dims-prior.size(),1)), + mat(prior_b)); + + svm_objective = solver( + make_oca_problem_c_svm<w_type>(Cpos, Cneg, x, y, verbose, eps, max_iterations, dims), + w, + prior_temp); + } + else + { + svm_objective = solver( + make_oca_problem_c_svm<w_type>(Cpos, Cneg, x, y, verbose, eps, max_iterations, num_dims), + w, + num_nonnegative, + force_weight_1_idx); + } + + // put the solution into a decision function and then return it + decision_function<kernel_type> df; + df.b = static_cast<scalar_type>(w(w.size()-1)); + df.basis_vectors.set_size(1); + // Copy the plane normal into the output basis vector. The output vector might be a + // sparse vector container so we need to use this special kind of copy to handle that case. + // As an aside, the reason for using max_index_plus_one() and not just w.size()-1 is because + // doing it this way avoids an inane warning from gcc that can occur in some cases. + const long out_size = max_index_plus_one(x); + assign(df.basis_vectors(0), matrix_cast<scalar_type>(colm(w, 0, out_size))); + df.alpha.set_size(1); + df.alpha(0) = 1; + + return df; + } + + scalar_type Cpos; + scalar_type Cneg; + oca solver; + scalar_type eps; + bool verbose; + unsigned long max_iterations; + bool learn_nonnegative_weights; + bool last_weight_1; + matrix<scalar_type,0,1> prior; + scalar_type prior_b = 0; + }; + +// ---------------------------------------------------------------------------------------- + +} + +// ---------------------------------------------------------------------------------------- + + +#endif // DLIB_SVM_C_LiNEAR_TRAINER_Hh_ + |